Merge pull request #4363 from pmatos/ReciprocalsFix

Improve reciprocal estimate and tests
FEX-Emu · Mar 1, 2025 · b7f58e6 · b7f58e6
2 parents 57ed466 + 8e4a471
commit b7f58e6
Show file tree

Hide file tree

Showing 23 changed files with 1,338 additions and 254 deletions.
diff --git a/FEXCore/Source/Interface/Core/JIT/VectorOps.cpp b/FEXCore/Source/Interface/Core/JIT/VectorOps.cpp
@@ -1520,7 +1520,10 @@ DEF_OP(VFRecp) {
         fdiv(Dst.D(), VTMP1.D(), Vector.D());
         break;
       }
-      default: break;
+      default: {
+        LOGMAN_MSG_A_FMT("Unexpected ElementSize for {}", __func__);
+        FEX_UNREACHABLE;
+      }
       }
     } else {
       if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
@@ -1539,6 +1542,46 @@ DEF_OP(VFRecp) {
   }
 }
 
+DEF_OP(VFRecpPrecision) {
+  const auto Op = IROp->C<IR::IROp_VFRecpPrecision>();
+  const auto OpSize = IROp->Size;
+  const auto ElementSize = Op->Header.ElementSize;
+
+  LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit,
+                     "Unexpected sizes for operation.", __func__);
+
+  const auto SubRegSize = ConvertSubRegSizePair16(IROp);
+  const auto IsScalar = OpSize == ElementSize;
+
+  const auto Dst = GetVReg(Node);
+  const auto Vector = GetVReg(Op->Vector.ID());
+
+  if (IsScalar) {
+    if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
+      // Not enough precision so we need to improve it with frecps
+      frecpe(SubRegSize.Scalar, VTMP1.S(), Vector.S());
+      frecps(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), Vector.S());
+      fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S());
+      return;
+    }
+
+    fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f);
+    // Element size is known to be 32bits
+    fdiv(Dst.S(), VTMP1.S(), Vector.S());
+  } else { // Vector operation - Opsize 64bits, elementsize 32bits
+    if (HostSupportsRPRES) {
+      frecpe(SubRegSize.Vector, VTMP1.D(), Vector.D());
+      frecps(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), Vector.D());
+      fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D());
+      return;
+    }
+
+    // No RPRES, so normal division
+    fmov(SubRegSize.Vector, VTMP1.Q(), 1.0f);
+    fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), Vector.Q());
+  }
+}
+
 DEF_OP(VFRSqrt) {
   const auto Op = IROp->C<IR::IROp_VFRSqrt>();
   const auto OpSize = IROp->Size;
@@ -1608,6 +1651,50 @@ DEF_OP(VFRSqrt) {
   }
 }
 
+DEF_OP(VFRSqrtPrecision) {
+  const auto Op = IROp->C<IR::IROp_VFRSqrtPrecision>();
+  const auto OpSize = IROp->Size;
+  const auto ElementSize = Op->Header.ElementSize;
+
+
+  LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit,
+                     "Unexpected sizes for operation.", __func__);
+
+  const auto SubRegSize = ConvertSubRegSizePair16(IROp);
+  const auto IsScalar = ElementSize == OpSize;
+
+  const auto Dst = GetVReg(Node);
+  const auto Vector = GetVReg(Op->Vector.ID());
+
+  if (IsScalar) {
+    if (HostSupportsRPRES) {
+      frsqrte(SubRegSize.Scalar, VTMP1.S(), Vector.S());
+      // Improve initial estimate which is not good enough.
+      fmul(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), VTMP1.S());
+      frsqrts(SubRegSize.Scalar, VTMP2.S(), VTMP2.S(), Vector.S());
+      fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S());
+      return;
+    }
+
+    fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0);
+    // element size is known to be 32bits
+    fsqrt(VTMP2.S(), Vector.S());
+    fdiv(Dst.S(), VTMP1.S(), VTMP2.S());
+  } else {
+    if (HostSupportsRPRES) {
+      frsqrte(SubRegSize.Vector, VTMP1.D(), Vector.D());
+      // Improve initial estimate which is not good enough.
+      fmul(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), VTMP1.D());
+      frsqrts(SubRegSize.Vector, VTMP2.D(), VTMP2.D(), Vector.D());
+      fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D());
+      return;
+    }
+    fmov(SubRegSize.Vector, VTMP1.Q(), 1.0);
+    fsqrt(SubRegSize.Vector, VTMP2.Q(), Vector.Q());
+    fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q());
+  }
+}
+
 DEF_OP(VNot) {
   const auto Op = IROp->C<IR::IROp_VNot>();
   const auto OpSize = IROp->Size;
@@ -4466,5 +4553,29 @@ DEF_OP(VFNMLS) {
   }
 }
 
+DEF_OP(VFCopySign) {
+  auto Op = IROp->C<IR::IROp_VFCopySign>();
+  const auto OpSize = IROp->Size;
+  const auto SubRegSize = ConvertSubRegSize248(IROp);
+
+  ARMEmitter::VRegister Magnitude = GetVReg(Op->Vector1.ID());
+  ARMEmitter::VRegister Sign = GetVReg(Op->Vector2.ID());
+
+  //  We don't assign explicity to Dst but Dst and Magniture are tied to the same register.
+  //  Similar in semantics to C's copysignf.
+  switch (OpSize) {
+  case IR::OpSize::i64Bit:
+    movi(SubRegSize, VTMP1.D(), 0x80, 24);
+    bit(Magnitude.D(), Sign.D(), VTMP1.D());
+    break;
+  case IR::OpSize::i128Bit:
+    movi(SubRegSize, VTMP1.Q(), 0x80, 24);
+    bit(Magnitude.Q(), Sign.Q(), VTMP1.Q());
+    break;
+  default: LOGMAN_MSG_A_FMT("Unsupported element size for operation {}", __func__); FEX_UNREACHABLE;
+  }
+}
+
+
 #undef DEF_OP
 } // namespace FEXCore::CPU
diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@@ -434,6 +434,7 @@ class OpDispatchBuilder final : public IREmitter {
 
   void VectorALUROp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
   void VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
+  void RSqrt3DNowOp(OpcodeArgs, bool Duplicate);
   template<FEXCore::IR::IROps IROp, IR::OpSize ElementSize>
   void VectorUnaryDuplicateOp(OpcodeArgs);
 

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h
@@ -9,16 +9,16 @@ constexpr std::tuple<uint8_t, uint8_t, FEXCore::X86Tables::OpDispatchPtr> OpDisp
   {0x1C, 1, &OpDispatchBuilder::PF2IWOp},
   {0x1D, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int<OpSize::i32Bit, false>},
 
-  {0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECP, OpSize::i32Bit>},
-  {0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRSQRT, OpSize::i32Bit>},
+  {0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECPPRECISION, OpSize::i32Bit>},
+  {0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, false>},
 
   {0x8A, 1, &OpDispatchBuilder::PFNACCOp},
   {0x8E, 1, &OpDispatchBuilder::PFPNACCOp},
 
   {0x90, 1, &OpDispatchBuilder::VPFCMPOp<1>},
   {0x94, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMIN, OpSize::i32Bit>},
-  {0x96, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, OpSize::i32Bit>},
-  {0x97, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, OpSize::i32Bit>},
+  {0x96, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECPPRECISION, OpSize::i32Bit>},
+  {0x97, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, true>},
 
   {0x9A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFSUB, OpSize::i32Bit>},
   {0x9E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADD, OpSize::i32Bit>},

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@@ -626,17 +626,36 @@ void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs) {
 template void OpDispatchBuilder::AVXInsertScalarFCMPOp<OpSize::i32Bit>(OpcodeArgs);
 template void OpDispatchBuilder::AVXInsertScalarFCMPOp<OpSize::i64Bit>(OpcodeArgs);
 
+void OpDispatchBuilder::RSqrt3DNowOp(OpcodeArgs, bool Duplicate) {
+  const auto Size = OpSizeFromSrc(Op);
+  const auto ElementSize = OpSize::i32Bit;
+
+  Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], Size, Op->Flags);
+
+  // For the sqrt reciprocal in 3DNow!, if the source is negative,
+  // then the result has the same sign as the source but the result is always calculated
+  // as if the source was positive.
+  Ref AbsSrc = _VFAbs(Size, ElementSize, Src);
+  Ref PosRSqrt = _VFRSqrtPrecision(Size, ElementSize, AbsSrc);
+  Ref Result = _VFCopySign(Size, ElementSize, PosRSqrt, Src);
+
+  if (Duplicate) {
+    Result = _VDupElement(Size, ElementSize, Result, 0);
+  }
+
+  StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
+}
+
 void OpDispatchBuilder::VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) {
   // In the event of a scalar operation and a vector source, then
   // we can specify the entire vector length in order to avoid
   // unnecessary sign extension on the element to be operated on.
   // In the event of a memory operand, we load the exact element size.
-  const auto SrcSize = OpSizeFromSrc(Op);
-
-  Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags);
+  const auto Size = OpSizeFromSrc(Op);
 
-  DeriveOp(ALUOp, IROp, _VFSqrt(SrcSize, ElementSize, Src));
+  Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], Size, Op->Flags);
 
+  DeriveOp(ALUOp, IROp, _VFSqrt(Size, ElementSize, Src));
   StoreResult(FPRClass, Op, ALUOp, OpSize::iInvalid);
 }
 
@@ -676,8 +695,8 @@ void OpDispatchBuilder::VectorUnaryDuplicateOp(OpcodeArgs) {
   VectorUnaryDuplicateOpImpl(Op, IROp, ElementSize);
 }
 
-template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, OpSize::i32Bit>(OpcodeArgs);
-template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, OpSize::i32Bit>(OpcodeArgs);
+// TODO: there's only one instantiation of this template. Lets remove it.
+template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECPPRECISION, OpSize::i32Bit>(OpcodeArgs);
 
 void OpDispatchBuilder::MOVQOp(OpcodeArgs, VectorOpType VectorType) {
   const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : OpSizeFromSrc(Op);

diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json
@@ -1881,6 +1881,12 @@
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize",
         "TiedSource": 0
+      },
+      "FPR = VFCopySign OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1,  FPR:$Vector2": {
+        "Desc": ["Returns a vector where each element has has the magniture of each corresponding element in vector1 and the sign of vector 2."],
+        "DestSize": "RegisterSize",
+        "ElementSize": "ElementSize",
+        "TiedSource": 0
       }
     },
     "Vector": {
@@ -1967,19 +1973,52 @@
       },
 
       "FPR = VFRecp OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
+        "Desc": [
+          "Reciprocal value - matches the precision required by the x86 spec.",
+          "It has a relative error of at most 1.5 * 2^-12"
+        ],
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize"
       },
+      "FPR = VFRecpPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
+        "Desc": [
+          "Similar to VFRecp but carrying more precision for 3DNow!",
+          "It provides at least 14 bits precision, with a relative error of at most 2^-14"
+        ],
+        "DestSize": "RegisterSize",
+        "ElementSize": "ElementSize",
+        "EmitValidation": [
+          "RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit",
+          "ElementSize == FEXCore::IR::OpSize::i32Bit"
+        ]
+      },
 
       "FPR = VFSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize"
       },
 
       "FPR = VFRSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
+        "Desc": [
+          "Reciprocal Square Root - matches the precision required by the x86 spec.",
+          "It has a relative error of at most 1.5 * 2^-12"
+        ],
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize"
       },
+      "FPR = VFRSqrtPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
+        "Desc": [
+          "Similar to VFRSqrt but carrying more precision for 3DNow!",
+          "It provides at least 15 bits precision, with a relative error of at most 2^-15"
+        ],
+        "DestSize": "RegisterSize",
+        "ElementSize": "ElementSize",
+        "EmitValidation": [
+          "RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit",
+          "ElementSize == FEXCore::IR::OpSize::i32Bit"
+        ]
+      },
+
       "FPR = VCMPEQZ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize"

diff --git a/unittests/ASM/3DNow/86.asm b/unittests/ASM/3DNow/86.asm
@@ -1,20 +1,72 @@
 %ifdef CONFIG
 {
   "RegData": {
-    "MM0": "0x3f800000bf800000",
-    "MM1": "0x3c000000bc000000",
-    "MM2": "0xbf8000003f800000"
+    "RBX": "1",
+    "MM3": "0xff8000007f800000"
   },
   "HostFeatures": ["3DNOW"]
 }
 %endif
 
+%include "checkprecision.mac"
+
+section .text
+global _start
+
+_start:
 pfrcpv mm0, [rel data1]
 pfrcpv mm1, [rel data2]
 pfrcpv mm2, [rel data3]
+pfrcpv mm3, [rel data4]
+
+; All calculated
+; Now we extract all the values into memory to call check_relerr.
+movd edx, mm0
+mov [rel result11], edx
+
+psrlq mm0, 32
+movd edx, mm0
+mov [rel result12], edx
+
+movd edx, mm1
+mov [rel result21], edx
+
+psrlq mm1, 32
+movd edx, mm1
+mov [rel result22], edx
+
+movd edx, mm2
+mov [rel result31], edx
+
+psrlq mm2, 32
+movd edx, mm2
+mov [rel result32], edx
+
+check_relerr rel eresult11, rel result11, rel tolerance
+mov ebx, eax
+check_relerr rel eresult12, rel result12, rel tolerance
+and ebx, eax
+check_relerr rel eresult21, rel result21, rel tolerance
+and ebx, eax
+check_relerr rel eresult22, rel result22, rel tolerance
+and ebx, eax
+check_relerr rel eresult31, rel result31, rel tolerance
+and ebx, eax
+check_relerr rel eresult32, rel result32, rel tolerance
+and ebx, eax
 
 hlt
 
+section .bss
+align 32
+result11: resd 1
+result12: resd 1
+result21: resd 1
+result22: resd 1
+result31: resd 1
+result32: resd 1
+
+section .data
 align 8
 data1:
 dd -1.0
@@ -27,3 +79,25 @@ dd 128.0
 data3:
 dd 1.0
 dd -1.0
+
+data4:
+dd 0.0
+dd -0.0
+
+eresult11:
+dd -1.0
+eresult12:
+dd 1.0
+eresult21:
+dd 0xbc000000 ; -1/128
+eresult22:
+dd 0x3c000000 ; 1/128
+eresult31:
+dd 1.0
+eresult32:
+dd -1.0
+
+tolerance:
+dd 0x38800000 ; 2^-14 - 14bit accuracy
+
+define_check_data_constants