Improve reciprocal estimate and tests

Reciprocal estimations did not have enough accuracy. Tests were enabled to check for accurate values of reciprocals. * where needed, reciprocal accuracy was increased. * 3DNow sqrt reciprocal fixed for negative values. * New helper VFCopySign IR op added. Fixes #4319.
FEX-Emu · Feb 27, 2025 · c58ba7b · c58ba7b
1 parent 717015b
commit c58ba7b
Show file tree

Hide file tree

Showing 21 changed files with 1,315 additions and 241 deletions.
diff --git a/FEXCore/Source/Interface/Core/JIT/VectorOps.cpp b/FEXCore/Source/Interface/Core/JIT/VectorOps.cpp
@@ -5,8 +5,13 @@ tags: backend|arm64
 $end_info$
 */
 
+
+#include "FEXCore/Utils/CompilerDefs.h"
+#include "Interface/Core/ArchHelpers/Arm64Emitter.h"
 #include "Interface/Core/Dispatcher/Dispatcher.h"
 #include "Interface/Core/JIT/JITClass.h"
+#include "CodeEmitter/Emitter.h"
+#include "Interface/IR/IR.h"
 
 #include <FEXCore/Utils/MathUtils.h>
 
@@ -529,7 +534,11 @@ DEF_OP(VFRSqrtScalarInsert) {
 
   auto ScalarEmitRPRES = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> SrcVar) {
     auto Src = *std::get_if<ARMEmitter::VRegister>(&SrcVar);
-    frsqrte(SubRegSize.Scalar, Dst.D(), Src.D());
+    frsqrte(SubRegSize.Scalar, VTMP1.D(), Src.D());
+    fmul(SubRegSize.Scalar, VTMP2.D(), VTMP1.D(), VTMP1.D());
+    frsqrts(SubRegSize.Scalar, VTMP2.D(), VTMP2.D(), Src.D());
+    fmul(SubRegSize.Scalar, VTMP1.D(), VTMP1.D(), VTMP2.D());
+    ins(SubRegSize.Vector, Dst, 0, VTMP1, 0);
   };
 
   std::array<ScalarUnaryOpCaller, 2> Handlers = {
@@ -556,12 +565,18 @@ DEF_OP(VFRecpScalarInsert) {
     auto Src = *std::get_if<ARMEmitter::VRegister>(&SrcVar);
 
     fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f);
-    fdiv(SubRegSize.Scalar, Dst, VTMP1, Src);
+    if (HostSupportsAFP) {
+      fdiv(SubRegSize.Scalar, VTMP1, VTMP1, Src);
+      ins(SubRegSize.Vector, Dst, 0, VTMP1, 0);
+    } else {
+      fdiv(SubRegSize.Scalar, Dst, VTMP1, Src);
+    }
   };
 
   auto ScalarEmitRPRES = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> SrcVar) {
     auto Src = *std::get_if<ARMEmitter::VRegister>(&SrcVar);
-    frecpe(SubRegSize.Scalar, Dst, Src);
+    frecpe(SubRegSize.Scalar, VTMP1, Src);
+    ins(SubRegSize.Vector, Dst, 0, VTMP1, 0);
   };
 
   std::array<ScalarUnaryOpCaller, 2> Handlers = {
@@ -1488,7 +1503,6 @@ DEF_OP(VFRecp) {
   } else {
     if (IsScalar) {
       if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
-        // RPRES gives enough precision for this.
         frecpe(SubRegSize.Scalar, Dst.S(), Vector.S());
         return;
       }
@@ -1507,7 +1521,10 @@ DEF_OP(VFRecp) {
         fdiv(Dst.D(), VTMP1.D(), Vector.D());
         break;
       }
-      default: break;
+      default: {
+        LOGMAN_MSG_A_FMT("Unexpected ElementSize for {}", __func__);
+        FEX_UNREACHABLE;
+      }
       }
     } else {
       if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
@@ -1526,6 +1543,46 @@ DEF_OP(VFRecp) {
   }
 }
 
+DEF_OP(VFRecpPrecision) {
+  const auto Op = IROp->C<IR::IROp_VFRecpPrecision>();
+  const auto OpSize = IROp->Size;
+  const auto ElementSize = Op->Header.ElementSize;
+
+  LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit,
+                     "Unexpected sizes for operation.", __func__);
+
+  const auto SubRegSize = ConvertSubRegSizePair16(IROp);
+  const auto IsScalar = OpSize == ElementSize;
+
+  const auto Dst = GetVReg(Node);
+  const auto Vector = GetVReg(Op->Vector.ID());
+
+  if (IsScalar) {
+    if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
+      // Not enough precision so we need to improve it with frecps
+      frecpe(SubRegSize.Scalar, VTMP1.S(), Vector.S());
+      frecps(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), Vector.S());
+      fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S());
+      return;
+    }
+
+    fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f);
+    // Element size is known to be 32bits
+    fdiv(Dst.S(), VTMP1.S(), Vector.S());
+  } else { // Vector operation - Opsize 64bits, elementsize 32bits
+    if (HostSupportsRPRES) {
+      frecpe(SubRegSize.Vector, VTMP1.D(), Vector.D());
+      frecps(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), Vector.D());
+      fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D());
+      return;
+    }
+
+    // No RPRES, so normal division
+    fmov(SubRegSize.Vector, VTMP1.Q(), 1.0f);
+    fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), Vector.Q());
+  }
+}
+
 DEF_OP(VFRSqrt) {
   const auto Op = IROp->C<IR::IROp_VFRSqrt>();
   const auto OpSize = IROp->Size;
@@ -1553,7 +1610,6 @@ DEF_OP(VFRSqrt) {
   } else {
     if (IsScalar) {
       if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
-        // RPRES gives enough precision for this.
         frsqrte(SubRegSize.Scalar, Dst.S(), Vector.S());
         return;
       }
@@ -1587,14 +1643,57 @@ DEF_OP(VFRSqrt) {
         }
         return;
       }
-
       fmov(SubRegSize.Vector, VTMP1.Q(), 1.0);
       fsqrt(SubRegSize.Vector, VTMP2.Q(), Vector.Q());
       fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q());
     }
   }
 }
 
+DEF_OP(VFRSqrtPrecision) {
+  const auto Op = IROp->C<IR::IROp_VFRSqrtPrecision>();
+  const auto OpSize = IROp->Size;
+  const auto ElementSize = Op->Header.ElementSize;
+
+
+  LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit,
+                     "Unexpected sizes for operation.", __func__);
+
+  const auto SubRegSize = ConvertSubRegSizePair16(IROp);
+  const auto IsScalar = ElementSize == OpSize;
+
+  const auto Dst = GetVReg(Node);
+  const auto Vector = GetVReg(Op->Vector.ID());
+
+  if (IsScalar) {
+    if (HostSupportsRPRES) {
+      frsqrte(SubRegSize.Scalar, VTMP1.S(), Vector.S());
+      // Improve initial estimate which is not good enough.
+      fmul(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), VTMP1.S());
+      frsqrts(SubRegSize.Scalar, VTMP2.S(), VTMP2.S(), Vector.S());
+      fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S());
+      return;
+    }
+
+    fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0);
+    // element size is known to be 32bits
+    fsqrt(VTMP2.S(), Vector.S());
+    fdiv(Dst.S(), VTMP1.S(), VTMP2.S());
+  } else {
+    if (HostSupportsRPRES) {
+      frsqrte(SubRegSize.Vector, VTMP1.D(), Vector.D());
+      // Improve initial estimate which is not good enough.
+      fmul(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), VTMP1.D());
+      frsqrts(SubRegSize.Vector, VTMP2.D(), VTMP2.D(), Vector.D());
+      fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D());
+      return;
+    }
+    fmov(SubRegSize.Vector, VTMP1.Q(), 1.0);
+    fsqrt(SubRegSize.Vector, VTMP2.Q(), Vector.Q());
+    fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q());
+  }
+}
+
 DEF_OP(VNot) {
   const auto Op = IROp->C<IR::IROp_VNot>();
   const auto OpSize = IROp->Size;
@@ -4453,5 +4552,29 @@ DEF_OP(VFNMLS) {
   }
 }
 
+DEF_OP(VFCopySign) {
+  auto Op = IROp->C<IR::IROp_VFCopySign>();
+  const auto OpSize = IROp->Size;
+  const auto SubRegSize = ConvertSubRegSize248(IROp);
+
+  ARMEmitter::VRegister Magnitude = GetVReg(Op->Vector1.ID());
+  ARMEmitter::VRegister Sign = GetVReg(Op->Vector2.ID());
+
+  //  We don't assign explicity to Dst but Dst and Magniture are tied to the same register.
+  //  Similar in semantics to C's copysignf.
+  switch (OpSize) {
+  case IR::OpSize::i64Bit:
+    movi(SubRegSize, VTMP1.D(), 0x80, 24);
+    bit(Magnitude.D(), Sign.D(), VTMP1.D());
+    break;
+  case IR::OpSize::i128Bit:
+    movi(SubRegSize, VTMP1.Q(), 0x80, 24);
+    bit(Magnitude.Q(), Sign.Q(), VTMP1.Q());
+    break;
+  default: LOGMAN_MSG_A_FMT("Unsupported element size for operation {}", __func__); FEX_UNREACHABLE;
+  }
+}
+
+
 #undef DEF_OP
 } // namespace FEXCore::CPU
diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@@ -434,6 +434,7 @@ class OpDispatchBuilder final : public IREmitter {
 
   void VectorALUROp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
   void VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
+  void RSqrt3DNowOp(OpcodeArgs, bool Duplicate);
   template<FEXCore::IR::IROps IROp, IR::OpSize ElementSize>
   void VectorUnaryDuplicateOp(OpcodeArgs);
 

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h
@@ -9,16 +9,16 @@ constexpr std::tuple<uint8_t, uint8_t, FEXCore::X86Tables::OpDispatchPtr> OpDisp
   {0x1C, 1, &OpDispatchBuilder::PF2IWOp},
   {0x1D, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int<OpSize::i32Bit, false>},
 
-  {0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECP, OpSize::i32Bit>},
-  {0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRSQRT, OpSize::i32Bit>},
+  {0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECPPRECISION, OpSize::i32Bit>},
+  {0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, false>},
 
   {0x8A, 1, &OpDispatchBuilder::PFNACCOp},
   {0x8E, 1, &OpDispatchBuilder::PFPNACCOp},
 
   {0x90, 1, &OpDispatchBuilder::VPFCMPOp<1>},
   {0x94, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMIN, OpSize::i32Bit>},
-  {0x96, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, OpSize::i32Bit>},
-  {0x97, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, OpSize::i32Bit>},
+  {0x96, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECPPRECISION, OpSize::i32Bit>},
+  {0x97, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, true>},
 
   {0x9A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFSUB, OpSize::i32Bit>},
   {0x9E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADD, OpSize::i32Bit>},

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@@ -626,17 +626,36 @@ void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs) {
 template void OpDispatchBuilder::AVXInsertScalarFCMPOp<OpSize::i32Bit>(OpcodeArgs);
 template void OpDispatchBuilder::AVXInsertScalarFCMPOp<OpSize::i64Bit>(OpcodeArgs);
 
+void OpDispatchBuilder::RSqrt3DNowOp(OpcodeArgs, bool Duplicate) {
+  const auto Size = OpSizeFromSrc(Op);
+  const auto ElementSize = OpSize::i32Bit;
+
+  Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], Size, Op->Flags);
+
+  // For the sqrt reciprocal in 3DNow!, if the source is negative,
+  // then the result has the same sign as the source but the result is always calculated
+  // as if the source was positive.
+  Ref AbsSrc = _VFAbs(Size, ElementSize, Src);
+  Ref PosRSqrt = _VFRSqrtPrecision(Size, ElementSize, AbsSrc);
+  Ref Result = _VFCopySign(Size, ElementSize, PosRSqrt, Src);
+
+  if (Duplicate) {
+    Result = _VDupElement(Size, ElementSize, Result, 0);
+  }
+
+  StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
+}
+
 void OpDispatchBuilder::VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) {
   // In the event of a scalar operation and a vector source, then
   // we can specify the entire vector length in order to avoid
   // unnecessary sign extension on the element to be operated on.
   // In the event of a memory operand, we load the exact element size.
-  const auto SrcSize = OpSizeFromSrc(Op);
-
-  Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags);
+  const auto Size = OpSizeFromSrc(Op);
 
-  DeriveOp(ALUOp, IROp, _VFSqrt(SrcSize, ElementSize, Src));
+  Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], Size, Op->Flags);
 
+  DeriveOp(ALUOp, IROp, _VFSqrt(Size, ElementSize, Src));
   StoreResult(FPRClass, Op, ALUOp, OpSize::iInvalid);
 }
 
@@ -676,8 +695,8 @@ void OpDispatchBuilder::VectorUnaryDuplicateOp(OpcodeArgs) {
   VectorUnaryDuplicateOpImpl(Op, IROp, ElementSize);
 }
 
-template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, OpSize::i32Bit>(OpcodeArgs);
-template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, OpSize::i32Bit>(OpcodeArgs);
+// TODO: there's only one instantiation of this template. Lets remove it.
+template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECPPRECISION, OpSize::i32Bit>(OpcodeArgs);
 
 void OpDispatchBuilder::MOVQOp(OpcodeArgs, VectorOpType VectorType) {
   const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : OpSizeFromSrc(Op);

diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json
@@ -1881,6 +1881,12 @@
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize",
         "TiedSource": 0
+      },
+      "FPR = VFCopySign OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1,  FPR:$Vector2": {
+        "Desc": ["Returns a vector where each element has has the magniture of each corresponding element in vector1 and the sign of vector 2."],
+        "DestSize": "RegisterSize",
+        "ElementSize": "ElementSize",
+        "TiedSource": 0
       }
     },
     "Vector": {
@@ -1967,19 +1973,52 @@
       },
 
       "FPR = VFRecp OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
+        "Desc": [
+          "Reciprocal value - matches the precision required by the x86 spec.",
+          "It has a relative error of at most 1.5 * 2^-12"
+        ],
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize"
       },
+      "FPR = VFRecpPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
+        "Desc": [
+          "Similar to VFRecp but carrying more precision for 3DNow!",
+          "It provides at least 14 bits precision, with a relative error of at most 2^-14"
+        ],
+        "DestSize": "RegisterSize",
+        "ElementSize": "ElementSize",
+        "EmitValidation": [
+          "RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit",
+          "ElementSize == FEXCore::IR::OpSize::i32Bit"
+        ]
+      },
 
       "FPR = VFSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize"
       },
 
       "FPR = VFRSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
+        "Desc": [
+          "Reciprocal Square Root - matches the precision required by the x86 spec.",
+          "It has a relative error of at most 1.5 * 2^-12"
+        ],
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize"
       },
+      "FPR = VFRSqrtPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
+        "Desc": [
+          "Similar to VFRSqrt but carrying more precision for 3DNow!",
+          "It provides at least 15 bits precision, with a relative error of at most 2^-15"
+        ],
+        "DestSize": "RegisterSize",
+        "ElementSize": "ElementSize",
+        "EmitValidation": [
+          "RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit",
+          "ElementSize == FEXCore::IR::OpSize::i32Bit"
+        ]
+      },
+
       "FPR = VCMPEQZ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
         "DestSize": "RegisterSize",
         "ElementSize": "ElementSize"