Skip to content

Commit

Permalink
Improve reciprocal estimate and tests
Browse files Browse the repository at this point in the history
Reciprocal estimations did not have enough accuracy. Tests were enabled
to check for accurate values of reciprocals.

* where needed, reciprocal accuracy was increased.
* 3DNow sqrt reciprocal fixed for negative values.
* New helper VFCopySign IR op added.

Fixes #4319.
  • Loading branch information
pmatos committed Feb 27, 2025
1 parent 717015b commit c58ba7b
Show file tree
Hide file tree
Showing 21 changed files with 1,315 additions and 241 deletions.
137 changes: 130 additions & 7 deletions FEXCore/Source/Interface/Core/JIT/VectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@ tags: backend|arm64
$end_info$
*/


#include "FEXCore/Utils/CompilerDefs.h"
#include "Interface/Core/ArchHelpers/Arm64Emitter.h"
#include "Interface/Core/Dispatcher/Dispatcher.h"
#include "Interface/Core/JIT/JITClass.h"
#include "CodeEmitter/Emitter.h"
#include "Interface/IR/IR.h"

#include <FEXCore/Utils/MathUtils.h>

Expand Down Expand Up @@ -529,7 +534,11 @@ DEF_OP(VFRSqrtScalarInsert) {

auto ScalarEmitRPRES = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> SrcVar) {
auto Src = *std::get_if<ARMEmitter::VRegister>(&SrcVar);
frsqrte(SubRegSize.Scalar, Dst.D(), Src.D());
frsqrte(SubRegSize.Scalar, VTMP1.D(), Src.D());
fmul(SubRegSize.Scalar, VTMP2.D(), VTMP1.D(), VTMP1.D());
frsqrts(SubRegSize.Scalar, VTMP2.D(), VTMP2.D(), Src.D());
fmul(SubRegSize.Scalar, VTMP1.D(), VTMP1.D(), VTMP2.D());
ins(SubRegSize.Vector, Dst, 0, VTMP1, 0);
};

std::array<ScalarUnaryOpCaller, 2> Handlers = {
Expand All @@ -556,12 +565,18 @@ DEF_OP(VFRecpScalarInsert) {
auto Src = *std::get_if<ARMEmitter::VRegister>(&SrcVar);

fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f);
fdiv(SubRegSize.Scalar, Dst, VTMP1, Src);
if (HostSupportsAFP) {
fdiv(SubRegSize.Scalar, VTMP1, VTMP1, Src);
ins(SubRegSize.Vector, Dst, 0, VTMP1, 0);
} else {
fdiv(SubRegSize.Scalar, Dst, VTMP1, Src);
}
};

auto ScalarEmitRPRES = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant<ARMEmitter::VRegister, ARMEmitter::Register> SrcVar) {
auto Src = *std::get_if<ARMEmitter::VRegister>(&SrcVar);
frecpe(SubRegSize.Scalar, Dst, Src);
frecpe(SubRegSize.Scalar, VTMP1, Src);
ins(SubRegSize.Vector, Dst, 0, VTMP1, 0);
};

std::array<ScalarUnaryOpCaller, 2> Handlers = {
Expand Down Expand Up @@ -1488,7 +1503,6 @@ DEF_OP(VFRecp) {
} else {
if (IsScalar) {
if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
// RPRES gives enough precision for this.
frecpe(SubRegSize.Scalar, Dst.S(), Vector.S());
return;
}
Expand All @@ -1507,7 +1521,10 @@ DEF_OP(VFRecp) {
fdiv(Dst.D(), VTMP1.D(), Vector.D());
break;
}
default: break;
default: {
LOGMAN_MSG_A_FMT("Unexpected ElementSize for {}", __func__);
FEX_UNREACHABLE;
}
}
} else {
if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
Expand All @@ -1526,6 +1543,46 @@ DEF_OP(VFRecp) {
}
}

DEF_OP(VFRecpPrecision) {
const auto Op = IROp->C<IR::IROp_VFRecpPrecision>();
const auto OpSize = IROp->Size;
const auto ElementSize = Op->Header.ElementSize;

LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit,
"Unexpected sizes for operation.", __func__);

const auto SubRegSize = ConvertSubRegSizePair16(IROp);
const auto IsScalar = OpSize == ElementSize;

const auto Dst = GetVReg(Node);
const auto Vector = GetVReg(Op->Vector.ID());

if (IsScalar) {
if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
// Not enough precision so we need to improve it with frecps
frecpe(SubRegSize.Scalar, VTMP1.S(), Vector.S());
frecps(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), Vector.S());
fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S());
return;
}

fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f);
// Element size is known to be 32bits
fdiv(Dst.S(), VTMP1.S(), Vector.S());
} else { // Vector operation - Opsize 64bits, elementsize 32bits
if (HostSupportsRPRES) {
frecpe(SubRegSize.Vector, VTMP1.D(), Vector.D());
frecps(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), Vector.D());
fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D());
return;
}

// No RPRES, so normal division
fmov(SubRegSize.Vector, VTMP1.Q(), 1.0f);
fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), Vector.Q());
}
}

DEF_OP(VFRSqrt) {
const auto Op = IROp->C<IR::IROp_VFRSqrt>();
const auto OpSize = IROp->Size;
Expand Down Expand Up @@ -1553,7 +1610,6 @@ DEF_OP(VFRSqrt) {
} else {
if (IsScalar) {
if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
// RPRES gives enough precision for this.
frsqrte(SubRegSize.Scalar, Dst.S(), Vector.S());
return;
}
Expand Down Expand Up @@ -1587,14 +1643,57 @@ DEF_OP(VFRSqrt) {
}
return;
}

fmov(SubRegSize.Vector, VTMP1.Q(), 1.0);
fsqrt(SubRegSize.Vector, VTMP2.Q(), Vector.Q());
fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q());
}
}
}

DEF_OP(VFRSqrtPrecision) {
const auto Op = IROp->C<IR::IROp_VFRSqrtPrecision>();
const auto OpSize = IROp->Size;
const auto ElementSize = Op->Header.ElementSize;


LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit,
"Unexpected sizes for operation.", __func__);

const auto SubRegSize = ConvertSubRegSizePair16(IROp);
const auto IsScalar = ElementSize == OpSize;

const auto Dst = GetVReg(Node);
const auto Vector = GetVReg(Op->Vector.ID());

if (IsScalar) {
if (HostSupportsRPRES) {
frsqrte(SubRegSize.Scalar, VTMP1.S(), Vector.S());
// Improve initial estimate which is not good enough.
fmul(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), VTMP1.S());
frsqrts(SubRegSize.Scalar, VTMP2.S(), VTMP2.S(), Vector.S());
fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S());
return;
}

fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0);
// element size is known to be 32bits
fsqrt(VTMP2.S(), Vector.S());
fdiv(Dst.S(), VTMP1.S(), VTMP2.S());
} else {
if (HostSupportsRPRES) {
frsqrte(SubRegSize.Vector, VTMP1.D(), Vector.D());
// Improve initial estimate which is not good enough.
fmul(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), VTMP1.D());
frsqrts(SubRegSize.Vector, VTMP2.D(), VTMP2.D(), Vector.D());
fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D());
return;
}
fmov(SubRegSize.Vector, VTMP1.Q(), 1.0);
fsqrt(SubRegSize.Vector, VTMP2.Q(), Vector.Q());
fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q());
}
}

DEF_OP(VNot) {
const auto Op = IROp->C<IR::IROp_VNot>();
const auto OpSize = IROp->Size;
Expand Down Expand Up @@ -4453,5 +4552,29 @@ DEF_OP(VFNMLS) {
}
}

DEF_OP(VFCopySign) {
auto Op = IROp->C<IR::IROp_VFCopySign>();
const auto OpSize = IROp->Size;
const auto SubRegSize = ConvertSubRegSize248(IROp);

ARMEmitter::VRegister Magnitude = GetVReg(Op->Vector1.ID());
ARMEmitter::VRegister Sign = GetVReg(Op->Vector2.ID());

// We don't assign explicity to Dst but Dst and Magniture are tied to the same register.
// Similar in semantics to C's copysignf.
switch (OpSize) {
case IR::OpSize::i64Bit:
movi(SubRegSize, VTMP1.D(), 0x80, 24);
bit(Magnitude.D(), Sign.D(), VTMP1.D());
break;
case IR::OpSize::i128Bit:
movi(SubRegSize, VTMP1.Q(), 0x80, 24);
bit(Magnitude.Q(), Sign.Q(), VTMP1.Q());
break;
default: LOGMAN_MSG_A_FMT("Unsupported element size for operation {}", __func__); FEX_UNREACHABLE;
}
}


#undef DEF_OP
} // namespace FEXCore::CPU
1 change: 1 addition & 0 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ class OpDispatchBuilder final : public IREmitter {

void VectorALUROp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
void VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
void RSqrt3DNowOp(OpcodeArgs, bool Duplicate);
template<FEXCore::IR::IROps IROp, IR::OpSize ElementSize>
void VectorUnaryDuplicateOp(OpcodeArgs);

Expand Down
8 changes: 4 additions & 4 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ constexpr std::tuple<uint8_t, uint8_t, FEXCore::X86Tables::OpDispatchPtr> OpDisp
{0x1C, 1, &OpDispatchBuilder::PF2IWOp},
{0x1D, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int<OpSize::i32Bit, false>},

{0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECP, OpSize::i32Bit>},
{0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRSQRT, OpSize::i32Bit>},
{0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECPPRECISION, OpSize::i32Bit>},
{0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, false>},

{0x8A, 1, &OpDispatchBuilder::PFNACCOp},
{0x8E, 1, &OpDispatchBuilder::PFPNACCOp},

{0x90, 1, &OpDispatchBuilder::VPFCMPOp<1>},
{0x94, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMIN, OpSize::i32Bit>},
{0x96, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, OpSize::i32Bit>},
{0x97, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, OpSize::i32Bit>},
{0x96, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECPPRECISION, OpSize::i32Bit>},
{0x97, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, true>},

{0x9A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFSUB, OpSize::i32Bit>},
{0x9E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADD, OpSize::i32Bit>},
Expand Down
31 changes: 25 additions & 6 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -626,17 +626,36 @@ void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs) {
template void OpDispatchBuilder::AVXInsertScalarFCMPOp<OpSize::i32Bit>(OpcodeArgs);
template void OpDispatchBuilder::AVXInsertScalarFCMPOp<OpSize::i64Bit>(OpcodeArgs);

void OpDispatchBuilder::RSqrt3DNowOp(OpcodeArgs, bool Duplicate) {
const auto Size = OpSizeFromSrc(Op);
const auto ElementSize = OpSize::i32Bit;

Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], Size, Op->Flags);

// For the sqrt reciprocal in 3DNow!, if the source is negative,
// then the result has the same sign as the source but the result is always calculated
// as if the source was positive.
Ref AbsSrc = _VFAbs(Size, ElementSize, Src);
Ref PosRSqrt = _VFRSqrtPrecision(Size, ElementSize, AbsSrc);
Ref Result = _VFCopySign(Size, ElementSize, PosRSqrt, Src);

if (Duplicate) {
Result = _VDupElement(Size, ElementSize, Result, 0);
}

StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
}

void OpDispatchBuilder::VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) {
// In the event of a scalar operation and a vector source, then
// we can specify the entire vector length in order to avoid
// unnecessary sign extension on the element to be operated on.
// In the event of a memory operand, we load the exact element size.
const auto SrcSize = OpSizeFromSrc(Op);

Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags);
const auto Size = OpSizeFromSrc(Op);

DeriveOp(ALUOp, IROp, _VFSqrt(SrcSize, ElementSize, Src));
Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], Size, Op->Flags);

DeriveOp(ALUOp, IROp, _VFSqrt(Size, ElementSize, Src));
StoreResult(FPRClass, Op, ALUOp, OpSize::iInvalid);
}

Expand Down Expand Up @@ -676,8 +695,8 @@ void OpDispatchBuilder::VectorUnaryDuplicateOp(OpcodeArgs) {
VectorUnaryDuplicateOpImpl(Op, IROp, ElementSize);
}

template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, OpSize::i32Bit>(OpcodeArgs);
template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, OpSize::i32Bit>(OpcodeArgs);
// TODO: there's only one instantiation of this template. Lets remove it.
template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECPPRECISION, OpSize::i32Bit>(OpcodeArgs);

void OpDispatchBuilder::MOVQOp(OpcodeArgs, VectorOpType VectorType) {
const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : OpSizeFromSrc(Op);
Expand Down
39 changes: 39 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -1881,6 +1881,12 @@
"DestSize": "RegisterSize",
"ElementSize": "ElementSize",
"TiedSource": 0
},
"FPR = VFCopySign OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
"Desc": ["Returns a vector where each element has has the magniture of each corresponding element in vector1 and the sign of vector 2."],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize",
"TiedSource": 0
}
},
"Vector": {
Expand Down Expand Up @@ -1967,19 +1973,52 @@
},

"FPR = VFRecp OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": [
"Reciprocal value - matches the precision required by the x86 spec.",
"It has a relative error of at most 1.5 * 2^-12"
],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize"
},
"FPR = VFRecpPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": [
"Similar to VFRecp but carrying more precision for 3DNow!",
"It provides at least 14 bits precision, with a relative error of at most 2^-14"
],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize",
"EmitValidation": [
"RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit",
"ElementSize == FEXCore::IR::OpSize::i32Bit"
]
},

"FPR = VFSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"DestSize": "RegisterSize",
"ElementSize": "ElementSize"
},

"FPR = VFRSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": [
"Reciprocal Square Root - matches the precision required by the x86 spec.",
"It has a relative error of at most 1.5 * 2^-12"
],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize"
},
"FPR = VFRSqrtPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": [
"Similar to VFRSqrt but carrying more precision for 3DNow!",
"It provides at least 15 bits precision, with a relative error of at most 2^-15"
],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize",
"EmitValidation": [
"RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit",
"ElementSize == FEXCore::IR::OpSize::i32Bit"
]
},

"FPR = VCMPEQZ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"DestSize": "RegisterSize",
"ElementSize": "ElementSize"
Expand Down
Loading

0 comments on commit c58ba7b

Please sign in to comment.