Skip to content

Commit

Permalink
Merge pull request #4363 from pmatos/ReciprocalsFix
Browse files Browse the repository at this point in the history
Improve reciprocal estimate and tests
  • Loading branch information
Sonicadvance1 authored Mar 1, 2025
2 parents 57ed466 + 8e4a471 commit b7f58e6
Show file tree
Hide file tree
Showing 23 changed files with 1,338 additions and 254 deletions.
113 changes: 112 additions & 1 deletion FEXCore/Source/Interface/Core/JIT/VectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1520,7 +1520,10 @@ DEF_OP(VFRecp) {
fdiv(Dst.D(), VTMP1.D(), Vector.D());
break;
}
default: break;
default: {
LOGMAN_MSG_A_FMT("Unexpected ElementSize for {}", __func__);
FEX_UNREACHABLE;
}
}
} else {
if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
Expand All @@ -1539,6 +1542,46 @@ DEF_OP(VFRecp) {
}
}

DEF_OP(VFRecpPrecision) {
const auto Op = IROp->C<IR::IROp_VFRecpPrecision>();
const auto OpSize = IROp->Size;
const auto ElementSize = Op->Header.ElementSize;

LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit,
"Unexpected sizes for operation.", __func__);

const auto SubRegSize = ConvertSubRegSizePair16(IROp);
const auto IsScalar = OpSize == ElementSize;

const auto Dst = GetVReg(Node);
const auto Vector = GetVReg(Op->Vector.ID());

if (IsScalar) {
if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) {
// Not enough precision so we need to improve it with frecps
frecpe(SubRegSize.Scalar, VTMP1.S(), Vector.S());
frecps(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), Vector.S());
fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S());
return;
}

fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f);
// Element size is known to be 32bits
fdiv(Dst.S(), VTMP1.S(), Vector.S());
} else { // Vector operation - Opsize 64bits, elementsize 32bits
if (HostSupportsRPRES) {
frecpe(SubRegSize.Vector, VTMP1.D(), Vector.D());
frecps(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), Vector.D());
fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D());
return;
}

// No RPRES, so normal division
fmov(SubRegSize.Vector, VTMP1.Q(), 1.0f);
fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), Vector.Q());
}
}

DEF_OP(VFRSqrt) {
const auto Op = IROp->C<IR::IROp_VFRSqrt>();
const auto OpSize = IROp->Size;
Expand Down Expand Up @@ -1608,6 +1651,50 @@ DEF_OP(VFRSqrt) {
}
}

DEF_OP(VFRSqrtPrecision) {
const auto Op = IROp->C<IR::IROp_VFRSqrtPrecision>();
const auto OpSize = IROp->Size;
const auto ElementSize = Op->Header.ElementSize;


LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit,
"Unexpected sizes for operation.", __func__);

const auto SubRegSize = ConvertSubRegSizePair16(IROp);
const auto IsScalar = ElementSize == OpSize;

const auto Dst = GetVReg(Node);
const auto Vector = GetVReg(Op->Vector.ID());

if (IsScalar) {
if (HostSupportsRPRES) {
frsqrte(SubRegSize.Scalar, VTMP1.S(), Vector.S());
// Improve initial estimate which is not good enough.
fmul(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), VTMP1.S());
frsqrts(SubRegSize.Scalar, VTMP2.S(), VTMP2.S(), Vector.S());
fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S());
return;
}

fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0);
// element size is known to be 32bits
fsqrt(VTMP2.S(), Vector.S());
fdiv(Dst.S(), VTMP1.S(), VTMP2.S());
} else {
if (HostSupportsRPRES) {
frsqrte(SubRegSize.Vector, VTMP1.D(), Vector.D());
// Improve initial estimate which is not good enough.
fmul(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), VTMP1.D());
frsqrts(SubRegSize.Vector, VTMP2.D(), VTMP2.D(), Vector.D());
fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D());
return;
}
fmov(SubRegSize.Vector, VTMP1.Q(), 1.0);
fsqrt(SubRegSize.Vector, VTMP2.Q(), Vector.Q());
fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q());
}
}

DEF_OP(VNot) {
const auto Op = IROp->C<IR::IROp_VNot>();
const auto OpSize = IROp->Size;
Expand Down Expand Up @@ -4466,5 +4553,29 @@ DEF_OP(VFNMLS) {
}
}

DEF_OP(VFCopySign) {
auto Op = IROp->C<IR::IROp_VFCopySign>();
const auto OpSize = IROp->Size;
const auto SubRegSize = ConvertSubRegSize248(IROp);

ARMEmitter::VRegister Magnitude = GetVReg(Op->Vector1.ID());
ARMEmitter::VRegister Sign = GetVReg(Op->Vector2.ID());

// We don't assign explicity to Dst but Dst and Magniture are tied to the same register.
// Similar in semantics to C's copysignf.
switch (OpSize) {
case IR::OpSize::i64Bit:
movi(SubRegSize, VTMP1.D(), 0x80, 24);
bit(Magnitude.D(), Sign.D(), VTMP1.D());
break;
case IR::OpSize::i128Bit:
movi(SubRegSize, VTMP1.Q(), 0x80, 24);
bit(Magnitude.Q(), Sign.Q(), VTMP1.Q());
break;
default: LOGMAN_MSG_A_FMT("Unsupported element size for operation {}", __func__); FEX_UNREACHABLE;
}
}


#undef DEF_OP
} // namespace FEXCore::CPU
1 change: 1 addition & 0 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ class OpDispatchBuilder final : public IREmitter {

void VectorALUROp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
void VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize);
void RSqrt3DNowOp(OpcodeArgs, bool Duplicate);
template<FEXCore::IR::IROps IROp, IR::OpSize ElementSize>
void VectorUnaryDuplicateOp(OpcodeArgs);

Expand Down
8 changes: 4 additions & 4 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ constexpr std::tuple<uint8_t, uint8_t, FEXCore::X86Tables::OpDispatchPtr> OpDisp
{0x1C, 1, &OpDispatchBuilder::PF2IWOp},
{0x1D, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int<OpSize::i32Bit, false>},

{0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECP, OpSize::i32Bit>},
{0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRSQRT, OpSize::i32Bit>},
{0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECPPRECISION, OpSize::i32Bit>},
{0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, false>},

{0x8A, 1, &OpDispatchBuilder::PFNACCOp},
{0x8E, 1, &OpDispatchBuilder::PFPNACCOp},

{0x90, 1, &OpDispatchBuilder::VPFCMPOp<1>},
{0x94, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMIN, OpSize::i32Bit>},
{0x96, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, OpSize::i32Bit>},
{0x97, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, OpSize::i32Bit>},
{0x96, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECPPRECISION, OpSize::i32Bit>},
{0x97, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, true>},

{0x9A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFSUB, OpSize::i32Bit>},
{0x9E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADD, OpSize::i32Bit>},
Expand Down
31 changes: 25 additions & 6 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -626,17 +626,36 @@ void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs) {
template void OpDispatchBuilder::AVXInsertScalarFCMPOp<OpSize::i32Bit>(OpcodeArgs);
template void OpDispatchBuilder::AVXInsertScalarFCMPOp<OpSize::i64Bit>(OpcodeArgs);

void OpDispatchBuilder::RSqrt3DNowOp(OpcodeArgs, bool Duplicate) {
const auto Size = OpSizeFromSrc(Op);
const auto ElementSize = OpSize::i32Bit;

Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], Size, Op->Flags);

// For the sqrt reciprocal in 3DNow!, if the source is negative,
// then the result has the same sign as the source but the result is always calculated
// as if the source was positive.
Ref AbsSrc = _VFAbs(Size, ElementSize, Src);
Ref PosRSqrt = _VFRSqrtPrecision(Size, ElementSize, AbsSrc);
Ref Result = _VFCopySign(Size, ElementSize, PosRSqrt, Src);

if (Duplicate) {
Result = _VDupElement(Size, ElementSize, Result, 0);
}

StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
}

void OpDispatchBuilder::VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) {
// In the event of a scalar operation and a vector source, then
// we can specify the entire vector length in order to avoid
// unnecessary sign extension on the element to be operated on.
// In the event of a memory operand, we load the exact element size.
const auto SrcSize = OpSizeFromSrc(Op);

Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags);
const auto Size = OpSizeFromSrc(Op);

DeriveOp(ALUOp, IROp, _VFSqrt(SrcSize, ElementSize, Src));
Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], Size, Op->Flags);

DeriveOp(ALUOp, IROp, _VFSqrt(Size, ElementSize, Src));
StoreResult(FPRClass, Op, ALUOp, OpSize::iInvalid);
}

Expand Down Expand Up @@ -676,8 +695,8 @@ void OpDispatchBuilder::VectorUnaryDuplicateOp(OpcodeArgs) {
VectorUnaryDuplicateOpImpl(Op, IROp, ElementSize);
}

template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRSQRT, OpSize::i32Bit>(OpcodeArgs);
template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECP, OpSize::i32Bit>(OpcodeArgs);
// TODO: there's only one instantiation of this template. Lets remove it.
template void OpDispatchBuilder::VectorUnaryDuplicateOp<IR::OP_VFRECPPRECISION, OpSize::i32Bit>(OpcodeArgs);

void OpDispatchBuilder::MOVQOp(OpcodeArgs, VectorOpType VectorType) {
const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : OpSizeFromSrc(Op);
Expand Down
39 changes: 39 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -1881,6 +1881,12 @@
"DestSize": "RegisterSize",
"ElementSize": "ElementSize",
"TiedSource": 0
},
"FPR = VFCopySign OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": {
"Desc": ["Returns a vector where each element has has the magniture of each corresponding element in vector1 and the sign of vector 2."],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize",
"TiedSource": 0
}
},
"Vector": {
Expand Down Expand Up @@ -1967,19 +1973,52 @@
},

"FPR = VFRecp OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": [
"Reciprocal value - matches the precision required by the x86 spec.",
"It has a relative error of at most 1.5 * 2^-12"
],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize"
},
"FPR = VFRecpPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": [
"Similar to VFRecp but carrying more precision for 3DNow!",
"It provides at least 14 bits precision, with a relative error of at most 2^-14"
],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize",
"EmitValidation": [
"RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit",
"ElementSize == FEXCore::IR::OpSize::i32Bit"
]
},

"FPR = VFSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"DestSize": "RegisterSize",
"ElementSize": "ElementSize"
},

"FPR = VFRSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": [
"Reciprocal Square Root - matches the precision required by the x86 spec.",
"It has a relative error of at most 1.5 * 2^-12"
],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize"
},
"FPR = VFRSqrtPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"Desc": [
"Similar to VFRSqrt but carrying more precision for 3DNow!",
"It provides at least 15 bits precision, with a relative error of at most 2^-15"
],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize",
"EmitValidation": [
"RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit",
"ElementSize == FEXCore::IR::OpSize::i32Bit"
]
},

"FPR = VCMPEQZ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": {
"DestSize": "RegisterSize",
"ElementSize": "ElementSize"
Expand Down
80 changes: 77 additions & 3 deletions unittests/ASM/3DNow/86.asm
Original file line number Diff line number Diff line change
@@ -1,20 +1,72 @@
%ifdef CONFIG
{
"RegData": {
"MM0": "0x3f800000bf800000",
"MM1": "0x3c000000bc000000",
"MM2": "0xbf8000003f800000"
"RBX": "1",
"MM3": "0xff8000007f800000"
},
"HostFeatures": ["3DNOW"]
}
%endif

%include "checkprecision.mac"

section .text
global _start

_start:
pfrcpv mm0, [rel data1]
pfrcpv mm1, [rel data2]
pfrcpv mm2, [rel data3]
pfrcpv mm3, [rel data4]

; All calculated
; Now we extract all the values into memory to call check_relerr.
movd edx, mm0
mov [rel result11], edx

psrlq mm0, 32
movd edx, mm0
mov [rel result12], edx

movd edx, mm1
mov [rel result21], edx

psrlq mm1, 32
movd edx, mm1
mov [rel result22], edx

movd edx, mm2
mov [rel result31], edx

psrlq mm2, 32
movd edx, mm2
mov [rel result32], edx

check_relerr rel eresult11, rel result11, rel tolerance
mov ebx, eax
check_relerr rel eresult12, rel result12, rel tolerance
and ebx, eax
check_relerr rel eresult21, rel result21, rel tolerance
and ebx, eax
check_relerr rel eresult22, rel result22, rel tolerance
and ebx, eax
check_relerr rel eresult31, rel result31, rel tolerance
and ebx, eax
check_relerr rel eresult32, rel result32, rel tolerance
and ebx, eax

hlt

section .bss
align 32
result11: resd 1
result12: resd 1
result21: resd 1
result22: resd 1
result31: resd 1
result32: resd 1

section .data
align 8
data1:
dd -1.0
Expand All @@ -27,3 +79,25 @@ dd 128.0
data3:
dd 1.0
dd -1.0

data4:
dd 0.0
dd -0.0

eresult11:
dd -1.0
eresult12:
dd 1.0
eresult21:
dd 0xbc000000 ; -1/128
eresult22:
dd 0x3c000000 ; 1/128
eresult31:
dd 1.0
eresult32:
dd -1.0

tolerance:
dd 0x38800000 ; 2^-14 - 14bit accuracy

define_check_data_constants
Loading

0 comments on commit b7f58e6

Please sign in to comment.