Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tensor primitives divide int32 #111505

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
dfa44fa
Add vectorized path for Int32 type in TensorPrimitives.Divide
Jan 16, 2025
37afdd2
Add ISA guards and Debug.Assert
Jan 16, 2025
0cc3b79
Simplify vectorizable check, simplify preprocessor guard
Jan 16, 2025
b8965ae
Use XPlat intrinsics, use x86 intrinsics just for conversion
Jan 21, 2025
d5ebf81
Get Vector128<int> operator/ working
Jan 31, 2025
ef596d9
Working for Vector256<int> operator/
Jan 31, 2025
1da230f
Works for operator/ where op2 is scalar int
Jan 31, 2025
c8b913e
Consolidate to one path
Jan 31, 2025
6ac02b3
Move logic from importer to gentree
Feb 7, 2025
2bc816b
Add support to vectorize Vector512<int> operator /
Feb 7, 2025
77842e4
Working GenTreeSIMDDivByZeroCheck node
Feb 11, 2025
16a5178
Make GenTreeSIMDDivByZeroCheck a unary op
Feb 11, 2025
1ca404f
Remove empty lower functions, remove leftover gtGetOp2
Feb 11, 2025
d9b18a0
JIT formatting
Feb 11, 2025
34329ad
Only use low SIMD registers for ptest
Feb 11, 2025
65d2f8c
Merge branch 'main' into tensor-primitives-divide-int32
alexcovington Feb 12, 2025
839baa9
Merge branch 'main' into tensor-primitives-divide-int32
alexcovington Feb 17, 2025
c6e18e6
Use gtNewSimdCmpOpAnyNode for div-by-zero check
Feb 18, 2025
078aff6
Add spill for op1
Feb 18, 2025
f994b09
Jit formatting
Feb 18, 2025
6767384
Rework to use NI_Vector*_op_Division
Feb 20, 2025
e61e97f
Remove GT_SIMD_DIV_BY_ZERO_CHECK and related class/functions
Feb 20, 2025
08e0d7c
Re-use tmp reg for converts
Feb 21, 2025
66580e1
Comments, jit formatting
Feb 21, 2025
3feb91b
Remove leftover SIMDDivByZero
Feb 21, 2025
1807a4c
Don't flag with GTF_OVERFLOW since it conflicts with GTF_HW_EM_OP
Feb 21, 2025
4929106
Use correct INT_MIN / -1, simplify vec const generation, add assert t…
Feb 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/coreclr/jit/fgdiagnostic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3432,6 +3432,12 @@ void Compiler::fgDebugCheckFlags(GenTree* tree, BasicBlock* block)
expectedFlags |= GTF_GLOB_REF;
break;
}

case NI_Vector128_op_Division:
case NI_Vector256_op_Division:
{
break;
}
#endif // TARGET_XARCH

#if defined(TARGET_ARM64)
Expand Down
74 changes: 59 additions & 15 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7269,6 +7269,16 @@ bool GenTree::OperMayThrow(Compiler* comp)
{
return true;
}

#ifdef TARGET_XARCH
NamedIntrinsic intrinsicId = this->AsHWIntrinsic()->GetHWIntrinsicId();
if (intrinsicId == NI_Vector128_op_Division || intrinsicId == NI_Vector256_op_Division ||
intrinsicId == NI_Vector512_op_Division)
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably assert that we're only encountering it for varTypeIsInteger(AsHWIntrinsic()->GetSimdBaseType()), just to avoid issues from potential future refactorings.

I'd expect we want to go ahead and include Vector512_op_Division here as well, even if it can't be encountered quite yet.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I've added the assert and included Vector512_op_Division.

assert(varTypeIsInt(AsHWIntrinsic()->GetSimdBaseType()));
return true;
}
#endif // TARGET_XARCH
}
#endif // FEATURE_HW_INTRINSICS

Expand Down Expand Up @@ -21234,6 +21244,26 @@ GenTree* Compiler::gtNewSimdBinOpNode(
}
}
#endif // TARGET_XARCH
#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
case GT_DIV:
{
if (simdBaseType == TYP_INT)
{
assert(compOpportunisticallyDependsOn(InstructionSet_AVX) ||
compOpportunisticallyDependsOn(InstructionSet_AVX512F));

assert(simdSize == 16 || simdSize == 32);

NamedIntrinsic divIntrinsic = simdSize == 16 ? NI_Vector128_op_Division : NI_Vector256_op_Division;
unsigned int divideOpSimdSize = simdSize * 2;

GenTree* divOp =
gtNewSimdHWIntrinsicNode(op1->TypeGet(), op1, op2, divIntrinsic, simdBaseJitType, divideOpSimdSize);
return divOp;
}
unreached();
}
#endif // defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)

case GT_MUL:
{
Expand Down Expand Up @@ -28156,6 +28186,13 @@ void GenTreeHWIntrinsic::Initialize(NamedIntrinsic intrinsicId)
gtFlags |= (GTF_CALL | GTF_GLOB_REF);
break;
}

case NI_Vector128_op_Division:
case NI_Vector256_op_Division:
{
gtFlags |= GTF_EXCEPT;
break;
}
#endif // TARGET_XARCH

#if defined(TARGET_ARM64)
Expand Down Expand Up @@ -29073,26 +29110,33 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp,

case GT_DIV:
{
#if defined(TARGET_XARCH)
assert(varTypeIsFloating(simdBaseType) || varTypeIsInt(simdBaseType));
#else
assert(varTypeIsFloating(simdBaseType));
#endif
assert(op2->TypeIs(simdType));

#if defined(TARGET_XARCH)
if (simdSize == 64)
{
id = NI_AVX512F_Divide;
}
else if (simdSize == 32)
{
id = NI_AVX_Divide;
}
else if (simdBaseType == TYP_FLOAT)
{
id = isScalar ? NI_SSE_DivideScalar : NI_SSE_Divide;
}
else
if (varTypeIsFloating(simdBaseType))
{
assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
id = isScalar ? NI_SSE2_DivideScalar : NI_SSE2_Divide;
if (simdSize == 64)
{
id = NI_AVX512F_Divide;
}
else if (simdSize == 32)
{
id = NI_AVX_Divide;
}
else if (simdBaseType == TYP_FLOAT)
{
id = isScalar ? NI_SSE_DivideScalar : NI_SSE_Divide;
}
else
{
assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
id = isScalar ? NI_SSE2_DivideScalar : NI_SSE2_Divide;
}
}
#elif defined(TARGET_ARM64)
if ((simdSize == 8) && (isScalar || (simdBaseType == TYP_DOUBLE)))
Expand Down
66 changes: 66 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1871,6 +1871,72 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
break;
}

case NI_Vector128_op_Division:
case NI_Vector256_op_Division:
{
// We can emulate SIMD integer division by converting the 32-bit integer -> 64-bit double,
// perform a 64-bit double divide, then convert back to a 32-bit integer. This is generating
// something similar to the following managed code:
// if (Vector128.EqualsAny(op2, Vector128<int>.Zero))
// {
// throw new DivideByZeroException();
// }
//
// Vector128<int> overflowMask =
// Vector128.Equals(op1, Vector128.Create(int.MinValue)
// & Vector128.Equals(op2, Vector128.Create(-1));
// if (!Vector128.EqualsAll(overflowMask, Vector128<int>.Zero))
// {
// throw new OverflowException();
// }
//
// Vector256<double> op1_f64 =
// Vector256.ConvertToDouble(Vector256.WidenLower(Vector128.ToVector256Unsafe(op1))));
// Vector256<double> op2_f64 =
// Vector256.ConvertToDouble(Vector256.WidenLower(Vector128.ToVector256Unsafe(op2))));
// Vector256<double> div_f64 = op1_f64 / op2_f64;
// Vector256<long> div_i64 = Vector256.ConvertToInt64(div_f64);
// Vector128<int> div_i32 = Vector256.Narrow(div_i64.GetLower(), div_i64.GetUpper());
// return div_i32;
regNumber op2Reg = op2->GetRegNum();
regNumber tmpReg1 = internalRegisters.Extract(node, RBM_ALLFLOAT);
regNumber tmpReg2 = internalRegisters.Extract(node, RBM_ALLFLOAT);
emitAttr typeSize = emitTypeSize(node->TypeGet());
noway_assert(typeSize == EA_16BYTE || typeSize == EA_32BYTE);
emitAttr divTypeSize = typeSize == EA_16BYTE ? EA_32BYTE : EA_64BYTE;

simd_t negOneIntVec = simd_t::AllBitsSet();
simd_t minValueInt{};
int numElements = genTypeSize(node->TypeGet()) / 4;
for (int i = 0; i < numElements; i++)
{
minValueInt.i32[i] = INT_MIN;
}
CORINFO_FIELD_HANDLE minValueFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(minValueInt.v128[0])
: emit->emitSimd32Const(minValueInt.v256[0]);
CORINFO_FIELD_HANDLE negOneFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(negOneIntVec.v128[0])
: emit->emitSimd32Const(negOneIntVec.v256[0]);

// div-by-zero check
emit->emitIns_SIMD_R_R_R(INS_xorpd, typeSize, tmpReg1, tmpReg1, tmpReg1, instOptions);
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, typeSize, tmpReg1, tmpReg1, op2Reg, instOptions);
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg1, tmpReg1, instOptions);
genJumpToThrowHlpBlk(EJ_jne, SCK_DIV_BY_ZERO);

// overflow check
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg1, op1Reg, minValueFld, 0, instOptions);
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg2, op2Reg, negOneFld, 0, instOptions);
emit->emitIns_SIMD_R_R_R(INS_pand, typeSize, tmpReg1, tmpReg1, tmpReg2, instOptions);
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg1, tmpReg1, instOptions);
genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);

emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op1Reg, instOptions);
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg2, op2Reg, instOptions);
emit->emitIns_SIMD_R_R_R(INS_divpd, divTypeSize, targetReg, tmpReg1, tmpReg2, instOptions);
emit->emitIns_R_R(INS_cvttpd2dq, divTypeSize, targetReg, targetReg, instOptions);
break;
}

default:
{
unreached();
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ HARDWARE_INTRINSIC(Vector128, get_Zero,
HARDWARE_INTRINSIC(Vector128, op_Addition, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, op_BitwiseAnd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, op_BitwiseOr, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, op_Division, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, op_Division, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialSideEffect_Other|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(Vector128, op_Equality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector128, op_ExclusiveOr, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, op_Inequality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp)
Expand Down Expand Up @@ -249,7 +249,7 @@ HARDWARE_INTRINSIC(Vector256, get_Zero,
HARDWARE_INTRINSIC(Vector256, op_Addition, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector256, op_BitwiseAnd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, op_BitwiseOr, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, op_Division, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector256, op_Division, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialSideEffect_Other|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(Vector256, op_Equality, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(Vector256, op_ExclusiveOr, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, op_Inequality, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp)
Expand Down
14 changes: 13 additions & 1 deletion src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2417,8 +2417,19 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

if (!varTypeIsFloating(simdBaseType))
{
// We can't trivially handle division for integral types using SIMD
#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
// Check to see if it is possible to emulate the integer division
if (!(simdBaseType == TYP_INT &&
((simdSize == 16 && compOpportunisticallyDependsOn(InstructionSet_AVX)) ||
(simdSize == 32 && compOpportunisticallyDependsOn(InstructionSet_AVX512F)))))
{
break;
}
impSpillSideEffect(true, stackState.esStackDepth -
2 DEBUGARG("Spilling op1 side effects for vector integer division"));
#else
break;
#endif // defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
}

CORINFO_ARG_LIST_HANDLE arg1 = sig->args;
Expand All @@ -2433,6 +2444,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
op1 = getArgForHWIntrinsic(argType, argClass);

retNode = gtNewSimdBinOpNode(GT_DIV, retType, op1, op2, simdBaseJitType, simdSize);

break;
}

Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10422,6 +10422,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_Vector128_op_Division:
case NI_Vector256_op_Division:
{
break;
}

default:
{
assert(!"Unhandled containment for helper binary hardware intrinsic");
Expand Down
17 changes: 17 additions & 0 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2786,6 +2786,23 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}

case NI_Vector128_op_Division:
case NI_Vector256_op_Division:
{
srcCount = BuildOperandUses(op1, lowSIMDRegs());
srcCount += BuildOperandUses(op2, lowSIMDRegs());

// get a tmp register for div-by-zero check
buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs());

// get a tmp register for overflow check
buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs());
setInternalRegsDelayFree = true;

buildUses = false;
break;
}

default:
{
assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
Expand Down
8 changes: 8 additions & 0 deletions src/coreclr/jit/morph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11383,6 +11383,14 @@ GenTree* Compiler::fgMorphHWIntrinsic(GenTreeHWIntrinsic* tree)
tree->AddAllEffectsFlags(operand);
}

#ifdef TARGET_XARCH
if (intrinsicId == NI_Vector128_op_Division || intrinsicId == NI_Vector256_op_Division)
{
fgAddCodeRef(compCurBB, SCK_DIV_BY_ZERO);
fgAddCodeRef(compCurBB, SCK_OVERFLOW);
}
#endif // TARGET_XARCH

if (opts.OptimizationEnabled())
{
var_types retType = tree->TypeGet();
Expand Down
14 changes: 14 additions & 0 deletions src/coreclr/jit/stacklevelsetter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,20 @@ void StackLevelSetter::SetThrowHelperBlocks(GenTree* node, BasicBlock* block)
}
break;

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
case GT_HWINTRINSIC:
{

NamedIntrinsic intrinsicId = node->AsHWIntrinsic()->GetHWIntrinsicId();
if (intrinsicId == NI_Vector128_op_Division || intrinsicId == NI_Vector256_op_Division)
{
SetThrowHelperBlock(SCK_DIV_BY_ZERO, block);
SetThrowHelperBlock(SCK_OVERFLOW, block);
}
}
break;
#endif // defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)

case GT_INDEX_ADDR:
case GT_ARR_ELEM:
SetThrowHelperBlock(SCK_RNGCHK_FAIL, block);
Expand Down
Loading
Loading