diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 91046c5474b73a..9a54d934305419 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1460,9 +1460,29 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm // Special case: "vec ==/!= zero_vector" if (!varTypeIsFloating(simdBaseType) && (op != nullptr) && (simdSize != 12)) { - GenTree* cmp = op; + uint64_t scalarAndMask = UINT64_MAX; + GenTree* cmp = op; if (simdSize != 8) // we don't need compression for Vector64 { + CorInfoType pairwiseMaxType = CORINFO_TYPE_UINT; + + // If op is "vec & cnsVec" where both u64 components in that cnsVec are the same (for both SIMD12 and + // SIMD16) then we'd better do this AND on top of TYP_LONG NI_AdvSimd_Extract in the end - it produces a + // more optimal codegen. + if (op->OperIsHWIntrinsic(NI_AdvSimd_And) && op->AsHWIntrinsic()->Op(2)->OperIs(GT_CNS_VEC)) + { + GenTreeVecCon* andMask = op->AsHWIntrinsic()->Op(2)->AsVecCon(); + simd16_t val = andMask->gtSimd16Val; + if (ElementsAreSame(val.i8, 16) && emitter::emitIns_valid_imm_for_alu(val.i64[0], EA_8BYTE)) + { + pairwiseMaxType = CORINFO_TYPE_UBYTE; + scalarAndMask = val.u64[0]; + BlockRange().Remove(op); + BlockRange().Remove(andMask); + op = op->AsHWIntrinsic()->Op(1); + } + } + node->Op(1) = op; LIR::Use tmp1Use(BlockRange(), &node->Op(1), node); ReplaceWithLclVar(tmp1Use); @@ -1470,7 +1490,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm GenTree* opClone = comp->gtClone(op); BlockRange().InsertAfter(op, opClone); - cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op, opClone, NI_AdvSimd_Arm64_MaxPairwise, CORINFO_TYPE_UINT, + cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op, opClone, NI_AdvSimd_Arm64_MaxPairwise, pairwiseMaxType, simdSize); BlockRange().InsertBefore(node, cmp); LowerNode(cmp); @@ -1483,7 +1503,20 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm GenTree* val = comp->gtNewSimdHWIntrinsicNode(TYP_LONG, cmp, zroCns, NI_AdvSimd_Extract, CORINFO_TYPE_ULONG, simdSize); - BlockRange().InsertAfter(zroCns, val); + + // Apply the scalar AND mask + if (scalarAndMask != UINT64_MAX) + { + GenTree* andMaskNode = comp->gtNewIconNode(static_cast(scalarAndMask), TYP_LONG); + GenTree* andNode = comp->gtNewOperNode(GT_AND, TYP_LONG, val, andMaskNode); + BlockRange().InsertAfter(zroCns, val, andMaskNode, andNode); + LowerNode(val); + val = andNode; + } + else + { + BlockRange().InsertAfter(zroCns, val); + } LowerNode(val); GenTree* cmpZeroCns = comp->gtNewIconNode(0, TYP_LONG);