Skip to content

Commit

Permalink
[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatibl…
Browse files Browse the repository at this point in the history
…e] functions (1/n) (#118505)

In streaming[-compatible] functions, use SVE for scalar FP conversions
to/from integer types. This can help avoid moves between FPRs and GRPs,
which could be costly.

This patch also updates definitions of SCVTF_ZPmZ_StoD and
UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so
requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent
types.

Follow up to #112213.

Note: This PR does not include support for f64 <-> i32 conversions (like
#112564), which needs a bit more work to support.
  • Loading branch information
MacDue authored Dec 19, 2024
1 parent 9bb1d03 commit ca98a3d
Show file tree
Hide file tree
Showing 7 changed files with 857 additions and 254 deletions.
60 changes: 59 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19115,13 +19115,67 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
return SDValue();
}

/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
/// functions, this can help to reduce the number of fmovs to/from GPRs.
static SDValue
tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (N->isStrictFPOpcode())
return SDValue();

if (DCI.isBeforeLegalizeOps())
return SDValue();

if (!Subtarget->isSVEorStreamingSVEAvailable() ||
(!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
return SDValue();

auto isSupportedType = [](EVT VT) {
return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
};

SDValue SrcVal = N->getOperand(0);
EVT SrcTy = SrcVal.getValueType();
EVT DestTy = N->getValueType(0);

if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
return SDValue();

EVT SrcVecTy;
EVT DestVecTy;
if (DestTy.bitsGT(SrcTy)) {
DestVecTy = getPackedSVEVectorVT(DestTy);
SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
} else {
SrcVecTy = getPackedSVEVectorVT(SrcTy);
DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
}

// Ensure the resulting src/dest vector type is legal.
if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
return SDValue();

SDLoc DL(N);
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
}

static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
// First try to optimize away the conversion when it's conditionally from
// a constant. Vectors only.
if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
return Res;

if (SDValue Res =
tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
return Res;

EVT VT = N->getValueType(0);
if (VT != MVT::f32 && VT != MVT::f64)
return SDValue();
Expand Down Expand Up @@ -19160,6 +19214,10 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (SDValue Res =
tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
return Res;

if (!Subtarget->isNeonAvailable())
return SDValue();

Expand Down Expand Up @@ -26240,7 +26298,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performMulCombine(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return performIntToFpCombine(N, DAG, Subtarget);
return performIntToFpCombine(N, DAG, DCI, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT_SAT:
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2348,8 +2348,8 @@ let Predicates = [HasSVEorSME] in {
defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, null_frag, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
Expand Down
93 changes: 74 additions & 19 deletions llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
Original file line number Diff line number Diff line change
@@ -1,22 +1,32 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -force-streaming-compatible < %s | FileCheck %s
; RUN: llc -force-streaming-compatible -mattr=+sme2p2 < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
; RUN: llc < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
; RUN: llc -mattr=+sme2p2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE

target triple = "aarch64-unknown-linux-gnu"

define double @t1(double %x) {
; CHECK-LABEL: t1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs x8, d0
; CHECK-NEXT: scvtf d0, x8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t1:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzs d0, d0
; USE-NEON-NO-GPRS-NEXT: scvtf d0, d0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t1:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvtzs x8, d0
; NONEON-NOSVE-NEXT: scvtf d0, x8
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptosi double %x to i64
%conv1 = sitofp i64 %conv to double
Expand All @@ -26,15 +36,24 @@ entry:
define float @t2(float %x) {
; CHECK-LABEL: t2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs w8, s0
; CHECK-NEXT: scvtf s0, w8
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t2:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzs s0, s0
; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t2:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
; NONEON-NOSVE-NEXT: scvtf s0, w8
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptosi float %x to i32
%conv1 = sitofp i32 %conv to float
Expand All @@ -44,11 +63,20 @@ entry:
define half @t3(half %x) {
; CHECK-LABEL: t3:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvt s0, h0
; CHECK-NEXT: fcvtzs w8, s0
; CHECK-NEXT: scvtf s0, w8
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: t3:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fcvtzs w8, s0
; NONEON-NOSVE-NEXT: scvtf s0, w8
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptosi half %x to i32
%conv1 = sitofp i32 %conv to half
Expand All @@ -58,15 +86,24 @@ entry:
define double @t4(double %x) {
; CHECK-LABEL: t4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu x8, d0
; CHECK-NEXT: ucvtf d0, x8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t4:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzu d0, d0
; USE-NEON-NO-GPRS-NEXT: ucvtf d0, d0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t4:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvtzu x8, d0
; NONEON-NOSVE-NEXT: ucvtf d0, x8
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptoui double %x to i64
%conv1 = uitofp i64 %conv to double
Expand All @@ -76,15 +113,24 @@ entry:
define float @t5(float %x) {
; CHECK-LABEL: t5:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu w8, s0
; CHECK-NEXT: ucvtf s0, w8
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
;
; USE-NEON-NO-GPRS-LABEL: t5:
; USE-NEON-NO-GPRS: // %bb.0: // %entry
; USE-NEON-NO-GPRS-NEXT: fcvtzu s0, s0
; USE-NEON-NO-GPRS-NEXT: ucvtf s0, s0
; USE-NEON-NO-GPRS-NEXT: ret
;
; NONEON-NOSVE-LABEL: t5:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptoui float %x to i32
%conv1 = uitofp i32 %conv to float
Expand All @@ -94,11 +140,20 @@ entry:
define half @t6(half %x) {
; CHECK-LABEL: t6:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvt s0, h0
; CHECK-NEXT: fcvtzu w8, s0
; CHECK-NEXT: ucvtf s0, w8
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h
; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: t6:
; NONEON-NOSVE: // %bb.0: // %entry
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: fcvtzu w8, s0
; NONEON-NOSVE-NEXT: ucvtf s0, w8
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: ret
entry:
%conv = fptoui half %x to i32
%conv1 = uitofp i32 %conv to half
Expand Down
Loading

0 comments on commit ca98a3d

Please sign in to comment.