[AArch64] Avoid single-element vector fp converts in streaming[-compatible] functions #112213

MacDue · 2024-10-14T14:28:06Z

The single-element vector variants of FCVTZS, FCVTZU, UCVTF, and SCVTF are only supported in streaming[-compatible] functions with +sme2p2.

Reference:

Codegen will be improved in follow up patches.

llvmbot · 2024-10-14T14:28:42Z

@llvm/pr-subscribers-backend-aarch64

Author: Benjamin Maxwell (MacDue)

Changes

The single-element vector variants of FCVTZS, FCVTZU, UCVTF, and SCVTF are only supported in streaming[-compatible] functions with +sme2p2.

Reference:

Full diff: https://github.com/llvm/llvm-project/pull/112213.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+11-6)
(added) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll (+121)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 325508b62a9f14..bd9da10300c7fd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -247,6 +247,11 @@ def HasSMEF16F16orSMEF8F16
 def HasNEONandIsStreamingSafe
     : Predicate<"Subtarget->hasNEON()">,
       AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
+// A subet of NEON instructions legal in Streaming SVE mode with +sme2p2.
+// TODO: Change to check for hasSME2p2() once FEAT_SME2p2 is implemented.
+def HasNEONandIsSME2p2StreamingSafe
+    : Predicate<"Subtarget->isNeonAvailable()">,
+      AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -6237,7 +6242,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEONandIsStreamingSafe] in {
+let Predicates = [HasNEONandIsSME2p2StreamingSafe] in {
 def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
 def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -6247,7 +6252,7 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
 def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
           (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
 
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
 def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
           (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
 def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
@@ -6270,9 +6275,9 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
 
 // fp16: integer extraction from vector must be at least 32-bits to be legal.
 // Actual extraction result is then an in-reg sign-extension of lower 16-bits.
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract 
-                (v8i16 FPR128:$Rn), (i64 0))), i16)))), 
+let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
+                (v8i16 FPR128:$Rn), (i64 0))), i16)))),
           (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
 
 // unsigned 32-bit extracted element is truncated to 16-bits using AND
@@ -6367,7 +6372,7 @@ def : Pat <(f64 (uint_to_fp (i32
                           (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
 // 64-bits -> double are handled in target specific dag combine:
 // performIntToFpCombine.
-} // let Predicates = [HasNEONandIsStreamingSafe]
+} // let Predicates = [HasNEONandIsSME2p2StreamingSafe]
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three different-sized vector instructions.
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
new file mode 100644
index 00000000000000..9aadf3133ba197
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s --check-prefix=NON-STREAMING
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define double @t1(double %x) {
+; CHECK-LABEL: t1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t1:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzs d0, d0
+; NON-STREAMING-NEXT:    scvtf d0, d0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptosi double %x to i64
+  %conv1 = sitofp i64 %conv to double
+  ret double %conv1
+}
+
+define float @t2(float %x) {
+; CHECK-LABEL: t2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w8, s0
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t2:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzs s0, s0
+; NON-STREAMING-NEXT:    scvtf s0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptosi float %x to i32
+  %conv1 = sitofp i32 %conv to float
+  ret float %conv1
+}
+
+define half @t3(half %x)  {
+; CHECK-LABEL: t3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcvtzs w8, s0
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t3:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvt s0, h0
+; NON-STREAMING-NEXT:    fcvtzs s0, s0
+; NON-STREAMING-NEXT:    scvtf s0, s0
+; NON-STREAMING-NEXT:    fcvt h0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptosi half %x to i32
+  %conv1 = sitofp i32 %conv to half
+  ret half %conv1
+}
+
+define double @t4(double %x) {
+; CHECK-LABEL: t4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu x8, d0
+; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t4:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzu d0, d0
+; NON-STREAMING-NEXT:    ucvtf d0, d0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptoui double %x to i64
+  %conv1 = uitofp i64 %conv to double
+  ret double %conv1
+}
+
+define float @t5(float %x) {
+; CHECK-LABEL: t5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu w8, s0
+; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t5:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzu s0, s0
+; NON-STREAMING-NEXT:    ucvtf s0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptoui float %x to i32
+  %conv1 = uitofp i32 %conv to float
+  ret float %conv1
+}
+
+define half @t6(half %x)  {
+; CHECK-LABEL: t6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcvtzu w8, s0
+; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t6:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvt s0, h0
+; NON-STREAMING-NEXT:    fcvtzu s0, s0
+; NON-STREAMING-NEXT:    ucvtf s0, s0
+; NON-STREAMING-NEXT:    fcvt h0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptoui half %x to i32
+  %conv1 = uitofp i32 %conv to half
+  ret half %conv1
+}

llvm/lib/Target/AArch64/AArch64InstrInfo.td

sdesmalen-arm · 2024-10-14T15:41:54Z

llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll

+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    scvtf d0, x8


This operation may be expensive when executed in streaming mode, because the operations move results between register files (FPR <-> GPR). Could you emulate this operation with SVE instructions instead?
I'm happy for this to be done in a separate patch so that this bugfix can already land.

I'll look into that in a follow up patch 👍 I need to check how to do something ~equivalent for SVE since for Neon this is done via a load of .td patterns (matching both the to/from int parts).

llvm/lib/Target/AArch64/AArch64InstrInfo.td

…tible] functions The single-element vector variants of FCVTZS, FCVTZU, UCVTF, and SCVTF are only supported in streaming[-compatible] functions with `+sme2p2`. Reference: - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-convert-to-signed-integer--rounding-toward-zero--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-convert-to-floating-point--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-convert-to-floating-point--vector--

When Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF, and SCVTF for fp -> int -> fp conversions to avoid moving values to/from GPRs which may be expensive. Note: With +sme2p2 the single-element vector Neon variants of these instructions could be used instead (but that feature is not implemented yet). Follow up to llvm#112213.

Follow up to llvm#112213 now that the +sme2p2 feature flag has landed. The single-element vector variants of FCVTZS, FCVTZU, UCVTF, and SCVTF are allowed in streaming SVE mode with +sme2p2. Reference: - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-convert-to-signed-integer--rounding-toward-zero--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-convert-to-floating-point--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-convert-to-floating-point--vector--

When Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF, and SCVTF for fp -> int -> fp conversions to avoid moving values to/from GPRs which may be expensive. Note: With +sme2p2 the single-element vector Neon variants of these instructions could be used instead (but that feature is not implemented yet). Follow up to llvm#112213.

Follow up to llvm#112213 now that the +sme2p2 feature flag has landed. The single-element vector variants of FCVTZS, FCVTZU, UCVTF, and SCVTF are allowed in streaming SVE mode with +sme2p2. Reference: - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-convert-to-signed-integer--rounding-toward-zero--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-convert-to-floating-point--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-convert-to-floating-point--vector--

Follow up to #112213 now that the +sme2p2 feature flag has landed. The single-element vector variants of FCVTZS, FCVTZU, UCVTF, and SCVTF are allowed in streaming SVE mode with +sme2p2. Reference: - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-convert-to-signed-integer--rounding-toward-zero--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-convert-to-floating-point--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-convert-to-floating-point--vector--

When Neon is not available use SVE variants of FCVTZS, FCVTZU, UCVTF, and SCVTF for fp -> int -> fp conversions to avoid moving values to/from GPRs which may be expensive. Note: With +sme2p2 the single-element vector Neon variants of these instructions could be used instead (but that feature is not implemented yet). Follow up to llvm#112213.

…112905) Follow up to llvm#112213 now that the +sme2p2 feature flag has landed. The single-element vector variants of FCVTZS, FCVTZU, UCVTF, and SCVTF are allowed in streaming SVE mode with +sme2p2. Reference: - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-convert-to-signed-integer--rounding-toward-zero--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-convert-to-floating-point--vector-- - https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-convert-to-floating-point--vector--

…e] functions (1/n) In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to llvm#112213. Note: This PR does not include support for f64 <-> i32 conversions (like llvm#112564), which needs a bit more work to support.

…e] functions (1/n) (#118505) In streaming[-compatible] functions, use SVE for scalar FP conversions to/from integer types. This can help avoid moves between FPRs and GRPs, which could be costly. This patch also updates definitions of SCVTF_ZPmZ_StoD and UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent types. Follow up to #112213. Note: This PR does not include support for f64 <-> i32 conversions (like #112564), which needs a bit more work to support.

MacDue requested review from sdesmalen-arm and kmclaughlin-arm October 14, 2024 14:28

llvmbot added the backend:AArch64 label Oct 14, 2024

sdesmalen-arm reviewed Oct 14, 2024

View reviewed changes

MacDue force-pushed the sme_fp_converts branch from a005fce to 74577f7 Compare October 14, 2024 19:06

sdesmalen-arm approved these changes Oct 15, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64InstrInfo.td Show resolved Hide resolved

MacDue added 4 commits October 15, 2024 08:47

Fix typo

decbae4

Fixups

122d5ea

Add TODOs & test updates

80227ae

MacDue force-pushed the sme_fp_converts branch from 74577f7 to 80227ae Compare October 15, 2024 09:06

MacDue merged commit 4c28d21 into llvm:main Oct 16, 2024
8 checks passed

MacDue deleted the sme_fp_converts branch October 16, 2024 09:00

MacDue mentioned this pull request Oct 16, 2024

[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions #112564

Open

MacDue mentioned this pull request Oct 18, 2024

[AArch64] Allow single-element vector FP converts with +sme2p2 #112905

Merged

MacDue mentioned this pull request Dec 3, 2024

[AArch64][SVE] Use SVE for scalar FP converts in streaming[-compatible] functions (1/n) #118505

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Avoid single-element vector fp converts in streaming[-compatible] functions #112213

[AArch64] Avoid single-element vector fp converts in streaming[-compatible] functions #112213

MacDue commented Oct 14, 2024 •

edited

Loading

llvmbot commented Oct 14, 2024

sdesmalen-arm Oct 14, 2024

MacDue Oct 14, 2024 •

edited

Loading

[AArch64] Avoid single-element vector fp converts in streaming[-compatible] functions #112213

[AArch64] Avoid single-element vector fp converts in streaming[-compatible] functions #112213

Conversation

MacDue commented Oct 14, 2024 • edited Loading

llvmbot commented Oct 14, 2024

sdesmalen-arm Oct 14, 2024

Choose a reason for hiding this comment

MacDue Oct 14, 2024 • edited Loading

Choose a reason for hiding this comment

MacDue commented Oct 14, 2024 •

edited

Loading

MacDue Oct 14, 2024 •

edited

Loading