Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64] Avoid single-element vector fp converts in streaming[-compatible] functions #112213

Merged
merged 4 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -6237,7 +6237,8 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
let Predicates = [HasNEONandIsStreamingSafe] in {
// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
let Predicates = [HasNEON] in {
def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
(SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
Expand All @@ -6247,7 +6248,8 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
(UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;

let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
let Predicates = [HasNEON, HasFullFP16] in {
def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
(SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
Expand All @@ -6270,9 +6272,10 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),

// fp16: integer extraction from vector must be at least 32-bits to be legal.
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
(v8i16 FPR128:$Rn), (i64 0))), i16)))),
// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
let Predicates = [HasNEON, HasFullFP16] in {
def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
(v8i16 FPR128:$Rn), (i64 0))), i16)))),
(SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;

// unsigned 32-bit extracted element is truncated to 16-bits using AND
Expand Down Expand Up @@ -6367,7 +6370,7 @@ def : Pat <(f64 (uint_to_fp (i32
(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
// 64-bits -> double are handled in target specific dag combine:
// performIntToFpCombine.
} // let Predicates = [HasNEONandIsStreamingSafe]
} // let Predicates = [HasNEON]

//===----------------------------------------------------------------------===//
// Advanced SIMD three different-sized vector instructions.
Expand Down
121 changes: 121 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -force-streaming-compatible < %s | FileCheck %s
; RUN: llc < %s | FileCheck %s --check-prefix=NON-STREAMING

target triple = "aarch64-unknown-linux-gnu"

define double @t1(double %x) {
; CHECK-LABEL: t1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs x8, d0
; CHECK-NEXT: scvtf d0, x8
Comment on lines +10 to +11
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This operation may be expensive when executed in streaming mode, because the operations move results between register files (FPR <-> GPR). Could you emulate this operation with SVE instructions instead?
I'm happy for this to be done in a separate patch so that this bugfix can already land.

Copy link
Member Author

@MacDue MacDue Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll look into that in a follow up patch 👍 I need to check how to do something ~equivalent for SVE since for Neon this is done via a load of .td patterns (matching both the to/from int parts).

; CHECK-NEXT: ret
;
; NON-STREAMING-LABEL: t1:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzs d0, d0
; NON-STREAMING-NEXT: scvtf d0, d0
; NON-STREAMING-NEXT: ret
entry:
%conv = fptosi double %x to i64
%conv1 = sitofp i64 %conv to double
ret double %conv1
}

define float @t2(float %x) {
; CHECK-LABEL: t2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs w8, s0
; CHECK-NEXT: scvtf s0, w8
; CHECK-NEXT: ret
;
; NON-STREAMING-LABEL: t2:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzs s0, s0
; NON-STREAMING-NEXT: scvtf s0, s0
; NON-STREAMING-NEXT: ret
entry:
%conv = fptosi float %x to i32
%conv1 = sitofp i32 %conv to float
ret float %conv1
}

define half @t3(half %x) {
; CHECK-LABEL: t3:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvt s0, h0
; CHECK-NEXT: fcvtzs w8, s0
; CHECK-NEXT: scvtf s0, w8
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ret
;
; NON-STREAMING-LABEL: t3:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvt s0, h0
; NON-STREAMING-NEXT: fcvtzs s0, s0
; NON-STREAMING-NEXT: scvtf s0, s0
; NON-STREAMING-NEXT: fcvt h0, s0
; NON-STREAMING-NEXT: ret
entry:
%conv = fptosi half %x to i32
%conv1 = sitofp i32 %conv to half
ret half %conv1
}

define double @t4(double %x) {
; CHECK-LABEL: t4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu x8, d0
; CHECK-NEXT: ucvtf d0, x8
; CHECK-NEXT: ret
;
; NON-STREAMING-LABEL: t4:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzu d0, d0
; NON-STREAMING-NEXT: ucvtf d0, d0
; NON-STREAMING-NEXT: ret
entry:
%conv = fptoui double %x to i64
%conv1 = uitofp i64 %conv to double
ret double %conv1
}

define float @t5(float %x) {
; CHECK-LABEL: t5:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu w8, s0
; CHECK-NEXT: ucvtf s0, w8
; CHECK-NEXT: ret
;
; NON-STREAMING-LABEL: t5:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvtzu s0, s0
; NON-STREAMING-NEXT: ucvtf s0, s0
; NON-STREAMING-NEXT: ret
entry:
%conv = fptoui float %x to i32
%conv1 = uitofp i32 %conv to float
ret float %conv1
}

define half @t6(half %x) {
; CHECK-LABEL: t6:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvt s0, h0
; CHECK-NEXT: fcvtzu w8, s0
; CHECK-NEXT: ucvtf s0, w8
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ret
;
; NON-STREAMING-LABEL: t6:
; NON-STREAMING: // %bb.0: // %entry
; NON-STREAMING-NEXT: fcvt s0, h0
; NON-STREAMING-NEXT: fcvtzu s0, s0
; NON-STREAMING-NEXT: ucvtf s0, s0
; NON-STREAMING-NEXT: fcvt h0, s0
; NON-STREAMING-NEXT: ret
entry:
%conv = fptoui half %x to i32
%conv1 = uitofp i32 %conv to half
ret half %conv1
}
Loading
Loading