[AArch64-SVE]: Force generating code compatible to streaming mode.

When streaming mode is enabled, lower some operations and disable some code paths; to force generateing code compatible to streaming mode. Add streaming-mode flag for new sve-fixed-length testing files: build_vector.ll concat.ll extract-subvector.ll extract-vector-elt.ll int-shifts.ll loads.ll shuffle.ll stores.ll Differential Revision: https://reviews.llvm.org/D135564
MaxMood96 · Oct 31, 2022 · 681888e · 681888e
1 parent 325a308
commit 681888e
Show file tree

Hide file tree

Showing 11 changed files with 1,913 additions and 23 deletions.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1391,6 +1391,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
 
+    if (Subtarget->forceStreamingCompatibleSVE()) {
+      for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
+                     MVT::v4i32, MVT::v2i64})
+        addTypeForStreamingSVE(VT);
+
+      for (MVT VT :
+           {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
+        addTypeForStreamingSVE(VT);
+    }
+
     // NOTE: Currently this has to happen after computeRegisterProperties rather
     // than the preferred option of combining it with the addRegisterClass call.
     if (Subtarget->useSVEForFixedLengthVectors()) {
@@ -1597,6 +1607,14 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
   return false;
 }
 
+void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) {
+  setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+  setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+  setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+  setOperationAction(ISD::AND, VT, Custom);
+}
+
 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
 
@@ -5773,8 +5791,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::MLOAD:
     return LowerMLOAD(Op, DAG);
   case ISD::LOAD:
-    if (useSVEForFixedLengthVectorVT(Op.getValueType(),
-                                     Subtarget->forceStreamingCompatibleSVE()))
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
       return LowerFixedLengthVectorLoadToSVE(Op, DAG);
     return LowerLOAD(Op, DAG);
   case ISD::ADD:
@@ -11400,9 +11417,13 @@ static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
                                   const APInt &Bits,
                                   const SDValue *LHS = nullptr) {
+  EVT VT = Op.getValueType();
+  if (VT.isFixedLengthVector() &&
+      DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE())
+    return SDValue();
+
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
-    EVT VT = Op.getValueType();
     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
     bool isAdvSIMDModImm = false;
     uint64_t Shift;
@@ -11448,9 +11469,13 @@ static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
                                   const APInt &Bits,
                                   const SDValue *LHS = nullptr) {
+  EVT VT = Op.getValueType();
+  if (VT.isFixedLengthVector() &&
+      DAG.getSubtarget<AArch64Subtarget>().forceStreamingCompatibleSVE())
+    return SDValue();
+
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
-    EVT VT = Op.getValueType();
     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
     bool isAdvSIMDModImm = false;
     uint64_t Shift;
@@ -12128,7 +12153,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                                    SelectionDAG &DAG) const {
-  if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
+                                   Subtarget->forceStreamingCompatibleSVE()))
     return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
 
   assert(Op.getValueType().isScalableVector() &&
@@ -12234,7 +12260,8 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
   }
 
-  if (useSVEForFixedLengthVectorVT(VT))
+  if (useSVEForFixedLengthVectorVT(VT,
+                                   Subtarget->forceStreamingCompatibleSVE()))
     return LowerFixedLengthExtractVectorElt(Op, DAG);
 
   // Check for non-constant or out of range lane.
@@ -12296,10 +12323,11 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   // If this is extracting the upper 64-bits of a 128-bit vector, we match
   // that directly.
   if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
-      InVT.getSizeInBits() == 128)
+      InVT.getSizeInBits() == 128 && !Subtarget->forceStreamingCompatibleSVE())
     return Op;
 
-  if (useSVEForFixedLengthVectorVT(InVT)) {
+  if (useSVEForFixedLengthVectorVT(InVT,
+                                   Subtarget->forceStreamingCompatibleSVE())) {
     SDLoc DL(Op);
 
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
@@ -12487,7 +12515,8 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
 
 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   // Currently no fixed length shuffles that require SVE are legal.
-  if (useSVEForFixedLengthVectorVT(VT))
+  if (useSVEForFixedLengthVectorVT(VT,
+                                   Subtarget->forceStreamingCompatibleSVE()))
     return false;
 
   if (VT.getVectorNumElements() == 4 &&
@@ -12597,7 +12626,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
 
   switch (Op.getOpcode()) {
   case ISD::SHL:
-    if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
+    if (VT.isScalableVector() ||
+        useSVEForFixedLengthVectorVT(VT,
+                                     Subtarget->forceStreamingCompatibleSVE()))
       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
 
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
@@ -12609,7 +12640,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
                        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
-    if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
+    if (VT.isScalableVector() ||
+        useSVEForFixedLengthVectorVT(
+            VT, Subtarget->forceStreamingCompatibleSVE())) {
       unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
                                                 : AArch64ISD::SRL_PRED;
       return LowerToPredicatedOp(Op, DAG, Opc);
@@ -14008,6 +14041,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
                                                   ShuffleVectorInst *SVI,
                                                   unsigned Factor) const {
+  // Skip if streaming compatible SVE is enabled, because it generates invalid
+  // code in streaming mode when SVE length is not specified.
+  if (Subtarget->forceStreamingCompatibleSVE())
+    return false;
+
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
@@ -22489,7 +22527,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  assert(useSVEForFixedLengthVectorVT(VT) &&
+  assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
          "Only expected to lower fixed length vector operation!");
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
 
@@ -22505,7 +22543,8 @@ SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
     }
 
     // "cast" fixed length vector to a scalable vector.
-    assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
+    assert(V.getValueType().isFixedLengthVector() &&
+           isTypeLegal(V.getValueType()) &&
            "Only fixed length vectors are supported!");
     Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
   }

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -899,6 +899,7 @@ class AArch64TargetLowering : public TargetLowering {
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
   void addTypeForNEON(MVT VT);
+  void addTypeForStreamingSVE(MVT VT);
   void addTypeForFixedLengthSVE(MVT VT);
   void addDRTypeForNEON(MVT VT);
   void addQRTypeForNEON(MVT VT);

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3032,7 +3032,7 @@ let Predicates = [HasSVEorSME] in {
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
 
   // Extract element from vector with immediate index that's within the bottom 128-bits.
-  let AddedComplexity = 1 in {
+  let Predicates = [NotInStreamingSVEMode], AddedComplexity = 1 in {
   def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)),
             (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
   def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)),
@@ -3041,8 +3041,9 @@ let Predicates = [HasSVEorSME] in {
             (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
   def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)),
             (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>;
-  }
+  } // End NotInStreamingSVEMode
 
+  let Predicates = [NotInStreamingSVEMode] in {
   def : Pat<(sext_inreg (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index), i8),
             (i32 (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
   def : Pat<(sext_inreg (anyext (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), i8),
@@ -3055,6 +3056,7 @@ let Predicates = [HasSVEorSME] in {
 
   def : Pat<(sext (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
             (i64 (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+  } // End NotInStreamingSVEMode
 
   // Extract first element from vector.
   let AddedComplexity = 2 in {

diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @build_vector_7_inc1_v4i1(ptr %a) #0 {
+; CHECK-LABEL: build_vector_7_inc1_v4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+  store <4 x i1> <i1 true, i1 false, i1 true, i1 false>, ptr %a, align 1
+  ret void
+}
+
+define void @build_vector_7_inc1_v32i8(ptr %a) #0 {
+; CHECK-LABEL: build_vector_7_inc1_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z0.b, #0, #1
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add z0.b, z0.b, #7 // =0x7
+; CHECK-NEXT:    add z1.b, z1.b, #23 // =0x17
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  store <32 x i8> <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38>, ptr %a, align 1
+  ret void
+}
+
+define void @build_vector_0_inc2_v16i16(ptr %a) #0 {
+; CHECK-LABEL: build_vector_0_inc2_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z0.h, #0, #2
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    add z0.h, z0.h, #16 // =0x10
+; CHECK-NEXT:    str q0, [x0, #16]
+; CHECK-NEXT:    ret
+  store <16 x i16> <i16 0, i16 2, i16 4, i16 6, i16 8, i16 10, i16 12, i16 14, i16 16, i16 18, i16 20, i16 22, i16 24, i16 26, i16 28, i16 30>, ptr %a, align 2
+  ret void
+}
+
+; Negative const stride.
+define void @build_vector_0_dec3_v8i32(ptr %a) #0 {
+; CHECK-LABEL: build_vector_0_dec3_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z0.s, #0, #-3
+; CHECK-NEXT:    mov z1.s, #-12 // =0xfffffffffffffff4
+; CHECK-NEXT:    add z1.s, z0.s, z1.s
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  store <8 x i32> <i32 0, i32 -3, i32 -6, i32 -9, i32 -12, i32 -15, i32 -18, i32 -21>, ptr %a, align 4
+  ret void
+}
+
+; Constant stride that's too big to be directly encoded into the index.
+define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 {
+; CHECK-LABEL: build_vector_minus2_dec32_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #-32
+; CHECK-NEXT:    mov z0.d, #-66 // =0xffffffffffffffbe
+; CHECK-NEXT:    mov z2.d, #-2 // =0xfffffffffffffffe
+; CHECK-NEXT:    index z1.d, #0, x8
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, z2.d
+; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ret
+  store <4 x i64> <i64 -2, i64 -34, i64 -66, i64 -98>, ptr %a, align 8
+  ret void
+}
+
+; Constant but not a sequence.
+define void @build_vector_no_stride_v4i64(ptr %a) #0 {
+; CHECK-LABEL: build_vector_no_stride_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z0.d, #1, #7
+; CHECK-NEXT:    index z1.d, #0, #4
+; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ret
+  store <4 x i64> <i64 0, i64 4, i64 1, i64 8>, ptr %a, align 8
+  ret void
+}
+
+define void @build_vector_0_inc2_v16f16(ptr %a) #0 {
+; CHECK-LABEL: build_vector_0_inc2_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI6_0
+; CHECK-NEXT:    adrp x9, .LCPI6_1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI6_0]
+; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI6_1]
+; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ret
+  store <16 x half> <half 0.0, half 2.0, half 4.0, half 6.0, half 8.0, half 10.0, half 12.0, half 14.0, half 16.0, half 18.0, half 20.0, half 22.0, half 24.0, half 26.0, half 28.0, half 30.0>, ptr %a, align 2
+  ret void
+}
+
+; Negative const stride.
+define void @build_vector_0_dec3_v8f32(ptr %a) #0 {
+; CHECK-LABEL: build_vector_0_dec3_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI7_0
+; CHECK-NEXT:    adrp x9, .LCPI7_1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI7_0]
+; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI7_1]
+; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ret
+  store <8 x float> <float 0.0, float -3.0, float -6.0, float -9.0, float -12.0, float -15.0, float -18.0, float -21.0>, ptr %a, align 4
+  ret void
+}
+
+; Constant stride that's too big to be directly encoded into the index.
+define void @build_vector_minus2_dec32_v4f64(ptr %a) #0 {
+; CHECK-LABEL: build_vector_minus2_dec32_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI8_0
+; CHECK-NEXT:    adrp x9, .LCPI8_1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI8_1]
+; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ret
+  store <4 x double> <double -2.0, double -34.0, double -66.0, double -98.0>, ptr %a, align 8
+  ret void
+}
+
+; Constant but not a sequence.
+define void @build_vector_no_stride_v4f64(ptr %a) #0 {
+; CHECK-LABEL: build_vector_no_stride_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI9_0
+; CHECK-NEXT:    adrp x9, .LCPI9_1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI9_0]
+; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI9_1]
+; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ret
+  store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
+  ret void
+}
+
+
+attributes #0 = { "target-features"="+sve" }