-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Revert "[RISCV] Recurse on first operand of two operand shuffles (#79180)" #80238
Revert "[RISCV] Recurse on first operand of two operand shuffles (#79180)" #80238
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesThis reverts commit bdc4110 on the @tstellar This is my first backport in the new process, so please bear with me and double check I got all pieces of this right. Patch is 54.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80238.diff 6 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 47c6cd6e5487b..c8f7b5c35a381 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5033,60 +5033,56 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
MVT IndexContainerVT =
ContainerVT.changeVectorElementType(IndexVT.getScalarType());
- // Base case for the recursion just below - handle the worst case
- // single source permutation. Note that all the splat variants
- // are handled above.
- if (V2.isUndef()) {
+ SDValue Gather;
+ // TODO: This doesn't trigger for i64 vectors on RV32, since there we
+ // encounter a bitcasted BUILD_VECTOR with low/high i32 values.
+ if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
+ Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG,
+ Subtarget);
+ } else {
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
- SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
- LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
- Subtarget);
- SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
- DAG.getUNDEF(ContainerVT), TrueMask, VL);
- return convertFromScalableVector(VT, Gather, DAG, Subtarget);
- }
-
- // Translate the gather index we computed above (and possibly swapped)
- // back to a shuffle mask. This step should disappear once we complete
- // the migration to recursive design.
- SmallVector<int> ShuffleMaskLHS;
- ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
- for (SDValue GatherIndex : GatherIndicesLHS) {
- if (GatherIndex.isUndef()) {
- ShuffleMaskLHS.push_back(-1);
- continue;
+ // If only one index is used, we can use a "splat" vrgather.
+ // TODO: We can splat the most-common index and fix-up any stragglers, if
+ // that's beneficial.
+ if (LHSIndexCounts.size() == 1) {
+ int SplatIndex = LHSIndexCounts.begin()->getFirst();
+ Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
+ DAG.getConstant(SplatIndex, DL, XLenVT),
+ DAG.getUNDEF(ContainerVT), TrueMask, VL);
+ } else {
+ SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+ LHSIndices =
+ convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
+
+ Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
+ DAG.getUNDEF(ContainerVT), TrueMask, VL);
}
- auto *IdxC = cast<ConstantSDNode>(GatherIndex);
- ShuffleMaskLHS.push_back(IdxC->getZExtValue());
}
- // Recursively invoke lowering for the LHS as if there were no RHS.
- // This allows us to leverage all of our single source permute tricks.
- SDValue Gather =
- DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
- Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);
+ // If a second vector operand is used by this shuffle, blend it in with an
+ // additional vrgather.
+ if (!V2.isUndef()) {
+ V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
- // Blend in second vector source with an additional vrgather.
- V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+ MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
+ SelectMask =
+ convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
- MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
- SelectMask =
- convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
-
- // If only one index is used, we can use a "splat" vrgather.
- // TODO: We can splat the most-common index and fix-up any stragglers, if
- // that's beneficial.
- if (RHSIndexCounts.size() == 1) {
- int SplatIndex = RHSIndexCounts.begin()->getFirst();
- Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
- DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
- SelectMask, VL);
- } else {
- SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
- RHSIndices =
- convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
- Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
- SelectMask, VL);
+ // If only one index is used, we can use a "splat" vrgather.
+ // TODO: We can splat the most-common index and fix-up any stragglers, if
+ // that's beneficial.
+ if (RHSIndexCounts.size() == 1) {
+ int SplatIndex = RHSIndexCounts.begin()->getFirst();
+ Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
+ DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
+ SelectMask, VL);
+ } else {
+ SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
+ RHSIndices =
+ convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
+ Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
+ SelectMask, VL);
+ }
}
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index dab530751ef96..799aebcaa6302 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -238,26 +238,39 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
; V128-LABEL: interleave_v32f32:
; V128: # %bb.0:
-; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v8, 16
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v24, v0, v8
-; V128-NEXT: li a0, -1
-; V128-NEXT: vwmaccu.vx v24, a0, v8
-; V128-NEXT: lui a1, %hi(.LCPI10_0)
-; V128-NEXT: addi a1, a1, %lo(.LCPI10_0)
-; V128-NEXT: li a2, 32
-; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; V128-NEXT: vle16.v v12, (a1)
-; V128-NEXT: lui a1, 699051
-; V128-NEXT: addi a1, a1, -1366
-; V128-NEXT: vmv.s.x v0, a1
+; V128-NEXT: addi sp, sp, -16
+; V128-NEXT: .cfi_def_cfa_offset 16
+; V128-NEXT: csrr a0, vlenb
+; V128-NEXT: slli a0, a0, 2
+; V128-NEXT: sub sp, sp, a0
+; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; V128-NEXT: lui a0, %hi(.LCPI10_0)
+; V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
+; V128-NEXT: li a1, 32
+; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
+; V128-NEXT: vle16.v v4, (a0)
+; V128-NEXT: lui a0, %hi(.LCPI10_1)
+; V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
+; V128-NEXT: vle16.v v24, (a0)
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
+; V128-NEXT: lui a0, 699051
+; V128-NEXT: addi a0, a0, -1366
+; V128-NEXT: vmv.s.x v0, a0
+; V128-NEXT: vrgatherei16.vv v24, v8, v4
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v0, v8, v16
+; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
+; V128-NEXT: csrr a0, vlenb
+; V128-NEXT: slli a0, a0, 2
+; V128-NEXT: add sp, sp, a0
+; V128-NEXT: addi sp, sp, 16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32f32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 9e21cc9e3d624..e1bd16649eede 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -188,30 +188,24 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
; V128-LABEL: interleave_v4i32_offset_1:
; V128: # %bb.0:
-; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V128-NEXT: vwaddu.vv v10, v8, v8
-; V128-NEXT: li a0, -1
-; V128-NEXT: vwmaccu.vx v10, a0, v8
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; V128-NEXT: vid.v v8
-; V128-NEXT: vsrl.vi v8, v8, 1
+; V128-NEXT: vid.v v10
+; V128-NEXT: vsrl.vi v11, v10, 1
+; V128-NEXT: vrgather.vv v10, v8, v11
; V128-NEXT: vmv.v.i v0, 10
-; V128-NEXT: vadd.vi v8, v8, 1
+; V128-NEXT: vadd.vi v8, v11, 1
; V128-NEXT: vrgather.vv v10, v9, v8, v0.t
; V128-NEXT: vmv.v.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v4i32_offset_1:
; V512: # %bb.0:
-; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V512-NEXT: vwaddu.vv v10, v8, v8
-; V512-NEXT: li a0, -1
-; V512-NEXT: vwmaccu.vx v10, a0, v8
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
-; V512-NEXT: vid.v v8
-; V512-NEXT: vsrl.vi v8, v8, 1
+; V512-NEXT: vid.v v10
+; V512-NEXT: vsrl.vi v11, v10, 1
+; V512-NEXT: vrgather.vv v10, v8, v11
; V512-NEXT: vmv.v.i v0, 10
-; V512-NEXT: vadd.vi v8, v8, 1
+; V512-NEXT: vadd.vi v8, v11, 1
; V512-NEXT: vrgather.vv v10, v9, v8, v0.t
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
@@ -403,26 +397,39 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
; V128-LABEL: interleave_v32i32:
; V128: # %bb.0:
-; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v8, 16
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v24, v0, v8
-; V128-NEXT: li a0, -1
-; V128-NEXT: vwmaccu.vx v24, a0, v8
-; V128-NEXT: lui a1, %hi(.LCPI17_0)
-; V128-NEXT: addi a1, a1, %lo(.LCPI17_0)
-; V128-NEXT: li a2, 32
-; V128-NEXT: vsetvli zero, a2, e32, m8, ta, mu
-; V128-NEXT: vle16.v v12, (a1)
-; V128-NEXT: lui a1, 699051
-; V128-NEXT: addi a1, a1, -1366
-; V128-NEXT: vmv.s.x v0, a1
+; V128-NEXT: addi sp, sp, -16
+; V128-NEXT: .cfi_def_cfa_offset 16
+; V128-NEXT: csrr a0, vlenb
+; V128-NEXT: slli a0, a0, 2
+; V128-NEXT: sub sp, sp, a0
+; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; V128-NEXT: lui a0, %hi(.LCPI17_0)
+; V128-NEXT: addi a0, a0, %lo(.LCPI17_0)
+; V128-NEXT: li a1, 32
+; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
+; V128-NEXT: vle16.v v4, (a0)
+; V128-NEXT: lui a0, %hi(.LCPI17_1)
+; V128-NEXT: addi a0, a0, %lo(.LCPI17_1)
+; V128-NEXT: vle16.v v24, (a0)
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
+; V128-NEXT: lui a0, 699051
+; V128-NEXT: addi a0, a0, -1366
+; V128-NEXT: vmv.s.x v0, a0
+; V128-NEXT: vrgatherei16.vv v24, v8, v4
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; V128-NEXT: vwaddu.vv v0, v8, v16
+; V128-NEXT: li a0, -1
; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
+; V128-NEXT: csrr a0, vlenb
+; V128-NEXT: slli a0, a0, 2
+; V128-NEXT: add sp, sp, a0
+; V128-NEXT: addi sp, sp, 16
; V128-NEXT: ret
;
; V512-LABEL: interleave_v32i32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index a26a87a1f3c13..a56a81f5f793b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -612,11 +612,13 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vid.v v10
+; CHECK-NEXT: vid.v v11
+; CHECK-NEXT: vrgather.vv v10, v8, v11
; CHECK-NEXT: li a0, 224
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vadd.vi v10, v10, -4
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vadd.vi v8, v11, -4
+; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
ret <8 x i8> %res
@@ -626,11 +628,13 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_start_into_end_non_contiguous:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vid.v v10
+; CHECK-NEXT: vid.v v11
+; CHECK-NEXT: vrgather.vv v10, v8, v11
; CHECK-NEXT: li a0, 144
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vadd.vi v10, v10, -4
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vadd.vi v8, v11, -4
+; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
ret <8 x i8> %res
@@ -671,11 +675,13 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: merge_slidedown:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vslidedown.vi v8, v8, 1
+; CHECK-NEXT: vid.v v11
+; CHECK-NEXT: vadd.vi v12, v11, 1
; CHECK-NEXT: li a0, 195
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vid.v v10
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vrgather.vv v10, v8, v12
+; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
+; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
ret <8 x i8> %res
@@ -686,12 +692,14 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT: vid.v v10
-; CHECK-NEXT: vadd.vi v10, v10, -1
+; CHECK-NEXT: vid.v v11
+; CHECK-NEXT: vadd.vi v12, v11, 2
+; CHECK-NEXT: vrgather.vv v10, v8, v12
; CHECK-NEXT: li a0, 234
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vadd.vi v8, v11, -1
+; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t
+; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
ret <8 x i8> %res
@@ -702,13 +710,16 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: unmergable:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT: vid.v v10
+; CHECK-NEXT: vadd.vi v11, v10, 2
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
-; CHECK-NEXT: vle8.v v10, (a0)
+; CHECK-NEXT: vle8.v v12, (a0)
; CHECK-NEXT: li a0, 234
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vslidedown.vi v8, v8, 2
-; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT: vrgather.vv v10, v8, v11
+; CHECK-NEXT: vrgather.vv v10, v9, v12, v0.t
+; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
ret <8 x i8> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index f889041647b23..eeb8e517d01d2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -8,51 +8,23 @@
; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3
define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) {
-; RV32-LABEL: load_factor2_v3:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; RV32-NEXT: vle32.v v10, (a0)
-; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v10, 2
-; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT: vwaddu.vv v8, v10, v9
-; RV32-NEXT: li a0, -1
-; RV32-NEXT: vwmaccu.vx v8, a0, v9
-; RV32-NEXT: vmv.v.i v0, 4
-; RV32-NEXT: vsetivli zero, 4, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v10, 4
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; RV32-NEXT: vrgather.vi v8, v12, 0, v0.t
-; RV32-NEXT: vid.v v9
-; RV32-NEXT: vadd.vv v9, v9, v9
-; RV32-NEXT: vadd.vi v11, v9, 1
-; RV32-NEXT: vrgather.vv v9, v10, v11
-; RV32-NEXT: vrgather.vi v9, v12, 1, v0.t
-; RV32-NEXT: ret
-;
-; RV64-LABEL: load_factor2_v3:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma
-; RV64-NEXT: vle32.v v10, (a0)
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vid.v v8
-; RV64-NEXT: vadd.vv v8, v8, v8
-; RV64-NEXT: vadd.vi v8, v8, 1
-; RV64-NEXT: vrgather.vv v9, v10, v8
-; RV64-NEXT: vmv.v.i v0, 4
-; RV64-NEXT: vsetivli zero, 4, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v10, 4
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; RV64-NEXT: vrgather.vi v9, v12, 1, v0.t
-; RV64-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v11, v10, 2
-; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT: vwaddu.vv v8, v10, v11
-; RV64-NEXT: li a0, -1
-; RV64-NEXT: vwmaccu.vx v8, a0, v11
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; RV64-NEXT: vrgather.vi v8, v12, 0, v0.t
-; RV64-NEXT: ret
+; CHECK-LABEL: load_factor2_v3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT: vle32.v v10, (a0)
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vadd.vv v9, v8, v8
+; CHECK-NEXT: vrgather.vv v8, v10, v9
+; CHECK-NEXT: vmv.v.i v0, 4
+; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v12, v10, 4
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT: vrgather.vi v8, v12, 0, v0.t
+; CHECK-NEXT: vadd.vi v11, v9, 1
+; CHECK-NEXT: vrgather.vv v9, v10, v11
+; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t
+; CHECK-NEXT: ret
%interleaved.vec = load <6 x i32>, ptr %ptr
%v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 0, i32 2, i32 4>
%v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 1, i32 3, i32 5>
@@ -159,142 +131,163 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: li a3, 58
+; RV32-NEXT: li a3, 62
; RV32-NEXT: mul a2, a2, a3
; RV32-NEXT: sub sp, sp, a2
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 58 * vlenb
-; RV32-NEXT: addi a3, a1, 256
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 62 * vlenb
+; RV32-NEXT: addi a3, a1, 128
+; RV32-NEXT: addi a4, a1, 256
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vle32.v v8, (a3)
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: li a4, 25
-; RV32-NEXT: mul a3, a3, a4
-; RV32-NEXT: add a3, sp, a3
-; RV32-NEXT: addi a3, a3, 16
-; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT: addi a3, a1, 128
-; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT: vslideup.vi v16, v8, 4
+; RV32...
[truncated]
|
@tstellar This backport has been outstanding for a while now. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In case it's helpful, explicit LGTM from me on backporting this.
@preames looks like you need to rebase
This reverts commit bdc4110 on the
release/18.x branch. This change was the first in a mini-series
and while I'm not aware of any particular problem from having it on
it's own in the branch, it seems safer to ship with the previous
known good state.
@tstellar This is my first backport in the new process, so please bear with me and double check I got all pieces of this right.