From 41e050b525566b8f94692008c98dd7b2b2154d49 Mon Sep 17 00:00:00 2001 From: zhongyunde 00443407 Date: Sun, 28 Jan 2024 23:16:42 -0500 Subject: [PATCH] [VPlan] Delete the redundant overflow check for DataAndControlFlowWithoutRuntimeCheck Fix https://github.com/llvm/llvm-project/issues/71917 --- .../Transforms/Vectorize/VPlanTransforms.cpp | 33 +- .../AArch64/scalable-reductions-tf.ll | 6 +- .../AArch64/scalable-strict-fadd.ll | 1778 ++++++++--------- .../sve-interleaved-masked-accesses.ll | 245 ++- .../AArch64/sve-tail-folding-forced.ll | 57 +- .../sve-tail-folding-overflow-checks.ll | 37 +- .../AArch64/sve-tail-folding-reductions.ll | 248 +-- .../AArch64/sve-tail-folding-unroll.ll | 340 ++-- .../LoopVectorize/AArch64/sve-tail-folding.ll | 324 ++- .../AArch64/tail-fold-uniform-memops.ll | 7 +- .../AArch64/tail-folding-styles.ll | 89 +- .../AArch64/uniform-args-call-variants.ll | 192 +- 12 files changed, 1533 insertions(+), 1823 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 5d7ac2bf9bb05..7833bc06e04c9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1107,21 +1107,6 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // Create the ActiveLaneMask instruction using the correct start values. VPValue *TC = Plan.getTripCount(); - VPValue *TripCount, *IncrementValue; - if (!DataAndControlFlowWithoutRuntimeCheck) { - // When the loop is guarded by a runtime overflow check for the loop - // induction variable increment by VF, we can increment the value before - // the get.active.lane mask and use the unmodified tripcount. - IncrementValue = CanonicalIVIncrement; - TripCount = TC; - } else { - // When avoiding a runtime check, the active.lane.mask inside the loop - // uses a modified trip count and the induction variable increment is - // done after the active.lane.mask intrinsic is called. - IncrementValue = CanonicalIVPHI; - TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, - {TC}, DL); - } auto *EntryIncrement = Builder.createOverflowingOp( VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL, "index.part.next"); @@ -1140,12 +1125,18 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // original terminator. VPRecipeBase *OriginalTerminator = EB->getTerminator(); Builder.setInsertPoint(OriginalTerminator); - auto *InLoopIncrement = - Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, - {IncrementValue}, {false, false}, DL); - auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, - {InLoopIncrement, TripCount}, DL, - "active.lane.mask.next"); + + auto *InLoopIncrement = CanonicalIVIncrement; + // When the loop is guarded by a runtime overflow check for the loop induction + // variable increment by VF, we can increment the value before the + // get.active.lane mask and use the unmodified tripcount. + if (!DataAndControlFlowWithoutRuntimeCheck) + InLoopIncrement = + Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, + {InLoopIncrement}, {false, false}, DL); + auto *ALM = + Builder.createNaryOp(VPInstruction::ActiveLaneMask, {InLoopIncrement, TC}, + DL, "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); // Replace the original terminator with BranchOnCond. We have to invert the diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll index 7bb25b8b3ea30..b551423ebee12 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll @@ -4,9 +4,6 @@ define void @invariant_store_red_exit_is_phi(ptr %dst, ptr readonly %src, i64 %n) { ; CHECK-LABEL: @invariant_store_red_exit_is_phi( ; CHECK: vector.ph: -; CHECK: %[[N_MINUS_VF:.*]] = sub i64 %n, %[[VSCALE_X_4:.*]] -; CHECK: %[[CMP:.*]] = icmp ugt i64 %n, %[[VSCALE_X_4]] -; CHECK: %[[N2:.*]] = select i1 %[[CMP]], i64 %[[N_MINUS_VF]], i64 0 ; CHECK: %[[ACTIVE_LANE_MASK_ENTRY:.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n) ; CHECK: vector.body: ; CHECK: %[[ACTIVE_LANE_MASK:.*]] = phi [ %[[ACTIVE_LANE_MASK_ENTRY]], %vector.ph ], [ %[[ACTIVE_LANE_MASK_NEXT:.*]], %vector.body ] @@ -14,7 +11,8 @@ define void @invariant_store_red_exit_is_phi(ptr %dst, ptr readonly %src, i64 %n ; CHECK: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0 ; CHECK-NEXT: %[[ADD:.*]] = add %[[VEC_PHI]], %[[LOAD]] ; CHECK-NEXT: %[[SELECT:.*]] = select %[[ACTIVE_LANE_MASK]], %[[ADD]], %[[VEC_PHI]] -; CHECK: %[[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %{{.*}}, i64 %[[N2]]) +; CHECK-NEXT: %[[INDEX_NEXT:.*]] = add i64 %index, %{{.*}} +; CHECK: %[[ACTIVE_LANE_MASK_NEXT:.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %[[INDEX_NEXT]], i64 %n) ; CHECK: middle.block: ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[SELECT]]) ; CHECK-NEXT: store i32 %[[SUM]], ptr %dst, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index fc67fb5aded6a..d14f4794b8365 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -42,22 +42,22 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP7]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP7]]) +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP9]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: @@ -89,18 +89,18 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-ORDERED-NEXT: [[TMP7]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -108,7 +108,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -120,7 +120,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] ; ; CHECK-ORDERED-TF-LABEL: define float @fadd_strict @@ -136,47 +136,42 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP10]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP19]], [[SUM_07]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP14]], [[SUM_07]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]] ; @@ -230,60 +225,60 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 32 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP34]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-UNORDERED-NEXT: [[TMP35]] = fadd [[WIDE_LOAD4]], [[VEC_PHI1]] -; CHECK-UNORDERED-NEXT: [[TMP36]] = fadd [[WIDE_LOAD5]], [[VEC_PHI2]] -; CHECK-UNORDERED-NEXT: [[TMP37]] = fadd [[WIDE_LOAD6]], [[VEC_PHI3]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP39]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP36]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP37]] = fadd [[WIDE_LOAD4]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP38]] = fadd [[WIDE_LOAD5]], [[VEC_PHI2]] +; CHECK-UNORDERED-NEXT: [[TMP39]] = fadd [[WIDE_LOAD6]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP35]], [[TMP34]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd [[TMP36]], [[BIN_RDX]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd [[TMP37]], [[BIN_RDX7]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP37]], [[TMP36]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd [[TMP38]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd [[TMP39]], [[BIN_RDX7]] ; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX8]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -316,51 +311,51 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 32 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) -; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP34]], [[WIDE_LOAD1]]) -; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP35]], [[WIDE_LOAD2]]) -; CHECK-ORDERED-NEXT: [[TMP37]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP36]], [[WIDE_LOAD3]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP39]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP36]], [[WIDE_LOAD1]]) +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP37]], [[WIDE_LOAD2]]) +; CHECK-ORDERED-NEXT: [[TMP39]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP38]], [[WIDE_LOAD3]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -368,7 +363,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -380,7 +375,7 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]] ; ; CHECK-ORDERED-TF-LABEL: define float @fadd_strict_unroll @@ -396,128 +391,102 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 32 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]] -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]] -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: -; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP35]] ; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] -; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP61]]) -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = select [[ACTIVE_LANE_MASK6]], [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], [[TMP63]]) -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = select [[ACTIVE_LANE_MASK7]], [[WIDE_MASKED_LOAD10]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP64]], [[TMP65]]) -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP68]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], [[TMP67]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = add i64 [[INDEX]], [[TMP72]] -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], [[TMP75]] -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = add i64 [[INDEX]], [[TMP78]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP73]], i64 [[TMP14]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP76]], i64 [[TMP19]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP79]], i64 [[TMP24]]) -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = extractelement [[TMP80]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP38]] +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP41]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP42]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP43]]) +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = select [[ACTIVE_LANE_MASK6]], [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP44]], [[TMP45]]) +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = select [[ACTIVE_LANE_MASK7]], [[WIDE_MASKED_LOAD10]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP46]], [[TMP47]]) +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP50]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP48]], [[TMP49]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT12:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT13:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT14]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT12]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT13]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT14]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = extractelement [[TMP51]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP50]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP85]], [[SUM_07]] +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD]] = fadd float [[TMP56]], [[SUM_07]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP50]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[ADD_LCSSA]] ; @@ -589,37 +558,37 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A2]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A1]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A2]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A1]], i32 0 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; CHECK-UNORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP13]], align 4 ; CHECK-UNORDERED-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( [[WIDE_VEC]]) -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd [[TMP12]], [[VEC_PHI1]] -; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd [[TMP13]], [[VEC_PHI]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd [[TMP14]], [[VEC_PHI1]] +; CHECK-UNORDERED-NEXT: [[TMP17]] = fadd [[TMP15]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP15]]) -; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP14]]) +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP17]]) +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP16]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: ; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] -; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-UNORDERED: for.body: ; CHECK-UNORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] @@ -636,8 +605,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-UNORDERED: for.end: -; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] -; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] ; CHECK-UNORDERED-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 ; CHECK-UNORDERED-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 ; CHECK-UNORDERED-NEXT: ret void @@ -661,24 +630,24 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP11]], align 4 ; CHECK-ORDERED-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( [[WIDE_VEC]]) -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-ORDERED-NEXT: [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP11]]) -; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP10]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-ORDERED-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) +; CHECK-ORDERED-NEXT: [[TMP15]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP12]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -686,8 +655,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] @@ -704,8 +673,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 ; CHECK-ORDERED-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 ; CHECK-ORDERED-NEXT: ret void @@ -730,62 +699,57 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]] -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i1( [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK]]) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP12]], i32 4, [[INTERLEAVED_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( [[WIDE_MASKED_VEC]]) -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP18]]) -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP16]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP15]]) +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP17]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP2]]) +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDXB1]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD1]] = fadd float [[TMP26]], [[ADD_PHI2]] +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDXB1]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD1]] = fadd float [[TMP21]], [[ADD_PHI2]] ; CHECK-ORDERED-TF-NEXT: [[OR:%.*]] = or disjoint i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[OR]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDXB2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD2]] = fadd float [[TMP27]], [[ADD_PHI1]] +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDXB2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD2]] = fadd float [[TMP22]], [[ADD_PHI1]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-ORDERED-TF-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: store float [[ADD1_LCSSA]], ptr [[A]], align 4 ; CHECK-ORDERED-TF-NEXT: store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4 ; CHECK-ORDERED-TF-NEXT: ret void @@ -867,26 +831,26 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] ; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-UNORDERED-NEXT: [[TMP11]] = fadd [[VEC_PHI]], [[TMP10]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd [[VEC_PHI]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP11]]) +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP13]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: @@ -929,22 +893,22 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 4 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] ; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-ORDERED-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP10]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 4 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP12]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -952,7 +916,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -967,7 +931,7 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-ORDERED: for.end.loopexit: -; CHECK-ORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_END]] ; CHECK-ORDERED: for.end: ; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] @@ -991,54 +955,49 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP5]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[TMP7]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]] -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP17]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]]) -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = extractelement [[TMP21]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP15]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP14]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-ORDERED-TF-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 -; CHECK-ORDERED-TF-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], [[TMP24]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-ORDERED-TF-NEXT: [[ADD:%.*]] = fadd float [[TMP18]], [[TMP19]] ; CHECK-ORDERED-TF-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-ORDERED-TF: for.end.loopexit: -; CHECK-ORDERED-TF-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_END]] ; CHECK-ORDERED-TF: for.end: ; CHECK-ORDERED-TF-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ] @@ -1110,28 +1069,28 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] -; CHECK-UNORDERED-NEXT: [[TMP11]] = fadd [[VEC_PHI]], [[PREDPHI]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[TMP9]], poison) +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd [[VEC_PHI]], [[PREDPHI]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP11]]) +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP13]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: @@ -1171,24 +1130,24 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] -; CHECK-ORDERED-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[PREDPHI]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = fcmp une [[WIDE_LOAD]], zeroinitializer +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0 +; CHECK-ORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[TMP9]], poison) +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] +; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[PREDPHI]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -1196,7 +1155,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1216,7 +1175,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[RDX]] ; ; CHECK-ORDERED-TF-LABEL: define float @fadd_conditional @@ -1232,64 +1191,59 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = fcmp une [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]] -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP14]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, [[TMP15]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or [[TMP15]], [[TMP18]] -; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP18]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[TMP19]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP20]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = fcmp une [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP7]] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP12]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = xor [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = or [[TMP11]], [[TMP15]] +; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD1]] +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select [[TMP16]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP17]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-ORDERED-TF-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-ORDERED-TF-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP26]], 0.000000e+00 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP21]], 0.000000e+00 ; CHECK-ORDERED-TF-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK-ORDERED-TF: if.then: ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 ; CHECK-ORDERED-TF-NEXT: br label [[FOR_INC]] ; CHECK-ORDERED-TF: for.inc: -; CHECK-ORDERED-TF-NEXT: [[PHI:%.*]] = phi float [ [[TMP27]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[PHI:%.*]] = phi float [ [[TMP22]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]] ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[RDX]] ; @@ -1358,26 +1312,26 @@ define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b, ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = fadd [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP10]] = fadd [[TMP7]], [[WIDE_LOAD1]] -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = fadd [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd [[TMP9]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP10]]) +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP12]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-UNORDERED: scalar.ph: @@ -1498,78 +1452,78 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 32 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 -; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 -; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP38]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP41]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP44]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP47]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP48]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) -; CHECK-UNORDERED-NEXT: [[TMP49]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) -; CHECK-UNORDERED-NEXT: [[TMP50]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) -; CHECK-UNORDERED-NEXT: [[TMP51]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP53]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8 +; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]] +; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16 +; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]] +; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 24 +; CHECK-UNORDERED-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP40]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP43]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP46]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP49]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP50]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP51]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP52]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP53]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP49]], [[TMP48]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd [[TMP50]], [[BIN_RDX]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd [[TMP51]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd [[TMP51]], [[TMP50]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd [[TMP52]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd [[TMP53]], [[BIN_RDX11]] ; CHECK-UNORDERED-NEXT: [[TMP55:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX12]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1604,73 +1558,73 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 32 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP55:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 -; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 -; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP38]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP41]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP44]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP47]], align 4 -; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP48]]) -; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP52]], [[TMP49]]) -; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP53]], [[TMP50]]) -; CHECK-ORDERED-NEXT: [[TMP55]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP57]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8 +; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]] +; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16 +; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]] +; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 24 +; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP40]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP43]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP46]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP49]], align 4 +; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = fmul [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = fmul [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP50]]) +; CHECK-ORDERED-NEXT: [[TMP55:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) +; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP55]], [[TMP52]]) +; CHECK-ORDERED-NEXT: [[TMP57]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP56]], [[TMP53]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -1678,7 +1632,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1692,7 +1646,7 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] ; ; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict @@ -1708,152 +1662,126 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]] -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]] -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: -; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP86:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP35]] ; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] -; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]] -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = fmul [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = fmul [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = fmul [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP75]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP79]]) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = select [[ACTIVE_LANE_MASK6]], [[TMP76]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP80]], [[TMP81]]) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP77]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]] -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]] -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]] -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP14]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP19]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP24]]) -; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = extractelement [[TMP98]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP38]] +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP41]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP42]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = mul i64 [[TMP48]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i64 [[TMP49]] +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i64 [[TMP52]] +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i64 [[TMP55]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP47]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP50]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP56]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = fmul [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = fmul [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = fmul [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP57]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP61]]) +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = select [[ACTIVE_LANE_MASK6]], [[TMP58]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], [[TMP63]]) +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP59]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP64]], [[TMP65]]) +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP60]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP68]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], [[TMP67]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT16:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT17:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT18]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT16]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT20]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT17]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT21]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT18]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = extractelement [[TMP69]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP73]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP103]], float [[TMP104]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP74]], float [[TMP75]], float [[SUM_07]]) ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] ; @@ -1912,78 +1840,78 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 32 +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] -; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 -; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 -; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP38]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP41]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP44]], align 4 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP47]], align 4 -; CHECK-UNORDERED-NEXT: [[TMP48]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) -; CHECK-UNORDERED-NEXT: [[TMP49]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) -; CHECK-UNORDERED-NEXT: [[TMP50]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) -; CHECK-UNORDERED-NEXT: [[TMP51]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) -; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP53]] +; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] +; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]] +; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]] +; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0 +; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8 +; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]] +; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16 +; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]] +; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 24 +; CHECK-UNORDERED-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP40]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP43]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP46]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP49]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP50]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) +; CHECK-UNORDERED-NEXT: [[TMP51]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) +; CHECK-UNORDERED-NEXT: [[TMP52]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) +; CHECK-UNORDERED-NEXT: [[TMP53]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-UNORDERED: middle.block: -; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd nnan [[TMP49]], [[TMP48]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd nnan [[TMP50]], [[BIN_RDX]] -; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd nnan [[TMP51]], [[BIN_RDX11]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd nnan [[TMP51]], [[TMP50]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd nnan [[TMP52]], [[BIN_RDX]] +; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd nnan [[TMP53]], [[BIN_RDX11]] ; CHECK-UNORDERED-NEXT: [[TMP55:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX12]]) ; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -2018,73 +1946,73 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 32 +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP55:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24 -; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 -; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 -; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] -; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 -; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 -; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP33]], align 4 -; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] -; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] -; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] -; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] -; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 -; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 -; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 -; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP38]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP41]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP44]], align 4 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP47]], align 4 -; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] -; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] -; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] -; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] -; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP48]]) -; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP52]], [[TMP49]]) -; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP53]], [[TMP50]]) -; CHECK-ORDERED-NEXT: [[TMP55]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) -; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP57]] +; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 0 +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 1 +; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 24 +; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], 0 +; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 1 +; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 +; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]] +; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 16 +; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP31]] +; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 24 +; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP34]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP26]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP29]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP32]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP35]], align 4 +; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] +; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]] +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP16]] +; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP21]] +; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i32 0 +; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = mul i64 [[TMP41]], 8 +; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP42]] +; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 16 +; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP45]] +; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 24 +; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP36]], i64 [[TMP48]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP40]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP43]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP46]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP49]], align 4 +; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] +; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] +; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = fmul nnan [[WIDE_LOAD2]], [[WIDE_LOAD6]] +; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = fmul nnan [[WIDE_LOAD3]], [[WIDE_LOAD7]] +; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP50]]) +; CHECK-ORDERED-NEXT: [[TMP55:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) +; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP55]], [[TMP52]]) +; CHECK-ORDERED-NEXT: [[TMP57]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP56]], [[TMP53]]) +; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-ORDERED: middle.block: @@ -2092,7 +2020,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED: scalar.ph: ; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED: for.body: ; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -2106,7 +2034,7 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-ORDERED: for.end: -; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP57]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]] ; ; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict_fmf @@ -2122,152 +2050,126 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 ; CHECK-ORDERED-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]] -; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[TMP16]] -; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[N]], [[TMP16]] -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 32 -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = sub i64 [[N]], [[TMP21]] -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[N]], [[TMP21]] -; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 -; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] -; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 24 -; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] +; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP8]] +; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP10]] +; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 24 +; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: -; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VECTOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP86:%.*]], [[VECTOR_BODY]] ] -; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT20:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT21:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ] +; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]] +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], [[TMP22]] +; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], 0 +; CHECK-ORDERED-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP28:%.*]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-ORDERED-TF-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP35]] ; CHECK-ORDERED-TF-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], [[TMP40]] -; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] -; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]] -; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]] -; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]] -; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]] -; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]] -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] -; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = fmul nnan [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] -; CHECK-ORDERED-TF-NEXT: [[TMP77:%.*]] = fmul nnan [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] -; CHECK-ORDERED-TF-NEXT: [[TMP78:%.*]] = fmul nnan [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] -; CHECK-ORDERED-TF-NEXT: [[TMP79:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[TMP75]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP80:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP79]]) -; CHECK-ORDERED-TF-NEXT: [[TMP81:%.*]] = select nnan [[ACTIVE_LANE_MASK6]], [[TMP76]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP82:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP80]], [[TMP81]]) -; CHECK-ORDERED-TF-NEXT: [[TMP83:%.*]] = select nnan [[ACTIVE_LANE_MASK7]], [[TMP77]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) -; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) -; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]] -; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 -; CHECK-ORDERED-TF-NEXT: [[TMP91:%.*]] = add i64 [[INDEX]], [[TMP90]] -; CHECK-ORDERED-TF-NEXT: [[TMP92:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP93:%.*]] = mul i64 [[TMP92]], 16 -; CHECK-ORDERED-TF-NEXT: [[TMP94:%.*]] = add i64 [[INDEX]], [[TMP93]] -; CHECK-ORDERED-TF-NEXT: [[TMP95:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP96:%.*]] = mul i64 [[TMP95]], 24 -; CHECK-ORDERED-TF-NEXT: [[TMP97:%.*]] = add i64 [[INDEX]], [[TMP96]] -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP91]], i64 [[TMP14]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP94]], i64 [[TMP19]]) -; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP97]], i64 [[TMP24]]) -; CHECK-ORDERED-TF-NEXT: [[TMP98:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP99:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP100:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP101:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-ORDERED-TF-NEXT: [[TMP102:%.*]] = extractelement [[TMP98]], i32 0 -; CHECK-ORDERED-TF-NEXT: br i1 [[TMP102]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-ORDERED-TF-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP38]] +; CHECK-ORDERED-TF-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP41]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP42]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]] +; CHECK-ORDERED-TF-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP18]] +; CHECK-ORDERED-TF-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP23]] +; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP28]] +; CHECK-ORDERED-TF-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i32 0 +; CHECK-ORDERED-TF-NEXT: [[TMP48:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = mul i64 [[TMP48]], 8 +; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i64 [[TMP49]] +; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = mul i64 [[TMP51]], 16 +; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i64 [[TMP52]] +; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = mul i64 [[TMP54]], 24 +; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i64 [[TMP55]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP47]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP50]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP53]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP56]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] +; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = fmul nnan [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] +; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = fmul nnan [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]] +; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = fmul nnan [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]] +; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = select nnan [[ACTIVE_LANE_MASK]], [[TMP57]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP61]]) +; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = select nnan [[ACTIVE_LANE_MASK6]], [[TMP58]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], [[TMP63]]) +; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = select nnan [[ACTIVE_LANE_MASK7]], [[TMP59]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP64]], [[TMP65]]) +; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP60]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP68]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], [[TMP67]]) +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT16:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT17:%.*]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT18]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT16]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT20]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT17]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT21]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT18]], i64 [[N]]) +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = extractelement [[TMP69]], i32 0 +; CHECK-ORDERED-TF-NEXT: br i1 [[TMP73]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: ; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-ORDERED-TF: scalar.ph: ; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP103:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-ORDERED-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] -; CHECK-ORDERED-TF-NEXT: [[TMP104:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP103]], float [[TMP104]], float [[SUM_07]]) +; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-ORDERED-TF-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP74]], float [[TMP75]], float [[SUM_07]]) ; CHECK-ORDERED-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-ORDERED-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-ORDERED-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-ORDERED-TF: for.end: -; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-ORDERED-TF-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP68]], [[MIDDLE_BLOCK]] ] ; CHECK-ORDERED-TF-NEXT: ret float [[MULADD_LCSSA]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 3ba91360850e7..46a31d702df8c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -35,38 +35,38 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP8]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) -; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl i32 [[INDEX]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP9]], [[TMP9]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP12]], i32 1, [[INTERLEAVED_MASK]], poison) ; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP13]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1 -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK1]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = or disjoint i32 [[TMP10]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sext i32 [[TMP15]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 -1 +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP16]], [[TMP17]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP9]], [[TMP9]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -111,16 +111,13 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PREDICATED_TAIL_FOLDING: vector.ph: ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -128,31 +125,31 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP5]], zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) ; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP13]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1 -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK1]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint i32 [[TMP7]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sext i32 [[TMP12]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP15]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i64 -1 +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP13]], [[TMP14]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX_NEXT]], i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -227,29 +224,29 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP8]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP8]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP9]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = or disjoint [[TMP7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg [[TMP11]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP12]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP13]], i32 1, [[TMP10]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP15]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = zext nneg [[TMP9]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP10]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP11]], i32 1, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint [[TMP9]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg [[TMP13]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP15]], i32 1, [[TMP12]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -286,16 +283,13 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PREDICATED_TAIL_FOLDING: vector.ph: ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -303,22 +297,22 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP7]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = zext nneg [[TMP5]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP6]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP7]], i32 1, [[ACTIVE_LANE_MASK]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint [[TMP5]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[TMP13]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[TMP9]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX_NEXT]], i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP13]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -389,12 +383,12 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = shl i32 [[TMP15]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP8]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -403,18 +397,18 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = zext nneg [[TMP7]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP9]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP10]], i32 1, [[TMP8]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint [[TMP7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP12]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP11]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg [[TMP9]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[TMP10]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = or disjoint [[TMP9]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg [[TMP14]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP15]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP16]], i32 1, [[TMP13]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -457,16 +451,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.ph: ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32 ; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD1]] to i32 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv16i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -476,24 +467,24 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg [[TMP6]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg [[TMP5]] to ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP9]], i32 1, [[TMP10]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), [[TMP9]], i32 1, [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint [[TMP5]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP12]] to ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP15]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX_NEXT]], i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP15]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll index cc72dfa4ce639..34bb57359298e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -9,33 +9,31 @@ target triple = "aarch64-unknown-linux-gnu" ; VPLANS-LABEL: Checking a loop in 'simple_memset' -; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { -; VPLANS-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF -; VPLANS-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count +; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; VPLANS-NEXT: Live-in vp<%0> = VF * UF +; VPLANS-NEXT: vp<%3> = original trip-count ; VPLANS-EMPTY: ; VPLANS-NEXT: ph: -; VPLANS-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (1 umax %n) +; VPLANS-NEXT: EMIT vp<%3> = EXPAND SCEV (1 umax %n) ; VPLANS-NEXT: No successors ; VPLANS-EMPTY: ; VPLANS-NEXT: vector.ph: -; VPLANS-NEXT: EMIT vp<[[NEWTC:%[0-9]+]]> = TC > VF ? TC - VF : 0 vp<[[TC]]> -; VPLANS-NEXT: EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0> -; VPLANS-NEXT: EMIT vp<[[LANEMASK_ENTRY:%[0-9]+]]> = active lane mask vp<[[VF]]>, vp<[[TC]]> +; VPLANS-NEXT: EMIT vp<%4> = VF * Part + ir<0> +; VPLANS-NEXT: EMIT vp<%5> = active lane mask vp<%4>, vp<%3> ; VPLANS-NEXT: Successor(s): vector loop ; VPLANS-EMPTY: ; VPLANS-NEXT: vector loop: { ; VPLANS-NEXT: vector.body: -; VPLANS-NEXT: EMIT vp<[[INDV:%[0-9]+]]> = CANONICAL-INDUCTION -; VPLANS-NEXT: ACTIVE-LANE-MASK-PHI vp<[[LANEMASK_PHI:%[0-9]+]]> = phi vp<[[LANEMASK_ENTRY]]>, vp<[[LANEMASK_LOOP:%[0-9]+]]> -; VPLANS-NEXT: vp<[[STEP:%[0-9]+]]> = SCALAR-STEPS vp<[[INDV]]>, ir<1> -; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEP]]> -; VPLANS-NEXT: vp<[[VEC_PTR:%[0-9]+]]> = vector-pointer ir<%gep> -; VPLANS-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%val>, vp<[[LANEMASK_PHI]]> -; VPLANS-NEXT: EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = add vp<[[INDV]]>, vp<[[VFxUF]]> -; VPLANS-NEXT: EMIT vp<[[INC:%[0-9]+]]> = VF * Part + vp<[[INDV]]> -; VPLANS-NEXT: EMIT vp<[[LANEMASK_LOOP]]> = active lane mask vp<[[INC]]>, vp<[[NEWTC]]> -; VPLANS-NEXT: EMIT vp<[[NOT:%[0-9]+]]> = not vp<[[LANEMASK_LOOP]]> -; VPLANS-NEXT: EMIT branch-on-cond vp<[[NOT]]> +; VPLANS-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%11> +; VPLANS-NEXT: ACTIVE-LANE-MASK-PHI vp<%7> = phi vp<%5>, vp<%12> +; VPLANS-NEXT: vp<%8> = SCALAR-STEPS vp<%6>, ir<1> +; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<%8> +; VPLANS-NEXT: vp<%10> = vector-pointer ir<%gep> +; VPLANS-NEXT: WIDEN store vp<%10>, ir<%val>, vp<%7> +; VPLANS-NEXT: EMIT vp<%11> = add vp<%6>, vp<%0> +; VPLANS-NEXT: EMIT vp<%12> = active lane mask vp<%11>, vp<%3> +; VPLANS-NEXT: EMIT vp<%13> = not vp<%12> +; VPLANS-NEXT: EMIT branch-on-cond vp<%13> ; VPLANS-NEXT: No successors ; VPLANS-NEXT: } @@ -53,13 +51,8 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -67,15 +60,15 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i32 0 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll index df107847e3e32..c7d7919591f68 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll @@ -15,19 +15,19 @@ define void @cannot_overflow_i32_induction_var(ptr noalias %dst, ptr readonly %s ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP0]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP1:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 42, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP1]], ptr [[TMP2]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP2]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP3:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 42, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP3]], ptr [[TMP4]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] @@ -76,25 +76,22 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src, ; CHECK: for.body.preheader: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP1]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP4:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 42, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP4]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP2]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP3:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 42, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP3]], ptr [[TMP4]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll index 9dcc751db7cf0..96dc6cebfd9c5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll @@ -20,36 +20,31 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[TMP14]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP11]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP12:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP11]]) ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -61,7 +56,7 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: while.end.loopexit: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] ; ; CHECK-IN-LOOP-LABEL: @add_reduction_i32( @@ -77,36 +72,31 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-IN-LOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-IN-LOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-IN-LOOP: vector.body: ; CHECK-IN-LOOP-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] -; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 -; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) -; CHECK-IN-LOOP-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]] -; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] -; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 -; CHECK-IN-LOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP7]] +; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP10]]) +; CHECK-IN-LOOP-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] +; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = extractelement [[TMP13]], i32 0 +; CHECK-IN-LOOP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-IN-LOOP: middle.block: ; CHECK-IN-LOOP-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-IN-LOOP: scalar.ph: ; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-IN-LOOP-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-IN-LOOP: while.body: ; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -118,7 +108,7 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-IN-LOOP-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-IN-LOOP: while.end.loopexit: -; CHECK-IN-LOOP-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-IN-LOOP-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-IN-LOOP-NEXT: ret i32 [[RED_NEXT_LCSSA]] ; entry: @@ -152,35 +142,30 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP10]]) +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP12:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -192,7 +177,7 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: while.end.loopexit: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]] ; ; CHECK-IN-LOOP-LABEL: @add_reduction_f32( @@ -208,35 +193,30 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-IN-LOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-IN-LOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-IN-LOOP: vector.body: ; CHECK-IN-LOOP-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[TMP10]] -; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[TMP11]], i32 0 -; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-IN-LOOP-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) -; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 -; CHECK-IN-LOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[TMP7]] +; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 +; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-IN-LOOP-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP10]]) +; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-IN-LOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-IN-LOOP: middle.block: ; CHECK-IN-LOOP-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-IN-LOOP: scalar.ph: ; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-IN-LOOP-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-IN-LOOP: while.body: ; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -248,7 +228,7 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] ; CHECK-IN-LOOP-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-IN-LOOP: while.end.loopexit: -; CHECK-IN-LOOP-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-IN-LOOP-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-IN-LOOP-NEXT: ret float [[RED_NEXT_LCSSA]] ; entry: @@ -281,56 +261,51 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 7, i32 0), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[TMP15]], poison) -; CHECK-NEXT: [[TMP17:%.*]] = xor [[VEC_PHI]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP18:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP17]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP20]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP24:%.*]] = extractelement [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 7, i32 0), [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) +; CHECK-NEXT: [[TMP14:%.*]] = xor [[VEC_PHI]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP15:%.*]] = xor [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[TMP14]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP20]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP17]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[IV]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP26]], 5 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP21]], 5 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[RDX]], [[TMP27]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[RDX]], [[TMP22]] ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[RES]] = phi i32 [ [[RDX]], [[FOR_BODY]] ], [ [[XOR]], [[IF_THEN]] ] @@ -338,7 +313,7 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RES_LCSSA]] ; ; CHECK-IN-LOOP-LABEL: @cond_xor_reduction( @@ -353,53 +328,48 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-IN-LOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] ; CHECK-IN-LOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-IN-LOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-IN-LOOP-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-IN-LOOP-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-IN-LOOP-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-IN-LOOP-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-IN-LOOP: vector.body: ; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP10]] -; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 -; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i64 0), poison, zeroinitializer) -; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] -; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0 -; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[TMP15]], poison) -; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD1]], zeroinitializer -; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP17]]) -; CHECK-IN-LOOP-NEXT: [[TMP19]] = xor i32 [[TMP18]], [[VEC_PHI]] -; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]] -; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-IN-LOOP-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-IN-LOOP-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 -; CHECK-IN-LOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-IN-LOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-IN-LOOP-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP7]] +; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-IN-LOOP-NEXT: [[TMP10:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 5, i64 0), poison, zeroinitializer) +; CHECK-IN-LOOP-NEXT: [[TMP11:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer +; CHECK-IN-LOOP-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP7]] +; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) +; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = select [[TMP11]], [[WIDE_MASKED_LOAD1]], zeroinitializer +; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP14]]) +; CHECK-IN-LOOP-NEXT: [[TMP16]] = xor i32 [[TMP15]], [[VEC_PHI]] +; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; CHECK-IN-LOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-IN-LOOP: middle.block: ; CHECK-IN-LOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK-IN-LOOP: scalar.ph: ; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] ; CHECK-IN-LOOP-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-IN-LOOP: for.body: ; CHECK-IN-LOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-IN-LOOP-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ] ; CHECK-IN-LOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[IV]] -; CHECK-IN-LOOP-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-IN-LOOP-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP24]], 5 +; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-IN-LOOP-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP19]], 5 ; CHECK-IN-LOOP-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK-IN-LOOP: if.then: ; CHECK-IN-LOOP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-IN-LOOP-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-IN-LOOP-NEXT: [[XOR:%.*]] = xor i32 [[RDX]], [[TMP25]] +; CHECK-IN-LOOP-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-IN-LOOP-NEXT: [[XOR:%.*]] = xor i32 [[RDX]], [[TMP20]] ; CHECK-IN-LOOP-NEXT: br label [[FOR_INC]] ; CHECK-IN-LOOP: for.inc: ; CHECK-IN-LOOP-NEXT: [[RES]] = phi i32 [ [[RDX]], [[FOR_BODY]] ], [ [[XOR]], [[IF_THEN]] ] @@ -407,7 +377,7 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-IN-LOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-IN-LOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-IN-LOOP: for.end: -; CHECK-IN-LOOP-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-IN-LOOP-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] ; CHECK-IN-LOOP-NEXT: ret i32 [[RES_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index 1a6e83a61ce74..c5a1f76109d27 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -18,37 +18,17 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16 -; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 -; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[UMAX]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 -; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 -; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12 -; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) @@ -57,65 +37,59 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0 -; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 -; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX6]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 1 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX6]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 12 +; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], 0 +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[INDEX6]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP28]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP35]] ; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 8 -; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 12 -; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0 -; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 4 -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 8 -; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 12 -; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]]) -; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]] -; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 4 -; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX6]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 -; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[INDEX6]], [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 12 -; CHECK-NEXT: [[TMP71:%.*]] = add i64 [[INDEX6]], [[TMP70]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT11]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP65]], i64 [[TMP14]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT12]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP68]], i64 [[TMP19]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT13]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP71]], i64 [[TMP24]]) -; CHECK-NEXT: [[TMP72:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP73:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP74:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP75:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP76:%.*]] = extractelement [[TMP72]], i32 0 -; CHECK-NEXT: br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 12 +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP41]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK7]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK8]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP42]], i32 4, [[ACTIVE_LANE_MASK9]]) +; CHECK-NEXT: [[INDEX_NEXT10:%.*]] = add i64 [[INDEX6]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT11:%.*]] = add i64 [[INDEX6]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT12:%.*]] = add i64 [[INDEX6]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT10]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT11]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT12]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT13]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP43:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP44:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP45:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP46:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP47:%.*]] = extractelement [[TMP43]], i32 0 +; CHECK-NEXT: br i1 [[TMP47]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -160,37 +134,17 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 -; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[UMAX]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[UMAX]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16 -; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[UMAX]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt i64 [[UMAX]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 0 -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 -; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[UMAX]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp ugt i64 [[UMAX]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 -; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP26]] -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 -; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 12 -; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 12 +; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP12]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[UMAX]]) @@ -199,91 +153,85 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX6]], 0 -; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 -; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 0 -; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 1 -; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[INDEX6]], [[TMP35]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK9:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX6]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 8 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 1 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX6]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 12 +; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP25]], 0 +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[INDEX6]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP28]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP35]] ; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP38:%.*]] = mul i64 [[TMP37]], 8 -; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP38]], 0 -; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 1 -; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[INDEX6]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 12 -; CHECK-NEXT: [[TMP44:%.*]] = add i64 [[TMP43]], 0 -; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP44]], 1 -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX6]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0 -; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 4 -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 8 -; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 12 -; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]], poison) -; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]], poison) -; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]], poison) -; CHECK-NEXT: [[TMP61:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP62:%.*]] = icmp ne [[WIDE_MASKED_LOAD10]], zeroinitializer -; CHECK-NEXT: [[TMP63:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer -; CHECK-NEXT: [[TMP64:%.*]] = icmp ne [[WIDE_MASKED_LOAD12]], zeroinitializer -; CHECK-NEXT: [[TMP69:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP61]], zeroinitializer -; CHECK-NEXT: [[TMP70:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP62]], zeroinitializer -; CHECK-NEXT: [[TMP71:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP63]], zeroinitializer -; CHECK-NEXT: [[TMP72:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP64]], zeroinitializer -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP36]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]] -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 -; CHECK-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 4 -; CHECK-NEXT: [[TMP76:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP75]] -; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 8 -; CHECK-NEXT: [[TMP79:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP78]] -; CHECK-NEXT: [[TMP80:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP81:%.*]] = mul i64 [[TMP80]], 12 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, [[TMP69]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, [[TMP70]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, [[TMP71]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, [[TMP72]]) -; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]] -; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP86:%.*]] = mul i64 [[TMP85]], 4 -; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[INDEX6]], [[TMP86]] -; CHECK-NEXT: [[TMP88:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP89:%.*]] = mul i64 [[TMP88]], 8 -; CHECK-NEXT: [[TMP90:%.*]] = add i64 [[INDEX6]], [[TMP89]] -; CHECK-NEXT: [[TMP91:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP92:%.*]] = mul i64 [[TMP91]], 12 -; CHECK-NEXT: [[TMP93:%.*]] = add i64 [[INDEX6]], [[TMP92]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX6]], i64 [[TMP9]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT14]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP87]], i64 [[TMP14]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT15]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP90]], i64 [[TMP19]]) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT16]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP24]]) -; CHECK-NEXT: [[TMP94:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP95:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP96:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP97:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP98:%.*]] = extractelement [[TMP94]], i32 0 -; CHECK-NEXT: br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 12 +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP41]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP33]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP36]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP39]], i32 4, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP42]], i32 4, [[ACTIVE_LANE_MASK9]], poison) +; CHECK-NEXT: [[TMP43:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP44:%.*]] = icmp ne [[WIDE_MASKED_LOAD10]], zeroinitializer +; CHECK-NEXT: [[TMP45:%.*]] = icmp ne [[WIDE_MASKED_LOAD11]], zeroinitializer +; CHECK-NEXT: [[TMP46:%.*]] = icmp ne [[WIDE_MASKED_LOAD12]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP43]], zeroinitializer +; CHECK-NEXT: [[TMP48:%.*]] = select [[ACTIVE_LANE_MASK7]], [[TMP44]], zeroinitializer +; CHECK-NEXT: [[TMP49:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP45]], zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP46]], zeroinitializer +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP28]] +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr i32, ptr [[TMP51]], i32 0 +; CHECK-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 4 +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr i32, ptr [[TMP51]], i64 [[TMP57]] +; CHECK-NEXT: [[TMP59:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP60:%.*]] = mul i64 [[TMP59]], 8 +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr i32, ptr [[TMP51]], i64 [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP63:%.*]] = mul i64 [[TMP62]], 12 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, ptr [[TMP51]], i64 [[TMP63]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP55]], i32 4, [[TMP47]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP58]], i32 4, [[TMP48]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP61]], i32 4, [[TMP49]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP64]], i32 4, [[TMP50]]) +; CHECK-NEXT: [[INDEX_NEXT13:%.*]] = add i64 [[INDEX6]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT14:%.*]] = add i64 [[INDEX6]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT15:%.*]] = add i64 [[INDEX6]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT16]] = add i64 [[INDEX6]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT13]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT14]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT18]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT15]], i64 [[UMAX]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT19]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT16]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP65:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP66:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP67:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP68:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT19]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP69:%.*]] = extractelement [[TMP65]], i32 0 +; CHECK-NEXT: br i1 [[TMP69]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 2b2742ca7ccbc..b5875f3e2fef8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -18,13 +18,8 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -32,15 +27,15 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i32 0 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -81,9 +76,6 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], 3 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[UMAX]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[UMAX]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer @@ -91,15 +83,15 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]]) -; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -145,30 +137,25 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP12:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -222,38 +209,33 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4 -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) -; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = mul [[TMP14]], shufflevector ( insertelement ( poison, i64 4, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 4, [[TMP17]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = mul [[TMP11]], shufflevector ( insertelement ( poison, i64 4, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 4, [[TMP14]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP15]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[SRC:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP20]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]]) -; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[SRC:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP16]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP9]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -303,31 +285,26 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[SRC:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP12:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -382,30 +359,25 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP8]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP11:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -457,13 +429,8 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[SRC:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -471,25 +438,25 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[PREDPHI]], ptr [[TMP19]], i32 4, [[TMP18]]) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = xor [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP12]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], zeroinitializer, [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[PREDPHI]], ptr [[TMP16]], i32 4, [[TMP14]]) +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[N]]) +; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -498,14 +465,14 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK: for.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP24]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP19]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[SRC]], align 4 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP25]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] +; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP20]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: store i32 [[VAL_0]], ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 @@ -557,13 +524,8 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -571,16 +533,16 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i32 0 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -629,32 +591,27 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP15:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP15]], ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP12:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP12]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP13:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP13]], i32 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -709,33 +666,28 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD2]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[TMP15]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP16]], ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP18]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; CHECK-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD2]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[TMP12]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP13]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP6]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]]) +; CHECK-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -786,18 +738,18 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 -; CHECK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP5]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll index 187f50f2e76a4..e0b51af9aba94 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -10,9 +10,6 @@ target triple = "aarch64-linux-gnu" define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 { ; CHECK-LABEL: @uniform_load( ; CHECK: vector.ph: -; CHECK: [[N_MINUS_VF:%.*]] = sub i64 %n, [[VSCALE_X_VF:.*]] -; CHECK: [[CMP:%.*]] = icmp ugt i64 %n, [[VSCALE_X_VF]] -; CHECK: [[N2:%.*]] = select i1 [[CMP]], i64 [[N_MINUS_VF]], i64 0 ; CHECK: [[INIT_ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 %n) ; CHECK: vector.body: ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] @@ -25,8 +22,8 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr %dst, i64 [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4 -; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]]) +; CHECK-NEXT: [[IDX_NEXT:%.*]] = add i64 [[IDX]], 4 +; CHECK-NEXT: [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX_NEXT]], i64 %n) ; CHECK-NEXT: [[NOT_ACTIVE_LANE_MASK:%.*]] = xor <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], ; CHECK-NEXT: [[FIRST_LANE_SET:%.*]] = extractelement <4 x i1> [[NOT_ACTIVE_LANE_MASK]], i32 0 ; CHECK-NEXT: br i1 [[FIRST_LANE_SET]], label %middle.block, label %vector.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll index 94f24fea3609c..fa252ecf0a076 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -22,18 +22,18 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; NONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; NONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP3]] ; NONE-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] -; NONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; NONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; NONE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NONE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; NONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; NONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; NONE-NEXT: br label [[VECTOR_BODY:%.*]] ; NONE: vector.body: ; NONE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] -; NONE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0 -; NONE-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] -; NONE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 -; NONE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4 -; NONE-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]] +; NONE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX1]], 0 +; NONE-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP6]] +; NONE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0 +; NONE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP8]], align 4 +; NONE-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP5]] ; NONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] ; NONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; NONE: middle.block: @@ -69,19 +69,19 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; DATA-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; DATA-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; DATA-NEXT: br label [[VECTOR_BODY:%.*]] ; DATA: vector.body: ; DATA-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] -; DATA-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP9]], i64 [[UMAX]]) -; DATA-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]] -; DATA-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; DATA-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) -; DATA-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]] +; DATA-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], 0 +; DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP11]], i64 [[UMAX]]) +; DATA-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP11]] +; DATA-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; DATA-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; DATA-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP10]] ; DATA-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] ; DATA-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; DATA: middle.block: @@ -119,24 +119,24 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_NO_LANEMASK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DATA_NO_LANEMASK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; DATA_NO_LANEMASK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; DATA_NO_LANEMASK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; DATA_NO_LANEMASK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector [[BROADCAST_SPLATINSERT4]], poison, zeroinitializer ; DATA_NO_LANEMASK-NEXT: br label [[VECTOR_BODY:%.*]] ; DATA_NO_LANEMASK: vector.body: ; DATA_NO_LANEMASK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY]] ] -; DATA_NO_LANEMASK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 +; DATA_NO_LANEMASK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], 0 ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[INDEX1]], i64 0 ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer -; DATA_NO_LANEMASK-NEXT: [[TMP10:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; DATA_NO_LANEMASK-NEXT: [[TMP11:%.*]] = add zeroinitializer, [[TMP10]] -; DATA_NO_LANEMASK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT3]], [[TMP11]] -; DATA_NO_LANEMASK-NEXT: [[TMP12:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT]] -; DATA_NO_LANEMASK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]] -; DATA_NO_LANEMASK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; DATA_NO_LANEMASK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT5]], ptr [[TMP14]], i32 4, [[TMP12]]) -; DATA_NO_LANEMASK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP16]] +; DATA_NO_LANEMASK-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; DATA_NO_LANEMASK-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; DATA_NO_LANEMASK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT3]], [[TMP13]] +; DATA_NO_LANEMASK-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT]] +; DATA_NO_LANEMASK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP11]] +; DATA_NO_LANEMASK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i32 0 +; DATA_NO_LANEMASK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT5]], ptr [[TMP16]], i32 4, [[TMP14]]) +; DATA_NO_LANEMASK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP10]] ; DATA_NO_LANEMASK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] ; DATA_NO_LANEMASK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; DATA_NO_LANEMASK: middle.block: @@ -171,8 +171,8 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP8]] ; DATA_AND_CONTROL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; DATA_AND_CONTROL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; DATA_AND_CONTROL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; DATA_AND_CONTROL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; DATA_AND_CONTROL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; DATA_AND_CONTROL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; DATA_AND_CONTROL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA_AND_CONTROL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -180,11 +180,11 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL: vector.body: ; DATA_AND_CONTROL-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; DATA_AND_CONTROL-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 0 -; DATA_AND_CONTROL-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]] -; DATA_AND_CONTROL-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; DATA_AND_CONTROL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) -; DATA_AND_CONTROL-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]] +; DATA_AND_CONTROL-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], 0 +; DATA_AND_CONTROL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP11]] +; DATA_AND_CONTROL-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; DATA_AND_CONTROL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; DATA_AND_CONTROL-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP10]] ; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) ; DATA_AND_CONTROL-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; DATA_AND_CONTROL-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 @@ -217,13 +217,8 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]] -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -231,15 +226,15 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL_NO_RT_CHECK: vector.body: ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0 -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP7]] +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 4, [[ACTIVE_LANE_MASK]]) +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP6]] +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = extractelement [[TMP10]], i32 0 +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; DATA_AND_CONTROL_NO_RT_CHECK: middle.block: ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; DATA_AND_CONTROL_NO_RT_CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll index 17edfe513dd01..45d47bc791418 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll @@ -12,23 +12,20 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP2]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP3:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP3]], ptr [[TMP4]], i32 8, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; @@ -38,42 +35,34 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) -; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 -; INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP6]]) -; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP3]], i64 [[N]]) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: -; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 1 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[TMP4]], i64 [[TMP6]] +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP4]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; INTERLEAVE-NEXT: [[TMP8:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: [[TMP9:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 1 -; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]] -; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK2]], poison) -; INTERLEAVE-NEXT: [[TMP14:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) -; INTERLEAVE-NEXT: [[TMP15:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK2]]) -; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 1 -; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]] -; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[TMP16]], i32 8, [[ACTIVE_LANE_MASK]]) -; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP15]], ptr [[TMP19]], i32 8, [[ACTIVE_LANE_MASK2]]) -; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; INTERLEAVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP21:%.*]] = shl i64 [[TMP20]], 1 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP7]]) -; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; INTERLEAVE-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[TMP12]] +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP8]], ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]] +; INTERLEAVE-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP1]] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT4]], i64 [[N]]) +; INTERLEAVE-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: ; INTERLEAVE-NEXT: ret void ; @@ -101,23 +90,20 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP2]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP3:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP3]], ptr [[TMP4]], i32 8, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; @@ -127,42 +113,34 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 -; INTERLEAVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]]) -; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 -; INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP6]]) -; INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP3]], i64 [[N]]) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: -; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; INTERLEAVE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 1 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[TMP4]], i64 [[TMP6]] +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP4]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; INTERLEAVE-NEXT: [[TMP8:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: [[TMP9:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; INTERLEAVE-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 1 -; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]] -; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; INTERLEAVE-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK2]], poison) -; INTERLEAVE-NEXT: [[TMP14:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) -; INTERLEAVE-NEXT: [[TMP15:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK2]]) -; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 1 -; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]] -; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[TMP16]], i32 8, [[ACTIVE_LANE_MASK]]) -; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP15]], ptr [[TMP19]], i32 8, [[ACTIVE_LANE_MASK2]]) -; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; INTERLEAVE-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; INTERLEAVE-NEXT: [[TMP21:%.*]] = shl i64 [[TMP20]], 1 -; INTERLEAVE-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT4]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP7]]) -; INTERLEAVE-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; INTERLEAVE-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] +; INTERLEAVE-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 [[TMP12]] +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP8]], ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) +; INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK2]]) +; INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP1]] +; INTERLEAVE-NEXT: [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP1]] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT4]], i64 [[N]]) +; INTERLEAVE-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: ; INTERLEAVE-NEXT: ret void ; @@ -194,7 +172,7 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8 -; CHECK-NEXT: [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store double [[CALL]], ptr [[GEPDST]], align 8 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -206,42 +184,40 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; INTERLEAVE-LABEL: define void @test_uniform_not_invariant ; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; INTERLEAVE-NEXT: entry: -; INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2) -; INTERLEAVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2) ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 0, i64 [[N]]) ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 1, i64 [[N]]) ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: -; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] +; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT5:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <1 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ] -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <1 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[PRED_STORE_CONTINUE4]] ] -; INTERLEAVE-NEXT: [[TMP2:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i64 0 -; INTERLEAVE-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <1 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[PRED_STORE_CONTINUE4]] ] +; INTERLEAVE-NEXT: [[TMP0:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; INTERLEAVE: pred.store.if: -; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[TMP3]], align 8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]] -; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] -; INTERLEAVE-NEXT: store double [[TMP5]], ptr [[TMP6]], align 8 +; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP1]], align 8 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR4:[0-9]+]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; INTERLEAVE-NEXT: store double [[TMP3]], ptr [[TMP4]], align 8 ; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE]] ; INTERLEAVE: pred.store.continue: -; INTERLEAVE-NEXT: [[TMP7:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK2]], i64 0 -; INTERLEAVE-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK2]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] ; INTERLEAVE: pred.store.if3: -; INTERLEAVE-NEXT: [[TMP8:%.*]] = or disjoint i64 [[INDEX]], 1 -; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP8]] -; INTERLEAVE-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP9]], align 8 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = call double @foo(double [[TMP10]], i64 [[TMP8]]) #[[ATTR5]] -; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP8]] -; INTERLEAVE-NEXT: store double [[TMP11]], ptr [[TMP12]], align 8 +; INTERLEAVE-NEXT: [[TMP6:%.*]] = or disjoint i64 [[INDEX]], 1 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP6]] +; INTERLEAVE-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP7]], align 8 +; INTERLEAVE-NEXT: [[TMP9:%.*]] = call double @foo(double [[TMP8]], i64 [[TMP6]]) #[[ATTR4]] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP6]] +; INTERLEAVE-NEXT: store double [[TMP9]], ptr [[TMP10]], align 8 ; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE4]] ; INTERLEAVE: pred.store.continue4: -; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; INTERLEAVE-NEXT: [[TMP13:%.*]] = or disjoint i64 [[INDEX]], 1 -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 [[INDEX]], i64 [[TMP0]]) -; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT5]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 [[TMP13]], i64 [[TMP1]]) -; INTERLEAVE-NEXT: [[TMP14:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; INTERLEAVE-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], 2 +; INTERLEAVE-NEXT: [[INDEX_NEXT5]] = add i64 [[INDEX]], 2 +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 [[INDEX_NEXT5]], i64 [[N]]) +; INTERLEAVE-NEXT: [[TMP11:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; INTERLEAVE-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; INTERLEAVE: for.cond.cleanup: ; INTERLEAVE-NEXT: ret void ;