-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] X86DAGToDAGISel - attempt to merge XMM/YMM loads with YMM/ZMM loads of the same ptr #73126
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf we are loading the same ptr at different vector widths, then reuse the larger load and just extract the low subvector. This is mainly useful for better constant sharing. Unlike the equivalent VBROADCAST_LOAD/SUBV_BROADCAST_LOAD folds which can occur in DAG, we have to wait until DAGISel otherwise we can hit infinite loops if constant folding recreates the original constant value. Patch is 1.26 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73126.diff 37 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 93e184eca9bc515..8b9393580dc45bf 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1036,6 +1036,42 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
break;
}
+ case ISD::LOAD: {
+ // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
+ // load, then just extract the lower subvector and avoid the second load.
+ auto *Ld = cast<LoadSDNode>(N);
+ MVT VT = N->getSimpleValueType(0);
+ if (ISD::isNormalLoad(Ld) && Ld->isSimple() &&
+ (VT.is128BitVector() || VT.is256BitVector())) {
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Chain = Ld->getChain();
+ for (SDNode *User : Ptr->uses()) {
+ auto *UserLd = dyn_cast<LoadSDNode>(N);
+ MVT UserVT = User->getSimpleValueType(0);
+ if (User != N && UserLd && ISD::isNormalLoad(User) &&
+ UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
+ !User->hasAnyUseOfValue(1) &&
+ (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
+ UserVT.getSizeInBits() > VT.getSizeInBits()) {
+ SDLoc dl(N);
+ unsigned NumSubElts =
+ VT.getSizeInBits() / UserVT.getScalarSizeInBits();
+ MVT SubVT = MVT::getVectorVT(UserVT.getScalarType(), NumSubElts);
+ SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
+ SDValue(User, 0),
+ CurDAG->getIntPtrConstant(0, dl));
+ SDValue Res = CurDAG->getBitcast(VT, Extract);
+ --I;
+ SDValue To[] = {Res, SDValue(UserLd, 1)};
+ CurDAG->ReplaceAllUsesWith(N, To);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ }
+ }
+ break;
+ }
case ISD::VSELECT: {
// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
index 34a205a7baa8641..b3a0c7dffae117c 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -98,10 +98,8 @@ define dso_local i64 @caller_argv64i1() #0 {
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %edi
; X32-NEXT: subl $88, %esp
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = [2,1,2,1]
-; X32-NEXT: # xmm0 = mem[0,0]
-; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
+; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: vmovups %zmm0, (%esp)
; X32-NEXT: movl $1, {{[0-9]+}}(%esp)
; X32-NEXT: movl $2, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 7a82515ad24b72c..4792e8343d7589f 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1401,9 +1401,9 @@ define <32 x bfloat> @pr63017_2() nounwind {
; AVXNC-NEXT: jne .LBB12_2
; AVXNC-NEXT: # %bb.1: # %cond.load
; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm1 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
-; AVXNC-NEXT: vpbroadcastw {{.*#+}} xmm0 = [49024,49024,49024,49024,49024,49024,49024,49024]
-; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0
-; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm2
+; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVXNC-NEXT: .LBB12_2: # %else
; AVXNC-NEXT: xorl %eax, %eax
; AVXNC-NEXT: testb %al, %al
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index a995e93708456e7..adf7fa97b776527 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -209,8 +209,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -255,8 +256,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -303,8 +305,9 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -421,13 +424,15 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
+; AVX1-NEXT: vpcmpeqq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [16,32,64,128]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
@@ -482,13 +487,15 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
@@ -549,17 +556,16 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vpcmpeqw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i16:
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 544d9b21eca7b9a..50c132b6c34de98 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -265,8 +265,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -324,8 +325,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -385,8 +387,9 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -544,15 +547,17 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
+; AVX1-NEXT: vpcmpeqq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [16,32,64,128]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -623,15 +628,17 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -705,22 +712,21 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vpsrlw $15, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,512,1024,2048,4096,8192,16384,32768]
-; AVX1-NEXT: vpcmpeqw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpeqw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index b69d22e04d7deb7..13f1451bbc8b027 100644
--- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -123,12 +123,12 @@ define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i16:
; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f32xi8_i16:
@@ -140,12 +140,12 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
;
; AVX-64-LABEL: f32xi8_i16:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-64-NEXT: vpaddb %xmm1, %xmm2, %xmm2
+; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f32xi8_i16:
@@ -245,12 +245,13 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i128:
; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm2
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f32xi8_i128:
@@ -263,12 +264,13 @@ define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
;
; AVX-64-LABEL: f32xi8_i128:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT: # ymm1 = m...
[truncated]
|
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 | ||
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8] | ||
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 | ||
; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm1 | ||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 | ||
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is (re)loading the upper bits [4,8] of the ymm data, technically we could extract the upper subvector, but that would create an extra instruction - is that worth it do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No only extra instruction but also register I think. It doesn't look worthwhile.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That would remove the constants too
.LCPI0_1:
.quad 4 # 0x4
.quad 8 # 0x8
SDValue Ptr = Ld->getBasePtr(); | ||
SDValue Chain = Ld->getChain(); | ||
for (SDNode *User : Ptr->uses()) { | ||
auto *UserLd = dyn_cast<LoadSDNode>(N); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you not need some check that there is no store between the two loads?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We check that the chains match
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I.e could you have load; store; load
all chained together?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the store changes the value for the second load, I think they will not have the same chain.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
store node always builds a new chain with the node.
// load, then just extract the lower subvector and avoid the second load. | ||
auto *Ld = cast<LoadSDNode>(N); | ||
MVT VT = N->getSimpleValueType(0); | ||
if (ISD::isNormalLoad(Ld) && Ld->isSimple() && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we check !isVolatile
instead of isSimple
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest early bail out here by reversing the condition.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we do that then we'd probably need to ensure Ld->isAtomic() == UserLd->isAtomic()
- would that be acceptable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. Then isSimple
sounds good to me.
add9352
to
20df8e4
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
20df8e4
to
75747ed
Compare
…oads of the same ptr If we are loading the same ptr at different vector widths, then reuse the larger load and just extract the low subvector. This is mainly useful for better constant sharing. Unlike the equivalent VBROADCAST_LOAD/SUBV_BROADCAST_LOAD folds which can occur in DAG, we have to wait until DAGISel otherwise we can hit infinite loops if constant folding recreates the original constant value.
75747ed
to
2a95cfd
Compare
…M/YMM loads with YMM/ZMM loads of the same ptr (#73126)" Missed an issue that we were calling continue from within the for loop - fixed version incoming shortly.
…oads of the same ptr (#73126) If we are loading the same ptr at different vector widths, then reuse the largest load and just extract the low subvector. Unlike the equivalent VBROADCAST_LOAD/SUBV_BROADCAST_LOAD folds which can occur in DAG, we have to wait until DAGISel otherwise we can hit infinite loops if constant folding recreates the original constant value. This is mainly useful for better constant sharing.
We were casting the LoadSDNode from the wrong node in the base pointer uses list, meaning the ptr/chain comparison were comparing against themselves.
Local branch amd-gfx df4f507 Merged main:0bc7cd4d5122 into amd-gfx:94601047c932 Remote branch main 381efa4 Revert rG67275263b3b781a "[X86] X86DAGToDAGISel - attempt to merge XMM/YMM loads with YMM/ZMM loads of the same ptr (llvm#73126)"
If we are loading the same ptr at different vector widths, then reuse the larger load and just extract the low subvector.
This is mainly useful for better constant sharing.
Unlike the equivalent VBROADCAST_LOAD/SUBV_BROADCAST_LOAD folds which can occur in DAG, we have to wait until DAGISel otherwise we can hit infinite loops if constant folding recreates the original constant value.