From 6a1ca7e5a1f444c9ad405cbacdaade2031dbaf2f Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 19 Oct 2023 11:30:24 +0200 Subject: [PATCH] [AMDGPU] PeepholeSDWA: Don't assume inst srcs are registers To fix that ticket we only needed to address the V_LSHLREV_B16 case, but I did it for all insts just in case. Fixes #66899 --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 12 ++- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 107 ++++++++++++++++++++++ 2 files changed, 115 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 97b3161c7f98b..53fc2c0686245 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -546,7 +546,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) + if (!Src1->isReg() || Src1->getReg().isPhysical() || + Dst->getReg().isPhysical()) break; if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || @@ -584,7 +585,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) + if (!Src1->isReg() || Src1->getReg().isPhysical() || + Dst->getReg().isPhysical()) break; if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || @@ -647,7 +649,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical()) + if (!Src0->isReg() || Src0->getReg().isPhysical() || + Dst->getReg().isPhysical()) break; return std::make_unique( @@ -675,7 +678,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical()) + if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || + Dst->getReg().isPhysical()) break; return std::make_unique( diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 79e96b45a901c..086fab3f52175 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -2098,6 +2098,113 @@ bb11: ; preds = %bb10, %bb2 br label %bb1 } +define void @crash_lshlrevb16_not_reg_op() { +; NOSDWA-LABEL: crash_lshlrevb16_not_reg_op: +; NOSDWA: ; %bb.0: ; %bb0 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; NOSDWA-NEXT: s_mov_b64 s[4:5], 0 +; NOSDWA-NEXT: s_and_b64 vcc, exec, -1 +; NOSDWA-NEXT: v_lshlrev_b16_e64 v3, 8, 1 +; NOSDWA-NEXT: .LBB22_1: ; %bb1 +; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v2, 0xff +; NOSDWA-NEXT: s_lshl_b32 s6, s4, 3 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_mov_b64 s[4:5], 1 +; NOSDWA-NEXT: v_and_b32_e32 v2, s4, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3 +; NOSDWA-NEXT: v_lshrrev_b16_e32 v2, s6, v2 +; NOSDWA-NEXT: flat_store_byte v[0:1], v2 +; NOSDWA-NEXT: s_mov_b64 vcc, vcc +; NOSDWA-NEXT: s_cbranch_vccnz .LBB22_1 +; NOSDWA-NEXT: ; %bb.2: ; %DummyReturnBlock +; NOSDWA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; NOSDWA-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: crash_lshlrevb16_not_reg_op: +; GFX89: ; %bb.0: ; %bb0 +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b64 s[4:5], 0 +; GFX89-NEXT: s_and_b64 vcc, exec, -1 +; GFX89-NEXT: v_lshlrev_b16_e64 v0, 8, 1 +; GFX89-NEXT: .LBB22_1: ; %bb1 +; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX89-NEXT: v_mov_b32_e32 v3, s4 +; GFX89-NEXT: s_lshl_b32 s6, s4, 3 +; GFX89-NEXT: v_mov_b32_e32 v1, s4 +; GFX89-NEXT: v_or_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_mov_b32_e32 v2, s5 +; GFX89-NEXT: s_mov_b64 s[4:5], 1 +; GFX89-NEXT: v_lshrrev_b16_e32 v3, s6, v3 +; GFX89-NEXT: flat_store_byte v[1:2], v3 +; GFX89-NEXT: s_mov_b64 vcc, vcc +; GFX89-NEXT: s_cbranch_vccnz .LBB22_1 +; GFX89-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: crash_lshlrevb16_not_reg_op: +; GFX9: ; %bb.0: ; %bb0 +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_lshlrev_b16_e64 v0, 8, 1 +; GFX9-NEXT: s_and_b64 vcc, exec, -1 +; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: .LBB22_1: ; %bb1 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_lshl_b32 s6, s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: s_mov_b64 s[4:5], 1 +; GFX9-NEXT: v_lshrrev_b16_e32 v3, s6, v0 +; GFX9-NEXT: flat_store_byte v[1:2], v3 +; GFX9-NEXT: s_mov_b64 vcc, vcc +; GFX9-NEXT: s_cbranch_vccnz .LBB22_1 +; GFX9-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: crash_lshlrevb16_not_reg_op: +; GFX10: ; %bb.0: ; %bb0 +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b16 v0, 8, 1 +; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX10-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: .LBB22_1: ; %bb1 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_lshl_b32 s6, s4, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_lshrrev_b16 v3, s6, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], 1 +; GFX10-NEXT: flat_store_byte v[1:2], v3 +; GFX10-NEXT: s_cbranch_vccnz .LBB22_1 +; GFX10-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] + %1 = alloca [2 x i8], align 1, addrspace(5) + %2 = getelementptr [2 x i8], ptr addrspace(5) %1, i32 0, i32 1 + br label %bb0 + +bb0: + store i8 1, ptr addrspace(5) %2, align 1 + br label %bb1 + +bb1: + %3 = phi i64 [ 1, %bb1 ], [ 0, %bb0 ] + %4 = trunc i64 %3 to i32 + %5 = getelementptr i8, ptr addrspace(5) %1, i32 %4 + %6 = load i8, ptr addrspace(5) %5, align 1 + %7 = getelementptr i8, ptr null, i64 %3 + store i8 %6, ptr %7, align 1 + br i1 false, label %bb2, label %bb1 + +bb2: + br label %bb0 +} + declare i32 @llvm.amdgcn.workitem.id.x() attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }