From cc3d2533cc2e4ea06981b86ede5087fbf801e789 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Mon, 16 Oct 2023 16:18:27 +0200
Subject: [PATCH 1/6] [AMDGPU] Add i1 mul patterns (#67291)

i1 muls can sometimes happen after SCEV. They resulted in ISel failures
because we were missing the patterns for them.

Solves SWDEV-423354
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   2 +
 llvm/test/CodeGen/AMDGPU/mul.ll           | 403 +++++++++++++++++-----
 2 files changed, 328 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cd849560feac2..9c5b166c96522 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -769,6 +769,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // extract of relevant bits.
   setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
 
+  setOperationAction(ISD::MUL, MVT::i1, Promote);
+
   setTargetDAGCombine({ISD::ADD,
                        ISD::UADDO_CARRY,
                        ISD::SUB,
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index b4e9376d82777..da7645d5011fc 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -1059,6 +1059,255 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind {
+; SI-LABEL: s_mul_i1:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
+; SI-NEXT:    s_load_dword s5, s[0:1], 0x1c
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mul_i32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s4, s4, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_mul_i1:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x70
+; VI-NEXT:    s_load_dword s5, s[0:1], 0x4c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mul_lo_u16_e32 v0, s5, v0
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_mul_i1:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x70
+; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x4c
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mul_lo_u16_e32 v0, s3, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_mul_i1:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
+; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mul_lo_u16 v0, s2, s3
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_mul_i1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x4c
+; GFX11-NEXT:    s_load_b32 s3, s[0:1], 0x70
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mul_lo_u16 v0, s2, s3
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; EG-LABEL: s_mul_i1:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 72, #3
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 108, #3
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T1.W, PS, 1,
+; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %mul = mul i1 %a, %b
+  store i1 %mul, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: v_mul_i1:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_mul_i1:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_mul_i1:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s10, s6
+; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; GFX9-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_mul_i1:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; GFX10-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_mul_i1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s6
+; GFX11-NEXT:    s_mov_b32 s11, s7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s8, s2
+; GFX11-NEXT:    s_mov_b32 s9, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[8:11], 0
+; GFX11-NEXT:    buffer_load_u8 v1, off, s[8:11], 0 offset:4
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[4:7], 0
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; EG-LABEL: v_mul_i1:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 4, #1
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T1.W, PS, 1,
+; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i1, ptr addrspace(1) %in
+  %b = load i1, ptr addrspace(1) %b_ptr
+  %result = mul i1 %a, %b
+  store i1 %result, ptr addrspace(1) %out
+  ret void
+}
+
 ; A standard 64-bit multiply.  The expansion should be around 6 instructions.
 ; It would be difficult to match the expansion correctly without writing
 ; a really complicated list of FileCheck expressions.  I don't want
@@ -1213,7 +1462,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
 ; SI-NEXT:    v_mul_hi_u32 v4, v2, v0
 ; SI-NEXT:    v_mul_lo_u32 v3, v3, v0
 ; SI-NEXT:    v_mul_lo_u32 v0, v2, v0
-; SI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -1367,30 +1616,30 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s2, 0
-; SI-NEXT:    s_cbranch_scc0 .LBB11_2
+; SI-NEXT:    s_cbranch_scc0 .LBB13_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_mul_i32 s6, s2, s3
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    s_branch .LBB11_3
-; SI-NEXT:  .LBB11_2:
+; SI-NEXT:    s_branch .LBB13_3
+; SI-NEXT:  .LBB13_2:
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:  .LBB11_3: ; %Flow
+; SI-NEXT:  .LBB13_3: ; %Flow
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB11_5
+; SI-NEXT:    s_cbranch_vccnz .LBB13_5
 ; SI-NEXT:  ; %bb.4: ; %if
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_branch .LBB11_6
-; SI-NEXT:  .LBB11_5:
+; SI-NEXT:    s_branch .LBB13_6
+; SI-NEXT:  .LBB13_5:
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:  .LBB11_6: ; %endif
+; SI-NEXT:  .LBB13_6: ; %endif
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -1402,18 +1651,18 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-NEXT:    s_cbranch_scc0 .LBB11_2
+; VI-NEXT:    s_cbranch_scc0 .LBB13_2
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_mul_i32 s6, s2, s3
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
-; VI-NEXT:    s_branch .LBB11_3
-; VI-NEXT:  .LBB11_2:
+; VI-NEXT:    s_branch .LBB13_3
+; VI-NEXT:  .LBB13_2:
 ; VI-NEXT:    s_mov_b64 s[4:5], -1
 ; VI-NEXT:    ; implicit-def: $sgpr6
-; VI-NEXT:  .LBB11_3: ; %Flow
+; VI-NEXT:  .LBB13_3: ; %Flow
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT:    s_cbranch_vccnz .LBB11_5
+; VI-NEXT:    s_cbranch_vccnz .LBB13_5
 ; VI-NEXT:  ; %bb.4: ; %if
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -1421,10 +1670,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_mov_b32 s4, s2
 ; VI-NEXT:    s_mov_b32 s5, s3
 ; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_branch .LBB11_6
-; VI-NEXT:  .LBB11_5:
+; VI-NEXT:    s_branch .LBB13_6
+; VI-NEXT:  .LBB13_5:
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:  .LBB11_6: ; %endif
+; VI-NEXT:  .LBB13_6: ; %endif
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
@@ -1437,18 +1686,18 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_cbranch_scc0 .LBB11_2
+; GFX9-NEXT:    s_cbranch_scc0 .LBB13_2
 ; GFX9-NEXT:  ; %bb.1: ; %else
 ; GFX9-NEXT:    s_mul_i32 s6, s2, s3
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    s_branch .LBB11_3
-; GFX9-NEXT:  .LBB11_2:
+; GFX9-NEXT:    s_branch .LBB13_3
+; GFX9-NEXT:  .LBB13_2:
 ; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $sgpr6
-; GFX9-NEXT:  .LBB11_3: ; %Flow
+; GFX9-NEXT:  .LBB13_3: ; %Flow
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_vccnz .LBB11_5
+; GFX9-NEXT:    s_cbranch_vccnz .LBB13_5
 ; GFX9-NEXT:  ; %bb.4: ; %if
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1456,10 +1705,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_mov_b32 s4, s2
 ; GFX9-NEXT:    s_mov_b32 s5, s3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX9-NEXT:    s_branch .LBB11_6
-; GFX9-NEXT:  .LBB11_5:
+; GFX9-NEXT:    s_branch .LBB13_6
+; GFX9-NEXT:  .LBB13_5:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:  .LBB11_6: ; %endif
+; GFX9-NEXT:  .LBB13_6: ; %endif
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -1473,17 +1722,17 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-NEXT:    s_cbranch_scc0 .LBB11_2
+; GFX10-NEXT:    s_cbranch_scc0 .LBB13_2
 ; GFX10-NEXT:  ; %bb.1: ; %else
 ; GFX10-NEXT:    s_mul_i32 s5, s2, s3
-; GFX10-NEXT:    s_branch .LBB11_3
-; GFX10-NEXT:  .LBB11_2:
+; GFX10-NEXT:    s_branch .LBB13_3
+; GFX10-NEXT:  .LBB13_2:
 ; GFX10-NEXT:    s_mov_b32 s4, -1
 ; GFX10-NEXT:    ; implicit-def: $sgpr5
-; GFX10-NEXT:  .LBB11_3: ; %Flow
+; GFX10-NEXT:  .LBB13_3: ; %Flow
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_vccnz .LBB11_5
+; GFX10-NEXT:    s_cbranch_vccnz .LBB13_5
 ; GFX10-NEXT:  ; %bb.4: ; %if
 ; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s6, -1
@@ -1491,10 +1740,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_mov_b32 s4, s2
 ; GFX10-NEXT:    s_mov_b32 s5, s3
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX10-NEXT:    s_branch .LBB11_6
-; GFX10-NEXT:  .LBB11_5:
+; GFX10-NEXT:    s_branch .LBB13_6
+; GFX10-NEXT:  .LBB13_5:
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-NEXT:  .LBB11_6: ; %endif
+; GFX10-NEXT:  .LBB13_6: ; %endif
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s2, -1
@@ -1508,17 +1757,17 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB11_2
+; GFX11-NEXT:    s_cbranch_scc0 .LBB13_2
 ; GFX11-NEXT:  ; %bb.1: ; %else
 ; GFX11-NEXT:    s_mul_i32 s5, s2, s3
-; GFX11-NEXT:    s_branch .LBB11_3
-; GFX11-NEXT:  .LBB11_2:
+; GFX11-NEXT:    s_branch .LBB13_3
+; GFX11-NEXT:  .LBB13_2:
 ; GFX11-NEXT:    s_mov_b32 s4, -1
 ; GFX11-NEXT:    ; implicit-def: $sgpr5
-; GFX11-NEXT:  .LBB11_3: ; %Flow
+; GFX11-NEXT:  .LBB13_3: ; %Flow
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB11_5
+; GFX11-NEXT:    s_cbranch_vccnz .LBB13_5
 ; GFX11-NEXT:  ; %bb.4: ; %if
 ; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s6, -1
@@ -1526,10 +1775,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s4, s2
 ; GFX11-NEXT:    s_mov_b32 s5, s3
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_branch .LBB11_6
-; GFX11-NEXT:  .LBB11_5:
+; GFX11-NEXT:    s_branch .LBB13_6
+; GFX11-NEXT:  .LBB13_5:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s5
-; GFX11-NEXT:  .LBB11_6: ; %endif
+; GFX11-NEXT:  .LBB13_6: ; %endif
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -1601,7 +1850,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
 ; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
-; SI-NEXT:    s_cbranch_vccz .LBB12_4
+; SI-NEXT:    s_cbranch_vccz .LBB14_4
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
@@ -1612,22 +1861,22 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; SI-NEXT:    s_cbranch_vccnz .LBB12_3
-; SI-NEXT:  .LBB12_2: ; %if
+; SI-NEXT:    s_cbranch_vccnz .LBB14_3
+; SI-NEXT:  .LBB14_2: ; %if
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; SI-NEXT:  .LBB12_3: ; %endif
+; SI-NEXT:  .LBB14_3: ; %endif
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB12_4:
+; SI-NEXT:  .LBB14_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; SI-NEXT:    s_branch .LBB12_2
+; SI-NEXT:    s_branch .LBB14_2
 ;
 ; VI-LABEL: mul64_in_branch:
 ; VI:       ; %bb.0: ; %entry
@@ -1635,7 +1884,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_mov_b64 s[8:9], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; VI-NEXT:    s_cbranch_scc0 .LBB12_4
+; VI-NEXT:    s_cbranch_scc0 .LBB14_4
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
@@ -1644,22 +1893,22 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_mul_i32 s4, s5, s6
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
 ; VI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; VI-NEXT:    s_cbranch_vccnz .LBB12_3
-; VI-NEXT:  .LBB12_2: ; %if
+; VI-NEXT:    s_cbranch_vccnz .LBB14_3
+; VI-NEXT:  .LBB14_2: ; %if
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_mov_b32 s4, s2
 ; VI-NEXT:    s_mov_b32 s5, s3
 ; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; VI-NEXT:  .LBB12_3: ; %endif
+; VI-NEXT:  .LBB14_3: ; %endif
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB12_4:
+; VI-NEXT:  .LBB14_4:
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; VI-NEXT:    s_branch .LBB12_2
+; VI-NEXT:    s_branch .LBB14_2
 ;
 ; GFX9-LABEL: mul64_in_branch:
 ; GFX9:       ; %bb.0: ; %entry
@@ -1667,7 +1916,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_cbranch_scc0 .LBB12_3
+; GFX9-NEXT:    s_cbranch_scc0 .LBB14_3
 ; GFX9-NEXT:  ; %bb.1: ; %else
 ; GFX9-NEXT:    s_mul_i32 s7, s4, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s10, s4, s6
@@ -1676,21 +1925,21 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_add_i32 s5, s7, s5
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s6
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GFX9-NEXT:    s_cbranch_vccnz .LBB12_4
-; GFX9-NEXT:  .LBB12_2: ; %if
+; GFX9-NEXT:    s_cbranch_vccnz .LBB14_4
+; GFX9-NEXT:  .LBB14_2: ; %if
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_mov_b32 s4, s2
 ; GFX9-NEXT:    s_mov_b32 s5, s3
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX9-NEXT:    s_branch .LBB12_5
-; GFX9-NEXT:  .LBB12_3:
+; GFX9-NEXT:    s_branch .LBB14_5
+; GFX9-NEXT:  .LBB14_3:
 ; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX9-NEXT:    s_branch .LBB12_2
-; GFX9-NEXT:  .LBB12_4:
+; GFX9-NEXT:    s_branch .LBB14_2
+; GFX9-NEXT:  .LBB14_4:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:  .LBB12_5: ; %endif
+; GFX9-NEXT:  .LBB14_5: ; %endif
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1702,7 +1951,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX10-NEXT:    s_cbranch_scc0 .LBB12_3
+; GFX10-NEXT:    s_cbranch_scc0 .LBB14_3
 ; GFX10-NEXT:  ; %bb.1: ; %else
 ; GFX10-NEXT:    s_mul_i32 s7, s4, s7
 ; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
@@ -1711,22 +1960,22 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_mul_i32 s4, s4, s6
 ; GFX10-NEXT:    s_add_i32 s5, s7, s5
 ; GFX10-NEXT:    s_mov_b32 s6, 0
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
-; GFX10-NEXT:  .LBB12_2: ; %if
+; GFX10-NEXT:    s_cbranch_execnz .LBB14_4
+; GFX10-NEXT:  .LBB14_2: ; %if
 ; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_mov_b32 s4, s2
 ; GFX10-NEXT:    s_mov_b32 s5, s3
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX10-NEXT:    s_branch .LBB12_5
-; GFX10-NEXT:  .LBB12_3:
+; GFX10-NEXT:    s_branch .LBB14_5
+; GFX10-NEXT:  .LBB14_3:
 ; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX10-NEXT:    s_branch .LBB12_2
-; GFX10-NEXT:  .LBB12_4:
+; GFX10-NEXT:    s_branch .LBB14_2
+; GFX10-NEXT:  .LBB14_4:
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:  .LBB12_5: ; %endif
+; GFX10-NEXT:  .LBB14_5: ; %endif
 ; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1738,7 +1987,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB12_3
+; GFX11-NEXT:    s_cbranch_scc0 .LBB14_3
 ; GFX11-NEXT:  ; %bb.1: ; %else
 ; GFX11-NEXT:    s_mul_i32 s7, s4, s7
 ; GFX11-NEXT:    s_mul_hi_u32 s8, s4, s6
@@ -1747,21 +1996,21 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_mul_i32 s4, s4, s6
 ; GFX11-NEXT:    s_add_i32 s5, s7, s5
 ; GFX11-NEXT:    s_mov_b32 s6, 0
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
-; GFX11-NEXT:  .LBB12_2: ; %if
+; GFX11-NEXT:    s_cbranch_execnz .LBB14_4
+; GFX11-NEXT:  .LBB14_2: ; %if
 ; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_mov_b32 s4, s2
 ; GFX11-NEXT:    s_mov_b32 s5, s3
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_branch .LBB12_5
-; GFX11-NEXT:  .LBB12_3:
+; GFX11-NEXT:    s_branch .LBB14_5
+; GFX11-NEXT:  .LBB14_3:
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX11-NEXT:    s_branch .LBB12_2
-; GFX11-NEXT:  .LBB12_4:
+; GFX11-NEXT:    s_branch .LBB14_2
+; GFX11-NEXT:  .LBB14_4:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT:  .LBB12_5: ; %endif
+; GFX11-NEXT:  .LBB14_5: ; %endif
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)

From 97217d188469c78d69b65059cabc123e847a2c66 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Sat, 14 Oct 2023 17:27:37 -0400
Subject: [PATCH 2/6] [mlir] Fix '-Wunused' warning. NFC

---
 mlir/lib/Target/LLVMIR/ModuleImport.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index d070e42ac0c7d..e3562049cd81c 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -991,7 +991,7 @@ FailureOr<Value> ModuleImport::convertConstant(llvm::Constant *constant) {
   }
 
   // Convert none token constants.
-  if (auto *noneToken = dyn_cast<llvm::ConstantTokenNone>(constant)) {
+  if (isa<llvm::ConstantTokenNone>(constant)) {
     return builder.create<NoneTokenOp>(loc).getResult();
   }
 

From e9c101a7533a829f48678589c7382d4c21c2eb1b Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 16 Oct 2023 17:08:12 +0200
Subject: [PATCH 3/6] [libc++] Add missing <__availability> include

---
 libcxx/include/sstream | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 47c2d0553a57c..7db5409871873 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -267,6 +267,7 @@ typedef basic_stringstream<wchar_t> wstringstream;
 */
 
 #include <__assert> // all public C++ headers provide the assertion handler
+#include <__availability>
 #include <__config>
 #include <__fwd/sstream.h>
 #include <__utility/swap.h>

From 903faefc14eb838a20c0526a14d44dbb0fcea85b Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Mon, 16 Oct 2023 15:14:48 +0000
Subject: [PATCH 4/6] [Flang][OpenMP] Port three tests to HLFIR flow

These are copies of tests from flang/test/Lower/OpenMP/FIR
---
 .../Lower/OpenMP/firstprivate-commonblock.f90 |  34 ++
 flang/test/Lower/OpenMP/unstructured.f90      | 348 ++++++++++++++++++
 flang/test/Lower/OpenMP/wsloop.f90            |  75 ++++
 3 files changed, 457 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/firstprivate-commonblock.f90
 create mode 100644 flang/test/Lower/OpenMP/unstructured.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop.f90

diff --git a/flang/test/Lower/OpenMP/firstprivate-commonblock.f90 b/flang/test/Lower/OpenMP/firstprivate-commonblock.f90
new file mode 100644
index 0000000000000..ff064a74d491a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/firstprivate-commonblock.f90
@@ -0,0 +1,34 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK: func.func @_QPfirstprivate_common() {
+!CHECK: %[[val_0:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<8xi8>>
+!CHECK: %[[val_1:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
+!CHECK: %[[val_2:.*]] = fir.coordinate_of %[[val_1]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<i8>) -> !fir.ref<f32>
+!CHECK: %[[VAL_3_DECL:.*]]:2 = hlfir.declare %[[val_3]] {uniq_name = "_QFfirstprivate_commonEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK: %[[val_4:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
+!CHECK: %[[val_5:.*]] = fir.coordinate_of %[[val_4]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<i8>) -> !fir.ref<f32>
+!CHECK: %[[VAL_6_DECL:.*]]:2 = hlfir.declare %[[val_6]] {uniq_name = "_QFfirstprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK: omp.parallel {
+!CHECK: %[[val_7:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFfirstprivate_commonEx"}
+!CHECK: %[[VAL_7_DECL:.*]]:2 = hlfir.declare %[[val_7]] {uniq_name = "_QFfirstprivate_commonEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK: %[[val_8:.*]] = fir.load %[[VAL_3_DECL]]#1 : !fir.ref<f32>
+!CHECK: fir.store %[[val_8]] to %[[VAL_7_DECL]]#1 : !fir.ref<f32>
+!CHECK: %[[val_9:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFfirstprivate_commonEy"}
+!CHECK: %[[VAL_9_DECL:.*]]:2 = hlfir.declare %[[val_9]] {uniq_name = "_QFfirstprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK: %[[val_10:.*]] = fir.load %[[VAL_6_DECL]]#1 : !fir.ref<f32>
+!CHECK: fir.store %[[val_10]] to %[[VAL_9_DECL]]#1 : !fir.ref<f32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: return
+!CHECK: }
+
+subroutine firstprivate_common
+  common /c/ x, y
+  real x, y
+  !$omp parallel firstprivate(/c/)
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/unstructured.f90 b/flang/test/Lower/OpenMP/unstructured.f90
new file mode 100644
index 0000000000000..e5bf980ce90fd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/unstructured.f90
@@ -0,0 +1,348 @@
+! Test unstructured code adjacent to and inside OpenMP constructs.
+
+! RUN: bbc %s -fopenmp -emit-hlfir -o "-" | FileCheck %s
+
+! CHECK-LABEL: func @_QPss1{{.*}} {
+! CHECK:   br ^bb1
+! CHECK: ^bb1:  // 2 preds: ^bb0, ^bb4
+! CHECK:   cond_br %{{[0-9]*}}, ^bb2, ^bb5
+! CHECK: ^bb2:  // pred: ^bb1
+! CHECK:   cond_br %{{[0-9]*}}, ^bb3, ^bb4
+! CHECK: ^bb4:  // pred: ^bb2
+! CHECK:   fir.call @_FortranAioBeginExternalListOutput
+! CHECK:   br ^bb1
+! CHECK: ^bb5:  // 2 preds: ^bb1, ^bb3
+! CHECK:   omp.master  {
+! CHECK:     @_FortranAioBeginExternalListOutput
+! CHECK:     omp.terminator
+! CHECK:   }
+! CHECK:   @_FortranAioBeginExternalListOutput
+! CHECK: }
+subroutine ss1(n) ! unstructured code followed by a structured OpenMP construct
+  do i = 1, 3
+    if (i .eq. n) exit
+    print*, 'ss1-A', i
+  enddo
+  !$omp master
+    print*, 'ss1-B', i
+  !$omp end master
+  print*
+end
+
+! CHECK-LABEL: func @_QPss2{{.*}} {
+! CHECK:   omp.master  {
+! CHECK:     @_FortranAioBeginExternalListOutput
+! CHECK:     br ^bb1
+! CHECK:   ^bb1:  // 2 preds: ^bb0, ^bb4
+! CHECK:     cond_br %{{[0-9]*}}, ^bb2, ^bb5
+! CHECK:   ^bb2:  // pred: ^bb1
+! CHECK:     cond_br %{{[0-9]*}}, ^bb3, ^bb4
+! CHECK:   ^bb3:  // pred: ^bb2
+! CHECK:     @_FortranAioBeginExternalListOutput
+! CHECK:     br ^bb1
+! CHECK:   ^bb5:  // 2 preds: ^bb1, ^bb3
+! CHECK:     omp.terminator
+! CHECK:   }
+! CHECK:   @_FortranAioBeginExternalListOutput
+! CHECK:   @_FortranAioBeginExternalListOutput
+! CHECK: }
+subroutine ss2(n) ! unstructured OpenMP construct; loop exit inside construct
+  !$omp master
+    print*, 'ss2-A', n
+    do i = 1, 3
+      if (i .eq. n) exit
+      print*, 'ss2-B', i
+    enddo
+  !$omp end master
+  print*, 'ss2-C', i
+  print*
+end
+
+! CHECK-LABEL: func @_QPss3{{.*}} {
+! CHECK:   omp.parallel {
+! CHECK:     %[[ALLOCA_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned}
+! CHECK:     %[[K_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_K]] {uniq_name = "_QFss3Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:     %[[ALLOCA_1:.*]] = fir.alloca i32 {{{.*}}, pinned}
+! CHECK:     %[[OMP_LOOP_J_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFss3Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:     %[[ALLOCA_2:.*]] = fir.alloca i32 {{{.*}}, pinned}
+! CHECK:     %[[OMP_LOOP_K_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFss3Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:     br ^bb1
+! CHECK:   ^bb1:  // 2 preds: ^bb0, ^bb3
+! CHECK:     cond_br %{{[0-9]*}}, ^bb2, ^bb4
+! CHECK:   ^bb2:  // pred: ^bb1
+! CHECK:     omp.wsloop for (%[[ARG1:.*]]) : {{.*}} {
+! CHECK:       fir.store %[[ARG1]] to %[[OMP_LOOP_K_DECL]]#1 : !fir.ref<i32>
+! CHECK:     @_FortranAioBeginExternalListOutput
+! CHECK:       %[[LOAD_1:.*]] = fir.load %[[OMP_LOOP_K_DECL]]#0 : !fir.ref<i32>
+! CHECK:     @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD_1]])
+! CHECK:       omp.yield
+! CHECK:     }
+! CHECK:     omp.wsloop for (%[[ARG2:.*]]) : {{.*}} {
+! CHECK:       fir.store %[[ARG2]] to %[[OMP_LOOP_J_DECL]]#1 : !fir.ref<i32>
+! CHECK:       br ^bb1
+! CHECK:     ^bb2:  // 2 preds: ^bb1, ^bb5
+! CHECK:       cond_br %{{[0-9]*}}, ^bb3, ^bb6
+! CHECK:     ^bb3:  // pred: ^bb2
+! CHECK:       cond_br %{{[0-9]*}}, ^bb4, ^bb5
+! CHECK:     ^bb4:  // pred: ^bb3
+! CHECK:       @_FortranAioBeginExternalListOutput
+! CHECK:       %[[LOAD_2:.*]] = fir.load %[[K_DECL]]#0 : !fir.ref<i32>
+! CHECK:     @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD_2]])
+! CHECK:       br ^bb2
+! CHECK:     ^bb6:  // 2 preds: ^bb2, ^bb4
+! CHECK:       omp.yield
+! CHECK:     }
+! CHECK:     br ^bb1
+! CHECK:   ^bb4:  // pred: ^bb1
+! CHECK:     omp.terminator
+! CHECK:   }
+! CHECK: }
+subroutine ss3(n) ! nested unstructured OpenMP constructs
+  !$omp parallel
+    do i = 1, 3
+      !$omp do
+        do k = 1, 3
+          print*, 'ss3-A', k
+        enddo
+      !$omp end do
+      !$omp do
+        do j = 1, 3
+          do k = 1, 3
+            if (k .eq. n) exit
+            print*, 'ss3-B', k
+          enddo
+        enddo
+      !$omp end do
+    enddo
+  !$omp end parallel
+end
+
+! CHECK-LABEL: func @_QPss4{{.*}} {
+! CHECK:       omp.parallel {
+! CHECK:         %[[ALLOCA:.*]] = fir.alloca i32 {{{.*}}, pinned}
+! CHECK:         %[[OMP_LOOP_J_DECL:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFss4Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         omp.wsloop for (%[[ARG:.*]]) : {{.*}} {
+! CHECK:           fir.store %[[ARG]] to %[[OMP_LOOP_J_DECL]]#1 : !fir.ref<i32>
+! CHECK:           %[[COND:.*]] = arith.cmpi eq, %{{.*}}, %{{.*}}
+! CHECK:           %[[COND_XOR:.*]] = arith.xori %[[COND]], %{{.*}}
+! CHECK:          fir.if %[[COND_XOR]] {
+! CHECK:           @_FortranAioBeginExternalListOutput
+! CHECK:           %[[LOAD:.*]] = fir.load %[[OMP_LOOP_J_DECL]]#0 : !fir.ref<i32>
+! CHECK:           @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD]])
+! CHECK:          } else {
+! CHECK:          }
+! CHECK-NEXT:      omp.yield
+! CHECK-NEXT:  }
+! CHECK:       omp.terminator
+! CHECK-NEXT:}
+subroutine ss4(n) ! CYCLE in OpenMP wsloop constructs
+  !$omp parallel
+    do i = 1, 3
+      !$omp do
+        do j = 1, 3
+           if (j .eq. n) cycle
+           print*, 'ss4', j
+        enddo
+      !$omp end do
+    enddo
+  !$omp end parallel
+end
+
+! CHECK-LABEL: func @_QPss5() {
+! CHECK:  omp.parallel  {
+! CHECK:    omp.wsloop {{.*}} {
+! CHECK:      br ^[[BB1:.*]]
+! CHECK:    ^[[BB1]]:
+! CHECK:      br ^[[BB2:.*]]
+! CHECK:    ^[[BB2]]:
+! CHECK:      cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
+! CHECK:    ^[[BB3]]:
+! CHECK:      cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB3:.*]]
+! CHECK:    ^[[BB4]]:
+! CHECK:      br ^[[BB6]]
+! CHECK:    ^[[BB3]]:
+! CHECK:      br ^[[BB2]]
+! CHECK:    ^[[BB6]]:
+! CHECK:      omp.yield
+! CHECK:    }
+! CHECK:    omp.terminator
+! CHECK:  }
+subroutine ss5() ! EXIT inside OpenMP wsloop (inside parallel)
+  integer :: x
+  !$omp parallel private(x)
+    !$omp do
+      do j = 1, 3
+        x = j * i
+        do k = 1, 3
+          if (k .eq. n) exit
+          x = k
+          x = x + k
+        enddo
+        x = j - 222
+      enddo
+    !$omp end do
+  !$omp end parallel
+end
+
+! CHECK-LABEL: func @_QPss6() {
+! CHECK:  omp.parallel  {
+! CHECK:    br ^[[BB1_OUTER:.*]]
+! CHECK:  ^[[BB1_OUTER]]:
+! CHECK:    cond_br %{{.*}}, ^[[BB2_OUTER:.*]], ^[[BB3_OUTER:.*]]
+! CHECK:  ^[[BB2_OUTER]]:
+! CHECK:    omp.wsloop {{.*}} {
+! CHECK:      br ^[[BB1:.*]]
+! CHECK:    ^[[BB1]]:
+! CHECK:      br ^[[BB2:.*]]
+! CHECK:    ^[[BB2]]:
+! CHECK:      cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
+! CHECK:    ^[[BB3]]:
+! CHECK:      cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB5:.*]]
+! CHECK:    ^[[BB4]]:
+! CHECK:      br ^[[BB6]]
+! CHECK:    ^[[BB5]]
+! CHECK:      br ^[[BB2]]
+! CHECK:    ^[[BB6]]:
+! CHECK:      omp.yield
+! CHECK:    }
+! CHECK:    br ^[[BB1_OUTER]]
+! CHECK:  ^[[BB3_OUTER]]:
+! CHECK:    omp.terminator
+! CHECK:  }
+subroutine ss6() ! EXIT inside OpenMP wsloop in a do loop (inside parallel)
+  integer :: x
+  !$omp parallel private(x)
+    do i = 1, 3
+      !$omp do
+        do j = 1, 3
+          x = j * i
+          do k = 1, 3
+            if (k .eq. n) exit
+            x = k
+            x = x + k
+          enddo
+          x = j - 222
+        enddo
+      !$omp end do
+    enddo
+  !$omp end parallel
+end
+
+! CHECK-LABEL: func @_QPss7() {
+! CHECK: br ^[[BB1_OUTER:.*]]
+! CHECK: ^[[BB1_OUTER]]:
+! CHECK:   cond_br %{{.*}}, ^[[BB2_OUTER:.*]], ^[[BB3_OUTER:.*]]
+! CHECK-NEXT: ^[[BB2_OUTER:.*]]:
+! CHECK:   omp.parallel  {
+! CHECK:     omp.wsloop {{.*}} {
+! CHECK:       br ^[[BB1:.*]]
+! CHECK-NEXT:     ^[[BB1]]:
+! CHECK:       br ^[[BB2:.*]]
+! CHECK-NEXT:     ^[[BB2]]:
+! CHECK:       cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
+! CHECK-NEXT:     ^[[BB3]]:
+! CHECK:       cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB5:.*]]
+! CHECK-NEXT:     ^[[BB4]]:
+! CHECK:       br ^[[BB6]]
+! CHECK-NEXT:     ^[[BB5]]:
+! CHECK:       br ^[[BB2]]
+! CHECK-NEXT:     ^[[BB6]]:
+! CHECK:       omp.yield
+! CHECK:     }
+! CHECK:     omp.terminator
+! CHECK:   }
+! CHECK:   br ^[[BB1_OUTER]]
+! CHECK-NEXT: ^[[BB3_OUTER]]:
+! CHECK-NEXT:   return
+subroutine ss7() ! EXIT inside OpenMP parallel do (inside do loop)
+  integer :: x
+    do i = 1, 3
+      !$omp parallel do private(x)
+        do j = 1, 3
+          x = j * i
+          do k = 1, 3
+            if (k .eq. n) exit
+            x = k
+            x = x + k
+          enddo
+        enddo
+      !$omp end parallel do
+    enddo
+end
+
+! CHECK-LABEL: func @_QPss8() {
+! CHECK:  omp.parallel  {
+! CHECK:    omp.wsloop {{.*}} {
+! CHECK:      br ^[[BB1:.*]]
+! CHECK-NEXT:    ^[[BB1]]:
+! CHECK:      br ^[[BB2:.*]]
+! CHECK:    ^[[BB2]]:
+! CHECK:      cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
+! CHECK:    ^[[BB3]]:
+! CHECK:      cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB5:.*]]
+! CHECK:    ^[[BB4]]:
+! CHECK-NEXT:    br ^[[BB6]]
+! CHECK:    ^[[BB5]]:
+! CHECK:      br ^[[BB2]]
+! CHECK-NEXT:    ^[[BB6]]:
+! CHECK:      omp.yield
+! CHECK:    }
+! CHECK:    omp.terminator
+! CHECK:  }
+subroutine ss8() ! EXIT inside OpenMP parallel do
+  integer :: x
+      !$omp parallel do private(x)
+        do j = 1, 3
+          x = j * i
+          do k = 1, 3
+            if (k .eq. n) exit
+            x = k
+            x = x + k
+          enddo
+        enddo
+      !$omp end parallel do
+end
+
+! CHECK-LABEL: func @_QPss9() {
+! CHECK:  omp.parallel  {
+! CHECK-NEXT:    omp.parallel  {
+! CHECK:      br ^[[BB1:.*]]
+! CHECK:         ^[[BB1]]:
+! CHECK:      cond_br %{{.*}}, ^[[BB2:.*]], ^[[BB5:.*]]
+! CHECK-NEXT:    ^[[BB2]]:
+! CHECK:      cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB4:.*]]
+! CHECK-NEXT:    ^[[BB3]]:
+! CHECK-NEXT:    br ^[[BB5]]
+! CHECK-NEXT:    ^[[BB4]]:
+! CHECK:      br ^[[BB1]]
+! CHECK-NEXT:    ^[[BB5]]:
+! CHECK:      omp.terminator
+! CHECK-NEXT:    }
+! CHECK:    omp.terminator
+! CHECK-NEXT  }
+! CHECK: }
+subroutine ss9() ! EXIT inside OpenMP parallel (inside parallel)
+  integer :: x
+  !$omp parallel
+  !$omp parallel private(x)
+    do k = 1, 3
+      if (k .eq. n) exit
+      x = k
+      x = x + k
+    end do
+  !$omp end parallel
+  !$omp end parallel
+end
+
+! CHECK-LABEL: func @_QQmain
+program p
+  call ss1(2)
+  call ss2(2)
+  call ss3(2)
+  call ss4(2)
+  call ss5()
+  call ss6()
+  call ss7()
+  call ss8()
+  call ss9()
+end
diff --git a/flang/test/Lower/OpenMP/wsloop.f90 b/flang/test/Lower/OpenMP/wsloop.f90
new file mode 100644
index 0000000000000..4068f715c3e18
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop.f90
@@ -0,0 +1,75 @@
+! This test checks lowering of OpenMP DO Directive (Worksharing).
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+!CHECK-LABEL: func @_QPsimple_loop()
+subroutine simple_loop
+  integer :: i
+  ! CHECK:  omp.parallel
+  !$OMP PARALLEL
+  ! CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
+  ! CHECK:     %[[IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+  ! CHECK:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  !$OMP DO
+  do i=1, 9
+  ! CHECK:             fir.store %[[I]] to %[[IV_DECL:.*]]#1 : !fir.ref<i32>
+  ! CHECK:             %[[LOAD_IV:.*]] = fir.load %[[IV_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  ! CHECK:       omp.yield
+  !$OMP END DO
+  ! CHECK:       omp.terminator
+  !$OMP END PARALLEL
+end subroutine
+
+!CHECK-LABEL: func @_QPsimple_loop_with_step()
+subroutine simple_loop_with_step
+  integer :: i
+  ! CHECK:  omp.parallel
+  !$OMP PARALLEL
+  ! CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
+  ! CHECK:     %[[IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loop_with_stepEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! CHECK:     %[[WS_STEP:.*]] = arith.constant 2 : i32
+  ! CHECK:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  ! CHECK:       fir.store %[[I]] to %[[IV_DECL]]#1 : !fir.ref<i32>
+  ! CHECK:       %[[LOAD_IV:.*]] = fir.load %[[IV_DECL]]#0 : !fir.ref<i32>
+  !$OMP DO
+  do i=1, 9, 2
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  ! CHECK:       omp.yield
+  !$OMP END DO
+  ! CHECK:       omp.terminator
+  !$OMP END PARALLEL
+end subroutine
+
+!CHECK-LABEL: func @_QPloop_with_schedule_nowait()
+subroutine loop_with_schedule_nowait
+  integer :: i
+  ! CHECK:  omp.parallel
+  !$OMP PARALLEL
+  ! CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
+  ! CHECK:     %[[IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFloop_with_schedule_nowaitEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+  ! CHECK:     omp.wsloop schedule(runtime) nowait for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  !$OMP DO SCHEDULE(runtime)
+  do i=1, 9
+  ! CHECK:       fir.store %[[I]] to %[[IV_DECL]]#1 : !fir.ref<i32>
+  ! CHECK:       %[[LOAD_IV:.*]] = fir.load %[[IV_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  ! CHECK:       omp.yield
+  !$OMP END DO NOWAIT
+  ! CHECK:       omp.terminator
+  !$OMP END PARALLEL
+end subroutine

From 144c5b6d58803a2d4a0fe92a0fe331ff0347dc3b Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 16 Oct 2023 15:25:44 +0000
Subject: [PATCH 5/6] [compiler-rt][hwasan] Disable deep-recursion.c test on
 AArch64 Linux

The test program occasionaly fails to detect the fault as it should.
See https://github.com/llvm/llvm-project/issues/69221.
---
 compiler-rt/test/hwasan/TestCases/deep-recursion.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/test/hwasan/TestCases/deep-recursion.c b/compiler-rt/test/hwasan/TestCases/deep-recursion.c
index fde8a0db5ad15..39902d072a0d3 100644
--- a/compiler-rt/test/hwasan/TestCases/deep-recursion.c
+++ b/compiler-rt/test/hwasan/TestCases/deep-recursion.c
@@ -17,6 +17,9 @@
 // Stack histories are currently not recorded on x86.
 // XFAIL: target=x86_64{{.*}}
 
+// Flaky on AArch64 Linux, see https://github.com/llvm/llvm-project/issues/69221.
+// UNSUPPORTED: target=aarch64-linux{{.*}}
+
 #include <stdlib.h>
 // At least -O1 is needed for this function to not have a stack frame on
 // AArch64.

From 6ade5183232dc1398205d7c9dbe21243b2560837 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 16 Oct 2023 08:52:02 -0700
Subject: [PATCH 6/6] [flang][openacc][NFC] Issue better error message when
 directive is wrong (#69034)

---
 flang/lib/Parser/openacc-parsers.cpp       | 32 ++++++++++++----------
 flang/test/Semantics/OpenACC/acc-error.f90 | 15 ++++++++++
 2 files changed, 33 insertions(+), 14 deletions(-)
 create mode 100644 flang/test/Semantics/OpenACC/acc-error.f90

diff --git a/flang/lib/Parser/openacc-parsers.cpp b/flang/lib/Parser/openacc-parsers.cpp
index 09b30e679de0e..75aeffd29f92f 100644
--- a/flang/lib/Parser/openacc-parsers.cpp
+++ b/flang/lib/Parser/openacc-parsers.cpp
@@ -150,11 +150,12 @@ TYPE_PARSER(sourced(construct<AccLoopDirective>(
 TYPE_PARSER(construct<AccBeginLoopDirective>(
     sourced(Parser<AccLoopDirective>{}), Parser<AccClauseList>{}))
 
-TYPE_PARSER(construct<AccEndLoop>(startAccLine >> "END LOOP"_tok))
+TYPE_PARSER(construct<AccEndLoop>("END LOOP"_tok))
 
 TYPE_PARSER(construct<OpenACCLoopConstruct>(
     sourced(Parser<AccBeginLoopDirective>{} / endAccLine),
-    maybe(Parser<DoConstruct>{}), maybe(Parser<AccEndLoop>{} / endAccLine)))
+    maybe(Parser<DoConstruct>{}),
+    maybe(startAccLine >> Parser<AccEndLoop>{} / endAccLine)))
 
 // 2.15.1 Routine directive
 TYPE_PARSER(sourced(construct<OpenACCRoutineConstruct>(verbatim("ROUTINE"_tok),
@@ -227,22 +228,25 @@ TYPE_PARSER(construct<OpenACCStandaloneConstruct>(
 TYPE_PARSER(construct<OpenACCStandaloneDeclarativeConstruct>(
     sourced(Parser<AccDeclarativeDirective>{}), Parser<AccClauseList>{}))
 
-TYPE_PARSER(
-    startAccLine >> first(sourced(construct<OpenACCDeclarativeConstruct>(
-                              Parser<OpenACCStandaloneDeclarativeConstruct>{})),
-                        sourced(construct<OpenACCDeclarativeConstruct>(
-                            Parser<OpenACCRoutineConstruct>{}))))
+TYPE_PARSER(startAccLine >>
+    withMessage("expected OpenACC directive"_err_en_US,
+        first(sourced(construct<OpenACCDeclarativeConstruct>(
+                  Parser<OpenACCStandaloneDeclarativeConstruct>{})),
+            sourced(construct<OpenACCDeclarativeConstruct>(
+                Parser<OpenACCRoutineConstruct>{})))))
 
 // OpenACC constructs
 TYPE_CONTEXT_PARSER("OpenACC construct"_en_US,
     startAccLine >>
-        first(construct<OpenACCConstruct>(Parser<OpenACCBlockConstruct>{}),
-            construct<OpenACCConstruct>(Parser<OpenACCCombinedConstruct>{}),
-            construct<OpenACCConstruct>(Parser<OpenACCLoopConstruct>{}),
-            construct<OpenACCConstruct>(Parser<OpenACCStandaloneConstruct>{}),
-            construct<OpenACCConstruct>(Parser<OpenACCCacheConstruct>{}),
-            construct<OpenACCConstruct>(Parser<OpenACCWaitConstruct>{}),
-            construct<OpenACCConstruct>(Parser<OpenACCAtomicConstruct>{})))
+        withMessage("expected OpenACC directive"_err_en_US,
+            first(construct<OpenACCConstruct>(Parser<OpenACCBlockConstruct>{}),
+                construct<OpenACCConstruct>(Parser<OpenACCCombinedConstruct>{}),
+                construct<OpenACCConstruct>(Parser<OpenACCLoopConstruct>{}),
+                construct<OpenACCConstruct>(
+                    Parser<OpenACCStandaloneConstruct>{}),
+                construct<OpenACCConstruct>(Parser<OpenACCCacheConstruct>{}),
+                construct<OpenACCConstruct>(Parser<OpenACCWaitConstruct>{}),
+                construct<OpenACCConstruct>(Parser<OpenACCAtomicConstruct>{}))))
 
 TYPE_PARSER(startAccLine >>
     sourced(construct<AccEndCombinedDirective>(sourced("END"_tok >>
diff --git a/flang/test/Semantics/OpenACC/acc-error.f90 b/flang/test/Semantics/OpenACC/acc-error.f90
new file mode 100644
index 0000000000000..b1c3b77847429
--- /dev/null
+++ b/flang/test/Semantics/OpenACC/acc-error.f90
@@ -0,0 +1,15 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenacc
+
+! Check parser specific error for OpenACC
+
+
+subroutine test(a, n)
+    integer :: a(n)
+    !ERROR: expected OpenACC directive
+    !$acc p
+    integer :: i,j
+ 
+    i = 0
+    !ERROR: expected OpenACC directive
+    !$acc p
+  end subroutine