Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mlir] adapt sm_90 integration test mbarrier.group #67423

Merged
merged 1 commit into from
Sep 26, 2023
Merged

Conversation

grypp
Copy link
Member

@grypp grypp commented Sep 26, 2023

#65951 improved mbarrier supports. This PR adapts that usage in the integration test.

llvm#65951 improved mbarrier supports. This PR adapts that usage in the integration test.
@llvmbot
Copy link
Member

llvmbot commented Sep 26, 2023

@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-gpu

Changes

#65951 improved mbarrier supports. This PR adapts that usage in the integration test.


Full diff: https://github.com/llvm/llvm-project/pull/67423.diff

3 Files Affected:

  • (modified) mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir (+6-6)
  • (modified) mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir (+16-21)
  • (modified) mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir (+7-6)
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index c9538ea3e6af531..aa11773defdb15f 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -35,7 +35,7 @@
 // |-------------------------------|
 
 
-!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
 !tokenType = !nvgpu.mbarrier.token
 
 !lhs = memref<128x64xf16>
@@ -93,21 +93,21 @@ module @mymod {
     
       // Step 6. Initialize the mbarrier
       %9 = nvgpu.mbarrier.create -> !barrierType
-      nvgpu.mbarrier.init %9, %5 : !barrierType
+      nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
       %10 = arith.cmpi eq, %6, %c0 : index
       
       
       // Step 7. First thread does TMA load
       scf.if %10 {
         gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index
-        nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
-        nvgpu.mbarrier.arrive.expect_tx %9, %c8192 : !barrierType
+        nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
+        nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType
       } else {
-        nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
+        nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
       }
 
       // Step 8. Wait until TMA is done
-      nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
+      nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
 
       // Step 9. Print loaded data in 128b swizzled
       scf.if %10 {        
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index c75be107ca4c276..5c465f7de8abdb5 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -40,7 +40,7 @@
 // |-------------------------------|
 
 
-!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
 !tokenType = !nvgpu.mbarrier.token
 
 !lhs = memref<128x64xf16>
@@ -96,28 +96,22 @@ module @mymod {
         memref.store %vL32, %lhs32[%j, %i] : memref<128x64xf32>
       }
     }
-
-    // Step 2. Print on the host
-    %lhs32_unranked = memref.cast %lhs32 : memref<128x64xf32> to memref<*xf32>
-    call @printMemrefF32(%lhs32_unranked) : (memref<*xf32>) -> ()
-    %rhs32_unranked = memref.cast %rhs32 : memref<64x128xf32> to memref<*xf32>
-    call @printMemrefF32(%rhs32_unranked) : (memref<*xf32>) -> ()
     
-    // Step 3. Copy host to device
+    // Step 2. Copy host to device
     %0 = gpu.wait async
     %d_glbmem_lhs, %asyncToken = gpu.alloc async [%0] () : !lhs
     %d_glbmem_rhs, %asyncToken_2 = gpu.alloc async [%0] () : !rhs
     %1 = gpu.memcpy async [%0] %d_glbmem_lhs, %lhs : !lhs, !lhs
     %2 = gpu.memcpy async [%0] %d_glbmem_rhs, %rhs : !rhs, !rhs
     
-    // Step 4. Create TMA tensor descriptor
+    // Step 3. Create TMA tensor descriptor
     %d_lhs_unranked = memref.cast %d_glbmem_lhs :!lhs  to memref<*xf16>
     %d_rhs_unranked = memref.cast %d_glbmem_rhs :!rhs  to memref<*xf16>
 
     %d_lhsTensorMap = nvgpu.tma.create.descriptor %d_lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap
     %d_rhsTensorMap = nvgpu.tma.create.descriptor %d_rhs_unranked box[%c64, %c64] : memref<*xf16> -> !rhsTensorMap
 
-    // Step 5. Launch a GPU kernel
+    // Step 4. Launch a GPU kernel
     gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
       %5 = gpu.block_dim  x
       %6 = gpu.thread_id  x
@@ -125,27 +119,27 @@ module @mymod {
       %rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs
       %rhsShmem2 = memref.subview %rhsShmem[%c32, %c0][%c32, %c128][%c1, %c1] : !shmemrhs to memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
     
-      // Step 6. Initialize the mbarrier
+      // Step 5. Initialize the mbarrier
       %9 = nvgpu.mbarrier.create -> !barrierType
-      nvgpu.mbarrier.init %9, %5 : !barrierType
+      nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
       %10 = arith.cmpi eq, %6, %c0 : index
       
       
-      // Step 7. First thread does TMA load
+      // Step 6. First thread does TMA load
       scf.if %10 {
         gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
-        nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9 to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
-        nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9 to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
-        nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9 to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
-        nvgpu.mbarrier.arrive.expect_tx %9, %c32768 : !barrierType
+        nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
+        nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
+        nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
+        nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c32768 : !barrierType
       } else {
-        nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
+        nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
       }
 
-      // Step 8. Wait until TMA is done
-      nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
+      // Step 7. Wait until TMA is done
+      nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
 
-      // Step 9. Print loaded data in 128b swizzled
+      // Step 8. Print loaded data in 128b swizzled
       scf.if %10 {        
         gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32
         scf.for %ii = %c0 to %c64 step %c1 {
@@ -158,6 +152,7 @@ module @mymod {
         }
         gpu.printf "===----------------=== %d \n" %c-1_i32 : i32
       }
+      gpu.barrier
       gpu.terminator
     }
     return
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 11cf63548a551bb..5331ebb87d37de5 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -39,6 +39,7 @@
 // RUN:   --entry-point-result=void \
 // RUN:  | FileCheck %s
 
+
 // CHECK: [GPU] TMA BEFORE lhs[45][7] 0.000000
 // CHECK: [GPU] TMA BEFORE rhs[7][0] 0.000000
 // CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000
@@ -87,21 +88,21 @@ module @mymod {
       %7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
       %8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
       %9 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
-      nvgpu.mbarrier.init %9, %5 : <memorySpace = #gpu.address_space<workgroup>>
+      nvgpu.mbarrier.init %9[%c0], %5 : <memorySpace = #gpu.address_space<workgroup>>
       gpu.barrier
       %10 = arith.cmpi eq, %6, %c0 : index
       scf.if %10 {
-        nvgpu.mbarrier.arrive.expect_tx %9, %c6144 : <memorySpace = #gpu.address_space<workgroup>>
+        nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
         %11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
         gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
         gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
-        nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
-        nvgpu.tma.async.load %4[%c0, %c0], %9 to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
+        nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
+        nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
       } else {
-        nvgpu.mbarrier.arrive.expect_tx %9, %c0 : <memorySpace = #gpu.address_space<workgroup>>
+        nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : <memorySpace = #gpu.address_space<workgroup>>
       }
-      nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
+      nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
       scf.if %10 {
         %11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>

@grypp grypp merged commit f9149a3 into llvm:main Sep 26, 2023
Guzhu-AMD pushed a commit to GPUOpen-Drivers/llvm-project that referenced this pull request Sep 28, 2023
Local branch amd-gfx 0bf8d84 Merged main:a09e32e5fe13 into amd-gfx:2b22973c3979
Remote branch main f9149a3 [mlir] adapt sm_90 integration test `mbarrier.group` (llvm#67423)
legrosbuffle pushed a commit to legrosbuffle/llvm-project that referenced this pull request Sep 29, 2023
llvm#65951 improved mbarrier supports. This PR adapts that usage in the
integration test.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants