-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir] adapt sm_90 integration test mbarrier.group
#67423
Merged
Merged
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
llvm#65951 improved mbarrier supports. This PR adapts that usage in the integration test.
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Changes#65951 improved mbarrier supports. This PR adapts that usage in the integration test. Full diff: https://github.com/llvm/llvm-project/pull/67423.diff 3 Files Affected:
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index c9538ea3e6af531..aa11773defdb15f 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -35,7 +35,7 @@
// |-------------------------------|
-!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token
!lhs = memref<128x64xf16>
@@ -93,21 +93,21 @@ module @mymod {
// Step 6. Initialize the mbarrier
%9 = nvgpu.mbarrier.create -> !barrierType
- nvgpu.mbarrier.init %9, %5 : !barrierType
+ nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
%10 = arith.cmpi eq, %6, %c0 : index
// Step 7. First thread does TMA load
scf.if %10 {
gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index
- nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
- nvgpu.mbarrier.arrive.expect_tx %9, %c8192 : !barrierType
+ nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType
} else {
- nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
}
// Step 8. Wait until TMA is done
- nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
+ nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
// Step 9. Print loaded data in 128b swizzled
scf.if %10 {
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index c75be107ca4c276..5c465f7de8abdb5 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -40,7 +40,7 @@
// |-------------------------------|
-!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token
!lhs = memref<128x64xf16>
@@ -96,28 +96,22 @@ module @mymod {
memref.store %vL32, %lhs32[%j, %i] : memref<128x64xf32>
}
}
-
- // Step 2. Print on the host
- %lhs32_unranked = memref.cast %lhs32 : memref<128x64xf32> to memref<*xf32>
- call @printMemrefF32(%lhs32_unranked) : (memref<*xf32>) -> ()
- %rhs32_unranked = memref.cast %rhs32 : memref<64x128xf32> to memref<*xf32>
- call @printMemrefF32(%rhs32_unranked) : (memref<*xf32>) -> ()
- // Step 3. Copy host to device
+ // Step 2. Copy host to device
%0 = gpu.wait async
%d_glbmem_lhs, %asyncToken = gpu.alloc async [%0] () : !lhs
%d_glbmem_rhs, %asyncToken_2 = gpu.alloc async [%0] () : !rhs
%1 = gpu.memcpy async [%0] %d_glbmem_lhs, %lhs : !lhs, !lhs
%2 = gpu.memcpy async [%0] %d_glbmem_rhs, %rhs : !rhs, !rhs
- // Step 4. Create TMA tensor descriptor
+ // Step 3. Create TMA tensor descriptor
%d_lhs_unranked = memref.cast %d_glbmem_lhs :!lhs to memref<*xf16>
%d_rhs_unranked = memref.cast %d_glbmem_rhs :!rhs to memref<*xf16>
%d_lhsTensorMap = nvgpu.tma.create.descriptor %d_lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap
%d_rhsTensorMap = nvgpu.tma.create.descriptor %d_rhs_unranked box[%c64, %c64] : memref<*xf16> -> !rhsTensorMap
- // Step 5. Launch a GPU kernel
+ // Step 4. Launch a GPU kernel
gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
%5 = gpu.block_dim x
%6 = gpu.thread_id x
@@ -125,27 +119,27 @@ module @mymod {
%rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs
%rhsShmem2 = memref.subview %rhsShmem[%c32, %c0][%c32, %c128][%c1, %c1] : !shmemrhs to memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
- // Step 6. Initialize the mbarrier
+ // Step 5. Initialize the mbarrier
%9 = nvgpu.mbarrier.create -> !barrierType
- nvgpu.mbarrier.init %9, %5 : !barrierType
+ nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
%10 = arith.cmpi eq, %6, %c0 : index
- // Step 7. First thread does TMA load
+ // Step 6. First thread does TMA load
scf.if %10 {
gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
- nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9 to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
- nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9 to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
- nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9 to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
- nvgpu.mbarrier.arrive.expect_tx %9, %c32768 : !barrierType
+ nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
+ nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
+ nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c32768 : !barrierType
} else {
- nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
}
- // Step 8. Wait until TMA is done
- nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
+ // Step 7. Wait until TMA is done
+ nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
- // Step 9. Print loaded data in 128b swizzled
+ // Step 8. Print loaded data in 128b swizzled
scf.if %10 {
gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32
scf.for %ii = %c0 to %c64 step %c1 {
@@ -158,6 +152,7 @@ module @mymod {
}
gpu.printf "===----------------=== %d \n" %c-1_i32 : i32
}
+ gpu.barrier
gpu.terminator
}
return
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 11cf63548a551bb..5331ebb87d37de5 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -39,6 +39,7 @@
// RUN: --entry-point-result=void \
// RUN: | FileCheck %s
+
// CHECK: [GPU] TMA BEFORE lhs[45][7] 0.000000
// CHECK: [GPU] TMA BEFORE rhs[7][0] 0.000000
// CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000
@@ -87,21 +88,21 @@ module @mymod {
%7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
%8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
%9 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
- nvgpu.mbarrier.init %9, %5 : <memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.init %9[%c0], %5 : <memorySpace = #gpu.address_space<workgroup>>
gpu.barrier
%10 = arith.cmpi eq, %6, %c0 : index
scf.if %10 {
- nvgpu.mbarrier.arrive.expect_tx %9, %c6144 : <memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
%11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
%12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
- nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
- nvgpu.tma.async.load %4[%c0, %c0], %9 to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
+ nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
+ nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
} else {
- nvgpu.mbarrier.arrive.expect_tx %9, %c0 : <memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : <memorySpace = #gpu.address_space<workgroup>>
}
- nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
scf.if %10 {
%11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
%12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
|
Guzhu-AMD
pushed a commit
to GPUOpen-Drivers/llvm-project
that referenced
this pull request
Sep 28, 2023
Local branch amd-gfx 0bf8d84 Merged main:a09e32e5fe13 into amd-gfx:2b22973c3979 Remote branch main f9149a3 [mlir] adapt sm_90 integration test `mbarrier.group` (llvm#67423)
legrosbuffle
pushed a commit
to legrosbuffle/llvm-project
that referenced
this pull request
Sep 29, 2023
llvm#65951 improved mbarrier supports. This PR adapts that usage in the integration test.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
#65951 improved mbarrier supports. This PR adapts that usage in the integration test.