Skip to content

Commit

Permalink
[mlir] adapt sm_90 integration test mbarrier.group (#67423)
Browse files Browse the repository at this point in the history
#65951 improved mbarrier supports. This PR adapts that usage in the
integration test.
  • Loading branch information
grypp authored Sep 26, 2023
1 parent 287f6cd commit f9149a3
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
// |-------------------------------|


!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token

!lhs = memref<128x64xf16>
Expand Down Expand Up @@ -93,21 +93,21 @@ module @mymod {

// Step 6. Initialize the mbarrier
%9 = nvgpu.mbarrier.create -> !barrierType
nvgpu.mbarrier.init %9, %5 : !barrierType
nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
%10 = arith.cmpi eq, %6, %c0 : index


// Step 7. First thread does TMA load
scf.if %10 {
gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index
nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
nvgpu.mbarrier.arrive.expect_tx %9, %c8192 : !barrierType
nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType
} else {
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
}

// Step 8. Wait until TMA is done
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType

// Step 9. Print loaded data in 128b swizzled
scf.if %10 {
Expand Down
37 changes: 16 additions & 21 deletions mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
// |-------------------------------|


!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token

!lhs = memref<128x64xf16>
Expand Down Expand Up @@ -96,56 +96,50 @@ module @mymod {
memref.store %vL32, %lhs32[%j, %i] : memref<128x64xf32>
}
}

// Step 2. Print on the host
%lhs32_unranked = memref.cast %lhs32 : memref<128x64xf32> to memref<*xf32>
call @printMemrefF32(%lhs32_unranked) : (memref<*xf32>) -> ()
%rhs32_unranked = memref.cast %rhs32 : memref<64x128xf32> to memref<*xf32>
call @printMemrefF32(%rhs32_unranked) : (memref<*xf32>) -> ()

// Step 3. Copy host to device
// Step 2. Copy host to device
%0 = gpu.wait async
%d_glbmem_lhs, %asyncToken = gpu.alloc async [%0] () : !lhs
%d_glbmem_rhs, %asyncToken_2 = gpu.alloc async [%0] () : !rhs
%1 = gpu.memcpy async [%0] %d_glbmem_lhs, %lhs : !lhs, !lhs
%2 = gpu.memcpy async [%0] %d_glbmem_rhs, %rhs : !rhs, !rhs

// Step 4. Create TMA tensor descriptor
// Step 3. Create TMA tensor descriptor
%d_lhs_unranked = memref.cast %d_glbmem_lhs :!lhs to memref<*xf16>
%d_rhs_unranked = memref.cast %d_glbmem_rhs :!rhs to memref<*xf16>

%d_lhsTensorMap = nvgpu.tma.create.descriptor %d_lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap
%d_rhsTensorMap = nvgpu.tma.create.descriptor %d_rhs_unranked box[%c64, %c64] : memref<*xf16> -> !rhsTensorMap

// Step 5. Launch a GPU kernel
// Step 4. Launch a GPU kernel
gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
%5 = gpu.block_dim x
%6 = gpu.thread_id x
%lhsShmem = memref.get_global @bufferLhsGlobal : !shmemlhs
%rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs
%rhsShmem2 = memref.subview %rhsShmem[%c32, %c0][%c32, %c128][%c1, %c1] : !shmemrhs to memref<?x?xf16, strided<[?, ?], offset: ?>, 3>

// Step 6. Initialize the mbarrier
// Step 5. Initialize the mbarrier
%9 = nvgpu.mbarrier.create -> !barrierType
nvgpu.mbarrier.init %9, %5 : !barrierType
nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
%10 = arith.cmpi eq, %6, %c0 : index


// Step 7. First thread does TMA load
// Step 6. First thread does TMA load
scf.if %10 {
gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9 to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9 to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9 to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
nvgpu.mbarrier.arrive.expect_tx %9, %c32768 : !barrierType
nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c32768 : !barrierType
} else {
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
}

// Step 8. Wait until TMA is done
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
// Step 7. Wait until TMA is done
nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType

// Step 9. Print loaded data in 128b swizzled
// Step 8. Print loaded data in 128b swizzled
scf.if %10 {
gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32
scf.for %ii = %c0 to %c64 step %c1 {
Expand All @@ -158,6 +152,7 @@ module @mymod {
}
gpu.printf "===----------------=== %d \n" %c-1_i32 : i32
}
gpu.barrier
gpu.terminator
}
return
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
// RUN: --entry-point-result=void \
// RUN: | FileCheck %s


// CHECK: [GPU] TMA BEFORE lhs[45][7] 0.000000
// CHECK: [GPU] TMA BEFORE rhs[7][0] 0.000000
// CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000
Expand Down Expand Up @@ -87,21 +88,21 @@ module @mymod {
%7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
%8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
%9 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.init %9, %5 : <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.init %9[%c0], %5 : <memorySpace = #gpu.address_space<workgroup>>
gpu.barrier
%10 = arith.cmpi eq, %6, %c0 : index
scf.if %10 {
nvgpu.mbarrier.arrive.expect_tx %9, %c6144 : <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
%11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
%12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
nvgpu.tma.async.load %4[%c0, %c0], %9 to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
} else {
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : <memorySpace = #gpu.address_space<workgroup>>
}
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
scf.if %10 {
%11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
%12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
Expand Down

0 comments on commit f9149a3

Please sign in to comment.