Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mlir] adapt sm_90 integration test mbarrier.group #67423

Merged
merged 1 commit into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
// |-------------------------------|


!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token

!lhs = memref<128x64xf16>
Expand Down Expand Up @@ -93,21 +93,21 @@ module @mymod {

// Step 6. Initialize the mbarrier
%9 = nvgpu.mbarrier.create -> !barrierType
nvgpu.mbarrier.init %9, %5 : !barrierType
nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
%10 = arith.cmpi eq, %6, %c0 : index


// Step 7. First thread does TMA load
scf.if %10 {
gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index
nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
nvgpu.mbarrier.arrive.expect_tx %9, %c8192 : !barrierType
nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType
} else {
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
}

// Step 8. Wait until TMA is done
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType

// Step 9. Print loaded data in 128b swizzled
scf.if %10 {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
// |-------------------------------|


!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token

!lhs = memref<128x64xf16>
Expand Down Expand Up @@ -96,56 +96,50 @@ module @mymod {
memref.store %vL32, %lhs32[%j, %i] : memref<128x64xf32>
}
}

// Step 2. Print on the host
%lhs32_unranked = memref.cast %lhs32 : memref<128x64xf32> to memref<*xf32>
call @printMemrefF32(%lhs32_unranked) : (memref<*xf32>) -> ()
%rhs32_unranked = memref.cast %rhs32 : memref<64x128xf32> to memref<*xf32>
call @printMemrefF32(%rhs32_unranked) : (memref<*xf32>) -> ()

// Step 3. Copy host to device
// Step 2. Copy host to device
%0 = gpu.wait async
%d_glbmem_lhs, %asyncToken = gpu.alloc async [%0] () : !lhs
%d_glbmem_rhs, %asyncToken_2 = gpu.alloc async [%0] () : !rhs
%1 = gpu.memcpy async [%0] %d_glbmem_lhs, %lhs : !lhs, !lhs
%2 = gpu.memcpy async [%0] %d_glbmem_rhs, %rhs : !rhs, !rhs

// Step 4. Create TMA tensor descriptor
// Step 3. Create TMA tensor descriptor
%d_lhs_unranked = memref.cast %d_glbmem_lhs :!lhs to memref<*xf16>
%d_rhs_unranked = memref.cast %d_glbmem_rhs :!rhs to memref<*xf16>

%d_lhsTensorMap = nvgpu.tma.create.descriptor %d_lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap
%d_rhsTensorMap = nvgpu.tma.create.descriptor %d_rhs_unranked box[%c64, %c64] : memref<*xf16> -> !rhsTensorMap

// Step 5. Launch a GPU kernel
// Step 4. Launch a GPU kernel
gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
%5 = gpu.block_dim x
%6 = gpu.thread_id x
%lhsShmem = memref.get_global @bufferLhsGlobal : !shmemlhs
%rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs
%rhsShmem2 = memref.subview %rhsShmem[%c32, %c0][%c32, %c128][%c1, %c1] : !shmemrhs to memref<?x?xf16, strided<[?, ?], offset: ?>, 3>

// Step 6. Initialize the mbarrier
// Step 5. Initialize the mbarrier
%9 = nvgpu.mbarrier.create -> !barrierType
nvgpu.mbarrier.init %9, %5 : !barrierType
nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
%10 = arith.cmpi eq, %6, %c0 : index


// Step 7. First thread does TMA load
// Step 6. First thread does TMA load
scf.if %10 {
gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9 to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9 to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9 to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
nvgpu.mbarrier.arrive.expect_tx %9, %c32768 : !barrierType
nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c32768 : !barrierType
} else {
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
}

// Step 8. Wait until TMA is done
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
// Step 7. Wait until TMA is done
nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType

// Step 9. Print loaded data in 128b swizzled
// Step 8. Print loaded data in 128b swizzled
scf.if %10 {
gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32
scf.for %ii = %c0 to %c64 step %c1 {
Expand All @@ -158,6 +152,7 @@ module @mymod {
}
gpu.printf "===----------------=== %d \n" %c-1_i32 : i32
}
gpu.barrier
gpu.terminator
}
return
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
// RUN: --entry-point-result=void \
// RUN: | FileCheck %s


// CHECK: [GPU] TMA BEFORE lhs[45][7] 0.000000
// CHECK: [GPU] TMA BEFORE rhs[7][0] 0.000000
// CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000
Expand Down Expand Up @@ -87,21 +88,21 @@ module @mymod {
%7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
%8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
%9 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.init %9, %5 : <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.init %9[%c0], %5 : <memorySpace = #gpu.address_space<workgroup>>
gpu.barrier
%10 = arith.cmpi eq, %6, %c0 : index
scf.if %10 {
nvgpu.mbarrier.arrive.expect_tx %9, %c6144 : <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
%11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
%12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
nvgpu.tma.async.load %4[%c0, %c0], %9 to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
} else {
nvgpu.mbarrier.arrive.expect_tx %9, %c0 : <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : <memorySpace = #gpu.address_space<workgroup>>
}
nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
scf.if %10 {
%11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
%12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
Expand Down