diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir index c9538ea3e6af5..aa11773defdb1 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir @@ -35,7 +35,7 @@ // |-------------------------------| -!barrierType = !nvgpu.mbarrier.barrier> +!barrierType = !nvgpu.mbarrier.group> !tokenType = !nvgpu.mbarrier.token !lhs = memref<128x64xf16> @@ -93,21 +93,21 @@ module @mymod { // Step 6. Initialize the mbarrier %9 = nvgpu.mbarrier.create -> !barrierType - nvgpu.mbarrier.init %9, %5 : !barrierType + nvgpu.mbarrier.init %9[%c0], %5 : !barrierType %10 = arith.cmpi eq, %6, %c0 : index // Step 7. First thread does TMA load scf.if %10 { gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index - nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : !lhsTensorMap, !barrierType -> !shmemlhs - nvgpu.mbarrier.arrive.expect_tx %9, %c8192 : !barrierType + nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs + nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType } else { - nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType + nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType } // Step 8. Wait until TMA is done - nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType + nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType // Step 9. Print loaded data in 128b swizzled scf.if %10 { diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir index c75be107ca4c2..5c465f7de8abd 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir @@ -40,7 +40,7 @@ // |-------------------------------| -!barrierType = !nvgpu.mbarrier.barrier> +!barrierType = !nvgpu.mbarrier.group> !tokenType = !nvgpu.mbarrier.token !lhs = memref<128x64xf16> @@ -96,28 +96,22 @@ module @mymod { memref.store %vL32, %lhs32[%j, %i] : memref<128x64xf32> } } - - // Step 2. Print on the host - %lhs32_unranked = memref.cast %lhs32 : memref<128x64xf32> to memref<*xf32> - call @printMemrefF32(%lhs32_unranked) : (memref<*xf32>) -> () - %rhs32_unranked = memref.cast %rhs32 : memref<64x128xf32> to memref<*xf32> - call @printMemrefF32(%rhs32_unranked) : (memref<*xf32>) -> () - // Step 3. Copy host to device + // Step 2. Copy host to device %0 = gpu.wait async %d_glbmem_lhs, %asyncToken = gpu.alloc async [%0] () : !lhs %d_glbmem_rhs, %asyncToken_2 = gpu.alloc async [%0] () : !rhs %1 = gpu.memcpy async [%0] %d_glbmem_lhs, %lhs : !lhs, !lhs %2 = gpu.memcpy async [%0] %d_glbmem_rhs, %rhs : !rhs, !rhs - // Step 4. Create TMA tensor descriptor + // Step 3. Create TMA tensor descriptor %d_lhs_unranked = memref.cast %d_glbmem_lhs :!lhs to memref<*xf16> %d_rhs_unranked = memref.cast %d_glbmem_rhs :!rhs to memref<*xf16> %d_lhsTensorMap = nvgpu.tma.create.descriptor %d_lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap %d_rhsTensorMap = nvgpu.tma.create.descriptor %d_rhs_unranked box[%c64, %c64] : memref<*xf16> -> !rhsTensorMap - // Step 5. Launch a GPU kernel + // Step 4. Launch a GPU kernel gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) { %5 = gpu.block_dim x %6 = gpu.thread_id x @@ -125,27 +119,27 @@ module @mymod { %rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs %rhsShmem2 = memref.subview %rhsShmem[%c32, %c0][%c32, %c128][%c1, %c1] : !shmemrhs to memref, 3> - // Step 6. Initialize the mbarrier + // Step 5. Initialize the mbarrier %9 = nvgpu.mbarrier.create -> !barrierType - nvgpu.mbarrier.init %9, %5 : !barrierType + nvgpu.mbarrier.init %9[%c0], %5 : !barrierType %10 = arith.cmpi eq, %6, %c0 : index - // Step 7. First thread does TMA load + // Step 6. First thread does TMA load scf.if %10 { gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index - nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9 to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs - nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9 to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs - nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9 to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref, 3> - nvgpu.mbarrier.arrive.expect_tx %9, %c32768 : !barrierType + nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs + nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs + nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref, 3> + nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c32768 : !barrierType } else { - nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType + nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType } - // Step 8. Wait until TMA is done - nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType + // Step 7. Wait until TMA is done + nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType - // Step 9. Print loaded data in 128b swizzled + // Step 8. Print loaded data in 128b swizzled scf.if %10 { gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32 scf.for %ii = %c0 to %c64 step %c1 { @@ -158,6 +152,7 @@ module @mymod { } gpu.printf "===----------------=== %d \n" %c-1_i32 : i32 } + gpu.barrier gpu.terminator } return diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir index 11cf63548a551..5331ebb87d37d 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir @@ -39,6 +39,7 @@ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s + // CHECK: [GPU] TMA BEFORE lhs[45][7] 0.000000 // CHECK: [GPU] TMA BEFORE rhs[7][0] 0.000000 // CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000 @@ -87,21 +88,21 @@ module @mymod { %7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3> %8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3> %9 = nvgpu.mbarrier.create -> > - nvgpu.mbarrier.init %9, %5 : > + nvgpu.mbarrier.init %9[%c0], %5 : > gpu.barrier %10 = arith.cmpi eq, %6, %c0 : index scf.if %10 { - nvgpu.mbarrier.arrive.expect_tx %9, %c6144 : > + nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : > %11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3> %12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3> gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32 gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32 - nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : , swizzle = none, l2promo = none, oob = zero, interleave = none>, > -> memref<64x8xf32, 3> - nvgpu.tma.async.load %4[%c0, %c0], %9 to %8 : , swizzle = none, l2promo = none, oob = zero, interleave = none>, > -> memref<8x128xf32, 3> + nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : , swizzle = none, l2promo = none, oob = zero, interleave = none>, > -> memref<64x8xf32, 3> + nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : , swizzle = none, l2promo = none, oob = zero, interleave = none>, > -> memref<8x128xf32, 3> } else { - nvgpu.mbarrier.arrive.expect_tx %9, %c0 : > + nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : > } - nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : > + nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : > scf.if %10 { %11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3> %12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>