[Encoding][Codegen] Add initial pad encoding layout attrs (#19865)

These allow us to pad allocations without changing the logical tensor sizes or data layouts. Split the encoding layout attribute interface into two: * One with target-specific information that allows us to decide layouts. * One with serialized target-agnostic padding information. Signed-off-by: Jakub Kuderski <jakub@nod-labs.com>
iree-org · Feb 3, 2025 · 86244de · 86244de
1 parent 78f312b
commit 86244de
Show file tree

Hide file tree

Showing 16 changed files with 365 additions and 68 deletions.
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -66,6 +66,7 @@ struct ROCMOptions {
   std::string bitcodeDirectory = getDefaultBitcodeDirectory();
   int wavesPerEu = 0;
   std::string enableROCMUkernels = "none";
+  bool experimentalPadLayout = false;
   bool slpVectorization = true;
   bool globalISel = false;
 
@@ -105,6 +106,10 @@ struct ROCMOptions {
         cl::desc("Enables microkernels in the HIP compiler backend. May be "
                  "`default`, `none`, `all`, or a comma-separated list of "
                  "specific unprefixed microkernels to enable, e.g. `mmt4d`."));
+    binder.opt<bool>("iree-hip-enable-experimental-pad-layout",
+                     experimentalPadLayout, cl::cat(category),
+                     cl::desc("Enables additional padding on allocations to "
+                              "maximize cache bandwidth."));
 
     binder.list<std::string>(
         "iree-hip-pass-plugin-path", passPlugins,
@@ -248,6 +253,11 @@ class ROCMTargetBackend final : public TargetBackend {
     if (auto target = GPU::getHIPTargetDetails(
             options.target, options.targetFeatures, context)) {
       addConfig("iree.gpu.target", target);
+      if (options.experimentalPadLayout) {
+        if (Attribute encoding = GPU::getHIPTargetEncodingLayoutAttr(target)) {
+          addConfig("encoding", encoding);
+        }
+      }
     }
 
     addConfig("ukernels", b.getStringAttr(options.enableROCMUkernels));

diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -19,6 +19,7 @@ iree_lit_test_suite(
         "config_ukernel_argmax_gfx942.mlir",
         "config_ukernel_multi_mma_gfx942.mlir",
         "default_tuning_specs_amdgpu.mlir",
+        "gpu_encoding_attrs.mlir",
         "lowering_strategy_from_tuning_spec.mlir",
         "ukernel_pipeline_transform.mlir",
     ],

diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -18,6 +18,7 @@ iree_lit_test_suite(
     "config_ukernel_argmax_gfx942.mlir"
     "config_ukernel_multi_mma_gfx942.mlir"
     "default_tuning_specs_amdgpu.mlir"
+    "gpu_encoding_attrs.mlir"
     "lowering_strategy_from_tuning_spec.mlir"
     "ukernel_pipeline_transform.mlir"
   TOOLS

diff --git a/compiler/plugins/target/ROCM/test/gpu_encoding_attrs.mlir b/compiler/plugins/target/ROCM/test/gpu_encoding_attrs.mlir
@@ -0,0 +1,26 @@
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
+// RUN:   --iree-hip-target=gfx942 --iree-hip-enable-experimental-pad-layout %s | FileCheck %s --check-prefix=PAD
+//
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
+// RUN:   --iree-hip-target=gfx90a --iree-hip-enable-experimental-pad-layout %s | FileCheck %s --check-prefix=PAD
+
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
+// RUN:   --iree-hip-target=gfx90a --iree-hip-enable-experimental-pad-layout=false %s | FileCheck %s --check-prefix=NOPAD
+
+// PAD:      #hal.executable.target<"rocm"
+// PAD-SAME:   encoding = #iree_gpu.gpu_pad_layout<cache_line_bytes = 128, cache_sets = 4>
+
+// NOPAD:      #hal.executable.target<"rocm"
+// NOPAD-NOT:    encoding = #iree_gpu.gpu_pad_layout
+
+stream.executable public @main {
+  stream.executable.export @main workgroups(%arg0: index) -> (index, index, index) {
+    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
+    stream.return %x, %y, %z : index, index, index
+  }
+  builtin.module {
+    func.func @main() {
+      return
+    }
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
@@ -417,7 +417,7 @@ def IREECodegen_ExportConfig : AttrDef<IREECodegen_Dialect, "ExportConfig", []>
 }
 
 //===---------------------------------------------------------------------===//
-// iree_codegen.encoding_layout
+// iree_codegen.encoding_nop_layout
 //===---------------------------------------------------------------------===//
 
 def IREECodegen_EncodingNopLayoutAttr  :

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -340,6 +340,33 @@ def IREEGPU_GPUEncodingLayoutAttr :
   );
 }
 
+//===----------------------------------------------------------------------===//
+// iree_gpu.gpu_pad_layout
+//===----------------------------------------------------------------------===//
+
+def IREEGPU_GPUPadLayoutAttr : AttrDef<IREEGPU_Dialect, "GPUPadLayout"> {
+  let mnemonic = "gpu_pad_layout";
+  let summary = "The padded encoding layout attribute for GPU targets.";
+  let assemblyFormat = "`<` struct(params) `>`";
+
+  let description = [{
+    Describes padding preferences for a given GPU target.
+    This attribute can implement any encoding interface for data-tiling,
+    e.g., Encoding::EncodingLayoutAttrInterface, etc. They should be implemented
+    through external model mechanism because we do not want to relocate
+    domain-specific logic to the dialect implementation, and we can have better
+    code structure. See the implementation in
+    compiler/Codegen/ExternalInterfaces/*.
+  }];
+
+  let parameters = (ins
+    // Relevant target properties that will later allow us to decide the
+    // serialized pad layout.
+    "uint32_t":$cache_line_bytes,
+    "uint32_t":$cache_sets
+  );
+}
+
 //===----------------------------------------------------------------------===//
 // Workgroup processor level description
 //===----------------------------------------------------------------------===//

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -9,8 +9,12 @@
 #include <optional>
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
 
 namespace mlir::iree_compiler::IREE::GPU {
 
@@ -707,6 +711,18 @@ TargetAttr getHIPTargetDetails(StringRef target, StringRef features,
   return nullptr;
 }
 
+Attribute getHIPTargetEncodingLayoutAttr(TargetAttr target) {
+  // This is only enabled for CDNA2 and CDNA3 for the time being.
+  // TODO(kuhar): Enable for other HIP targets.
+  if (!llvm::is_contained({"gfx90a", "gfx940", "gfx941", "gfx942"},
+                          target.getArch())) {
+    return nullptr;
+  }
+
+  return IREE::GPU::GPUPadLayoutAttr::get(
+      target.getContext(), /*cacheLineBytes=*/128, /*cacheSets=*/4);
+}
+
 StringRef normalizeHIPTarget(StringRef target) {
   return normalizeAMDGPUTarget(target);
 }

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.h
@@ -36,6 +36,10 @@ StringRef normalizeCUDATarget(StringRef target);
 TargetAttr getHIPTargetDetails(llvm::StringRef target, llvm::StringRef features,
                                MLIRContext *context);
 
+// Returns an attribute implementing `EncodingLayoutAttributeInterface` if
+// |target| has known encoding preferences.
+Attribute getHIPTargetEncodingLayoutAttr(TargetAttr target);
+
 // Normalizes the given HIP |target| to the gfx target commonly used for
 // compiling towards HIP. For example, "gfx90a" for "cnda2", "gfx1100" for
 // "rx7900xtx". Returns empty StringRef if the given |target| is not recognized.

diff --git a/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp b/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp
@@ -670,17 +670,9 @@ struct CPUDeviceEncodingLayoutAttrInterface
   }
 };
 
-struct CPUHostEncodingLayoutAttrInterface
-    : public IREE::Encoding::EncodingLayoutAttrInterface::ExternalModel<
+struct CPUHostEncodingLayoutAttrInterface final
+    : IREE::Encoding::EncodingLayoutAttrInterface::ExternalModel<
           CPUHostEncodingLayoutAttrInterface, CPUEncodingLayoutAttr> {
-
-  Value calculateStorageSizeInBytes(Attribute attr, Location loc,
-                                    OpBuilder &builder, RankedTensorType type,
-                                    ValueRange dynamicDims) const {
-    return calculateStorageSizeInBytesImpl(attr, loc, builder, type,
-                                           dynamicDims);
-  }
-
   Attribute cloneWithSimplifiedConfig(Attribute attr,
                                       DictionaryAttr config) const {
     MLIRContext *ctx = attr.getContext();
@@ -697,6 +689,18 @@ struct CPUHostEncodingLayoutAttrInterface
   }
 };
 
+struct CPUHostSerializedEncodingLayoutAttrInterface final
+    : IREE::Encoding::SerializedEncodingLayoutAttrInterface::ExternalModel<
+          CPUHostSerializedEncodingLayoutAttrInterface, CPUEncodingLayoutAttr> {
+
+  Value calculateStorageSizeInBytes(Attribute attr, Location loc,
+                                    OpBuilder &builder, RankedTensorType type,
+                                    ValueRange dynamicDims) const {
+    return calculateStorageSizeInBytesImpl(attr, loc, builder, type,
+                                           dynamicDims);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Interface methods implementaion for iree_cpu.vmvx_encoding_layout.
 //===----------------------------------------------------------------------===//
@@ -731,8 +735,8 @@ enumerateVMVXMatmulTiles(linalg::ContractionDimensions cDims,
   };
 }
 
-struct VMVXDeviceEncodingLayoutAttrInterface
-    : public Codegen::LayoutAttrInterface::ExternalModel<
+struct VMVXDeviceEncodingLayoutAttrInterface final
+    : Codegen::LayoutAttrInterface::ExternalModel<
           VMVXDeviceEncodingLayoutAttrInterface, VMVXEncodingLayoutAttr> {
   MaterializeEncodingInfo getEncodingInfo(Attribute attr,
                                           RankedTensorType type) const {
@@ -797,16 +801,9 @@ struct VMVXDeviceEncodingLayoutAttrInterface
   }
 };
 
-struct VMVXHostEncodingLayoutAttrInterface
-    : public IREE::Encoding::EncodingLayoutAttrInterface::ExternalModel<
+struct VMVXHostEncodingLayoutAttrInterface final
+    : IREE::Encoding::EncodingLayoutAttrInterface::ExternalModel<
           VMVXHostEncodingLayoutAttrInterface, VMVXEncodingLayoutAttr> {
-  Value calculateStorageSizeInBytes(Attribute attr, Location loc,
-                                    OpBuilder &builder, RankedTensorType type,
-                                    ValueRange dynamicDims) const {
-    return calculateStorageSizeInBytesImpl(attr, loc, builder, type,
-                                           dynamicDims);
-  }
-
   Attribute cloneWithSimplifiedConfig(Attribute attr,
                                       DictionaryAttr config) const {
     MLIRContext *ctx = attr.getContext();
@@ -822,17 +819,31 @@ struct VMVXHostEncodingLayoutAttrInterface
   }
 };
 
+struct VMVXHostSerializedEncodingLayoutAttrInterface final
+    : IREE::Encoding::SerializedEncodingLayoutAttrInterface::ExternalModel<
+          VMVXHostSerializedEncodingLayoutAttrInterface,
+          VMVXEncodingLayoutAttr> {
+  Value calculateStorageSizeInBytes(Attribute attr, Location loc,
+                                    OpBuilder &builder, RankedTensorType type,
+                                    ValueRange dynamicDims) const {
+    return calculateStorageSizeInBytesImpl(attr, loc, builder, type,
+                                           dynamicDims);
+  }
+};
+
 } // namespace
 
 void registerCPUEncodingExternalModels(DialectRegistry &registry) {
   registry.addExtension(
       +[](MLIRContext *ctx, IREE::CPU::IREECPUDialect *dialect) {
         IREE::CPU::CPUEncodingLayoutAttr::attachInterface<
             CPUDeviceEncodingLayoutAttrInterface,
-            CPUHostEncodingLayoutAttrInterface>(*ctx);
+            CPUHostEncodingLayoutAttrInterface,
+            CPUHostSerializedEncodingLayoutAttrInterface>(*ctx);
         IREE::CPU::VMVXEncodingLayoutAttr::attachInterface<
             VMVXDeviceEncodingLayoutAttrInterface,
-            VMVXHostEncodingLayoutAttrInterface>(*ctx);
+            VMVXHostEncodingLayoutAttrInterface,
+            VMVXHostSerializedEncodingLayoutAttrInterface>(*ctx);
       });
 }