Skip to content

Commit

Permalink
[Encoding][Codegen] Add initial pad encoding layout attrs (#19865)
Browse files Browse the repository at this point in the history
These allow us to pad allocations without changing the logical tensor
sizes or data layouts.

Split the encoding layout attribute interface into two:
* One with target-specific information that allows us to decide layouts.
* One with serialized target-agnostic padding information.

Signed-off-by: Jakub Kuderski <jakub@nod-labs.com>
  • Loading branch information
kuhar authored Feb 3, 2025
1 parent 78f312b commit 86244de
Show file tree
Hide file tree
Showing 16 changed files with 365 additions and 68 deletions.
10 changes: 10 additions & 0 deletions compiler/plugins/target/ROCM/ROCMTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ struct ROCMOptions {
std::string bitcodeDirectory = getDefaultBitcodeDirectory();
int wavesPerEu = 0;
std::string enableROCMUkernels = "none";
bool experimentalPadLayout = false;
bool slpVectorization = true;
bool globalISel = false;

Expand Down Expand Up @@ -105,6 +106,10 @@ struct ROCMOptions {
cl::desc("Enables microkernels in the HIP compiler backend. May be "
"`default`, `none`, `all`, or a comma-separated list of "
"specific unprefixed microkernels to enable, e.g. `mmt4d`."));
binder.opt<bool>("iree-hip-enable-experimental-pad-layout",
experimentalPadLayout, cl::cat(category),
cl::desc("Enables additional padding on allocations to "
"maximize cache bandwidth."));

binder.list<std::string>(
"iree-hip-pass-plugin-path", passPlugins,
Expand Down Expand Up @@ -248,6 +253,11 @@ class ROCMTargetBackend final : public TargetBackend {
if (auto target = GPU::getHIPTargetDetails(
options.target, options.targetFeatures, context)) {
addConfig("iree.gpu.target", target);
if (options.experimentalPadLayout) {
if (Attribute encoding = GPU::getHIPTargetEncodingLayoutAttr(target)) {
addConfig("encoding", encoding);
}
}
}

addConfig("ukernels", b.getStringAttr(options.enableROCMUkernels));
Expand Down
1 change: 1 addition & 0 deletions compiler/plugins/target/ROCM/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ iree_lit_test_suite(
"config_ukernel_argmax_gfx942.mlir",
"config_ukernel_multi_mma_gfx942.mlir",
"default_tuning_specs_amdgpu.mlir",
"gpu_encoding_attrs.mlir",
"lowering_strategy_from_tuning_spec.mlir",
"ukernel_pipeline_transform.mlir",
],
Expand Down
1 change: 1 addition & 0 deletions compiler/plugins/target/ROCM/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ iree_lit_test_suite(
"config_ukernel_argmax_gfx942.mlir"
"config_ukernel_multi_mma_gfx942.mlir"
"default_tuning_specs_amdgpu.mlir"
"gpu_encoding_attrs.mlir"
"lowering_strategy_from_tuning_spec.mlir"
"ukernel_pipeline_transform.mlir"
TOOLS
Expand Down
26 changes: 26 additions & 0 deletions compiler/plugins/target/ROCM/test/gpu_encoding_attrs.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
// RUN: --iree-hip-target=gfx942 --iree-hip-enable-experimental-pad-layout %s | FileCheck %s --check-prefix=PAD
//
// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
// RUN: --iree-hip-target=gfx90a --iree-hip-enable-experimental-pad-layout %s | FileCheck %s --check-prefix=PAD

// RUN: iree-opt --pass-pipeline='builtin.module(iree-hal-assign-target-devices{targetDevices=hip},iree-hal-transformation-pipeline{serialize-executables=false})' \
// RUN: --iree-hip-target=gfx90a --iree-hip-enable-experimental-pad-layout=false %s | FileCheck %s --check-prefix=NOPAD

// PAD: #hal.executable.target<"rocm"
// PAD-SAME: encoding = #iree_gpu.gpu_pad_layout<cache_line_bytes = 128, cache_sets = 4>

// NOPAD: #hal.executable.target<"rocm"
// NOPAD-NOT: encoding = #iree_gpu.gpu_pad_layout

stream.executable public @main {
stream.executable.export @main workgroups(%arg0: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main() {
return
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def IREECodegen_ExportConfig : AttrDef<IREECodegen_Dialect, "ExportConfig", []>
}

//===---------------------------------------------------------------------===//
// iree_codegen.encoding_layout
// iree_codegen.encoding_nop_layout
//===---------------------------------------------------------------------===//

def IREECodegen_EncodingNopLayoutAttr :
Expand Down
27 changes: 27 additions & 0 deletions compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,33 @@ def IREEGPU_GPUEncodingLayoutAttr :
);
}

//===----------------------------------------------------------------------===//
// iree_gpu.gpu_pad_layout
//===----------------------------------------------------------------------===//

def IREEGPU_GPUPadLayoutAttr : AttrDef<IREEGPU_Dialect, "GPUPadLayout"> {
let mnemonic = "gpu_pad_layout";
let summary = "The padded encoding layout attribute for GPU targets.";
let assemblyFormat = "`<` struct(params) `>`";

let description = [{
Describes padding preferences for a given GPU target.
This attribute can implement any encoding interface for data-tiling,
e.g., Encoding::EncodingLayoutAttrInterface, etc. They should be implemented
through external model mechanism because we do not want to relocate
domain-specific logic to the dialect implementation, and we can have better
code structure. See the implementation in
compiler/Codegen/ExternalInterfaces/*.
}];

let parameters = (ins
// Relevant target properties that will later allow us to decide the
// serialized pad layout.
"uint32_t":$cache_line_bytes,
"uint32_t":$cache_sets
);
}

//===----------------------------------------------------------------------===//
// Workgroup processor level description
//===----------------------------------------------------------------------===//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@
#include <optional>
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/MLIRContext.h"

namespace mlir::iree_compiler::IREE::GPU {

Expand Down Expand Up @@ -707,6 +711,18 @@ TargetAttr getHIPTargetDetails(StringRef target, StringRef features,
return nullptr;
}

Attribute getHIPTargetEncodingLayoutAttr(TargetAttr target) {
// This is only enabled for CDNA2 and CDNA3 for the time being.
// TODO(kuhar): Enable for other HIP targets.
if (!llvm::is_contained({"gfx90a", "gfx940", "gfx941", "gfx942"},
target.getArch())) {
return nullptr;
}

return IREE::GPU::GPUPadLayoutAttr::get(
target.getContext(), /*cacheLineBytes=*/128, /*cacheSets=*/4);
}

StringRef normalizeHIPTarget(StringRef target) {
return normalizeAMDGPUTarget(target);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ StringRef normalizeCUDATarget(StringRef target);
TargetAttr getHIPTargetDetails(llvm::StringRef target, llvm::StringRef features,
MLIRContext *context);

// Returns an attribute implementing `EncodingLayoutAttributeInterface` if
// |target| has known encoding preferences.
Attribute getHIPTargetEncodingLayoutAttr(TargetAttr target);

// Normalizes the given HIP |target| to the gfx target commonly used for
// compiling towards HIP. For example, "gfx90a" for "cnda2", "gfx1100" for
// "rx7900xtx". Returns empty StringRef if the given |target| is not recognized.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -670,17 +670,9 @@ struct CPUDeviceEncodingLayoutAttrInterface
}
};

struct CPUHostEncodingLayoutAttrInterface
: public IREE::Encoding::EncodingLayoutAttrInterface::ExternalModel<
struct CPUHostEncodingLayoutAttrInterface final
: IREE::Encoding::EncodingLayoutAttrInterface::ExternalModel<
CPUHostEncodingLayoutAttrInterface, CPUEncodingLayoutAttr> {

Value calculateStorageSizeInBytes(Attribute attr, Location loc,
OpBuilder &builder, RankedTensorType type,
ValueRange dynamicDims) const {
return calculateStorageSizeInBytesImpl(attr, loc, builder, type,
dynamicDims);
}

Attribute cloneWithSimplifiedConfig(Attribute attr,
DictionaryAttr config) const {
MLIRContext *ctx = attr.getContext();
Expand All @@ -697,6 +689,18 @@ struct CPUHostEncodingLayoutAttrInterface
}
};

struct CPUHostSerializedEncodingLayoutAttrInterface final
: IREE::Encoding::SerializedEncodingLayoutAttrInterface::ExternalModel<
CPUHostSerializedEncodingLayoutAttrInterface, CPUEncodingLayoutAttr> {

Value calculateStorageSizeInBytes(Attribute attr, Location loc,
OpBuilder &builder, RankedTensorType type,
ValueRange dynamicDims) const {
return calculateStorageSizeInBytesImpl(attr, loc, builder, type,
dynamicDims);
}
};

//===----------------------------------------------------------------------===//
// Interface methods implementaion for iree_cpu.vmvx_encoding_layout.
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -731,8 +735,8 @@ enumerateVMVXMatmulTiles(linalg::ContractionDimensions cDims,
};
}

struct VMVXDeviceEncodingLayoutAttrInterface
: public Codegen::LayoutAttrInterface::ExternalModel<
struct VMVXDeviceEncodingLayoutAttrInterface final
: Codegen::LayoutAttrInterface::ExternalModel<
VMVXDeviceEncodingLayoutAttrInterface, VMVXEncodingLayoutAttr> {
MaterializeEncodingInfo getEncodingInfo(Attribute attr,
RankedTensorType type) const {
Expand Down Expand Up @@ -797,16 +801,9 @@ struct VMVXDeviceEncodingLayoutAttrInterface
}
};

struct VMVXHostEncodingLayoutAttrInterface
: public IREE::Encoding::EncodingLayoutAttrInterface::ExternalModel<
struct VMVXHostEncodingLayoutAttrInterface final
: IREE::Encoding::EncodingLayoutAttrInterface::ExternalModel<
VMVXHostEncodingLayoutAttrInterface, VMVXEncodingLayoutAttr> {
Value calculateStorageSizeInBytes(Attribute attr, Location loc,
OpBuilder &builder, RankedTensorType type,
ValueRange dynamicDims) const {
return calculateStorageSizeInBytesImpl(attr, loc, builder, type,
dynamicDims);
}

Attribute cloneWithSimplifiedConfig(Attribute attr,
DictionaryAttr config) const {
MLIRContext *ctx = attr.getContext();
Expand All @@ -822,17 +819,31 @@ struct VMVXHostEncodingLayoutAttrInterface
}
};

struct VMVXHostSerializedEncodingLayoutAttrInterface final
: IREE::Encoding::SerializedEncodingLayoutAttrInterface::ExternalModel<
VMVXHostSerializedEncodingLayoutAttrInterface,
VMVXEncodingLayoutAttr> {
Value calculateStorageSizeInBytes(Attribute attr, Location loc,
OpBuilder &builder, RankedTensorType type,
ValueRange dynamicDims) const {
return calculateStorageSizeInBytesImpl(attr, loc, builder, type,
dynamicDims);
}
};

} // namespace

void registerCPUEncodingExternalModels(DialectRegistry &registry) {
registry.addExtension(
+[](MLIRContext *ctx, IREE::CPU::IREECPUDialect *dialect) {
IREE::CPU::CPUEncodingLayoutAttr::attachInterface<
CPUDeviceEncodingLayoutAttrInterface,
CPUHostEncodingLayoutAttrInterface>(*ctx);
CPUHostEncodingLayoutAttrInterface,
CPUHostSerializedEncodingLayoutAttrInterface>(*ctx);
IREE::CPU::VMVXEncodingLayoutAttr::attachInterface<
VMVXDeviceEncodingLayoutAttrInterface,
VMVXHostEncodingLayoutAttrInterface>(*ctx);
VMVXHostEncodingLayoutAttrInterface,
VMVXHostSerializedEncodingLayoutAttrInterface>(*ctx);
});
}

Expand Down
Loading

0 comments on commit 86244de

Please sign in to comment.