Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLVMCPU] Enable tileDispatchUsingForall as default #18777

Merged
merged 6 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,16 @@ static LogicalResult dropUnitDistributedDims(RewriterBase &rewriter,
llvm::SmallDenseSet<int> droppedLoops;
for (auto [index, lb, ub, step] :
llvm::enumerate(mixedLbs, mixedUbs, mixedSteps)) {
if (!isa<Attribute>(lb) || !isa<Attribute>(ub) || !isa<Attribute>(step)) {

std::optional<int64_t> lbVal = getConstantIntValue(lb);
std::optional<int64_t> ubVal = getConstantIntValue(ub);
std::optional<int64_t> stepVal = getConstantIntValue(step);

if (!(lbVal && ubVal && stepVal)) {
continue;
}
int64_t lbVal = getConstantIntValue(lb).value();
int64_t ubVal = getConstantIntValue(ub).value();
int64_t stepVal = getConstantIntValue(step).value();
if (CEILDIV(ubVal - lbVal, stepVal) == 1) {

if (CEILDIV(ubVal.value() - lbVal.value(), stepVal.value()) == 1) {
droppedLoops.insert(index);
}
}
Expand Down
28 changes: 10 additions & 18 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ static llvm::cl::opt<bool> clEnableVectorContractCustomKernels(
static llvm::cl::opt<bool> clTileDispatchUsingForall(
"iree-llvmcpu-tile-dispatch-using-forall",
llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"),
llvm::cl::init(false));
llvm::cl::init(true));

// By default, IREE does not enable the Armv9-A streaming SVE mode in the
// presence of scalable vectors (even when using `+sme`), as currently there's
Expand All @@ -111,9 +111,8 @@ static llvm::cl::opt<bool> clForceArmStreaming(
llvm::cl::init(false));

// TODO: Enable `TileDispatchUsingForall` for every pipeline.
static void addTileAndDistributePasses(OpPassManager &funcPassManager,
bool enableTileDispatchUsingForall) {
if (enableTileDispatchUsingForall || clTileDispatchUsingForall) {
static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
if (clTileDispatchUsingForall) {
funcPassManager.addPass(
createTileAndDistributeToWorkgroupsUsingForallOpPass());
} else {
Expand Down Expand Up @@ -346,8 +345,7 @@ void buildLLVMCPUVectorLoweringPipeline(
void addCPUBufferOpsTileAndVectorizePipeline(
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

// Skip tiling reduction loops because this is expected to apply on copy ops
// only.
Expand Down Expand Up @@ -384,8 +382,7 @@ void addCPUBufferOpsTileAndVectorizePipeline(
void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

SmallVector<int64_t> allFusableLevels(tilingConfig.getFusableLevels());
// Apply tile and fuse to all the non-distribution fusable levels. Skip
Expand Down Expand Up @@ -464,8 +461,7 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
void addConvTileAndDecomposeExpertPassPipeline(
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

// Run LLVMTileAndFuse firstly in case that we have fill + conv + generic
// ops. At this stage, we do not apply vectorization. The reduction dim won't
Expand Down Expand Up @@ -528,8 +524,7 @@ void addConvTileAndDecomposeExpertPassPipeline(
void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

funcPassManager.addPass(createLLVMCPUTileAndFusePass(
static_cast<int64_t>(tilingConfig.getVectorCommonParallelLevel())));
Expand Down Expand Up @@ -577,8 +572,7 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/true);
addTileAndDistributePasses(funcPassManager);

// The below two passes are nop if pack/unpack is not specified in ukernels
// attribute. By default, they are disabled.
Expand Down Expand Up @@ -621,8 +615,7 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
void addCPULinalgExtTileAndVectorizePipeline(
OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/false);
addTileAndDistributePasses(funcPassManager);
funcPassManager.addPass(
createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
// TODO: Remove the pass once we have PartialReductionOpInterface implemented
Expand Down Expand Up @@ -661,8 +654,7 @@ void addCPULinalgExtTileAndVectorizePipeline(
}

void addCPUDefaultPassPipeline(OpPassManager &funcPassManager) {
addTileAndDistributePasses(funcPassManager,
/*enableTileDispatchUsingForall=*/false);
addTileAndDistributePasses(funcPassManager);
addCPUBufferizePasses(funcPassManager);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ hal.executable private @main {
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C720:.+]] = arith.constant 720 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: scf.forall ({{.*}}) in (2, 4, 1, 5) {
// CHECK: scf.forall ({{.*}}) in (2, 4, 5) {
// CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x4x1x4x4x1xf32>)
// CHECK: gpu.barrier
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
Expand All @@ -307,7 +307,7 @@ hal.executable private @main {
// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32>
// CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32>
// CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]]
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}

// -----

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@

stream.executable private @__builtin_fill_i64 {
stream.executable.export public @__builtin_fill_i64 workgroups(%arg0: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @__builtin_fill_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
%c0 = arith.constant 0 : index
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
%0 = tensor.empty(%count) : tensor<?xi64>
%count0 = flow.dispatch.workload.ordinal %count, 0 : index
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
%0 = tensor.empty(%count0) : tensor<?xi64>
%1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
return
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@

stream.executable private @__builtin_splat_i64 {
stream.executable.export public @__builtin_splat_i64 workgroups(%arg0: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @__builtin_splat_i64(%value: i64, %count: index, %out_binding: !stream.binding) {
%c0 = arith.constant 0 : index
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
%0 = tensor.empty(%count) : tensor<?xi64>
%count0 = flow.dispatch.workload.ordinal %count, 0 : index
%out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
%0 = tensor.empty(%count0) : tensor<?xi64>
%1 = linalg.fill ins(%value : i64) outs(%0 : tensor<?xi64>) -> tensor<?xi64>
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count}
flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:tensor<?xi64>>{%count0}
return
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,14 @@ isFusableWithConsumer(OpOperand &fusedOperand,
return false;
}

// TODO: Enable grouped convolution and depth wise pooling fusion.
// Rightnow, this is going through the default CPU pipeline and not through
// CONVTilingExpert.
if (isa<linalg::Conv2DNgchwFgchwOp, linalg::Conv2DNgchwGfchwOp,
linalg::PoolingNdhwcSumOp>(producer)) {
return false;
}
Comment on lines +550 to +556
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have an issue for this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These originate from some onnx models. I will file an issue.


auto producerFusionOp =
dyn_cast<IREE::LinalgExt::LinalgFusionOpInterface>(producer);
auto consumerFusionOp =
Expand Down
Loading