Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CodeGen]Support Convert NVVM IR to Cubin With LibDevice Linked #10200

Merged
merged 26 commits into from
Apr 27, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ef90a10
fix
howin98 Apr 26, 2023
bd91bb4
Merge branch 'master' into support-nvvm-to-cubin-serial-passes
howin98 Apr 26, 2023
9d54af1
trim
howin98 Apr 26, 2023
e6f2e65
Merge branch 'support-nvvm-to-cubin-serial-passes' of github.com:Onef…
howin98 Apr 26, 2023
4b6bef0
auto format by CI
oneflow-ci-bot Apr 26, 2023
10dd5b4
auto fetch version
howin98 Apr 26, 2023
d1bdd38
Merge branch 'support-nvvm-to-cubin-serial-passes' of github.com:Onef…
howin98 Apr 26, 2023
6e8b08f
trim
howin98 Apr 26, 2023
2041a67
beautify
howin98 Apr 26, 2023
00255b5
fix
howin98 Apr 26, 2023
391cb48
auto format by CI
oneflow-ci-bot Apr 26, 2023
76f5294
fix version bug
howin98 Apr 26, 2023
9c87825
Merge branch 'support-nvvm-to-cubin-serial-passes' of github.com:Onef…
howin98 Apr 26, 2023
45e9814
auto format by CI
oneflow-ci-bot Apr 26, 2023
2dd6ec2
fix
howin98 Apr 26, 2023
90f6df7
fix
howin98 Apr 26, 2023
7da6adf
Merge branch 'master' into support-nvvm-to-cubin-serial-passes
howin98 Apr 27, 2023
78d3494
Merge branch 'master' into support-nvvm-to-cubin-serial-passes
mergify[bot] Apr 27, 2023
b075327
Merge branch 'master' into support-nvvm-to-cubin-serial-passes
mergify[bot] Apr 27, 2023
f38d98a
fix cpu
howin98 Apr 27, 2023
124d8bf
Merge branch 'support-nvvm-to-cubin-serial-passes' of github.com:Onef…
howin98 Apr 27, 2023
df7e0ba
brace
howin98 Apr 27, 2023
1158867
add flag in cmake
howin98 Apr 27, 2023
9e17dad
fix lit cfg support in cpu
howin98 Apr 27, 2023
35bdfba
Merge branch 'master' into support-nvvm-to-cubin-serial-passes
howin98 Apr 27, 2023
34d58ab
Merge branch 'master' into support-nvvm-to-cubin-serial-passes
howin98 Apr 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_PTXTOCUBIN_H_
#define ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_PTXTOCUBIN_H_
#ifndef ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_NVVMTOCUBIN_H_
#define ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_NVVMTOCUBIN_H_

#ifdef WITH_MLIR_CUDA_CODEGEN

#include "mlir/Pass/Pass.h"

namespace mlir {

namespace oneflow {
const std::string& getArchVersion();

std::unique_ptr<mlir::Pass> createSerializeToCubinPass();
void InitializeLLVMNVPTXBackend();
void registerGpuSerializeToCubinPass();
namespace gpu {

inline std::string getCubinAnnotation() { return "gpu.binary"; }

} // namespace oneflow
} // namespace gpu

void InitializeLLVMNVPTXBackend();
std::unique_ptr<mlir::Pass> createNVVMToCubinPass();

} // namespace mlir

#endif // ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_PTXTOCUBIN_H_
#endif // WITH_MLIR_CUDA_CODEGEN

#endif // ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_NVVMTOCUBIN_H_
10 changes: 10 additions & 0 deletions oneflow/ir/include/OneFlow/OneFlowPasses.td
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@

include "OneFlow/OneFlowOps.td"

def NVVMToCubinPass : InterfacePass<"nvvm-to-cubin", "SymbolOpInterface"> {
let summary = "convert nvvm ir to cubin";
let constructor = "mlir::createNVVMToCubinPass()";
let options = [
Option<"triple", "triple", "StringRef", "\"nvptx64-nvidia-cuda\"", "Target triple">,
Option<"chip", "chip", "StringRef", "\"sm_\" + getArchVersion()", "Target architecture">,
Option<"features", "features", "StringRef", "\"+ptx60\"", "Target features">,
];
}

def LowerOneFlowToTosaPass : Pass<"lower-oneflow-to-tosa", "ModuleOp"> {
let summary = "lower oneflow dialect to tosa dialect";
let constructor = "mlir::oneflow::createLowerOneFlowToTosaPass()";
Expand Down
2 changes: 1 addition & 1 deletion oneflow/ir/include/OneFlow/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ limitations under the License.
#include "OneFlow/Transform/EliminateAllocOps.h"

#ifdef WITH_MLIR_CUDA_CODEGEN
#include "OneFlow/Conversion/PTXToCubin.h"
#include "OneFlow/Conversion/NVVMToCubin.h"
#endif // WITH_MLIR_CUDA_CODEGEN

namespace mlir {
Expand Down
2 changes: 1 addition & 1 deletion oneflow/ir/lib/OneFlow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ oneflow_add_mlir_dialect_library(
OneFlowOpFolders.cpp
Conversion/OneFlowToTosa.cpp
Conversion/OneFlowToLinalg.cpp
Conversion/PTXToCubin.cpp
Conversion/NVVMToCubin.cpp
Transform/BufferHostRegister.cpp
Transform/OutlineAndFuse.cpp
Transform/JITPasses.cpp
Expand Down
251 changes: 251 additions & 0 deletions oneflow/ir/lib/OneFlow/Conversion/NVVMToCubin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifdef WITH_MLIR_CUDA_CODEGEN
#include "oneflow/core/common/util.h"
#include "OneFlow/Passes.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Support/FileUtilities.h"
#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
#include "mlir/Target/LLVMIR/Export.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Linker/Linker.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/Internalize.h"
#include "llvm/Transforms/Scalar/DCE.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"

#include <cuda.h>
#include <cuda_runtime_api.h>

static void emitCudaError(const llvm::Twine& expr, const char* buffer, CUresult result,
mlir::Location loc) {
const char* error;
cuGetErrorString(result, &error);
emitError(loc, expr.concat(" failed with error code ")
.concat(llvm::Twine{error})
.concat("[")
.concat(buffer)
.concat("]"));
}

#define RETURN_ON_CUDA_ERROR(expr) \
do { \
if (auto status = (expr)) { \
emitCudaError(#expr, jitErrorBuffer, status, loc); \
return {}; \
} \
} while (false)

namespace mlir {
namespace {

const std::string& getLibDevice() {
static std::string p;
if (p.size() > 0) return p;
const auto toolkit_env_name = "CUDA_TOOLKIT_ROOT_DIR";
p = ::oneflow::GetStringFromEnv(toolkit_env_name, "/usr/local/cuda/")
+ "nvvm/libdevice/libdevice.10.bc";
return p;
}

LogicalResult linkLibdevice(llvm::Module& llvmModule, llvm::LLVMContext& llvmContext) {
// Note: infer libdevice path from environment variable
auto libDevice = getLibDevice();

// Note: load raw data from file
std::string errorMessage;
auto libDeviceBuf = openInputFile(libDevice, &errorMessage);
if (!libDeviceBuf) LOG(FATAL) << "Open File error when link libdevice: " << errorMessage;

// Note: load module from raw data
auto moduleOrErr = llvm::getOwningLazyBitcodeModule(std::move(libDeviceBuf), llvmContext);
if (!moduleOrErr) LOG(FATAL) << "Failed to load: " << libDevice << "\n";
std::unique_ptr<llvm::Module> libDeviceModule = std::move(moduleOrErr.get());

// Note: link libdevice with module
if (llvm::Linker::linkModules(llvmModule, std::move(libDeviceModule),
llvm::Linker::Flags::LinkOnlyNeeded,
[](llvm::Module& M, const llvm::StringSet<>& GS) {
llvm::internalizeModule(M, [&GS](const llvm::GlobalValue& GV) {
return !GV.hasName() || (GS.count(GV.getName()) == 0);
});
})) {
LOG(FATAL) << "failed to link libdevice module\n";
}

return success();
}

const std::string& getArchVersion() {
static std::string version;
if (version.size()) return version;
cudaDeviceProp prop{};
cudaError_t err = cudaGetDeviceProperties(&prop, 0);
if (err != cudaSuccess) {
printf("%s\n", cudaGetErrorString(err));
exit(1);
}
version = std::to_string(prop.major) + std::to_string(prop.minor);
return version;
}

class NVVMToCubinPass : public NVVMToCubinPassBase<NVVMToCubinPass> {
std::unique_ptr<llvm::Module> translateToLLVMIR(llvm::LLVMContext& llvmContext) {
return translateModuleToLLVMIR(getOperation(), llvmContext, "LLVMDialectModule");
}

public:
std::optional<std::string> translateToISA(llvm::Module& llvmModule,
llvm::TargetMachine& targetMachine) {
llvmModule.setDataLayout(targetMachine.createDataLayout());

// TODO: optimizeLlvm

std::string targetISA;
llvm::raw_string_ostream stream(targetISA);

{ // Drop pstream after this to prevent the ISA from being stuck buffering
llvm::buffer_ostream pstream(stream);
llvm::legacy::PassManager codegenPasses;

if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
llvm::CGFT_AssemblyFile))
return std::nullopt;

codegenPasses.run(llvmModule);
}
return stream.str();
}
std::unique_ptr<llvm::TargetMachine> createTargetMachine() {
Location loc = getOperation().getLoc();
std::string error;
const llvm::Target* target = ::llvm::TargetRegistry::lookupTarget(triple.str(), error);
if (!target) {
emitError(loc, Twine("failed to lookup target: ") + error);
return {};
}
llvm::TargetMachine* machine =
target->createTargetMachine(triple.str(), chip.str(), features.str(), {}, {});
if (!machine) {
emitError(loc, "failed to create target machine");
return {};
}

return std::unique_ptr<llvm::TargetMachine>{machine};
}
std::unique_ptr<std::vector<char>> serializeISA(const std::string& isa) {
Location loc = getOperation().getLoc();
char jitErrorBuffer[4096] = {0};

RETURN_ON_CUDA_ERROR(cuInit(0));

// Linking requires a device context.
CUdevice device;
RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
CUcontext context;
RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device));
CUlinkState linkState;

CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
void* jitOptionsVals[] = {jitErrorBuffer, reinterpret_cast<void*>(sizeof(jitErrorBuffer))};

RETURN_ON_CUDA_ERROR(cuLinkCreate(2, /* number of jit options */
jitOptions, /* jit options */
jitOptionsVals, /* jit option values */
&linkState));

auto kernelName = getOperation().getName().str();
RETURN_ON_CUDA_ERROR(cuLinkAddData(linkState, CUjitInputType::CU_JIT_INPUT_PTX,
const_cast<void*>(static_cast<const void*>(isa.c_str())),
isa.length(), kernelName.c_str(),
0, /* number of jit options */
nullptr, /* jit options */
nullptr /* jit option values */
));

void* cubinData;
size_t cubinSize;
RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize));

char* cubinAsChar = static_cast<char*>(cubinData);
auto result = std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);

// This will also destroy the cubin data.
RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));

return result;
}

void runOnOperation() override {
llvm::LLVMContext llvmContext;
std::unique_ptr<llvm::Module> llvmModule = translateToLLVMIR(llvmContext);
if (!llvmModule) return signalPassFailure();
if (failed(linkLibdevice(*llvmModule, llvmContext))) { return signalPassFailure(); }

// Lower the LLVM IR module to target ISA.
std::unique_ptr<llvm::TargetMachine> targetMachine = createTargetMachine();
if (!targetMachine) return signalPassFailure();

std::optional<std::string> maybeTargetISA = translateToISA(*llvmModule, *targetMachine);

if (!maybeTargetISA.has_value()) return signalPassFailure();

std::string targetISA = std::move(*maybeTargetISA);

// Serialize the target ISA.
std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
if (!blob) return signalPassFailure();

// Add the blob as module attribute.
auto attr = StringAttr::get(&getContext(), StringRef(blob->data(), blob->size()));
getOperation()->setAttr(gpu::getCubinAnnotation(), attr);
}

void getDependentDialects(::mlir::DialectRegistry& registry) const override {
registerLLVMDialectTranslation(registry);
registerNVVMDialectTranslation(registry);
}
};
} // namespace

std::unique_ptr<mlir::Pass> createNVVMToCubinPass() { return std::make_unique<NVVMToCubinPass>(); }

void InitializeLLVMNVPTXBackend() {
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();
}

} // namespace mlir
#endif // WITH_MLIR_CUDA_CODEGEN
Loading