Skip to content

Commit

Permalink
[OpenMP] Lower printf to __llvm_omp_vprintf
Browse files Browse the repository at this point in the history
Extension of D112504. Lower amdgpu printf to `__llvm_omp_vprintf`
which takes the same const char*, void* arguments as cuda vprintf and also
passes the size of the void* alloca which will be needed by a non-stub
implementation of `__llvm_omp_vprintf` for amdgpu.

This removes the amdgpu link error on any printf in a target region in favour
of silently compiling code that doesn't print anything to stdout.

The exact set of changes to check-openmp probably needs revision before commit

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D112680
  • Loading branch information
JonChesterfield committed Nov 8, 2021
1 parent 1658980 commit db81d8f
Show file tree
Hide file tree
Showing 21 changed files with 147 additions and 75 deletions.
15 changes: 10 additions & 5 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5106,11 +5106,16 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
}
case Builtin::BIprintf:
if (getTarget().getTriple().isNVPTX())
return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
if (getTarget().getTriple().getArch() == Triple::amdgcn &&
getLangOpts().HIP)
return EmitAMDGPUDevicePrintfCallExpr(E, ReturnValue);
if (getTarget().getTriple().isNVPTX() ||
getTarget().getTriple().isAMDGCN()) {
if (getLangOpts().OpenMPIsDevice)
return EmitOpenMPDevicePrintfCallExpr(E);
if (getTarget().getTriple().isNVPTX())
return EmitNVPTXDevicePrintfCallExpr(E);
if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
return EmitAMDGPUDevicePrintfCallExpr(E);
}

break;
case Builtin::BI__builtin_canonicalize:
case Builtin::BI__builtin_canonicalizef:
Expand Down
109 changes: 80 additions & 29 deletions clang/lib/CodeGen/CGGPUBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@
using namespace clang;
using namespace CodeGen;

static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
namespace {
llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
llvm::Type::getInt8PtrTy(M.getContext())};
llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);

if (auto* F = M.getFunction("vprintf")) {
if (auto *F = M.getFunction("vprintf")) {
// Our CUDA system header declares vprintf with the right signature, so
// nobody else should have been able to declare vprintf with a bogus
// signature.
Expand All @@ -41,6 +42,28 @@ static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
}

llvm::Function *GetOpenMPVprintfDeclaration(CodeGenModule &CGM) {
const char *Name = "__llvm_omp_vprintf";
llvm::Module &M = CGM.getModule();
llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
llvm::Type::getInt8PtrTy(M.getContext()),
llvm::Type::getInt32Ty(M.getContext())};
llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);

if (auto *F = M.getFunction(Name)) {
if (F->getFunctionType() != VprintfFuncType) {
CGM.Error(SourceLocation(),
"Invalid type declaration for __llvm_omp_vprintf");
return nullptr;
}
return F;
}

return llvm::Function::Create(
VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M);
}

// Transforms a call to printf into a call to the NVPTX vprintf syscall (which
// isn't particularly special; it's invoked just like a regular function).
// vprintf takes two args: A format string, and a pointer to a buffer containing
Expand All @@ -67,17 +90,17 @@ static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
// Note that by the time this function runs, E's args have already undergone the
// standard C vararg promotion (short -> int, float -> double, etc.).

namespace {
llvm::Value *packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF,
const CallArgList &Args) {
std::pair<llvm::Value *, llvm::TypeSize>
packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF, const CallArgList &Args) {
const llvm::DataLayout &DL = CGF->CGM.getDataLayout();
llvm::LLVMContext &Ctx = CGF->CGM.getLLVMContext();
CGBuilderTy &Builder = CGF->Builder;

// Construct and fill the args buffer that we'll pass to vprintf.
if (Args.size() <= 1) {
// If there are no args, pass a null pointer to vprintf.
return llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
// If there are no args, pass a null pointer and size 0
llvm::Value * BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
return {BufferPtr, llvm::TypeSize::Fixed(0)};
} else {
llvm::SmallVector<llvm::Type *, 8> ArgTypes;
for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I)
Expand All @@ -96,43 +119,64 @@ llvm::Value *packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF,
llvm::Value *Arg = Args[I].getRValue(*CGF).getScalarVal();
Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType()));
}
return Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
llvm::Value *BufferPtr =
Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
return {BufferPtr, DL.getTypeAllocSize(AllocaTy)};
}
}
} // namespace

RValue
CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
ReturnValueSlot ReturnValue) {
assert(getTarget().getTriple().isNVPTX());
bool containsNonScalarVarargs(CodeGenFunction *CGF, CallArgList Args) {
return llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) {
return !A.getRValue(*CGF).isScalar();
});
}

RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF,
llvm::Function *Decl, bool WithSizeArg) {
CodeGenModule &CGM = CGF->CGM;
CGBuilderTy &Builder = CGF->Builder;
assert(E->getBuiltinCallee() == Builtin::BIprintf);
assert(E->getNumArgs() >= 1); // printf always has at least one arg.

// Uses the same format as nvptx for the argument packing, but also passes
// an i32 for the total size of the passed pointer
CallArgList Args;
EmitCallArgs(Args,
E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
E->arguments(), E->getDirectCallee(),
/* ParamsToSkip = */ 0);
CGF->EmitCallArgs(Args,
E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
E->arguments(), E->getDirectCallee(),
/* ParamsToSkip = */ 0);

// We don't know how to emit non-scalar varargs.
if (llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) {
return !A.getRValue(*this).isScalar();
})) {
if (containsNonScalarVarargs(CGF, Args)) {
CGM.ErrorUnsupported(E, "non-scalar arg to printf");
return RValue::get(llvm::ConstantInt::get(IntTy, 0));
return RValue::get(llvm::ConstantInt::get(CGF->IntTy, 0));
}

llvm::Value *BufferPtr = packArgsIntoNVPTXFormatBuffer(this, Args);
auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args);
llvm::Value *BufferPtr = r.first;

llvm::SmallVector<llvm::Value *, 3> Vec = {
Args[0].getRValue(*CGF).getScalarVal(), BufferPtr};
if (WithSizeArg) {
// Passing > 32bit of data as a local alloca doesn't work for nvptx or
// amdgpu
llvm::Constant *Size =
llvm::ConstantInt::get(llvm::Type::getInt32Ty(CGM.getLLVMContext()),
static_cast<uint32_t>(r.second.getFixedSize()));

// Invoke vprintf and return.
llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
return RValue::get(Builder.CreateCall(
VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr}));
Vec.push_back(Size);
}
return RValue::get(Builder.CreateCall(Decl, Vec));
}
} // namespace

RValue
CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E,
ReturnValueSlot ReturnValue) {
RValue CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E) {
assert(getTarget().getTriple().isNVPTX());
return EmitDevicePrintfCallExpr(
E, this, GetVprintfDeclaration(CGM.getModule()), false);
}

RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
assert(getTarget().getTriple().getArch() == llvm::Triple::amdgcn);
assert(E->getBuiltinCallee() == Builtin::BIprintf ||
E->getBuiltinCallee() == Builtin::BI__builtin_printf);
Expand Down Expand Up @@ -162,3 +206,10 @@ CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E,
Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint());
return RValue::get(Printf);
}

RValue CodeGenFunction::EmitOpenMPDevicePrintfCallExpr(const CallExpr *E) {
assert(getTarget().getTriple().isNVPTX() ||
getTarget().getTriple().isAMDGCN());
return EmitDevicePrintfCallExpr(E, this, GetOpenMPVprintfDeclaration(CGM),
true);
}
7 changes: 3 additions & 4 deletions clang/lib/CodeGen/CodeGenFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -4070,10 +4070,9 @@ class CodeGenFunction : public CodeGenTypeCache {
RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue);

RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
ReturnValueSlot ReturnValue);
RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E,
ReturnValueSlot ReturnValue);
RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
RValue EmitOpenMPDevicePrintfCallExpr(const CallExpr *E);

RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
const CallExpr *E, ReturnValueSlot ReturnValue);
Expand Down
12 changes: 2 additions & 10 deletions openmp/libomptarget/DeviceRTL/include/Debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,15 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
///}

/// Print
/// TODO: For now we have to use macros to guard the code because Clang lowers
/// `printf` to different function calls on NVPTX and AMDGCN platforms, and it
/// doesn't work for AMDGCN. After it can work on AMDGCN, we will remove the
/// macro.
/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
/// {

#ifndef __AMDGCN__
extern "C" {
int printf(const char *format, ...);
}

#define PRINTF(fmt, ...) (void)printf(fmt, __VA_ARGS__);
#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
#define PRINT(str) PRINTF("%s", str)
#else
#define PRINTF(fmt, ...)
#define PRINT(str)
#endif

///}

Expand Down
23 changes: 23 additions & 0 deletions openmp/libomptarget/DeviceRTL/src/Debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,29 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
assertion);
__builtin_trap();
}

#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
int32_t vprintf(const char *, void *);
namespace impl {
static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
return vprintf(Format, Arguments);
}
} // namespace impl
#pragma omp end declare variant

// We do not have a vprintf implementation for AMD GPU yet so we use a stub.
#pragma omp begin declare variant match(device = {arch(amdgcn)})
namespace impl {
static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
return -1;
}
} // namespace impl
#pragma omp end declare variant

int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
return impl::omp_vprintf(Format, Arguments, Size);
}
}

/// Current indentation level for the function trace. Only accessed by thread 0.
Expand Down
5 changes: 5 additions & 0 deletions openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,11 @@ __attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) {
}
__attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}

EXTERN
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) {
return -1;
}

EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
Expand Down
6 changes: 6 additions & 0 deletions openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,15 @@ EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
extern "C" {
void *malloc(size_t);
void free(void *);
int32_t vprintf(const char *, void *);
}

EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
EXTERN void __kmpc_impl_free(void *x) { free(x); }

EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments,
uint32_t) {
return vprintf(Format, Arguments);
}

#pragma omp end declare target
2 changes: 1 addition & 1 deletion openmp/libomptarget/test/mapping/data_member_ref.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: %libomptarget-compilexx-run-and-check-generic

// amdgcn does not have printf definition
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: %libomptarget-compilexx-run-and-check-generic

// amdgcn does not have printf definition
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: %libomptarget-compilexx-run-and-check-generic

// amdgcn does not have printf definition
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

Expand Down
2 changes: 1 addition & 1 deletion openmp/libomptarget/test/mapping/lambda_by_value.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: %libomptarget-compilexx-run-and-check-generic

// amdgcn does not have printf definition
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

Expand Down
2 changes: 1 addition & 1 deletion openmp/libomptarget/test/mapping/ompx_hold/struct.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %libomptarget-compile-generic -fopenmp-extensions
// RUN: %libomptarget-run-generic | %fcheck-generic -strict-whitespace

// amdgcn does not have printf definition
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

Expand Down
4 changes: 0 additions & 4 deletions openmp/libomptarget/test/mapping/ptr_and_obj_motion.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
// RUN: %libomptarget-compile-run-and-check-generic

// amdgcn does not have printf definition
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

#include <stdio.h>

typedef struct {
Expand Down
6 changes: 3 additions & 3 deletions openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// RUN: %libomptarget-compilexx-run-and-check-generic

// amdgcn does not have printf definition
// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

#include <stdio.h>

Expand Down
5 changes: 2 additions & 3 deletions openmp/libomptarget/test/offloading/bug49021.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
// RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic

// Wrong results on amdgcn
// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa

#include <iostream>

Expand Down
3 changes: 0 additions & 3 deletions openmp/libomptarget/test/offloading/bug50022.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
// RUN: %libomptarget-compilexx-and-run-generic

// UNSUPPORTED: amdgcn-amd-amdhsa
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL

#include <cassert>
#include <iostream>
#include <stdexcept>
Expand Down
2 changes: 1 addition & 1 deletion openmp/libomptarget/test/offloading/host_as_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

// RUN: %libomptarget-compile-run-and-check-generic

// amdgcn does not have printf definition
// amdgpu does not have a working printf definition
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

Expand Down
2 changes: 1 addition & 1 deletion openmp/libomptarget/test/unified_shared_memory/api.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// XFAIL: nvptx64-nvidia-cuda
// XFAIL: nvptx64-nvidia-cuda-newRTL

// Fails on amdgcn with error: GPU Memory Error
// Fails on amdgpu with error: GPU Memory Error
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// REQUIRES: unified_shared_memory
// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9

// Fails on amdgcn with error: GPU Memory Error
// Fails on amdgpu with error: GPU Memory Error
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL

Expand Down
Loading

0 comments on commit db81d8f

Please sign in to comment.