Skip to content

Add CUDA wrapper capability. #714

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ jobs:

- name: Install FFCx (Linux, with optional dependencies)
if: runner.os == 'Linux'
run: pip install .[ci,optional]
run: |
pip install .[ci,optional]
- name: Install FFCx (macOS, Windows)
if: runner.os != 'Linux'
run: pip install .[ci]
Expand Down
110 changes: 110 additions & 0 deletions demo/nvrtc_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include "Components.h"
#include "FacetIntegrals.h"
#include "HyperElasticity.h"
#include "MathFunctions.h"
#include "StabilisedStokes.h"
#include "VectorPoisson.h"
#include "ufcx.h"
#include "nvrtc.h"
#include <iostream>
#include <stdexcept>
#include <sstream>
#include <string>
#include <vector>

void check_nvrtc_compilation(ufcx_form* form)
{
// extract kernel
ufcx_integral* integral = form->form_integrals[0];
ufcx_tabulate_tensor_cuda_nvrtc* kernel = integral->tabulate_tensor_cuda_nvrtc;
// call kernel to get CUDA-wrapped source code
int num_program_headers;
const char** program_headers;
const char** program_include_names;
const char* program_src;
const char* tabulate_tensor_function_name;
if (!kernel) {
throw std::runtime_error("NVRTC wrapper function is NULL!");
}
(*kernel)(
&num_program_headers, &program_headers,
&program_include_names, &program_src,
&tabulate_tensor_function_name);
// compile CUDA-wrapped source code with NVRTC
// with proper error checking

nvrtcResult nvrtc_err;
nvrtcProgram program;
nvrtc_err = nvrtcCreateProgram(
&program, program_src, tabulate_tensor_function_name,
num_program_headers, program_headers,
program_include_names);

if (nvrtc_err != NVRTC_SUCCESS) {
throw std::runtime_error(
"nvrtcCreateProgram() failed with " +
std::string(nvrtcGetErrorString(nvrtc_err)) + " "
"at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
}

int num_compile_options = 0;
const char** compile_options;
// Compile the CUDA C++ program
nvrtcResult nvrtc_compile_err = nvrtcCompileProgram(
program, num_compile_options, compile_options);
if (nvrtc_compile_err != NVRTC_SUCCESS) {
// If the compiler failed, obtain the compiler log
std::string program_log;
size_t log_size;
nvrtc_err = nvrtcGetProgramLogSize(program, &log_size);
if (nvrtc_err != NVRTC_SUCCESS) {
program_log = std::string(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Std::format?

"nvrtcGetProgramLogSize() failed with " +
std::string(nvrtcGetErrorString(nvrtc_err)) + " "
"at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
} else {
program_log.resize(log_size);
nvrtc_err = nvrtcGetProgramLog(
program, const_cast<char*>(program_log.c_str()));
if (nvrtc_err != NVRTC_SUCCESS) {
program_log = std::string(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Std::format?

"nvrtcGetProgramLog() failed with " +
std::string(nvrtcGetErrorString(nvrtc_err))) + " "
"at " + std::string(__FILE__) + ":" + std::to_string(__LINE__);
}
if (log_size > 0)
program_log.resize(log_size-1);
}
nvrtcDestroyProgram(&program);

std::stringstream ss;
ss << "nvrtcCompileProgram() failed with "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use std::format if you can switch to C++20, removes the need for using C++ terrible string formatting.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I tried switching this code to use std::format, however the C++ compiler in Github's CI environment appears to not support c++20 (tests worked locally, but broke during CI on Github).

<< nvrtcGetErrorString(nvrtc_compile_err) << "\n"
<< "CUDA C++ source code:\n"
<< std::string(60, '-') << "\n"
<< program_src
<< std::string(60, '-') << "\n"
<< "NVRTC compiler log:\n"
<< std::string(60, '-') << "\n"
<< program_log << "\n"
<< std::string(60, '-') << "\n";
throw std::runtime_error(ss.str());
}
}

int main()
{
std::vector<ufcx_form*> forms = {
form_Components_L,
form_FacetIntegrals_a,
form_HyperElasticity_a_F, form_HyperElasticity_a_J,
form_MathFunctions_a,
form_StabilisedStokes_a, form_StabilisedStokes_L,
form_VectorPoisson_a, form_VectorPoisson_L
};

for (ufcx_form* form : forms) check_nvrtc_compilation(form);

return 0;
}

47 changes: 47 additions & 0 deletions demo/test_demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,50 @@ def test_demo(file, scalar_type):
os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c")
== 0
)


@pytest.mark.parametrize("scalar_type", ["float64", "float32"])
def test_demo_nvrtc(scalar_type):
"""Test generated CUDA code with NVRTC."""
import importlib.util

try:
spec = importlib.util.find_spec("nvidia.cuda_nvrtc")
except ModuleNotFoundError:
pytest.skip(reason="Must have NVRTC pip package installed to run test.")

if sys.platform.startswith("win32"):
pytest.skip(reason="NVRTC CUDA wrappers not currently supported for Windows.")

files = [
"Components",
"FacetIntegrals",
"HyperElasticity",
"MathFunctions",
"StabilisedStokes",
"VectorPoisson",
]
opts = f"--scalar_type {scalar_type} --cuda_nvrtc"
nvrtc_dir = os.path.realpath(spec.submodule_search_locations[0])
cc = os.environ.get("CC", "cc")
extra_flags = (
"-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
)
for file in files:
assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0
assert (
os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c")
== 0
)

cxx = os.environ.get("CXX", "c++")
assert (
os.system(
f"cd {demo_dir} && "
f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib "
f" -Werror -o nvrtc_test nvrtc_test.cpp "
f"{' '.join([file + '.o' for file in files])} -l:libnvrtc.so.12"
)
== 0
)
assert os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") == 0
13 changes: 12 additions & 1 deletion ffcx/codegeneration/C/integrals.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,27 @@ def generator(ir: IntegralIR, domain: basix.CellType, options):
else:
code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL,"
code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL,"
if options.get("cuda_nvrtc"):
code["tabulate_tensor_cuda_nvrtc"] = (
f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name},"
)
code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n "')
else:
code["tabulate_tensor_cuda_nvrtc"] = ""
code["tabulate_tensor_quoted"] = ""

np_scalar_type = np.dtype(options["scalar_type"]).name
code[f"tabulate_tensor_{np_scalar_type}"] = (
f".tabulate_tensor_{np_scalar_type} = tabulate_tensor_{factory_name},"
)

assert ir.expression.coordinate_element_hash is not None
implementation = ufcx_integrals.factory.format(
implementation = ufcx_integrals.get_factory(options).format(
factory_name=factory_name,
enabled_coefficients=code["enabled_coefficients"],
enabled_coefficients_init=code["enabled_coefficients_init"],
tabulate_tensor=code["tabulate_tensor"],
tabulate_tensor_quoted=code["tabulate_tensor_quoted"],
needs_facet_permutations="true" if ir.expression.needs_facet_permutations else "false",
scalar_type=dtype_to_c_type(options["scalar_type"]),
geom_type=dtype_to_c_type(dtype_to_scalar_dtype(options["scalar_type"])),
Expand All @@ -88,6 +98,7 @@ def generator(ir: IntegralIR, domain: basix.CellType, options):
tabulate_tensor_float64=code["tabulate_tensor_float64"],
tabulate_tensor_complex64=code["tabulate_tensor_complex64"],
tabulate_tensor_complex128=code["tabulate_tensor_complex128"],
tabulate_tensor_cuda_nvrtc=code["tabulate_tensor_cuda_nvrtc"],
domain=int(domain),
)

Expand Down
51 changes: 51 additions & 0 deletions ffcx/codegeneration/C/integrals_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,61 @@
{tabulate_tensor_float64}
{tabulate_tensor_complex64}
{tabulate_tensor_complex128}
{tabulate_tensor_cuda_nvrtc}
.needs_facet_permutations = {needs_facet_permutations},
.coordinate_element_hash = {coordinate_element_hash},
.domain = {domain},
}};

// End of code for integral {factory_name}
"""

cuda_wrapper = """

// Begin NVRTC CUDA wrapper for integral {factory_name}
// The wrapper is compiled with a standard C++ compiler, and is called at runtime to generate
// source code which is then compiled into a CUDA kernel at runtime via NVRTC.
void tabulate_tensor_cuda_nvrtc_{factory_name}(int* num_program_headers,
const char*** program_headers,
const char*** program_include_names,
const char** out_program_src,
const char** tabulate_tensor_function_name)
{{
// The below typedefs are needed due to issues with including stdint.h in NVRTC source code
const char* program_src = ""
"#define alignas(x)\\n"
"#define restrict __restrict__\\n"
"\\n"
"typedef unsigned char uint8_t;\\n"
"typedef unsigned int uint32_t;\\n"
"typedef double ufc_scalar_t;\\n"
"\\n"
"extern \\"C\\" __global__\\n"
"void tabulate_tensor_{factory_name}({scalar_type}* restrict A,\\n"
" const {scalar_type}* restrict w,\\n"
" const {scalar_type}* restrict c,\\n"
" const {geom_type}* restrict coordinate_dofs,\\n"
" const int* restrict entity_local_index,\\n"
" const uint8_t* restrict quadrature_permutation\\n"
" )\\n"
"{{\\n"
"{tabulate_tensor_quoted}\\n"
"}}";
*num_program_headers = 0;
*program_headers = NULL;
*program_include_names = NULL;
*out_program_src = program_src;
*tabulate_tensor_function_name = "tabulate_tensor_{factory_name}";
}}

// End NVRTC CUDA wrapper for integral {factory_name}

"""


def get_factory(options):
"""Return the template string for constructing form integrals."""
if options.get("cuda_nvrtc"):
return cuda_wrapper + factory
else:
return factory
3 changes: 3 additions & 0 deletions ffcx/codegeneration/jit.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@
UFC_INTEGRAL_DECL += "\n".join(
re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL)
)
UFC_INTEGRAL_DECL += "\n".join(
re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda_nvrtc\).*?\);", ufcx_h, re.DOTALL)
)

UFC_INTEGRAL_DECL += "\n".join(
re.findall("typedef struct ufcx_integral.*?ufcx_integral;", ufcx_h, re.DOTALL)
Expand Down
23 changes: 23 additions & 0 deletions ffcx/codegeneration/ufcx.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,28 @@ extern "C"
const uint8_t* restrict quadrature_permutation, void* custom_data);
#endif // __STDC_NO_COMPLEX__

/// Return CUDA C++ source code for the ufc_tabulate_tensor kernel
/// The resulting source code is passed to NVRTC for runtime compilation
///
/// @param[out] num_program_headers
/// The number of headers required by the program
/// @param[out] program_headers
/// Entire contents of each header file
/// @param[out] program_include_names
/// Names of each header file
/// @param[out] program_src
/// CUDA C++ source code for the program containing the
/// tabulate_tensor function.
/// @param[out] tabulate_tensor_function_name
/// The name of the device-side function.
///
typedef void(ufcx_tabulate_tensor_cuda_nvrtc)(
int* num_program_headers,
const char*** program_headers,
const char*** program_include_names,
const char** program_src,
const char** tabulate_tensor_function_name);

typedef struct ufcx_integral
{
const bool* enabled_coefficients;
Expand All @@ -138,6 +160,7 @@ extern "C"
ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64;
ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128;
#endif // __STDC_NO_COMPLEX__
ufcx_tabulate_tensor_cuda_nvrtc* tabulate_tensor_cuda_nvrtc;
bool needs_facet_permutations;

/// Hash of the coordinate element associated with the geometry of the mesh.
Expand Down
6 changes: 6 additions & 0 deletions ffcx/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
logger = logging.getLogger("ffcx")

FFCX_DEFAULT_OPTIONS = {
"cuda_nvrtc": (
bool,
False,
"generate CUDA wrapped versions of tabulate tensor functions for use with NVRTC",
None,
),
"epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None),
"scalar_type": (
str,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main"
[project.optional-dependencies]
lint = ["ruff"]
docs = ["sphinx", "sphinx_rtd_theme"]
optional = ["numba", "pygraphviz==1.9"]
optional = ["numba", "pygraphviz", "nvidia-cuda-nvrtc-cu12>=12.9.86"]
test = ["pytest >= 6.0", "sympy", "numba"]
ci = [
"coveralls",
Expand Down