Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat[next]: DaCe backend - enable GPU test execution #1360

Merged
merged 14 commits into from
Nov 23, 2023
Merged
4 changes: 3 additions & 1 deletion docs/development/ADRs/0015-Test_Exclusion_Matrices.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,12 @@ by calling `next_tests.get_processor_id()`, which returns the so-called processo
The following backend processors are defined:

```python
DACE = "dace_iterator.run_dace_iterator"
DACE_CPU = "dace_iterator.run_dace_cpu"
DACE_GPU = "dace_iterator.run_dace_gpu"
GTFN_CPU = "otf_compile_executor.run_gtfn"
GTFN_CPU_IMPERATIVE = "otf_compile_executor.run_gtfn_imperative"
GTFN_CPU_WITH_TEMPORARIES = "otf_compile_executor.run_gtfn_with_temporaries"
GTFN_GPU = "gt4py.next.program_processors.runners.gtfn.run_gtfn_gpu"
```

Following the previous example, the GTFN backend with temporaries does not support yet dynamic offsets in ITIR:
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -332,12 +332,14 @@ markers = [
'uses_applied_shifts: tests that require backend support for applied-shifts',
'uses_constant_fields: tests that require backend support for constant fields',
'uses_dynamic_offsets: tests that require backend support for dynamic offsets',
'uses_floordiv: tests that require backend support for floor division',
'uses_if_stmts: tests that require backend support for if-statements',
'uses_index_fields: tests that require backend support for index fields',
'uses_lift_expressions: tests that require backend support for lift expressions',
'uses_negative_modulo: tests that require backend support for modulo on negative numbers',
'uses_origin: tests that require backend support for domain origin',
'uses_reduction_over_lift_expressions: tests that require backend support for reduction over lift expressions',
'uses_reduction_with_only_sparse_fields: tests that require backend support for with sparse fields',
'uses_scan_in_field_operator: tests that require backend support for scan in field operator',
'uses_sparse_fields: tests that require backend support for sparse fields',
'uses_strided_neighbor_offset: tests that require backend support for strided neighbor offset',
Expand Down
48 changes: 26 additions & 22 deletions src/gt4py/next/program_processors/runners/dace_iterator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#
# SPDX-License-Identifier: GPL-3.0-or-later
import hashlib
import warnings
from typing import Any, Mapping, Optional, Sequence

import dace
Expand All @@ -22,11 +23,11 @@
import gt4py.next.allocators as next_allocators
import gt4py.next.iterator.ir as itir
import gt4py.next.program_processors.otf_compile_executor as otf_exec
import gt4py.next.program_processors.processor_interface as ppi
from gt4py.next.common import Dimension, Domain, UnitRange, is_field
from gt4py.next.iterator.embedded import NeighborTableOffsetProvider, StridedNeighborOffsetProvider
from gt4py.next.iterator.transforms import LiftMode, apply_common_transforms
from gt4py.next.otf.compilation import cache
from gt4py.next.program_processors.processor_interface import program_executor
from gt4py.next.type_system import type_specifications as ts, type_translation

from .itir_to_sdfg import ItirToSDFG
Expand Down Expand Up @@ -94,10 +95,26 @@ def get_args(params: Sequence[itir.Sym], args: Sequence[Any]) -> dict[str, Any]:
return {name.id: convert_arg(arg) for name, arg in zip(params, args)}


def _ensure_is_on_device(
connectivity_arg: np.typing.NDArray, device: dace.dtypes.DeviceType
) -> np.typing.NDArray:
if device == dace.dtypes.DeviceType.GPU:
if not isinstance(connectivity_arg, cp.ndarray):
warnings.warn(
"Copying connectivity to device. For performance make sure connectivity is provided on device."
)
return cp.asarray(connectivity_arg)
return connectivity_arg


def get_connectivity_args(
neighbor_tables: Sequence[tuple[str, NeighborTableOffsetProvider]]
neighbor_tables: Sequence[tuple[str, NeighborTableOffsetProvider]],
device: dace.dtypes.DeviceType,
) -> dict[str, Any]:
return {connectivity_identifier(offset): table.table for offset, table in neighbor_tables}
return {
connectivity_identifier(offset): _ensure_is_on_device(table.table, device)
for offset, table in neighbor_tables
}


def get_shape_args(
Expand Down Expand Up @@ -167,7 +184,6 @@ def get_cache_id(
return m.hexdigest()


@program_executor
def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None:
# build parameters
auto_optimize = kwargs.get("auto_optimize", False)
Expand All @@ -182,6 +198,7 @@ def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None:
offset_provider = kwargs["offset_provider"]

arg_types = [type_translation.from_value(arg) for arg in args]
device = dace.DeviceType.GPU if run_on_gpu else dace.DeviceType.CPU
neighbor_tables = filter_neighbor_tables(offset_provider)

cache_id = get_cache_id(program, arg_types, column_axis, offset_provider)
Expand All @@ -192,26 +209,16 @@ def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None:
else:
# visit ITIR and generate SDFG
program = preprocess_program(program, offset_provider, lift_mode)
sdfg_genenerator = ItirToSDFG(arg_types, offset_provider, column_axis)
sdfg_genenerator = ItirToSDFG(arg_types, offset_provider, column_axis, run_on_gpu)
sdfg = sdfg_genenerator.visit(program)
sdfg.simplify()

# set array storage for GPU execution
if run_on_gpu:
device = dace.DeviceType.GPU
sdfg._name = f"{sdfg.name}_gpu"
for _, _, array in sdfg.arrays_recursive():
if not array.transient:
array.storage = dace.dtypes.StorageType.GPU_Global
else:
device = dace.DeviceType.CPU

# run DaCe auto-optimization heuristics
if auto_optimize:
# TODO Investigate how symbol definitions improve autoopt transformations,
# in which case the cache table should take the symbols map into account.
symbols: dict[str, int] = {}
sdfg = autoopt.auto_optimize(sdfg, device, symbols=symbols)
sdfg = autoopt.auto_optimize(sdfg, device, symbols=symbols, use_gpu_storage=run_on_gpu)

# compile SDFG and retrieve SDFG program
sdfg.build_folder = cache._session_cache_dir_path / ".dacecache"
Expand All @@ -226,7 +233,7 @@ def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None:

dace_args = get_args(program.params, args)
dace_field_args = {n: v for n, v in dace_args.items() if not np.isscalar(v)}
dace_conn_args = get_connectivity_args(neighbor_tables)
dace_conn_args = get_connectivity_args(neighbor_tables, device)
dace_shapes = get_shape_args(sdfg.arrays, dace_field_args)
dace_conn_shapes = get_shape_args(sdfg.arrays, dace_conn_args)
dace_strides = get_stride_args(sdfg.arrays, dace_field_args)
Expand Down Expand Up @@ -254,7 +261,6 @@ def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None:
sdfg_program(**expected_args)


@program_executor
def _run_dace_cpu(program: itir.FencilDefinition, *args, **kwargs) -> None:
run_dace_iterator(
program,
Expand All @@ -267,13 +273,12 @@ def _run_dace_cpu(program: itir.FencilDefinition, *args, **kwargs) -> None:


run_dace_cpu = otf_exec.OTFBackend(
executor=_run_dace_cpu,
executor=ppi.program_executor(_run_dace_cpu, name="run_dace_cpu"),
allocator=next_allocators.StandardCPUFieldBufferAllocator(),
)

if cp:

@program_executor
def _run_dace_gpu(program: itir.FencilDefinition, *args, **kwargs) -> None:
run_dace_iterator(
program,
Expand All @@ -286,12 +291,11 @@ def _run_dace_gpu(program: itir.FencilDefinition, *args, **kwargs) -> None:

else:

@program_executor
def _run_dace_gpu(program: itir.FencilDefinition, *args, **kwargs) -> None:
raise RuntimeError("Missing `cupy` dependency for GPU execution.")


run_dace_gpu = otf_exec.OTFBackend(
executor=_run_dace_gpu,
executor=ppi.program_executor(_run_dace_gpu, name="run_dace_gpu"),
allocator=next_allocators.StandardGPUFieldBufferAllocator(),
)
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,20 @@ class ItirToSDFG(eve.NodeVisitor):
offset_provider: dict[str, Any]
node_types: dict[int, next_typing.Type]
unique_id: int
use_gpu_storage: bool

def __init__(
self,
param_types: list[ts.TypeSpec],
offset_provider: dict[str, NeighborTableOffsetProvider],
column_axis: Optional[Dimension] = None,
use_gpu_storage: bool = False,
):
self.param_types = param_types
self.column_axis = column_axis
self.offset_provider = offset_provider
self.storage_types = {}
self.use_gpu_storage = use_gpu_storage

def add_storage(self, sdfg: dace.SDFG, name: str, type_: ts.TypeSpec, has_offset: bool = True):
if isinstance(type_, ts.FieldType):
Expand All @@ -118,7 +121,14 @@ def add_storage(self, sdfg: dace.SDFG, name: str, type_: ts.TypeSpec, has_offset
else None
)
dtype = as_dace_type(type_.dtype)
sdfg.add_array(name, shape=shape, strides=strides, offset=offset, dtype=dtype)
storage = (
dace.dtypes.StorageType.GPU_Global
if self.use_gpu_storage
else dace.dtypes.StorageType.Default
)
sdfg.add_array(
name, shape=shape, strides=strides, offset=offset, dtype=dtype, storage=storage
)
elif isinstance(type_, ts.ScalarType):
sdfg.add_symbol(name, as_dace_type(type_))
else:
Expand Down Expand Up @@ -225,6 +235,7 @@ def visit_StencilClosure(
shape=array_table[name].shape,
strides=array_table[name].strides,
dtype=array_table[name].dtype,
storage=array_table[name].storage,
transient=True,
)
closure_init_state.add_nedge(
Expand All @@ -239,6 +250,7 @@ def visit_StencilClosure(
shape=array_table[name].shape,
strides=array_table[name].strides,
dtype=array_table[name].dtype,
storage=array_table[name].storage,
)
else:
assert isinstance(self.storage_types[name], ts.ScalarType)
Expand Down
54 changes: 27 additions & 27 deletions tests/next_tests/exclusion_matrices.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class ProgramBackendId(_PythonObjectIdMixin, str, enum.Enum):

class OptionalProgramBackendId(_PythonObjectIdMixin, str, enum.Enum):
DACE_CPU = "gt4py.next.program_processors.runners.dace_iterator.run_dace_cpu"
DACE_GPU = "gt4py.next.program_processors.runners.dace_iterator.run_dace_gpu"


class ProgramExecutorId(_PythonObjectIdMixin, str, enum.Enum):
Expand All @@ -83,9 +84,9 @@ class ProgramFormatterId(_PythonObjectIdMixin, str, enum.Enum):
# Test markers
REQUIRES_ATLAS = "requires_atlas"
USES_APPLIED_SHIFTS = "uses_applied_shifts"
USES_CAN_DEREF = "uses_can_deref"
USES_CONSTANT_FIELDS = "uses_constant_fields"
USES_DYNAMIC_OFFSETS = "uses_dynamic_offsets"
USES_FLOORDIV = "uses_floordiv"
USES_IF_STMTS = "uses_if_stmts"
USES_INDEX_FIELDS = "uses_index_fields"
USES_LIFT_EXPRESSIONS = "uses_lift_expressions"
Expand All @@ -111,54 +112,53 @@ class ProgramFormatterId(_PythonObjectIdMixin, str, enum.Enum):
"We cannot unroll a reduction on a sparse field only (not clear if it is legal ITIR)"
)
# Common list of feature markers to skip
GTFN_SKIP_TEST_LIST = [
COMMON_SKIP_TEST_LIST = [
(REQUIRES_ATLAS, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE),
(USES_APPLIED_SHIFTS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_IF_STMTS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_NEGATIVE_MODULO, XFAIL, UNSUPPORTED_MESSAGE),
(USES_REDUCTION_WITH_ONLY_SPARSE_FIELDS, XFAIL, REDUCTION_WITH_ONLY_SPARSE_FIELDS_MESSAGE),
(USES_SCAN_IN_FIELD_OPERATOR, XFAIL, UNSUPPORTED_MESSAGE),
]
DACE_SKIP_TEST_LIST = COMMON_SKIP_TEST_LIST + [
(USES_CONSTANT_FIELDS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_INDEX_FIELDS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_LIFT_EXPRESSIONS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_ORIGIN, XFAIL, UNSUPPORTED_MESSAGE),
(USES_REDUCTION_OVER_LIFT_EXPRESSIONS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_SPARSE_FIELDS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_TUPLE_ARGS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_TUPLE_RETURNS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_ZERO_DIMENSIONAL_FIELDS, XFAIL, UNSUPPORTED_MESSAGE),
]
EMBEDDED_SKIP_LIST = [
(USES_SCAN, XFAIL, UNSUPPORTED_MESSAGE),
(USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE),
(CHECKS_SPECIFIC_ERROR, XFAIL, UNSUPPORTED_MESSAGE),
]
GTFN_SKIP_TEST_LIST = COMMON_SKIP_TEST_LIST + [
# floordiv not yet supported, see https://github.com/GridTools/gt4py/issues/1136
(USES_FLOORDIV, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE),
(USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE),
]

#: Skip matrix, contains for each backend processor a list of tuples with following fields:
#: (<test_marker>, <skip_definition, <skip_message>)
BACKEND_SKIP_TEST_MATRIX = {
None: EMBEDDED_SKIP_LIST,
OptionalProgramBackendId.DACE_CPU: GTFN_SKIP_TEST_LIST
+ [
(USES_CAN_DEREF, XFAIL, UNSUPPORTED_MESSAGE),
(USES_CONSTANT_FIELDS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_INDEX_FIELDS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_LIFT_EXPRESSIONS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_ORIGIN, XFAIL, UNSUPPORTED_MESSAGE),
(USES_REDUCTION_OVER_LIFT_EXPRESSIONS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_SPARSE_FIELDS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_TUPLE_ARGS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_TUPLE_RETURNS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_ZERO_DIMENSIONAL_FIELDS, XFAIL, UNSUPPORTED_MESSAGE),
],
ProgramBackendId.GTFN_CPU: GTFN_SKIP_TEST_LIST
+ [
(USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE),
],
ProgramBackendId.GTFN_GPU: GTFN_SKIP_TEST_LIST
+ [
(USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE),
],
ProgramBackendId.GTFN_CPU_IMPERATIVE: GTFN_SKIP_TEST_LIST
OptionalProgramBackendId.DACE_CPU: DACE_SKIP_TEST_LIST,
OptionalProgramBackendId.DACE_GPU: DACE_SKIP_TEST_LIST
+ [
(USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE),
# awaiting dace fix, see https://github.com/spcl/dace/pull/1442
(USES_FLOORDIV, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE),
],
ProgramBackendId.GTFN_CPU: GTFN_SKIP_TEST_LIST,
ProgramBackendId.GTFN_CPU_IMPERATIVE: GTFN_SKIP_TEST_LIST,
ProgramBackendId.GTFN_GPU: GTFN_SKIP_TEST_LIST,
ProgramBackendId.GTFN_CPU_WITH_TEMPORARIES: GTFN_SKIP_TEST_LIST
+ [
(USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE),
(USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE),
],
ProgramFormatterId.GTFN_CPP_FORMATTER: [
(USES_REDUCTION_WITH_ONLY_SPARSE_FIELDS, XFAIL, REDUCTION_WITH_ONLY_SPARSE_FIELDS_MESSAGE),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def no_backend(program: itir.FencilDefinition, *args: Any, **kwargs: Any) -> Non
OPTIONAL_PROCESSORS = []
if dace_iterator:
OPTIONAL_PROCESSORS.append(definitions.OptionalProgramBackendId.DACE_CPU)
OPTIONAL_PROCESSORS.append(
pytest.param(definitions.OptionalProgramBackendId.DACE_GPU, marks=pytest.mark.requires_gpu)
),


@pytest.fixture(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@


def test_external_local_field(unstructured_case):
# TODO(edopao): remove try/catch after uplift of dace module to version > 0.15
try:
from gt4py.next.program_processors.runners.dace_iterator import run_dace_gpu

if unstructured_case.backend == run_dace_gpu:
# see https://github.com/spcl/dace/pull/1442
pytest.xfail("requires fix in dace module for cuda codegen")
except ImportError:
pass

@gtx.field_operator
def testee(
inp: gtx.Field[[Vertex, V2EDim], int32], ones: gtx.Field[[Edge], int32]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,16 @@ def fencil(edge_f: cases.EField, out: cases.VField):

@pytest.mark.uses_unstructured_shift
def test_reduction_with_common_expression(unstructured_case):
# TODO(edopao): remove try/catch after uplift of dace module to version > 0.15
try:
from gt4py.next.program_processors.runners.dace_iterator import run_dace_gpu

if unstructured_case.backend == run_dace_gpu:
# see https://github.com/spcl/dace/pull/1442
pytest.xfail("requires fix in dace module for cuda codegen")
except ImportError:
pass

@gtx.field_operator
def testee(flux: cases.EField) -> cases.VField:
return neighbor_sum(flux(V2E) + flux(V2E), axis=V2EDim)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
tanh,
trunc,
)
from gt4py.next.program_processors.runners import gtfn

from next_tests.integration_tests import cases
from next_tests.integration_tests.cases import IDim, cartesian_case, unstructured_case
Expand Down Expand Up @@ -67,17 +66,8 @@ def pow(inp1: cases.IField) -> cases.IField:
cases.verify_with_default_data(cartesian_case, pow, ref=lambda inp1: inp1**2)


@pytest.mark.uses_floordiv
def test_floordiv(cartesian_case):
if cartesian_case.backend in [
gtfn.run_gtfn,
gtfn.run_gtfn_imperative,
gtfn.run_gtfn_with_temporaries,
gtfn.run_gtfn_gpu,
]:
pytest.xfail(
"FloorDiv not yet supported."
) # see https://github.com/GridTools/gt4py/issues/1136

@gtx.field_operator
def floorDiv(inp1: cases.IField) -> cases.IField:
return inp1 // 2
Expand Down
Loading