diff --git a/docs/development/ADRs/0015-Test_Exclusion_Matrices.md b/docs/development/ADRs/0015-Test_Exclusion_Matrices.md index 6c6a043560..b338169d61 100644 --- a/docs/development/ADRs/0015-Test_Exclusion_Matrices.md +++ b/docs/development/ADRs/0015-Test_Exclusion_Matrices.md @@ -47,10 +47,12 @@ by calling `next_tests.get_processor_id()`, which returns the so-called processo The following backend processors are defined: ```python -DACE = "dace_iterator.run_dace_iterator" +DACE_CPU = "dace_iterator.run_dace_cpu" +DACE_GPU = "dace_iterator.run_dace_gpu" GTFN_CPU = "otf_compile_executor.run_gtfn" GTFN_CPU_IMPERATIVE = "otf_compile_executor.run_gtfn_imperative" GTFN_CPU_WITH_TEMPORARIES = "otf_compile_executor.run_gtfn_with_temporaries" +GTFN_GPU = "gt4py.next.program_processors.runners.gtfn.run_gtfn_gpu" ``` Following the previous example, the GTFN backend with temporaries does not support yet dynamic offsets in ITIR: diff --git a/pyproject.toml b/pyproject.toml index 041448e17d..2cf4fb12e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -332,12 +332,14 @@ markers = [ 'uses_applied_shifts: tests that require backend support for applied-shifts', 'uses_constant_fields: tests that require backend support for constant fields', 'uses_dynamic_offsets: tests that require backend support for dynamic offsets', + 'uses_floordiv: tests that require backend support for floor division', 'uses_if_stmts: tests that require backend support for if-statements', 'uses_index_fields: tests that require backend support for index fields', 'uses_lift_expressions: tests that require backend support for lift expressions', 'uses_negative_modulo: tests that require backend support for modulo on negative numbers', 'uses_origin: tests that require backend support for domain origin', 'uses_reduction_over_lift_expressions: tests that require backend support for reduction over lift expressions', + 'uses_reduction_with_only_sparse_fields: tests that require backend support for with sparse fields', 'uses_scan_in_field_operator: tests that require backend support for scan in field operator', 'uses_sparse_fields: tests that require backend support for sparse fields', 'uses_strided_neighbor_offset: tests that require backend support for strided neighbor offset', diff --git a/src/gt4py/next/program_processors/runners/dace_iterator/__init__.py b/src/gt4py/next/program_processors/runners/dace_iterator/__init__.py index e3fba87571..40b6d24b0e 100644 --- a/src/gt4py/next/program_processors/runners/dace_iterator/__init__.py +++ b/src/gt4py/next/program_processors/runners/dace_iterator/__init__.py @@ -12,6 +12,7 @@ # # SPDX-License-Identifier: GPL-3.0-or-later import hashlib +import warnings from typing import Any, Mapping, Optional, Sequence import dace @@ -22,11 +23,11 @@ import gt4py.next.allocators as next_allocators import gt4py.next.iterator.ir as itir import gt4py.next.program_processors.otf_compile_executor as otf_exec +import gt4py.next.program_processors.processor_interface as ppi from gt4py.next.common import Dimension, Domain, UnitRange, is_field from gt4py.next.iterator.embedded import NeighborTableOffsetProvider, StridedNeighborOffsetProvider from gt4py.next.iterator.transforms import LiftMode, apply_common_transforms from gt4py.next.otf.compilation import cache -from gt4py.next.program_processors.processor_interface import program_executor from gt4py.next.type_system import type_specifications as ts, type_translation from .itir_to_sdfg import ItirToSDFG @@ -94,10 +95,26 @@ def get_args(params: Sequence[itir.Sym], args: Sequence[Any]) -> dict[str, Any]: return {name.id: convert_arg(arg) for name, arg in zip(params, args)} +def _ensure_is_on_device( + connectivity_arg: np.typing.NDArray, device: dace.dtypes.DeviceType +) -> np.typing.NDArray: + if device == dace.dtypes.DeviceType.GPU: + if not isinstance(connectivity_arg, cp.ndarray): + warnings.warn( + "Copying connectivity to device. For performance make sure connectivity is provided on device." + ) + return cp.asarray(connectivity_arg) + return connectivity_arg + + def get_connectivity_args( - neighbor_tables: Sequence[tuple[str, NeighborTableOffsetProvider]] + neighbor_tables: Sequence[tuple[str, NeighborTableOffsetProvider]], + device: dace.dtypes.DeviceType, ) -> dict[str, Any]: - return {connectivity_identifier(offset): table.table for offset, table in neighbor_tables} + return { + connectivity_identifier(offset): _ensure_is_on_device(table.table, device) + for offset, table in neighbor_tables + } def get_shape_args( @@ -167,7 +184,6 @@ def get_cache_id( return m.hexdigest() -@program_executor def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None: # build parameters auto_optimize = kwargs.get("auto_optimize", False) @@ -182,6 +198,7 @@ def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None: offset_provider = kwargs["offset_provider"] arg_types = [type_translation.from_value(arg) for arg in args] + device = dace.DeviceType.GPU if run_on_gpu else dace.DeviceType.CPU neighbor_tables = filter_neighbor_tables(offset_provider) cache_id = get_cache_id(program, arg_types, column_axis, offset_provider) @@ -192,26 +209,16 @@ def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None: else: # visit ITIR and generate SDFG program = preprocess_program(program, offset_provider, lift_mode) - sdfg_genenerator = ItirToSDFG(arg_types, offset_provider, column_axis) + sdfg_genenerator = ItirToSDFG(arg_types, offset_provider, column_axis, run_on_gpu) sdfg = sdfg_genenerator.visit(program) sdfg.simplify() - # set array storage for GPU execution - if run_on_gpu: - device = dace.DeviceType.GPU - sdfg._name = f"{sdfg.name}_gpu" - for _, _, array in sdfg.arrays_recursive(): - if not array.transient: - array.storage = dace.dtypes.StorageType.GPU_Global - else: - device = dace.DeviceType.CPU - # run DaCe auto-optimization heuristics if auto_optimize: # TODO Investigate how symbol definitions improve autoopt transformations, # in which case the cache table should take the symbols map into account. symbols: dict[str, int] = {} - sdfg = autoopt.auto_optimize(sdfg, device, symbols=symbols) + sdfg = autoopt.auto_optimize(sdfg, device, symbols=symbols, use_gpu_storage=run_on_gpu) # compile SDFG and retrieve SDFG program sdfg.build_folder = cache._session_cache_dir_path / ".dacecache" @@ -226,7 +233,7 @@ def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None: dace_args = get_args(program.params, args) dace_field_args = {n: v for n, v in dace_args.items() if not np.isscalar(v)} - dace_conn_args = get_connectivity_args(neighbor_tables) + dace_conn_args = get_connectivity_args(neighbor_tables, device) dace_shapes = get_shape_args(sdfg.arrays, dace_field_args) dace_conn_shapes = get_shape_args(sdfg.arrays, dace_conn_args) dace_strides = get_stride_args(sdfg.arrays, dace_field_args) @@ -254,7 +261,6 @@ def run_dace_iterator(program: itir.FencilDefinition, *args, **kwargs) -> None: sdfg_program(**expected_args) -@program_executor def _run_dace_cpu(program: itir.FencilDefinition, *args, **kwargs) -> None: run_dace_iterator( program, @@ -267,13 +273,12 @@ def _run_dace_cpu(program: itir.FencilDefinition, *args, **kwargs) -> None: run_dace_cpu = otf_exec.OTFBackend( - executor=_run_dace_cpu, + executor=ppi.program_executor(_run_dace_cpu, name="run_dace_cpu"), allocator=next_allocators.StandardCPUFieldBufferAllocator(), ) if cp: - @program_executor def _run_dace_gpu(program: itir.FencilDefinition, *args, **kwargs) -> None: run_dace_iterator( program, @@ -286,12 +291,11 @@ def _run_dace_gpu(program: itir.FencilDefinition, *args, **kwargs) -> None: else: - @program_executor def _run_dace_gpu(program: itir.FencilDefinition, *args, **kwargs) -> None: raise RuntimeError("Missing `cupy` dependency for GPU execution.") run_dace_gpu = otf_exec.OTFBackend( - executor=_run_dace_gpu, + executor=ppi.program_executor(_run_dace_gpu, name="run_dace_gpu"), allocator=next_allocators.StandardGPUFieldBufferAllocator(), ) diff --git a/src/gt4py/next/program_processors/runners/dace_iterator/itir_to_sdfg.py b/src/gt4py/next/program_processors/runners/dace_iterator/itir_to_sdfg.py index 9e9cc4bf29..a7cecf5fad 100644 --- a/src/gt4py/next/program_processors/runners/dace_iterator/itir_to_sdfg.py +++ b/src/gt4py/next/program_processors/runners/dace_iterator/itir_to_sdfg.py @@ -96,17 +96,20 @@ class ItirToSDFG(eve.NodeVisitor): offset_provider: dict[str, Any] node_types: dict[int, next_typing.Type] unique_id: int + use_gpu_storage: bool def __init__( self, param_types: list[ts.TypeSpec], offset_provider: dict[str, NeighborTableOffsetProvider], column_axis: Optional[Dimension] = None, + use_gpu_storage: bool = False, ): self.param_types = param_types self.column_axis = column_axis self.offset_provider = offset_provider self.storage_types = {} + self.use_gpu_storage = use_gpu_storage def add_storage(self, sdfg: dace.SDFG, name: str, type_: ts.TypeSpec, has_offset: bool = True): if isinstance(type_, ts.FieldType): @@ -118,7 +121,14 @@ def add_storage(self, sdfg: dace.SDFG, name: str, type_: ts.TypeSpec, has_offset else None ) dtype = as_dace_type(type_.dtype) - sdfg.add_array(name, shape=shape, strides=strides, offset=offset, dtype=dtype) + storage = ( + dace.dtypes.StorageType.GPU_Global + if self.use_gpu_storage + else dace.dtypes.StorageType.Default + ) + sdfg.add_array( + name, shape=shape, strides=strides, offset=offset, dtype=dtype, storage=storage + ) elif isinstance(type_, ts.ScalarType): sdfg.add_symbol(name, as_dace_type(type_)) else: @@ -225,6 +235,7 @@ def visit_StencilClosure( shape=array_table[name].shape, strides=array_table[name].strides, dtype=array_table[name].dtype, + storage=array_table[name].storage, transient=True, ) closure_init_state.add_nedge( @@ -239,6 +250,7 @@ def visit_StencilClosure( shape=array_table[name].shape, strides=array_table[name].strides, dtype=array_table[name].dtype, + storage=array_table[name].storage, ) else: assert isinstance(self.storage_types[name], ts.ScalarType) diff --git a/tests/next_tests/exclusion_matrices.py b/tests/next_tests/exclusion_matrices.py index a8a508b2fb..a6a302e143 100644 --- a/tests/next_tests/exclusion_matrices.py +++ b/tests/next_tests/exclusion_matrices.py @@ -57,6 +57,7 @@ class ProgramBackendId(_PythonObjectIdMixin, str, enum.Enum): class OptionalProgramBackendId(_PythonObjectIdMixin, str, enum.Enum): DACE_CPU = "gt4py.next.program_processors.runners.dace_iterator.run_dace_cpu" + DACE_GPU = "gt4py.next.program_processors.runners.dace_iterator.run_dace_gpu" class ProgramExecutorId(_PythonObjectIdMixin, str, enum.Enum): @@ -83,9 +84,9 @@ class ProgramFormatterId(_PythonObjectIdMixin, str, enum.Enum): # Test markers REQUIRES_ATLAS = "requires_atlas" USES_APPLIED_SHIFTS = "uses_applied_shifts" -USES_CAN_DEREF = "uses_can_deref" USES_CONSTANT_FIELDS = "uses_constant_fields" USES_DYNAMIC_OFFSETS = "uses_dynamic_offsets" +USES_FLOORDIV = "uses_floordiv" USES_IF_STMTS = "uses_if_stmts" USES_INDEX_FIELDS = "uses_index_fields" USES_LIFT_EXPRESSIONS = "uses_lift_expressions" @@ -111,7 +112,7 @@ class ProgramFormatterId(_PythonObjectIdMixin, str, enum.Enum): "We cannot unroll a reduction on a sparse field only (not clear if it is legal ITIR)" ) # Common list of feature markers to skip -GTFN_SKIP_TEST_LIST = [ +COMMON_SKIP_TEST_LIST = [ (REQUIRES_ATLAS, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE), (USES_APPLIED_SHIFTS, XFAIL, UNSUPPORTED_MESSAGE), (USES_IF_STMTS, XFAIL, UNSUPPORTED_MESSAGE), @@ -119,46 +120,45 @@ class ProgramFormatterId(_PythonObjectIdMixin, str, enum.Enum): (USES_REDUCTION_WITH_ONLY_SPARSE_FIELDS, XFAIL, REDUCTION_WITH_ONLY_SPARSE_FIELDS_MESSAGE), (USES_SCAN_IN_FIELD_OPERATOR, XFAIL, UNSUPPORTED_MESSAGE), ] +DACE_SKIP_TEST_LIST = COMMON_SKIP_TEST_LIST + [ + (USES_CONSTANT_FIELDS, XFAIL, UNSUPPORTED_MESSAGE), + (USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE), + (USES_INDEX_FIELDS, XFAIL, UNSUPPORTED_MESSAGE), + (USES_LIFT_EXPRESSIONS, XFAIL, UNSUPPORTED_MESSAGE), + (USES_ORIGIN, XFAIL, UNSUPPORTED_MESSAGE), + (USES_REDUCTION_OVER_LIFT_EXPRESSIONS, XFAIL, UNSUPPORTED_MESSAGE), + (USES_SPARSE_FIELDS, XFAIL, UNSUPPORTED_MESSAGE), + (USES_TUPLE_ARGS, XFAIL, UNSUPPORTED_MESSAGE), + (USES_TUPLE_RETURNS, XFAIL, UNSUPPORTED_MESSAGE), + (USES_ZERO_DIMENSIONAL_FIELDS, XFAIL, UNSUPPORTED_MESSAGE), +] EMBEDDED_SKIP_LIST = [ (USES_SCAN, XFAIL, UNSUPPORTED_MESSAGE), (USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE), (CHECKS_SPECIFIC_ERROR, XFAIL, UNSUPPORTED_MESSAGE), ] +GTFN_SKIP_TEST_LIST = COMMON_SKIP_TEST_LIST + [ + # floordiv not yet supported, see https://github.com/GridTools/gt4py/issues/1136 + (USES_FLOORDIV, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE), + (USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE), +] #: Skip matrix, contains for each backend processor a list of tuples with following fields: #: (, ) BACKEND_SKIP_TEST_MATRIX = { None: EMBEDDED_SKIP_LIST, - OptionalProgramBackendId.DACE_CPU: GTFN_SKIP_TEST_LIST - + [ - (USES_CAN_DEREF, XFAIL, UNSUPPORTED_MESSAGE), - (USES_CONSTANT_FIELDS, XFAIL, UNSUPPORTED_MESSAGE), - (USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE), - (USES_INDEX_FIELDS, XFAIL, UNSUPPORTED_MESSAGE), - (USES_LIFT_EXPRESSIONS, XFAIL, UNSUPPORTED_MESSAGE), - (USES_ORIGIN, XFAIL, UNSUPPORTED_MESSAGE), - (USES_REDUCTION_OVER_LIFT_EXPRESSIONS, XFAIL, UNSUPPORTED_MESSAGE), - (USES_SPARSE_FIELDS, XFAIL, UNSUPPORTED_MESSAGE), - (USES_TUPLE_ARGS, XFAIL, UNSUPPORTED_MESSAGE), - (USES_TUPLE_RETURNS, XFAIL, UNSUPPORTED_MESSAGE), - (USES_ZERO_DIMENSIONAL_FIELDS, XFAIL, UNSUPPORTED_MESSAGE), - ], - ProgramBackendId.GTFN_CPU: GTFN_SKIP_TEST_LIST - + [ - (USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE), - ], - ProgramBackendId.GTFN_GPU: GTFN_SKIP_TEST_LIST - + [ - (USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE), - ], - ProgramBackendId.GTFN_CPU_IMPERATIVE: GTFN_SKIP_TEST_LIST + OptionalProgramBackendId.DACE_CPU: DACE_SKIP_TEST_LIST, + OptionalProgramBackendId.DACE_GPU: DACE_SKIP_TEST_LIST + [ - (USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE), + # awaiting dace fix, see https://github.com/spcl/dace/pull/1442 + (USES_FLOORDIV, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE), ], + ProgramBackendId.GTFN_CPU: GTFN_SKIP_TEST_LIST, + ProgramBackendId.GTFN_CPU_IMPERATIVE: GTFN_SKIP_TEST_LIST, + ProgramBackendId.GTFN_GPU: GTFN_SKIP_TEST_LIST, ProgramBackendId.GTFN_CPU_WITH_TEMPORARIES: GTFN_SKIP_TEST_LIST + [ (USES_DYNAMIC_OFFSETS, XFAIL, UNSUPPORTED_MESSAGE), - (USES_STRIDED_NEIGHBOR_OFFSET, XFAIL, BINDINGS_UNSUPPORTED_MESSAGE), ], ProgramFormatterId.GTFN_CPP_FORMATTER: [ (USES_REDUCTION_WITH_ONLY_SPARSE_FIELDS, XFAIL, REDUCTION_WITH_ONLY_SPARSE_FIELDS_MESSAGE), diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py index 1537c01642..f8a3f6a975 100644 --- a/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py +++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/ffront_test_utils.py @@ -45,6 +45,9 @@ def no_backend(program: itir.FencilDefinition, *args: Any, **kwargs: Any) -> Non OPTIONAL_PROCESSORS = [] if dace_iterator: OPTIONAL_PROCESSORS.append(definitions.OptionalProgramBackendId.DACE_CPU) + OPTIONAL_PROCESSORS.append( + pytest.param(definitions.OptionalProgramBackendId.DACE_GPU, marks=pytest.mark.requires_gpu) + ), @pytest.fixture( diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_external_local_field.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_external_local_field.py index 05adc63a45..42938e2f4b 100644 --- a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_external_local_field.py +++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_external_local_field.py @@ -30,6 +30,16 @@ def test_external_local_field(unstructured_case): + # TODO(edopao): remove try/catch after uplift of dace module to version > 0.15 + try: + from gt4py.next.program_processors.runners.dace_iterator import run_dace_gpu + + if unstructured_case.backend == run_dace_gpu: + # see https://github.com/spcl/dace/pull/1442 + pytest.xfail("requires fix in dace module for cuda codegen") + except ImportError: + pass + @gtx.field_operator def testee( inp: gtx.Field[[Vertex, V2EDim], int32], ones: gtx.Field[[Edge], int32] diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gt4py_builtins.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gt4py_builtins.py index e2434d860a..bbbac6c139 100644 --- a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gt4py_builtins.py +++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_gt4py_builtins.py @@ -120,6 +120,16 @@ def fencil(edge_f: cases.EField, out: cases.VField): @pytest.mark.uses_unstructured_shift def test_reduction_with_common_expression(unstructured_case): + # TODO(edopao): remove try/catch after uplift of dace module to version > 0.15 + try: + from gt4py.next.program_processors.runners.dace_iterator import run_dace_gpu + + if unstructured_case.backend == run_dace_gpu: + # see https://github.com/spcl/dace/pull/1442 + pytest.xfail("requires fix in dace module for cuda codegen") + except ImportError: + pass + @gtx.field_operator def testee(flux: cases.EField) -> cases.VField: return neighbor_sum(flux(V2E) + flux(V2E), axis=V2EDim) diff --git a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_math_unary_builtins.py b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_math_unary_builtins.py index 8660ecfdbd..c2ab43773f 100644 --- a/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_math_unary_builtins.py +++ b/tests/next_tests/integration_tests/feature_tests/ffront_tests/test_math_unary_builtins.py @@ -37,7 +37,6 @@ tanh, trunc, ) -from gt4py.next.program_processors.runners import gtfn from next_tests.integration_tests import cases from next_tests.integration_tests.cases import IDim, cartesian_case, unstructured_case @@ -67,17 +66,8 @@ def pow(inp1: cases.IField) -> cases.IField: cases.verify_with_default_data(cartesian_case, pow, ref=lambda inp1: inp1**2) +@pytest.mark.uses_floordiv def test_floordiv(cartesian_case): - if cartesian_case.backend in [ - gtfn.run_gtfn, - gtfn.run_gtfn_imperative, - gtfn.run_gtfn_with_temporaries, - gtfn.run_gtfn_gpu, - ]: - pytest.xfail( - "FloorDiv not yet supported." - ) # see https://github.com/GridTools/gt4py/issues/1136 - @gtx.field_operator def floorDiv(inp1: cases.IField) -> cases.IField: return inp1 // 2 diff --git a/tests/next_tests/unit_tests/conftest.py b/tests/next_tests/unit_tests/conftest.py index 372062d08a..6f91557e46 100644 --- a/tests/next_tests/unit_tests/conftest.py +++ b/tests/next_tests/unit_tests/conftest.py @@ -50,6 +50,12 @@ def lift_mode(request): OPTIONAL_PROCESSORS = [] if dace_iterator: OPTIONAL_PROCESSORS.append((definitions.OptionalProgramBackendId.DACE_CPU, True)) + # TODO(havogt): update tests to use proper allocation + # OPTIONAL_PROCESSORS.append( + # pytest.param( + # (definitions.OptionalProgramBackendId.DACE_GPU, True), marks=pytest.mark.requires_gpu + # ) + # ), @pytest.fixture(