spcl · philip-paul-mueller · Mar 14, 2025 · Mar 14, 2025 · Mar 14, 2025 · Mar 14, 2025
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
@@ -161,35 +161,40 @@ def preprocess(self, sdfg: SDFG) -> None:
                 nsdfg = state.parent
                 if (e.src.desc(nsdfg).storage == dtypes.StorageType.GPU_Global
                         and e.dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global):
+
+                    # NOTE: If possible `memlet_copy_to_absolute_strides()` will collapse a
+                    #   ND copy into a 1D copy if the memory is contiguous. In that case
+                    #   `copy_shape` will only have one element.
                     copy_shape, src_strides, dst_strides, _, _ = memlet_copy_to_absolute_strides(
                         None, nsdfg, state, e, e.src, e.dst)
                     dims = len(copy_shape)
 
                     # Skip supported copy types
                     if dims == 1:
+                        # NOTE: We do not check if the stride is `1`. See `_emit_copy()` for more.
                         continue
                     elif dims == 2:
-                        if src_strides[-1] != 1 or dst_strides[-1] != 1:
-                            # NOTE: Special case of continuous copy
-                            # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J]
-                            # with copy shape [I, J] and strides [J*K, K], [J, 1]
-                            try:
-                                is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1]
-                                is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1]
-                            except (TypeError, ValueError):
-                                is_src_cont = False
-                                is_dst_cont = False
-                            if is_src_cont and is_dst_cont:
-                                continue
-                        else:
+                        # Because `memlet_copy_to_absolute_strides()` handles contiguous copies
+                        #  transparently, we only have to check if we have FORTRAN or C order.
+                        #  If we do not have them, then we have to turn this into a Map.
+                        is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1
+                        is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1
+                        if is_c_order or is_fortran_order:
                             continue
                     elif dims > 2:
-                        if not (src_strides[-1] != 1 or dst_strides[-1] != 1):
+                        # Any higher dimensional copies must be C order. If not turn it
+                        #  into a copy map.
+                        if src_strides[-1] == 1 and dst_strides[-1] == 1:
                             continue
 
                     # Turn unsupported copy to a map
                     try:
-                        CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst)
+                        CopyToMap.apply_to(nsdfg,
+                                           save=False,
+                                           annotate=False,
+                                           a=e.src,
+                                           b=e.dst,
+                                           options={"ignore_strides": True})
                     except ValueError:  # If transformation doesn't match, continue normally
                         continue
 
@@ -973,32 +978,51 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
             copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides(
                 self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._cpu_codegen._packed_types))
             dims = len(copy_shape)
-
             dtype = dst_node.desc(sdfg).dtype
 
-            # Handle unsupported copy types
-            if dims == 2 and (src_strides[-1] != 1 or dst_strides[-1] != 1):
-                # NOTE: Special case of continuous copy
-                # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J]
-                # with copy shape [I, J] and strides [J*K, K], [J, 1]
+            # In 1D there is no difference between FORTRAN or C order, thus we will set them
+            #  to the same value. The value indicates if the stride is `1`
+            # TODO: Figuring out if this is enough for views.
+            is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1
+            is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1
+
+            # Test if it is possible to transform a 2D copy into a 1D copy, this is possible if
+            #  the allocation happens to be contiguous.
+            # NOTE: It seems that the `memlet_copy_to_absolute_strides()` function already does
+            #   this, the code below is kept if it is still needed, but somebody, who knows the
+            #   code generator should look at it. There are even tests for that, see
+            #   `cuda_memcopy_test.py::test_gpu_pseudo_1d_copy_f_order`, but they most likely
+            #   do not test the code below but the `memlet_copy_to_absolute_strides()` function.
+            # TODO: Figuring out if this can be removed.
+            if dims == 2 and (is_fortran_order or is_c_order):
                 try:
 def memlet_copy_to_absolute_strides(self, 
 def memlet_copy_to_absolute_strides(self, 
-                    is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1]
-                    is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1]
+                    if is_c_order:
+                        is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1]
+                        is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1]
+                    elif is_fortran_order:
+                        is_src_cont = src_strides[1] / src_strides[0] == copy_shape[0]
+                        is_dst_cont = dst_strides[1] / dst_strides[0] == copy_shape[0]
+                    else:
+                        is_src_cont = False
+                        is_dst_cont = False
                 except (TypeError, ValueError):
                     is_src_cont = False
                     is_dst_cont = False
                 if is_src_cont and is_dst_cont:
-                    dims = 1
                     copy_shape = [copy_shape[0] * copy_shape[1]]
-                    src_strides = [src_strides[1]]
-                    dst_strides = [dst_strides[1]]
-                else:
-                    raise NotImplementedError('2D copy only supported with one stride')
+                    src_strides = [src_strides[1 if is_c_order else 0]]
+                    dst_strides = [dst_strides[1 if is_c_order else 0]]
+                    is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1
+                    is_c_order = is_fortran_order
+                    dims = 1
 
-            # Currently we only support ND copies when they can be represented
-            # as a 1D copy or as a 2D strided copy
             if dims > 2:
-                if src_strides[-1] != 1 or dst_strides[-1] != 1:
+                # Currently we only support ND copies when they can be represented
+                #  as a 1D copy or as a 2D strided copy
+                # NOTE: Not sure if this test is enough, it should also be tested that
+                #   they are ordered, i.e. largest stride on the left.
+                if not is_c_order:
+                    # TODO: Implement the FORTRAN case.
                     raise NotImplementedError(
                         'GPU copies are not supported for N-dimensions if they cannot be represented by a strided copy\n'
                         f'  Nodes: src {src_node} ({src_storage}), dst {dst_node}({dst_storage})\n'
@@ -1026,7 +1050,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
                     for d in range(dims - 2):
                         callsite_stream.write("}")
 
-            if dims == 1 and not (src_strides[-1] != 1 or dst_strides[-1] != 1):
+            elif dims == 1 and is_c_order:
+                # A 1D copy, in which the stride is 1, known at code generation time.
                 copysize = ' * '.join(_topy(copy_shape))
                 array_length = copysize
                 copysize += ' * sizeof(%s)' % dtype.ctype
@@ -1064,22 +1089,70 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
                                                                                     backend=self.backend), cfg,
                                 state_id, [src_node, dst_node])
                     callsite_stream.write('}')
-            elif dims == 1 and ((src_strides[-1] != 1 or dst_strides[-1] != 1)):
+
+            elif dims == 1 and not is_c_order:
+                # This is the case that generated for expressions such as `A[::3]`, we reduce it
+                #  to a 2D copy.
                 callsite_stream.write(
-                    'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
-                    (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype,
-                     src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype,
-                     'sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp(
-                         copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id,
-                    [src_node, dst_node])
-            elif dims == 2:
+                    'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n'
+                    .format(
+                        backend=self.backend,
+                        dst=dst_expr,
+                        dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        src=src_expr,
+                        src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})',
+                        width=f'sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        height=sym2cpp(copy_shape[0]),
+                        kind=f'{self.backend}Memcpy{src_location}To{dst_location}',
+                        stream=cudastream,
+                    ),
+                    cfg,
+                    state_id,
+                    [src_node, dst_node],
+                )
+
+            elif dims == 2 and is_c_order:
+                # Copying a 2D array that are in C order, i.e. last stride is 1.
+                callsite_stream.write(
+                    'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n'
+                    .format(
+                        backend=self.backend,
+                        dst=dst_expr,
+                        dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        src=src_expr,
+                        src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})',
+                        width=f'({sym2cpp(copy_shape[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        height=sym2cpp(copy_shape[0]),
+                        kind=f'{self.backend}Memcpy{src_location}To{dst_location}',
+                        stream=cudastream,
+                    ),
+                    cfg,
+                    state_id,
+                    [src_node, dst_node],
+                )
+            elif dims == 2 and is_fortran_order:
+                # Copying a 2D array into a 2D array that is in FORTRAN order, i.e. first stride
+                #  is one. The CUDA API can not handle such cases directly, however, by "transposing"
+                #  it is possible to use `Memcyp2DAsync`.
                 callsite_stream.write(
-                    'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' %
-                    (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype,
-                     src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype,
-                     sym2cpp(copy_shape[1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp(
-                         copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id,
-                    [src_node, dst_node])
+                    'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n'
+                    .format(
+                        backend=self.backend,
+                        dst=dst_expr,
+                        dst_stride=f'({_topy(dst_strides[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        src=src_expr,
+                        src_stride=f'({sym2cpp(src_strides[1])}) * sizeof({src_node.desc(sdfg).dtype.ctype})',
+                        width=f'({sym2cpp(copy_shape[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})',
+                        height=sym2cpp(copy_shape[1]),
+                        kind=f'{self.backend}Memcpy{src_location}To{dst_location}',
+                        stream=cudastream,
+                    ),
+                    cfg,
+                    state_id,
+                    [src_node, dst_node],
+                )
+            else:
+                raise NotImplementedError("The requested copy operation is not implemented.")
 
             # Post-copy synchronization
             if is_sync:
@@ -1126,7 +1199,6 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St
                 # Obtain copy information
                 copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides(
                     self._dispatcher, sdfg, state, edge, src_node, dst_node, self._cpu_codegen._packed_types))
-
                 dims = len(copy_shape)
 
                 funcname = 'dace::%sTo%s%dD' % (_get_storagename(src_storage), _get_storagename(dst_storage), dims)

diff --git a/dace/data.py b/dace/data.py
@@ -210,6 +210,8 @@ def _validate(self):
         if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.shape):
             raise TypeError('Shape must be a list or tuple of integer values '
                             'or symbols')
+        if any((shp < 0) == True for shp in self.shape):
+            raise TypeError(f'Found negative shape in Data, its shape was {self.shape}')
         return True
 
     def to_json(self):
@@ -1471,12 +1473,20 @@ def validate(self):
         super(Array, self).validate()
         if len(self.strides) != len(self.shape):
             raise TypeError('Strides must be the same size as shape')
+        if len(self.offset) != len(self.shape):
+            raise TypeError('Offset must be the same size as shape')
 
         if any(not isinstance(s, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic)) for s in self.strides):
             raise TypeError('Strides must be a list or tuple of integer values or symbols')
-
-        if len(self.offset) != len(self.shape):
-            raise TypeError('Offset must be the same size as shape')
+        if any(not isinstance(off, (int, symbolic.SymExpr, symbolic.symbol, symbolic.sympy.Basic))
+               for off in self.offset):
+            raise TypeError('Offset must be a list or tuple of integer values or symbols')
+
+        # Actually it would be enough to only enforce the non negativity only if the shape is larger than one.
+        if any((stride < 0) == True for stride in self.strides):
+            raise TypeError(f'Found negative strides in array, they were {self.strides}')
+        if (self.total_size < 0) == True:
+            raise TypeError(f'The total size of an array must be positive but it was negative {self.total_size}')
 
     def covers_range(self, rng):
         if len(rng) != len(self.shape):

diff --git a/dace/memlet.py b/dace/memlet.py
@@ -534,6 +534,9 @@ def dst_subset(self, new_dst_subset):
     def validate(self, sdfg, state):
         if self.data is not None and self.data not in sdfg.arrays:
             raise KeyError('Array "%s" not found in SDFG' % self.data)
+        # NOTE: We do not check here is the subsets have a negative size, because such as subset
+        #  is valid, in certain cases, for example if an AccessNode is connected to a MapEntry,
+        #  because the Map is not executed. Thus we do the check in the `validate_state()` function.
 
     def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]:
         """

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
@@ -9,7 +9,7 @@
 
 import networkx as nx
 
-from dace import dtypes, subsets, symbolic
+from dace import dtypes, subsets, symbolic, data
 from dace.dtypes import DebugInfo
 
 if TYPE_CHECKING:
@@ -656,7 +656,6 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                 )
         ########################################
 
-    # Memlet checks
     for eid, e in enumerate(state.edges()):
         # Reference check
         if id(e) in references:
@@ -680,6 +679,27 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         except Exception as ex:
             raise InvalidSDFGEdgeError("Edge validation failed: " + str(ex), sdfg, state_id, eid)
 
+        # If the edge is a connection between two AccessNodes check if the subset has negative size.
+        # NOTE: We _should_ do this check in `Memlet.validate()` however, this is not possible,
+        #  because the connection between am AccessNode and a MapEntry, with a negative size, is
+        #  legal because, the Map will not run in that case. However, this constellation can not
+        #  be tested for in the Memlet's validation function, so we have to do it here.
+        # NOTE: Zero size is explicitly allowed because it is essentially `memcpy(dst, src, 0)`
+        #  which is save.
+        # TODO: The AN to AN connection is the most obvious one, but it should be extended.
+        if isinstance(e.src, nd.AccessNode) and isinstance(e.dst, nd.AccessNode):
+            e_memlet: dace.Memlet = e.data
+            if e_memlet.subset is not None:
+                if any((ss < 0) == True for ss in e_memlet.subset.size()):
+                    raise InvalidSDFGEdgeError(
+                        f'`subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.subset.size()}',
+                        sdfg, state_id, eid)
+            if e_memlet.other_subset is not None:
+                if any((ss < 0) == True for ss in e_memlet.other_subset.size()):
+                    raise InvalidSDFGEdgeError(
+                        f'`other_subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.other_subset.size()}',
+                        sdfg, state_id, eid)
+
         # For every memlet, obtain its full path in the DFG
         path = state.memlet_path(e)
         src_node = path[0].src