-
Notifications
You must be signed in to change notification settings - Fork 135
Improved GPU Copy #1976
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Improved GPU Copy #1976
Changes from 19 commits
feea97f
9b49c9e
d0a396f
c931b91
a67ad2a
61ea7a6
322ecda
0b15a74
76a1a58
66b43f8
801adb1
02d87b5
3166302
2801967
51182e5
065e0d7
b0b9945
ba97874
e5bf87f
aef9945
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -161,35 +161,40 @@ def preprocess(self, sdfg: SDFG) -> None: | |
nsdfg = state.parent | ||
if (e.src.desc(nsdfg).storage == dtypes.StorageType.GPU_Global | ||
and e.dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global): | ||
|
||
# NOTE: If possible `memlet_copy_to_absolute_strides()` will collapse a | ||
# ND copy into a 1D copy if the memory is contiguous. In that case | ||
# `copy_shape` will only have one element. | ||
copy_shape, src_strides, dst_strides, _, _ = memlet_copy_to_absolute_strides( | ||
None, nsdfg, state, e, e.src, e.dst) | ||
dims = len(copy_shape) | ||
|
||
# Skip supported copy types | ||
if dims == 1: | ||
# NOTE: We do not check if the stride is `1`. See `_emit_copy()` for more. | ||
continue | ||
elif dims == 2: | ||
if src_strides[-1] != 1 or dst_strides[-1] != 1: | ||
# NOTE: Special case of continuous copy | ||
# Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] | ||
# with copy shape [I, J] and strides [J*K, K], [J, 1] | ||
try: | ||
is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1] | ||
is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1] | ||
except (TypeError, ValueError): | ||
is_src_cont = False | ||
is_dst_cont = False | ||
if is_src_cont and is_dst_cont: | ||
continue | ||
else: | ||
# Because `memlet_copy_to_absolute_strides()` handles contiguous copies | ||
# transparently, we only have to check if we have FORTRAN or C order. | ||
# If we do not have them, then we have to turn this into a Map. | ||
is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1 | ||
is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1 | ||
if is_c_order or is_fortran_order: | ||
continue | ||
elif dims > 2: | ||
if not (src_strides[-1] != 1 or dst_strides[-1] != 1): | ||
# Any higher dimensional copies must be C order. If not turn it | ||
# into a copy map. | ||
if src_strides[-1] == 1 and dst_strides[-1] == 1: | ||
continue | ||
|
||
# Turn unsupported copy to a map | ||
try: | ||
CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst) | ||
CopyToMap.apply_to(nsdfg, | ||
save=False, | ||
annotate=False, | ||
a=e.src, | ||
b=e.dst, | ||
options={"ignore_strides": True}) | ||
except ValueError: # If transformation doesn't match, continue normally | ||
continue | ||
|
||
|
@@ -973,32 +978,51 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St | |
copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides( | ||
self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._cpu_codegen._packed_types)) | ||
dims = len(copy_shape) | ||
|
||
dtype = dst_node.desc(sdfg).dtype | ||
|
||
# Handle unsupported copy types | ||
if dims == 2 and (src_strides[-1] != 1 or dst_strides[-1] != 1): | ||
# NOTE: Special case of continuous copy | ||
# Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] | ||
# with copy shape [I, J] and strides [J*K, K], [J, 1] | ||
# In 1D there is no difference between FORTRAN or C order, thus we will set them | ||
# to the same value. The value indicates if the stride is `1` | ||
# TODO: Figuring out if this is enough for views. | ||
is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1 | ||
is_c_order = src_strides[-1] == 1 and dst_strides[-1] == 1 | ||
|
||
# Test if it is possible to transform a 2D copy into a 1D copy, this is possible if | ||
# the allocation happens to be contiguous. | ||
# NOTE: It seems that the `memlet_copy_to_absolute_strides()` function already does | ||
# this, the code below is kept if it is still needed, but somebody, who knows the | ||
# code generator should look at it. There are even tests for that, see | ||
# `cuda_memcopy_test.py::test_gpu_pseudo_1d_copy_f_order`, but they most likely | ||
# do not test the code below but the `memlet_copy_to_absolute_strides()` function. | ||
# TODO: Figuring out if this can be removed. | ||
if dims == 2 and (is_fortran_order or is_c_order): | ||
try: | ||
is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1] | ||
is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1] | ||
if is_c_order: | ||
is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1] | ||
is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1] | ||
elif is_fortran_order: | ||
is_src_cont = src_strides[1] / src_strides[0] == copy_shape[0] | ||
is_dst_cont = dst_strides[1] / dst_strides[0] == copy_shape[0] | ||
else: | ||
is_src_cont = False | ||
is_dst_cont = False | ||
except (TypeError, ValueError): | ||
is_src_cont = False | ||
is_dst_cont = False | ||
if is_src_cont and is_dst_cont: | ||
dims = 1 | ||
copy_shape = [copy_shape[0] * copy_shape[1]] | ||
src_strides = [src_strides[1]] | ||
dst_strides = [dst_strides[1]] | ||
else: | ||
raise NotImplementedError('2D copy only supported with one stride') | ||
src_strides = [src_strides[1 if is_c_order else 0]] | ||
dst_strides = [dst_strides[1 if is_c_order else 0]] | ||
is_fortran_order = src_strides[0] == 1 and dst_strides[0] == 1 | ||
philip-paul-mueller marked this conversation as resolved.
Show resolved
Hide resolved
|
||
is_c_order = is_fortran_order | ||
philip-paul-mueller marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dims = 1 | ||
|
||
# Currently we only support ND copies when they can be represented | ||
# as a 1D copy or as a 2D strided copy | ||
if dims > 2: | ||
if src_strides[-1] != 1 or dst_strides[-1] != 1: | ||
# Currently we only support ND copies when they can be represented | ||
# as a 1D copy or as a 2D strided copy | ||
# NOTE: Not sure if this test is enough, it should also be tested that | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To what are you agreeing on?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That the test is not enough. it doesn't consider, e.g., Views where you can have multiple discontinuities, and I guess it is like that because the original code predates Views. I also think it is fine to leave it as-is for now, but it should be noted down as a possible source of errors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand why people give this code the impression that it is so old. It's from March 2022, it definitely does not predate views: #961 |
||
# they are ordered, i.e. largest stride on the left. | ||
if not is_c_order: | ||
# TODO: Implement the FORTRAN case. | ||
raise NotImplementedError( | ||
'GPU copies are not supported for N-dimensions if they cannot be represented by a strided copy\n' | ||
f' Nodes: src {src_node} ({src_storage}), dst {dst_node}({dst_storage})\n' | ||
|
@@ -1026,7 +1050,8 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St | |
for d in range(dims - 2): | ||
callsite_stream.write("}") | ||
|
||
if dims == 1 and not (src_strides[-1] != 1 or dst_strides[-1] != 1): | ||
elif dims == 1 and is_c_order: | ||
# A 1D copy, in which the stride is 1, known at code generation time. | ||
copysize = ' * '.join(_topy(copy_shape)) | ||
array_length = copysize | ||
copysize += ' * sizeof(%s)' % dtype.ctype | ||
|
@@ -1064,22 +1089,70 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St | |
backend=self.backend), cfg, | ||
state_id, [src_node, dst_node]) | ||
callsite_stream.write('}') | ||
elif dims == 1 and ((src_strides[-1] != 1 or dst_strides[-1] != 1)): | ||
|
||
elif dims == 1 and not is_c_order: | ||
# This is the case that generated for expressions such as `A[::3]`, we reduce it | ||
# to a 2D copy. | ||
callsite_stream.write( | ||
'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % | ||
(self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, | ||
src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, | ||
'sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( | ||
copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, | ||
[src_node, dst_node]) | ||
elif dims == 2: | ||
'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n' | ||
philip-paul-mueller marked this conversation as resolved.
Show resolved
Hide resolved
|
||
.format( | ||
backend=self.backend, | ||
dst=dst_expr, | ||
dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', | ||
src=src_expr, | ||
src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})', | ||
width=f'sizeof({dst_node.desc(sdfg).dtype.ctype})', | ||
height=sym2cpp(copy_shape[0]), | ||
kind=f'{self.backend}Memcpy{src_location}To{dst_location}', | ||
stream=cudastream, | ||
), | ||
cfg, | ||
state_id, | ||
[src_node, dst_node], | ||
) | ||
|
||
elif dims == 2 and is_c_order: | ||
# Copying a 2D array that are in C order, i.e. last stride is 1. | ||
callsite_stream.write( | ||
'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n' | ||
.format( | ||
backend=self.backend, | ||
dst=dst_expr, | ||
dst_stride=f'({_topy(dst_strides[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', | ||
src=src_expr, | ||
src_stride=f'({sym2cpp(src_strides[0])}) * sizeof({src_node.desc(sdfg).dtype.ctype})', | ||
width=f'({sym2cpp(copy_shape[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', | ||
height=sym2cpp(copy_shape[0]), | ||
kind=f'{self.backend}Memcpy{src_location}To{dst_location}', | ||
stream=cudastream, | ||
), | ||
cfg, | ||
state_id, | ||
[src_node, dst_node], | ||
) | ||
elif dims == 2 and is_fortran_order: | ||
philip-paul-mueller marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Copying a 2D array into a 2D array that is in FORTRAN order, i.e. first stride | ||
# is one. The CUDA API can not handle such cases directly, however, by "transposing" | ||
# it is possible to use `Memcyp2DAsync`. | ||
callsite_stream.write( | ||
'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % | ||
(self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, | ||
src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, | ||
sym2cpp(copy_shape[1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( | ||
copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, | ||
[src_node, dst_node]) | ||
'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dst_stride}, {src}, {src_stride}, {width}, {height}, {kind}, {stream}));\n' | ||
.format( | ||
backend=self.backend, | ||
dst=dst_expr, | ||
dst_stride=f'({_topy(dst_strides[1])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', | ||
src=src_expr, | ||
src_stride=f'({sym2cpp(src_strides[1])}) * sizeof({src_node.desc(sdfg).dtype.ctype})', | ||
width=f'({sym2cpp(copy_shape[0])}) * sizeof({dst_node.desc(sdfg).dtype.ctype})', | ||
height=sym2cpp(copy_shape[1]), | ||
kind=f'{self.backend}Memcpy{src_location}To{dst_location}', | ||
stream=cudastream, | ||
), | ||
cfg, | ||
state_id, | ||
[src_node, dst_node], | ||
) | ||
else: | ||
raise NotImplementedError("The requested copy operation is not implemented.") | ||
|
||
# Post-copy synchronization | ||
if is_sync: | ||
|
@@ -1126,7 +1199,6 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St | |
# Obtain copy information | ||
copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides( | ||
self._dispatcher, sdfg, state, edge, src_node, dst_node, self._cpu_codegen._packed_types)) | ||
|
||
dims = len(copy_shape) | ||
|
||
funcname = 'dace::%sTo%s%dD' % (_get_storagename(src_storage), _get_storagename(dst_storage), dims) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tbennun Since you want to review it and write the code you can probably tell me what this operation, turning a 2d copy into a 1d copy if the memory is contiguous.
I mean I understand what it is doing but I have no idea why it is doing it, especially since
memlet_copy_to_absolute_strides()
does the same thing already.My guess is that this code is older than
memlet_copy_to_absolute_strides()
and it was forgotten and I would propose to remove it, do you agree?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is about 5 years newer than memlet_copy_to_absolute_strides (which was there in the very first GitHub commit of dace):
dace/dace/codegen/targets/cpu.py
Line 970 in 256a0a6
Definitely not some code that was forgotten, you would see it very quickly if you
git blame
d the file.There is a reason it is there, I need to read the code again to get context. This will happen during April though. Thanks!