Skip to content

Commit 67ede4c

Browse files
implement fletcher32 (#412)
* implement fletcher32 * Update numcodecs/fletcher32.pyx Co-authored-by: Ryan Abernathey <ryan.abernathey@gmail.com> * Add docstring and erorr test * Use HDF C impl * Remove unused, add docstrings * to runtime and int test * to cython * Update numcodecs/fletcher32.pyx Co-authored-by: Ryan Abernathey <ryan.abernathey@gmail.com> * Add docs Co-authored-by: Ryan Abernathey <ryan.abernathey@gmail.com>
1 parent 4f2a2e3 commit 67ede4c

File tree

6 files changed

+170
-2
lines changed

6 files changed

+170
-2
lines changed

docs/checksum32.rst

+11
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,14 @@ Adler32
2222
.. automethod:: decode
2323
.. automethod:: get_config
2424
.. automethod:: from_config
25+
26+
27+
Fletcher32
28+
----------
29+
30+
.. autoclass:: numcodecs.fletcher32.Fletcher32
31+
32+
.. autoattribute:: codec_id
33+
.. automethod:: encode
34+
.. automethod:: decode
35+

docs/release.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ Unreleased
1515
Enhancements
1616
~~~~~~~~~~~~
1717

18-
*
18+
* Add ``fletcher32`` checksum codec
19+
By :user:`Martin Durant <martindurant>`, :issue:`410`.
1920

2021
Fix
2122
~~~

numcodecs/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,6 @@
111111
register_codec(VLenUTF8)
112112
register_codec(VLenBytes)
113113
register_codec(VLenArray)
114+
115+
from numcodecs.fletcher32 import Fletcher32
116+
register_codec(Fletcher32)

numcodecs/fletcher32.pyx

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# cython: language_level=3
2+
# cython: overflowcheck=False
3+
# cython: cdivision=True
4+
import struct
5+
6+
from numcodecs.abc import Codec
7+
from numcodecs.compat import ensure_contiguous_ndarray
8+
9+
from libc.stdint cimport uint8_t, uint16_t, uint32_t
10+
11+
12+
cdef uint32_t _fletcher32(const uint8_t[::1] _data):
13+
# converted from
14+
# https://github.com/Unidata/netcdf-c/blob/main/plugins/H5checksum.c#L109
15+
cdef:
16+
const uint8_t *data = &_data[0]
17+
size_t _len = _data.shape[0]
18+
size_t len = _len / 2
19+
size_t tlen
20+
uint32_t sum1 = 0, sum2 = 0;
21+
22+
23+
while len:
24+
tlen = 360 if len > 360 else len
25+
len -= tlen
26+
while True:
27+
sum1 += <uint32_t>((<uint16_t>data[0]) << 8) | (<uint16_t>data[1])
28+
data += 2
29+
sum2 += sum1
30+
tlen -= 1
31+
if tlen < 1:
32+
break
33+
sum1 = (sum1 & 0xffff) + (sum1 >> 16)
34+
sum2 = (sum2 & 0xffff) + (sum2 >> 16)
35+
36+
if _len % 2:
37+
sum1 += <uint32_t>((<uint16_t>(data[0])) << 8)
38+
sum2 += sum1
39+
sum1 = (sum1 & 0xffff) + (sum1 >> 16)
40+
sum2 = (sum2 & 0xffff) + (sum2 >> 16)
41+
42+
sum1 = (sum1 & 0xffff) + (sum1 >> 16)
43+
sum2 = (sum2 & 0xffff) + (sum2 >> 16)
44+
45+
return (sum2 << 16) | sum1
46+
47+
48+
class Fletcher32(Codec):
49+
"""The fletcher checksum with 16-bit words and 32-bit output
50+
51+
This is the netCDF4/HED5 implementation, which is not equivalent
52+
to the one in wikipedia
53+
https://github.com/Unidata/netcdf-c/blob/main/plugins/H5checksum.c#L95
54+
55+
With this codec, the checksum is concatenated on the end of the data
56+
bytes when encoded. At decode time, the checksum is performed on
57+
the data portion and compared with the four-byte checksum, raising
58+
RuntimeError if inconsistent.
59+
"""
60+
61+
codec_id = "fletcher32"
62+
63+
def encode(self, buf):
64+
"""Return buffer plus 4-byte fletcher checksum"""
65+
buf = ensure_contiguous_ndarray(buf).ravel().view('uint8')
66+
cdef const uint8_t[::1] b_ptr = buf
67+
val = _fletcher32(b_ptr)
68+
return buf.tobytes() + struct.pack("<I", val)
69+
70+
def decode(self, buf, out=None):
71+
"""Check fletcher checksum, and return buffer without it"""
72+
b = ensure_contiguous_ndarray(buf).view('uint8')
73+
cdef const uint8_t[::1] b_ptr = b[:-4]
74+
val = _fletcher32(b_ptr)
75+
found = b[-4:].view("<u4")[0]
76+
if val != found:
77+
raise RuntimeError(
78+
f"The fletcher32 checksum of the data ({val}) did not"
79+
f" match the expected checksum ({found}).\n"
80+
"This could be a sign that the data has been corrupted."
81+
)
82+
if out:
83+
out.view("uint8")[:] = b[:-4]
84+
return out
85+
return memoryview(b[:-4])

numcodecs/tests/test_fletcher32.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import numpy as np
2+
import pytest
3+
4+
from numcodecs.fletcher32 import Fletcher32
5+
6+
7+
@pytest.mark.parametrize(
8+
"dtype",
9+
["uint8", "int32", "float32"]
10+
)
11+
def test_with_data(dtype):
12+
data = np.arange(100, dtype=dtype)
13+
f = Fletcher32()
14+
arr = np.frombuffer(f.decode(f.encode(data)), dtype=dtype)
15+
assert (arr == data).all()
16+
17+
18+
def test_error():
19+
data = np.arange(100)
20+
f = Fletcher32()
21+
enc = f.encode(data)
22+
enc2 = bytearray(enc)
23+
enc2[0] += 1
24+
with pytest.raises(RuntimeError) as e:
25+
f.decode(enc2)
26+
assert "fletcher32 checksum" in str(e.value)
27+
28+
29+
def test_known():
30+
data = (
31+
b'w\x07\x00\x00\x00\x00\x00\x00\x85\xf6\xff\xff\xff\xff\xff\xff'
32+
b'i\x07\x00\x00\x00\x00\x00\x00\x94\xf6\xff\xff\xff\xff\xff\xff'
33+
b'\x88\t\x00\x00\x00\x00\x00\x00i\x03\x00\x00\x00\x00\x00\x00'
34+
b'\x93\xfd\xff\xff\xff\xff\xff\xff\xc3\xfc\xff\xff\xff\xff\xff\xff'
35+
b"'\x02\x00\x00\x00\x00\x00\x00\xba\xf7\xff\xff\xff\xff\xff\xff"
36+
b'\xfd%\x86d')
37+
data3 = Fletcher32().decode(data)
38+
outarr = np.frombuffer(data3, dtype="<i8")
39+
expected = [
40+
1911, -2427, 1897, -2412, 2440, 873, -621, -829, 551, -2118,
41+
]
42+
assert outarr.tolist() == expected

setup.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,31 @@ def vlen_extension():
198198
return extensions
199199

200200

201+
def fletcher_extension():
202+
info('setting up fletcher32 extension')
203+
204+
extra_compile_args = base_compile_args.copy()
205+
define_macros = []
206+
207+
# setup sources
208+
include_dirs = ['numcodecs']
209+
# define_macros += [('CYTHON_TRACE', '1')]
210+
211+
sources = ['numcodecs/fletcher32.pyx']
212+
213+
# define extension module
214+
extensions = [
215+
Extension('numcodecs.fletcher32',
216+
sources=sources,
217+
include_dirs=include_dirs,
218+
define_macros=define_macros,
219+
extra_compile_args=extra_compile_args,
220+
),
221+
]
222+
223+
return extensions
224+
225+
201226
def compat_extension():
202227
info('setting up compat extension')
203228

@@ -265,7 +290,8 @@ def run_setup(with_extensions):
265290

266291
if with_extensions:
267292
ext_modules = (blosc_extension() + zstd_extension() + lz4_extension() +
268-
compat_extension() + shuffle_extension() + vlen_extension())
293+
compat_extension() + shuffle_extension() + vlen_extension() +
294+
fletcher_extension())
269295

270296
cmdclass = dict(build_ext=ve_build_ext)
271297
else:

0 commit comments

Comments
 (0)