Skip to content

Commit 97e7df1

Browse files
authored
feat: Support v2 VLenUTF8 codec (#96)
1 parent f233388 commit 97e7df1

File tree

22 files changed

+428
-108
lines changed

22 files changed

+428
-108
lines changed

.changeset/nice-jars-explode.md

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
---
2+
"@zarrita/indexing": patch
3+
"@zarrita/ndarray": patch
4+
"zarrita": patch
5+
"@zarrita/core": patch
6+
---
7+
8+
feat: Support `VLenUTF8` codec in v2 and introduce a strided JS "object" Array.
9+
10+
```python
11+
import zarr
12+
import numcodecs
13+
14+
zarr.create_dataset(
15+
"data.zarr",
16+
data=np.array(
17+
[[["a", "aa"], ["aaa", "aaaa"]],
18+
[["b", "bb"], ["bbb", "bbbb"]]],
19+
dtype=object
20+
),
21+
dtype="|O",
22+
object_codec=numcodecs.VLenUTF8(),
23+
chunks=(1, 1, 2),
24+
)
25+
```
26+
27+
```typescript
28+
import * as zarr from "zarrita";
29+
30+
let store = zarr.FetchStore("http://localhost:8080/data.zarr");
31+
let arr = await zarr.open.v2(store, { kind: "array" });
32+
let result = zarr.get(arr);
33+
// {
34+
// data: ["a", "aa", "aaa", "aaaa", "b", "bb", "bbb", "bbbb"],
35+
// shape: [2, 2, 2],
36+
// stride: [4, 2, 1],
37+
// }
38+
```

fixtures/v2/data.zarr/.zmetadata

+28
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,34 @@
419419
],
420420
"zarr_format": 2
421421
},
422+
"3d.chunked.O/.zarray": {
423+
"chunks": [
424+
1,
425+
1,
426+
2
427+
],
428+
"compressor": {
429+
"blocksize": 0,
430+
"clevel": 5,
431+
"cname": "lz4",
432+
"id": "blosc",
433+
"shuffle": 1
434+
},
435+
"dtype": "|O",
436+
"fill_value": 0,
437+
"filters": [
438+
{
439+
"id": "vlen-utf8"
440+
}
441+
],
442+
"order": "C",
443+
"shape": [
444+
2,
445+
2,
446+
2
447+
],
448+
"zarr_format": 2
449+
},
422450
"3d.chunked.i2/.zarray": {
423451
"chunks": [
424452
1,
+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"chunks": [
3+
1,
4+
1,
5+
2
6+
],
7+
"compressor": {
8+
"blocksize": 0,
9+
"clevel": 5,
10+
"cname": "lz4",
11+
"id": "blosc",
12+
"shuffle": 1
13+
},
14+
"dtype": "|O",
15+
"fill_value": 0,
16+
"filters": [
17+
{
18+
"id": "vlen-utf8"
19+
}
20+
],
21+
"order": "C",
22+
"shape": [
23+
2,
24+
2,
25+
2
26+
],
27+
"zarr_format": 2
28+
}
31 Bytes
Binary file not shown.
35 Bytes
Binary file not shown.
31 Bytes
Binary file not shown.
35 Bytes
Binary file not shown.

fixtures/v2/generate-v2.py

+116-30
Original file line numberDiff line numberDiff line change
@@ -1,88 +1,174 @@
1-
import zarr
1+
import zarr
22
import numpy as np
3-
from numcodecs import Zlib, Blosc, LZ4, Zstd
3+
import shutil
4+
from numcodecs import Zlib, Blosc, LZ4, Zstd, VLenUTF8
45

5-
store = zarr.DirectoryStore('data.zarr')
6+
shutil.rmtree("data.zarr", ignore_errors=True)
7+
8+
store = zarr.DirectoryStore("data.zarr")
69
root = zarr.open_group(store)
7-
root.attrs['answer'] = 42
10+
root.attrs["answer"] = 42
811

912
# 1d.contiguous.zlib.i2
10-
root.create_dataset("1d.contiguous.zlib.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=Zlib())
13+
root.create_dataset(
14+
"1d.contiguous.zlib.i2",
15+
data=[1, 2, 3, 4],
16+
dtype="i2",
17+
chunks=(4,),
18+
compressor=Zlib(),
19+
)
1120

1221
# 1d.contiguous.blosc.i2
13-
root.create_dataset("1d.contiguous.blosc.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=Blosc())
22+
root.create_dataset(
23+
"1d.contiguous.blosc.i2",
24+
data=[1, 2, 3, 4],
25+
dtype="i2",
26+
chunks=(4,),
27+
compressor=Blosc(),
28+
)
1429

1530
# 1d.contiguous.lz4.i2
16-
root.create_dataset("1d.contiguous.lz4.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=LZ4())
31+
root.create_dataset(
32+
"1d.contiguous.lz4.i2", data=[1, 2, 3, 4], dtype="i2", chunks=(4,), compressor=LZ4()
33+
)
1734

1835
# 1d.contiguous.zstd.i2
19-
root.create_dataset("1d.contiguous.zstd.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=Zstd())
36+
root.create_dataset(
37+
"1d.contiguous.zstd.i2",
38+
data=[1, 2, 3, 4],
39+
dtype="i2",
40+
chunks=(4,),
41+
compressor=Zstd(),
42+
)
2043

2144
# 1d.contiguous.raw.i2
22-
root.create_dataset("1d.contiguous.raw.i2", data=[1, 2, 3, 4], dtype='i2', chunks=(4,), compressor=None)
45+
root.create_dataset(
46+
"1d.contiguous.raw.i2", data=[1, 2, 3, 4], dtype="i2", chunks=(4,), compressor=None
47+
)
2348

2449

2550
# 1d.contiguous.i4
26-
root.create_dataset('1d.contiguous.i4', data=[1, 2, 3, 4], dtype='i4', chunks=(4,))
51+
root.create_dataset("1d.contiguous.i4", data=[1, 2, 3, 4], dtype="i4", chunks=(4,))
2752

2853
# 1d.contiguous.u1
29-
root.create_dataset('1d.contiguous.u1', data=[255, 0, 255, 0], dtype='u1', chunks=(4,))
54+
root.create_dataset("1d.contiguous.u1", data=[255, 0, 255, 0], dtype="u1", chunks=(4,))
3055

3156
# 1d.contiguous.<f4
32-
root.create_dataset('1d.contiguous.f4.le', data=[-1000.5, 0, 1000.5, 0], dtype='<f4', chunks=(4,))
57+
root.create_dataset(
58+
"1d.contiguous.f4.le", data=[-1000.5, 0, 1000.5, 0], dtype="<f4", chunks=(4,)
59+
)
3360

3461
# 1d.contiguous.>f4
35-
root.create_dataset('1d.contiguous.f4.be', data=[-1000.5, 0, 1000.5, 0], dtype='>f4', chunks=(4,))
62+
root.create_dataset(
63+
"1d.contiguous.f4.be", data=[-1000.5, 0, 1000.5, 0], dtype=">f4", chunks=(4,)
64+
)
3665

3766
# 1d.contiguous.f8
38-
root.create_dataset('1d.contiguous.f8', data=[1.5, 2.5, 3.5, 4.5], dtype='f8', chunks=(4,))
67+
root.create_dataset(
68+
"1d.contiguous.f8", data=[1.5, 2.5, 3.5, 4.5], dtype="f8", chunks=(4,)
69+
)
3970

4071
# 1d.contiguous.<U13
41-
root.create_dataset('1d.contiguous.U13.le', data=['a', 'b', 'cc', 'd'], dtype='<U13', chunks=(4,))
72+
root.create_dataset(
73+
"1d.contiguous.U13.le", data=["a", "b", "cc", "d"], dtype="<U13", chunks=(4,)
74+
)
4275

4376
# 1d.contiguous.>U13
44-
root.create_dataset('1d.contiguous.U13.be', data=['a', 'b', 'cc', 'd'], dtype='>U13', chunks=(4,))
77+
root.create_dataset(
78+
"1d.contiguous.U13.be", data=["a", "b", "cc", "d"], dtype=">U13", chunks=(4,)
79+
)
4580

4681
# 1d.contiguous.U7
47-
root.create_dataset('1d.contiguous.U7', data=['a', 'b', 'cc', 'd'], dtype='U7', chunks=(4,))
82+
root.create_dataset(
83+
"1d.contiguous.U7", data=["a", "b", "cc", "d"], dtype="U7", chunks=(4,)
84+
)
4885

4986
# 1d.contiguous.S7
50-
root.create_dataset('1d.contiguous.S7', data=['a', 'b', 'cc', 'd'], dtype='S7', chunks=(4,))
87+
root.create_dataset(
88+
"1d.contiguous.S7", data=["a", "b", "cc", "d"], dtype="S7", chunks=(4,)
89+
)
5190

5291
# 1d.contiguous.b1
53-
root.create_dataset('1d.contiguous.b1', data=[True, False, True, False], dtype='b1', chunks=(4,))
54-
92+
root.create_dataset(
93+
"1d.contiguous.b1", data=[True, False, True, False], dtype="b1", chunks=(4,)
94+
)
5595

5696

5797
# 1d.chunked.i2
58-
root.create_dataset('1d.chunked.i2', data=[1, 2, 3, 4], dtype='i2', chunks=(2,))
98+
root.create_dataset("1d.chunked.i2", data=[1, 2, 3, 4], dtype="i2", chunks=(2,))
5999

60100
# 1d.chunked.ragged.i2
61-
root.create_dataset('1d.chunked.ragged.i2', data=[1, 2, 3, 4, 5], dtype='i2', chunks=(2,))
101+
root.create_dataset(
102+
"1d.chunked.ragged.i2", data=[1, 2, 3, 4, 5], dtype="i2", chunks=(2,)
103+
)
62104

63105

64106
# 2d.contiguous.i2
65-
root.create_dataset('2d.contiguous.i2', data=[[1, 2],[3, 4]], dtype='i2', chunks=(2,2))
107+
root.create_dataset(
108+
"2d.contiguous.i2", data=[[1, 2], [3, 4]], dtype="i2", chunks=(2, 2)
109+
)
66110

67111
# 2d.chunked.i2
68-
root.create_dataset('2d.chunked.i2', data=[[1, 2],[3, 4]], dtype='i2', chunks=(1,1))
112+
root.create_dataset("2d.chunked.i2", data=[[1, 2], [3, 4]], dtype="i2", chunks=(1, 1))
69113

70114
# 2d.chunked.U7
71-
root.create_dataset('2d.chunked.U7', data=[['a', 'b'],['cc', 'd']], dtype='U7', chunks=(1,1))
115+
root.create_dataset(
116+
"2d.chunked.U7", data=[["a", "b"], ["cc", "d"]], dtype="U7", chunks=(1, 1)
117+
)
72118

73119
# 2d.chunked.ragged.i2
74-
root.create_dataset('2d.chunked.ragged.i2', data=[[1, 2, 3],[4, 5, 6], [7, 8, 9]], dtype='i2', chunks=(2,2))
120+
root.create_dataset(
121+
"2d.chunked.ragged.i2",
122+
data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
123+
dtype="i2",
124+
chunks=(2, 2),
125+
)
75126

76127
# 3d.contiguous.i2
77-
root.create_dataset('3d.contiguous.i2', data=np.arange(27).reshape(3,3,3), dtype='i2', chunks=(3,3,3))
128+
root.create_dataset(
129+
"3d.contiguous.i2",
130+
data=np.arange(27).reshape(3, 3, 3),
131+
dtype="i2",
132+
chunks=(3, 3, 3),
133+
)
78134

79135
# 3d.chunked.i2
80-
root.create_dataset('3d.chunked.i2', data=np.arange(27).reshape(3,3,3), dtype='i2', chunks=(1,1,1))
136+
root.create_dataset(
137+
"3d.chunked.i2", data=np.arange(27).reshape(3, 3, 3), dtype="i2", chunks=(1, 1, 1)
138+
)
81139

82140
# 3d.chunked.mixed.i2.C
83-
root.create_dataset('3d.chunked.mixed.i2.C', data=np.arange(27).reshape(3,3,3), dtype='i2', chunks=(3,3,1))
141+
root.create_dataset(
142+
"3d.chunked.mixed.i2.C",
143+
data=np.arange(27).reshape(3, 3, 3),
144+
dtype="i2",
145+
chunks=(3, 3, 1),
146+
)
84147

85148
# 3d.chunked.mixed.i2.F
86-
arr = root.create_dataset('3d.chunked.mixed.i2.F', data=np.arange(27).reshape(3,3,3), order="F", dtype='i2', chunks=(3,3,1))
149+
arr = root.create_dataset(
150+
"3d.chunked.mixed.i2.F",
151+
data=np.arange(27).reshape(3, 3, 3),
152+
order="F",
153+
dtype="i2",
154+
chunks=(3, 3, 1),
155+
)
156+
157+
158+
# 3d.chunked.o
159+
data = np.array(
160+
[
161+
[["a", "aa"], ["aaa", "aaaa"]],
162+
[["b", "bb"], ["bbb", "bbbb"]],
163+
],
164+
dtype=object,
165+
)
166+
root.create_dataset(
167+
"3d.chunked.O",
168+
data=data,
169+
object_codec=VLenUTF8(),
170+
dtype="O",
171+
chunks=(1, 1, 2),
172+
)
87173

88174
zarr.consolidate_metadata(store)

packages/core/__tests__/consolidated.test.ts

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ describe("openConsolidated", () => {
3838
"/2d.chunked.i2" => "array",
3939
"/2d.chunked.ragged.i2" => "array",
4040
"/2d.contiguous.i2" => "array",
41+
"/3d.chunked.O" => "array",
4142
"/3d.chunked.i2" => "array",
4243
"/3d.chunked.mixed.i2.C" => "array",
4344
"/3d.chunked.mixed.i2.F" => "array",

packages/core/__tests__/open.test.ts

+18
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,24 @@ describe("v2", () => {
291291
});
292292
});
293293

294+
describe("3d.chunked.O", async () => {
295+
let arr = await open.v2(store.resolve("/3d.chunked.O"), {
296+
kind: "array",
297+
});
298+
it.each([
299+
[[0, 0, 0], ["a", "aa"]],
300+
[[1, 0, 0], ["b", "bb"]],
301+
[[0, 1, 0], ["aaa", "aaaa"]],
302+
[[1, 1, 0], ["bbb", "bbbb"]],
303+
])(`getChunk(%j) -> %j`, async (index, expected) => {
304+
expect(await arr.getChunk(index)).toStrictEqual({
305+
data: expected,
306+
shape: [1, 1, 2],
307+
stride: [2, 2, 1],
308+
});
309+
});
310+
});
311+
294312
describe("3d.chunked.mixed.i2.C", async () => {
295313
let arr = await open.v2(store.resolve("/3d.chunked.mixed.i2.C"), {
296314
kind: "array",

packages/core/__tests__/util.test.ts

+5
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ describe("is_dtype", () => {
8686
["uint64", false],
8787
["v2:U6", false],
8888
["v2:S6", false],
89+
["v2:object", false],
8990
])("is_dtype(%s, 'number') -> %s", (dtype, expected) => {
9091
expect(is_dtype(dtype, "number")).toBe(expected);
9192
});
@@ -104,6 +105,7 @@ describe("is_dtype", () => {
104105
["uint64", false],
105106
["v2:U6", false],
106107
["v2:S6", false],
108+
["v2:object", false],
107109
])("is_dtype(%s, 'boolean') -> %s", (dtype, expected) => {
108110
expect(is_dtype(dtype, "boolean")).toBe(expected);
109111
});
@@ -122,6 +124,7 @@ describe("is_dtype", () => {
122124
["uint64", true],
123125
["v2:U6", false],
124126
["v2:S6", false],
127+
["v2:object", false],
125128
])("is_dtype(%s, 'bigint') -> %s", (dtype, expected) => {
126129
expect(is_dtype(dtype, "bigint")).toBe(expected);
127130
});
@@ -140,6 +143,7 @@ describe("is_dtype", () => {
140143
["uint64", false],
141144
["v2:U6", true],
142145
["v2:S6", true],
146+
["v2:object", false],
143147
])("is_dtype(%s, 'string') -> %s", (dtype, expected) => {
144148
expect(is_dtype(dtype, "string")).toBe(expected);
145149
});
@@ -158,6 +162,7 @@ describe("is_dtype", () => {
158162
"uint64",
159163
"v2:U6",
160164
"v2:S6",
165+
"v2:object",
161166
])("is_dtype(%s, %s) -> true", (dtype) => {
162167
expect(is_dtype(dtype, dtype)).toBe(true);
163168
});

0 commit comments

Comments
 (0)