Skip to content

Commit

Permalink
Add json2 codec (#153)
Browse files Browse the repository at this point in the history
* Implement json2 codec

* Update registry

* Clean up

* format

* Update

* Implement options and errors

* Format

* Test for ensure_ascii

* Fmt

* Update

* Changeset
  • Loading branch information
keller-mark authored Feb 23, 2024
1 parent 4d2fe82 commit b90f16b
Show file tree
Hide file tree
Showing 5 changed files with 378 additions and 1 deletion.
5 changes: 5 additions & 0 deletions .changeset/lovely-readers-change.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@zarrita/core": patch
---

Add json2 codec.
184 changes: 184 additions & 0 deletions packages/core/__tests__/json2.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import { describe, expect, test } from "vitest";

import { JsonCodec } from "../src/codecs/json2.js";

describe("JsonCodec", () => {
test("can decode", () => {
// from numcodecs.json import JSON
// import numpy as np
// json_codec = JSON()
// json_codec.encode(np.array(['ASC1', 'ASC2', 'END', 'GABA1', 'GABA2', 'MG', 'NSC', 'ODC1', 'OPC', 'Unclassified', 'exCA1', 'exCA3', 'exDG', 'exPFC1', 'exPFC2'], dtype=object))
const encodedStr =
`["ASC1","ASC2","END","GABA1","GABA2","MG","NSC","ODC1","OPC","Unclassified","exCA1","exCA3","exDG","exPFC1","exPFC2","|O",[15]]`;
const encodedBytes = new TextEncoder().encode(encodedStr);
const jsonCodec = new JsonCodec({ encoding: "utf-8" });
const decodedResult = jsonCodec.decode(encodedBytes);
expect(decodedResult).toStrictEqual({
data: [
"ASC1",
"ASC2",
"END",
"GABA1",
"GABA2",
"MG",
"NSC",
"ODC1",
"OPC",
"Unclassified",
"exCA1",
"exCA3",
"exDG",
"exPFC1",
"exPFC2",
],
shape: [15],
stride: [1],
});
});
test("can encode", () => {
const encodedStr =
`["ASC1","ASC2","END","GABA1","GABA2","MG","NSC","ODC1","OPC","Unclassified","exCA1","exCA3","exDG","exPFC1","exPFC2","|O",[15]]`;
const encodedBytes = new TextEncoder().encode(encodedStr);

const chunk = {
data: [
"ASC1",
"ASC2",
"END",
"GABA1",
"GABA2",
"MG",
"NSC",
"ODC1",
"OPC",
"Unclassified",
"exCA1",
"exCA3",
"exDG",
"exPFC1",
"exPFC2",
],
shape: [15],
stride: [1],
};
const jsonCodec = new JsonCodec({ encoding: "utf-8" });
const encodedResult = jsonCodec.encode(chunk);
expect(encodedResult).toStrictEqual(encodedBytes);
});

test("throws on decode when !strict", () => {
const encodedStr = `["A","B","C","|O",[3]]`;
const encodedBytes = new TextEncoder().encode(encodedStr);
const jsonCodec = new JsonCodec({ strict: false });
expect(() => jsonCodec.decode(encodedBytes)).toThrowError();
});

test("throws on encode with non-supported encoding", () => {
const chunk = {
data: ["A", "B", "C"],
shape: [3],
stride: [1],
};
const jsonCodec = new JsonCodec({ check_circular: false });
expect(() => jsonCodec.encode(chunk)).toThrowError();
});
test("throws on encode with !check_circular", () => {
const chunk = {
data: ["A", "B", "C"],
shape: [3],
stride: [1],
};
const jsonCodec = new JsonCodec({ check_circular: false });
expect(() => jsonCodec.encode(chunk)).toThrowError();
});
test("throws on encode with check_circular and circular reference", () => {
let data: any[] = ["A", null];
data[1] = data;
const chunk = {
data,
shape: [2],
stride: [1],
};
const jsonCodec = new JsonCodec({ check_circular: true });
expect(() => jsonCodec.encode(chunk)).toThrowError();
});
test("supports !allow_nan", () => {
const chunk = {
data: [1, 2, NaN],
shape: [3],
stride: [1],
};
const jsonCodec = new JsonCodec({ allow_nan: false });
expect(() => jsonCodec.encode(chunk)).toThrowError();
});
test("supports sort_keys", () => {
const chunk = {
data: [{ "1": 1, "3": 3, "2": 2 }],
shape: [1],
stride: [1],
};
const jsonCodec = new JsonCodec({ sort_keys: true });
const decodedChunk = jsonCodec.decode(jsonCodec.encode(chunk));
expect(Object.keys(decodedChunk.data[0])).toEqual(["1", "2", "3"]);
});
test("supports ensure_ascii", () => {
const chunk = {
data: ["£"],
shape: [1],
stride: [1],
};
const jsonCodec = new JsonCodec({ ensure_ascii: true });
const encodedChunk = jsonCodec.encode(chunk);
const decodedChunk = jsonCodec.decode(encodedChunk);
expect(decodedChunk.data).toEqual(["£"]);
expect(Array.from(encodedChunk)).toEqual([
91,
34,
92,
117,
48,
48,
97,
51,
34,
44,
34,
124,
79,
34,
44,
91,
49,
93,
93,
]);
});
test("supports !ensure_ascii", () => {
const chunk = {
data: ["£"],
shape: [1],
stride: [1],
};
const jsonCodec = new JsonCodec({ ensure_ascii: false });
const encodedChunk = jsonCodec.encode(chunk);
const decodedChunk = jsonCodec.decode(encodedChunk);
expect(decodedChunk.data).toEqual(["£"]);
expect(Array.from(encodedChunk)).toEqual([
91,
34,
194,
163,
34,
44,
34,
124,
79,
34,
44,
91,
49,
93,
93,
]);
});
});
4 changes: 3 additions & 1 deletion packages/core/src/codecs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { TransposeCodec } from "./codecs/transpose.js";
import { BytesCodec } from "./codecs/bytes.js";
import { Crc32cCodec } from "./codecs/crc32c.js";
import { VLenUTF8 } from "./codecs/vlen-utf8.js";
import { JsonCodec } from "./codecs/json2.js";

type ChunkMetadata<D extends DataType> = {
data_type: D;
Expand All @@ -30,7 +31,8 @@ function create_default_registry(): Map<
.set("transpose", () => TransposeCodec)
.set("bytes", () => BytesCodec)
.set("crc32c", () => Crc32cCodec)
.set("vlen-utf8", () => VLenUTF8);
.set("vlen-utf8", () => VLenUTF8)
.set("json2", () => JsonCodec);
}

export const registry = create_default_registry();
Expand Down
185 changes: 185 additions & 0 deletions packages/core/src/codecs/json2.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
// Adapted from https://github.com/hms-dbmi/vizarr/blob/5b0e3ea6fbb42d19d0e38e60e49bb73d1aca0693/src/utils.ts#L26
import type { Chunk, ObjectType } from "../metadata.js";
import { get_strides, json_decode_object } from "../util.js";

type EncoderConfig = {
encoding?: "utf-8";
skipkeys?: boolean;
ensure_ascii?: boolean;
check_circular?: boolean;
allow_nan?: boolean;
sort_keys?: boolean;
indent?: number;
separators?: [string, string];
};
type DecoderConfig = {
strict?: boolean;
};

type JsonCodecConfig = EncoderConfig & DecoderConfig;

// Reference: https://stackoverflow.com/a/21897413
function throw_on_nan_replacer(_key: string | number, value: any): any {
if (value !== value) {
throw new Error(
"JsonCodec allow_nan is false but NaN was encountered during encoding.",
);
}

if (value === Infinity) {
throw new Error(
"JsonCodec allow_nan is false but Infinity was encountered during encoding.",
);
}

if (value === -Infinity) {
throw new Error(
"JsonCodec allow_nan is false but -Infinity was encountered during encoding.",
);
}
return value;
}

// Reference: https://gist.github.com/davidfurlong/463a83a33b70a3b6618e97ec9679e490
function sort_keys_replacer(_key: string | number, value: any): any {
return value instanceof Object && !(value instanceof Array)
? Object.keys(value)
.sort()
.reduce((sorted: any, key: string | number) => {
sorted[key] = value[key];
return sorted;
}, {})
: value;
}

export class JsonCodec {
kind = "array_to_bytes";

#encoder_config: EncoderConfig;
#decoder_config: DecoderConfig;

constructor(
public configuration: JsonCodecConfig,
) {
// Reference: https://github.com/zarr-developers/numcodecs/blob/0878717a3613d91a453fe3d3716aa9c67c023a8b/numcodecs/json.py#L36
const {
encoding = "utf-8",
skipkeys = false,
ensure_ascii = true,
check_circular = true,
allow_nan = true,
sort_keys = true,
indent,
strict = true,
} = configuration;

let separators = configuration.separators;
if (!separators) {
// ensure separators are explicitly specified, and consistent behaviour across
// Python versions, and most compact representation if indent is None
if (!indent) {
separators = [",", ":"];
} else {
separators = [", ", ": "];
}
}

this.#encoder_config = {
encoding,
skipkeys,
ensure_ascii,
check_circular,
allow_nan,
indent,
separators,
sort_keys,
};
this.#decoder_config = { strict };
}
static fromConfig(
configuration: JsonCodecConfig,
) {
return new JsonCodec(configuration);
}

encode(buf: Chunk<ObjectType>): Uint8Array {
const {
indent,
encoding,
ensure_ascii,
check_circular,
allow_nan,
sort_keys,
} = this.#encoder_config;
if (encoding !== "utf-8") {
throw new Error("JsonCodec does not yet support non-utf-8 encoding.");
}
const replacer_functions: Function[] = [];
if (!check_circular) {
// By default, for JSON.stringify,
// a TypeError will be thrown if one attempts to encode an object with circular references
throw new Error(
"JsonCodec does not yet support skipping the check for circular references during encoding.",
);
}
if (!allow_nan) {
// Throw if NaN/Infinity/-Infinity are encountered during encoding.
replacer_functions.push(throw_on_nan_replacer);
}
if (sort_keys) {
// We can ensure keys are sorted but not really the opposite since
// there is no guarantee of key ordering in JS.
replacer_functions.push(sort_keys_replacer);
}

const items = Array.from(buf.data);
items.push("|O");
items.push(buf.shape);

let replacer = undefined;
if (replacer_functions.length) {
replacer = function (key: string | number, value: any): any {
let new_value = value;
replacer_functions.forEach((sub_replacer) => {
new_value = sub_replacer(key, new_value);
});
return new_value;
};
}
let json_str = JSON.stringify(items, replacer, indent);

if (ensure_ascii) {
// If ensure_ascii is true (the default), the output is guaranteed
// to have all incoming non-ASCII characters escaped.
// If ensure_ascii is false, these characters will be output as-is.
// Reference: https://stackoverflow.com/a/31652607
json_str = json_str.replace(/[\u007F-\uFFFF]/g, function (chr) {
const full_str = "0000" + chr.charCodeAt(0).toString(16);
const sub_str = full_str.substring(full_str.length - 4);
return "\\u" + sub_str;
});
}
return new TextEncoder().encode(json_str);
}

decode(bytes: Uint8Array): Chunk<ObjectType> {
const { strict } = this.#decoder_config;
if (!strict) {
// (i.e., allowing control characters inside strings)
throw new Error(
"JsonCodec does not yet support non-strict decoding.",
);
}
const items = json_decode_object(bytes);
const shape = items.pop();
items.pop(); // Pop off dtype (unused)
if (!shape) {
// O-d case
throw new Error("0D not implemented for JsonCodec.");
} else {
const stride = get_strides(shape, "C");
const data = items;
return { data, shape, stride };
}
}
}
Loading

0 comments on commit b90f16b

Please sign in to comment.