Skip to content

Commit e617d51

Browse files
Add in_channel twine decoder
Co-authored-by: Simon Cruanes <simon.cruanes.2007@m4x.org>
2 parents ea28b4e + daf9428 commit e617d51

File tree

7 files changed

+223
-65
lines changed

7 files changed

+223
-65
lines changed

src/leb128/imandrakit_leb128.ml

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ module Decode = struct
3232
let res = ref 0L in
3333
let continue = ref true in
3434

35-
let off = ref off in
35+
let off = ref (sl.off + off) in
3636
let n_consumed = ref 0 in
3737

3838
while !continue do

src/twine/decode.ml

+132-34
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,107 @@ module Slice = Byte_slice
44
module LEB128 = Imandrakit_leb128.Decode
55

66
type cached = ..
7-
type cached += Miss
87

98
module type CACHE_KEY = sig
109
type elt
1110
type cached += C of elt
1211
end
1312

14-
type t = {
15-
sl: slice;
16-
cache: cached array;
17-
mutable hmap: Hmap.t;
13+
type t =
14+
| Decode : {
15+
st: 'st;
16+
ops: 'st ops;
17+
cache: cached Int_tbl.t;
18+
mutable hmap: Hmap.t;
19+
}
20+
-> t
21+
22+
and 'st ops = {
23+
length: 'st -> int;
24+
read_char: 'st -> int -> char;
25+
read_int32: 'st -> int -> int32;
26+
read_int64: 'st -> int -> int64;
27+
read_blob: 'st -> int -> int -> slice;
28+
read_leb128: 'st -> int -> int64 * offset;
1829
}
1930

31+
let st_len t =
32+
let (Decode { st; ops; _ }) = t in
33+
ops.length st
34+
35+
let read_char t offset =
36+
let (Decode { st; ops; _ }) = t in
37+
ops.read_char st offset
38+
39+
let read_int32 t offset =
40+
let (Decode { st; ops; _ }) = t in
41+
ops.read_int32 st offset
42+
43+
let read_int64 t offset =
44+
let (Decode { st; ops; _ }) = t in
45+
ops.read_int64 st offset
46+
47+
let read_blob t offset length =
48+
let (Decode { st; ops; _ }) = t in
49+
ops.read_blob st offset length
50+
51+
let read_leb128 t offset =
52+
let (Decode { st; ops; _ }) = t in
53+
ops.read_leb128 st offset
54+
55+
let slice_ops : slice ops =
56+
{
57+
length = (fun (st : slice) -> Slice.len st);
58+
read_char = (fun (st : slice) (offset : int) -> Slice.get st offset);
59+
read_int32 =
60+
(fun (st : slice) (offset : int) ->
61+
Bytes.get_int32_le st.bs (st.off + offset));
62+
read_int64 =
63+
(fun (st : slice) (offset : int) ->
64+
Bytes.get_int64_le st.bs (st.off + offset));
65+
read_blob =
66+
(fun (st : slice) (offset : int) (length : int) ->
67+
Slice.sub st offset (min length (Slice.len st - st.off - offset)));
68+
read_leb128 = (fun (st : slice) (offset : int) -> LEB128.u64 st offset);
69+
}
70+
71+
let in_channel_ops : in_channel ops =
72+
{
73+
length = (fun (st : in_channel) -> Int64.to_int (In_channel.length st));
74+
read_char =
75+
(fun (st : in_channel) (offset : int) ->
76+
In_channel.seek st (Int64.of_int offset);
77+
Option.get (In_channel.input_char st));
78+
read_int32 =
79+
(fun (st : in_channel) (offset : int) ->
80+
let bs = Bytes.create 4 in
81+
In_channel.seek st (Int64.of_int offset);
82+
let _ = In_channel.really_input st bs 0 4 in
83+
Bytes.get_int32_le bs 0);
84+
read_int64 =
85+
(fun (st : in_channel) (offset : int) ->
86+
let bs = Bytes.create 8 in
87+
In_channel.seek st (Int64.of_int offset);
88+
let _ = In_channel.really_input st bs 0 8 in
89+
Bytes.get_int64_le bs 0);
90+
read_blob =
91+
(fun (st : in_channel) (offset : int) (length : int) ->
92+
let remaining = Int64.to_int (In_channel.length st) - offset in
93+
let length = min length remaining in
94+
let bs = Bytes.create length in
95+
In_channel.seek st (Int64.of_int offset);
96+
let _ = In_channel.really_input st bs 0 length in
97+
Slice.create bs);
98+
read_leb128 =
99+
(fun (st : in_channel) (offset : int) ->
100+
let remaining = Int64.to_int (In_channel.length st) - offset in
101+
let length = min 16 remaining in
102+
let bs = Bytes.create length in
103+
In_channel.seek st (Int64.of_int offset);
104+
let _ = In_channel.really_input st bs 0 length in
105+
LEB128.u64 (Slice.create bs) 0);
106+
}
107+
20108
type cstor_index = int [@@deriving show]
21109

22110
type cursor = {
@@ -30,16 +118,26 @@ let show_cursor (self : cursor) =
30118

31119
let pp_cursor = Fmt.of_to_string show_cursor
32120

33-
let[@inline] create sl : t =
34-
{ sl; cache = Array.make sl.len Miss; hmap = Hmap.empty }
121+
let[@inline] create (st : 'st) (ops : 'st ops) : t =
122+
Decode { st; ops; cache = Int_tbl.create 32; hmap = Hmap.empty }
123+
124+
let[@inline] of_slice s : t = create s slice_ops
125+
let[@inline] of_string s : t = create (Slice.of_string s) slice_ops
126+
let[@inline] of_in_channel c : t = create c in_channel_ops
127+
128+
let[@inline] hmap_set self k v =
129+
match self with
130+
| Decode d -> d.hmap <- Hmap.add k v d.hmap
35131

36-
let[@inline] of_string s = create @@ Slice.of_string s
37-
let[@inline] hmap_set self k v = self.hmap <- Hmap.add k v self.hmap
38-
let[@inline] hmap_get self k = Hmap.find k self.hmap
132+
let[@inline] hmap_get self k =
133+
match self with
134+
| Decode d -> Hmap.find k d.hmap
39135

40136
let hmap_transfer d1 ~into:d2 : unit =
41-
d2.hmap <-
42-
Hmap.fold (fun (Hmap.B (k, v)) h2 -> Hmap.add k v h2) d1.hmap d2.hmap
137+
match d1, d2 with
138+
| Decode d1, Decode d2 ->
139+
d2.hmap <-
140+
Hmap.fold (fun (Hmap.B (k, v)) h2 -> Hmap.add k v h2) d1.hmap d2.hmap
43141

44142
type 'a decoder = t -> offset -> 'a
45143
type num_bytes_consumed = int
@@ -77,7 +175,7 @@ let invalid_first_byte_ msg ~offset ~high ~low =
77175
failf "Decode: invalid first byte %d/%d at %d: %s" high low offset msg
78176

79177
let[@inline] get_char_ (self : t) (offset : offset) : int =
80-
Char.code (Slice.get self.sl offset)
178+
Char.code @@ read_char self offset
81179

82180
let[@inline] get_high (c : int) : int = (c land 0b1111_0000) lsr 4
83181
let[@inline] get_low (c : int) : int = c land 0b0000_1111
@@ -87,7 +185,7 @@ let[@inline] get_int64_ self offset ~low : int64 * num_bytes_consumed =
87185
if low < 15 then
88186
Int64.of_int low, 0
89187
else (
90-
let i, n = LEB128.u64 self.sl (offset + 1) in
188+
let i, n = read_leb128 self (offset + 1) in
91189
Int64.add i 15L, n
92190
)
93191

@@ -106,13 +204,9 @@ let get_special_ offset ~high ~low : Value.t =
106204
let[@inline] get_float_ (self : t) offset ~low : float * num_bytes_consumed =
107205
let isf32 = low = 0 in
108206
if isf32 then
109-
( Bytes.get_int32_le self.sl.bs (self.sl.off + offset + 1)
110-
|> Int32.float_of_bits,
111-
4 )
207+
read_int32 self (offset + 1) |> Int32.float_of_bits, 4
112208
else
113-
( Bytes.get_int64_le self.sl.bs (self.sl.off + offset + 1)
114-
|> Int64.float_of_bits,
115-
8 )
209+
read_int64 self (offset + 1) |> Int64.float_of_bits, 8
116210

117211
(** Number of bytes to skip *)
118212
let skip_float_ ~low : int =
@@ -173,11 +267,11 @@ let read ?(auto_deref = true) (self : t) (offset : offset) : Value.t =
173267
Value.Float f
174268
| 4 ->
175269
let len, size_len = get_int_truncate_ self offset ~low in
176-
let s = Slice.sub self.sl (offset + 1 + size_len) len in
270+
let s = read_blob self (offset + 1 + size_len) len in
177271
Value.String s
178272
| 5 ->
179273
let len, size_len = get_int_truncate_ self offset ~low in
180-
let s = Slice.sub self.sl (offset + 1 + size_len) len in
274+
let s = read_blob self (offset + 1 + size_len) len in
181275
Value.Blob s
182276
| 6 ->
183277
let len, size_len = get_int_truncate_ self offset ~low in
@@ -208,7 +302,8 @@ let read ?(auto_deref = true) (self : t) (offset : offset) : Value.t =
208302
let idx_cstor, size_idx_cstor = get_int_truncate_ self offset ~low in
209303
let offset_after_n = offset + 1 + size_idx_cstor in
210304
let num_items, size_num_items =
211-
LEB128.uint_truncate self.sl offset_after_n
305+
let sl = read_blob self offset_after_n 16 in
306+
LEB128.uint_truncate sl 0
212307
in
213308
let c : cursor =
214309
{
@@ -314,7 +409,7 @@ let string_slice self offset =
314409
let low = get_low c in
315410
if high <> 4 then fail_decode_type_ ~what:"string" offset;
316411
let len, size_len = get_int_truncate_ self offset ~low in
317-
Slice.sub self.sl (offset + 1 + size_len) len
412+
read_blob self (offset + 1 + size_len) len
318413

319414
let string self offset = Slice.contents @@ string_slice self offset
320415

@@ -325,7 +420,7 @@ let blob_slice self offset =
325420
let low = get_low c in
326421
if high <> 5 then fail_decode_type_ ~what:"blob" offset;
327422
let len, size_len = get_int_truncate_ self offset ~low in
328-
Slice.sub self.sl (offset + 1 + size_len) len
423+
read_blob self (offset + 1 + size_len) len
329424

330425
let blob self offset = Slice.contents @@ blob_slice self offset
331426

@@ -380,7 +475,8 @@ let cstor self offset =
380475
let idx_cstor, size_idx_cstor = get_int_truncate_ self offset ~low in
381476
let offset_after_n = offset + 1 + size_idx_cstor in
382477
let num_items, size_num_items =
383-
LEB128.uint_truncate self.sl offset_after_n
478+
let sl = read_blob self offset_after_n 16 in
479+
LEB128.uint_truncate sl 0
384480
in
385481
let c : cursor =
386482
{
@@ -405,15 +501,16 @@ let[@inline] ref_for (self : t) offset : _ offset_for =
405501
Offset_for (ref_ self offset)
406502

407503
let get_entrypoint (self : t) : offset =
408-
assert (Slice.len self.sl > 0);
409-
let last = Slice.len self.sl - 1 in
410-
let offset = Char.code @@ Slice.get self.sl last in
504+
let len = st_len self in
505+
assert (len > 0);
506+
let last = len - 1 in
507+
let offset = Char.code @@ read_char self last in
411508
last - offset - 1
412509

413510
let read_entrypoint (self : t) : Value.t =
414511
read self @@ deref_rec self @@ get_entrypoint self
415512

416-
let decode_string ?(init = ignore) (d : _ decoder) (s : string) =
513+
let decode_string ?(init = ignore) (d : 'a decoder) (s : string) : 'a =
417514
let self = of_string s in
418515
init self;
419516
let off = deref_rec self @@ get_entrypoint self in
@@ -544,12 +641,13 @@ let with_cache (type a) (key : a cache_key) (dec : a decoder) : a decoder =
544641
dec st off
545642
else (
546643
(* go through the cache *)
547-
match st.cache.(off) with
548-
| K.C v -> v
549-
| Miss ->
644+
let (Decode { cache; _ }) = st in
645+
match Int_tbl.find cache off with
646+
| exception Not_found ->
550647
let v = dec st off in
551-
st.cache.(off) <- K.C v;
648+
Int_tbl.add cache off (K.C v);
552649
v
650+
| K.C v -> v
553651
| _ -> (* weird collision, just don't cache… *) dec st off
554652
)
555653

src/twine/decode.mli

+25-28
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@ open Types
22

33
type t
44

5-
val create : slice -> t
5+
val of_slice : slice -> t
66
val of_string : string -> t
7+
val of_in_channel : in_channel -> t
78
val hmap_set : t -> 'a Hmap.key -> 'a -> unit
89
val hmap_get : t -> 'a Hmap.key -> 'a option
910

@@ -74,12 +75,12 @@ module Dict_cursor : sig
7475
end
7576

7677
val deref_rec : offset decoder
77-
(** Given any value, follow pointers until a non-pointer value is reached,
78-
a return its address. *)
78+
(** Given any value, follow pointers until a non-pointer value is reached, a
79+
return its address. *)
7980

8081
val read : ?auto_deref:bool -> Value.t decoder
8182
(** Read a value of any kind.
82-
@param auto_deref if true (default), follow pointers implicitly *)
83+
@param auto_deref if true (default), follow pointers implicitly *)
8384

8485
val null : unit decoder
8586
val bool : bool decoder
@@ -116,39 +117,35 @@ val decode_string : ?init:(t -> unit) -> 'a decoder -> string -> 'a
116117

117118
(** {2 Caching}
118119
119-
Caching is used to reflect the sharing of values
120-
embedded in a Twine slice, into the decoded values.
121-
It means that, for a given type, if values of this type
122-
are encoded with sharing (e.g. a graph-heavy term representation),
123-
then with caching we can decode the values to OCaml values
124-
that also have sharing.
125-
*)
120+
Caching is used to reflect the sharing of values embedded in a Twine slice,
121+
into the decoded values. It means that, for a given type, if values of this
122+
type are encoded with sharing (e.g. a graph-heavy term representation), then
123+
with caching we can decode the values to OCaml values that also have
124+
sharing. *)
126125

127126
type 'a cache_key
128127
(** Generative key used to cache values during decoding *)
129128

130129
val create_cache_key : unit -> _ cache_key
131130
(** Generate a new (generative) cache key for a type.
132131
133-
{b NOTE} this should be called only at module toplevel, as a constant,
134-
not dynamically inside a function:
135-
[let key: foo value_pack.Deser.cache_key = value_pack.Deser.create_cache_key ();;].
136-
Indeed, this is generative, so creating multiple keys for a type
132+
{b NOTE} this should be called only at module toplevel, as a constant, not
133+
dynamically inside a function:
134+
[let key: foo value_pack.Deser.cache_key = value_pack.Deser.create_cache_key
135+
();;]. Indeed, this is generative, so creating multiple keys for a type
137136
will result in sub-par performance or non-existent caching. *)
138137

139138
val with_cache : 'a cache_key -> 'a decoder -> 'a decoder
140-
(** [with_cache key dec] is the same decoder as [dec] but
141-
it uses [key] to retrieve values directly from
142-
an internal table for entries/values that have already
143-
been decoded in the past. This means that a value that was
144-
encoded with a lot of sharing (e.g in a graph, or a large
145-
string using {!Ser.add_string}) will be decoded only once.
146-
*)
139+
(** [with_cache key dec] is the same decoder as [dec] but it uses [key] to
140+
retrieve values directly from an internal table for entries/values that have
141+
already been decoded in the past. This means that a value that was encoded
142+
with a lot of sharing (e.g in a graph, or a large string using
143+
{!Ser.add_string}) will be decoded only once. *)
147144

148145
val add_cache : 'a decoder ref -> unit
149-
(** [add_cache dec_ref] modifies the decoder so it uses a new cache key.
150-
It is the same as:
151-
{[
152-
let key = create_cache_key ()
153-
let () = dec_ref := with_cache key !dec_ref
154-
]} *)
146+
(** [add_cache dec_ref] modifies the decoder so it uses a new cache key. It is
147+
the same as:
148+
{[
149+
let key = create_cache_key ()
150+
let () = dec_ref := with_cache key !dec_ref
151+
]} *)

src/twine/dump.ml

+1-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ let default_string_ellipsis_threshold = 30
149149

150150
let dump_slice ?(string_ellipsis_threshold = default_string_ellipsis_threshold)
151151
(sl : slice) : string =
152-
let dec = Decode.create sl in
152+
let dec = Decode.of_slice sl in
153153
let st = { offset = Int_map.empty; dec; string_ellipsis_threshold } in
154154
dump_rec st (Decode.get_entrypoint dec);
155155
let buf = Buffer.create 32 in

test/twine/dune

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
(tests
2-
(names t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12)
2+
(names t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t_file_1)
33
(libraries imandrakit.twine hex ppx_deriving.runtime unix)
44
(package imandrakit)
55
(preprocess

test/twine/t_file_1.expected

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
t_out: { T_file_1.a = 43; b = (Types.Offset_for 13);
2+
c = (Types.Offset_for 13) }
3+
t_in : { T_file_1.a = 43; b = (Types.Offset_for 13);
4+
c = (Types.Offset_for 13) }
5+
a=43 b=XYZ c=XYZ
6+
00000000: 4c55 4e55 5345 4420 5354 5546 4643 5859 LUNUSED STUFFCXY
7+
00000001: 5a63 1f1c e6e7 04 Zc.....

0 commit comments

Comments
 (0)