1
- // Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
1
+ // Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
2
2
3
3
use std:: {
4
4
fmt:: Debug ,
@@ -7,7 +7,7 @@ use std::{
7
7
8
8
use lru:: LruCache ;
9
9
use parquet:: file:: metadata:: FileMetaData ;
10
- use snafu:: { OptionExt , ResultExt } ;
10
+ use snafu:: { ensure , OptionExt , ResultExt } ;
11
11
12
12
use crate :: sst:: {
13
13
meta_data:: { DecodeCustomMetaData , KvMetaDataNotFound , ParquetMetaDataRef , Result } ,
@@ -39,14 +39,24 @@ impl MetaData {
39
39
let kv_metas = file_meta_data
40
40
. key_value_metadata ( )
41
41
. context ( KvMetaDataNotFound ) ?;
42
- let kv_meta = kv_metas
43
- . iter ( )
44
- . find ( |kv| kv. key == encoding:: META_KEY )
45
- . context ( KvMetaDataNotFound ) ?;
42
+
43
+ ensure ! ( !kv_metas. is_empty( ) , KvMetaDataNotFound ) ;
44
+ let mut other_kv_metas = Vec :: with_capacity ( kv_metas. len ( ) - 1 ) ;
45
+ let mut custom_kv_meta = None ;
46
+ for kv_meta in kv_metas {
47
+ // Remove our extended custom meta data from the parquet metadata for small
48
+ // memory consumption in the cache.
49
+ if kv_meta. key == encoding:: META_KEY {
50
+ custom_kv_meta = Some ( kv_meta) ;
51
+ } else {
52
+ other_kv_metas. push ( kv_meta. clone ( ) ) ;
53
+ }
54
+ }
46
55
47
56
let custom = {
57
+ let custom_kv_meta = custom_kv_meta. context ( KvMetaDataNotFound ) ?;
48
58
let mut sst_meta =
49
- encoding:: decode_sst_meta_data ( kv_meta ) . context ( DecodeCustomMetaData ) ?;
59
+ encoding:: decode_sst_meta_data ( custom_kv_meta ) . context ( DecodeCustomMetaData ) ?;
50
60
if ignore_sst_filter {
51
61
sst_meta. parquet_filter = None ;
52
62
}
@@ -56,13 +66,17 @@ impl MetaData {
56
66
57
67
// let's build a new parquet metadata without the extended key value
58
68
// metadata.
69
+ let other_kv_metas = if other_kv_metas. is_empty ( ) {
70
+ None
71
+ } else {
72
+ Some ( other_kv_metas)
73
+ } ;
59
74
let parquet = {
60
75
let thin_file_meta_data = FileMetaData :: new (
61
76
file_meta_data. version ( ) ,
62
77
file_meta_data. num_rows ( ) ,
63
78
file_meta_data. created_by ( ) . map ( |v| v. to_string ( ) ) ,
64
- // Remove the key value metadata.
65
- None ,
79
+ other_kv_metas,
66
80
file_meta_data. schema_descr_ptr ( ) ,
67
81
file_meta_data. column_orders ( ) . cloned ( ) ,
68
82
) ;
@@ -111,3 +125,153 @@ impl MetaCache {
111
125
self . cache . write ( ) . unwrap ( ) . put ( key, value) ;
112
126
}
113
127
}
128
+
129
+ #[ cfg( test) ]
130
+ mod tests {
131
+ use std:: { fs:: File , path:: Path , sync:: Arc } ;
132
+
133
+ use arrow:: {
134
+ array:: UInt64Builder ,
135
+ datatypes:: { DataType , Field , Schema } ,
136
+ record_batch:: RecordBatch ,
137
+ } ;
138
+ use bytes:: Bytes ;
139
+ use common_types:: {
140
+ column_schema:: Builder as ColumnSchemaBuilder ,
141
+ schema:: Builder as CustomSchemaBuilder ,
142
+ time:: { TimeRange , Timestamp } ,
143
+ } ;
144
+ use parquet:: { arrow:: ArrowWriter , file:: footer} ;
145
+ use parquet_ext:: ParquetMetaData ;
146
+
147
+ use super :: MetaData ;
148
+ use crate :: sst:: parquet:: { encoding, meta_data:: ParquetMetaData as CustomParquetMetaData } ;
149
+
150
+ fn check_parquet_meta_data ( original : & ParquetMetaData , processed : & ParquetMetaData ) {
151
+ assert_eq ! ( original. page_indexes( ) , processed. page_indexes( ) ) ;
152
+ assert_eq ! ( original. offset_indexes( ) , processed. offset_indexes( ) ) ;
153
+ assert_eq ! ( original. num_row_groups( ) , processed. num_row_groups( ) ) ;
154
+ assert_eq ! ( original. row_groups( ) , processed. row_groups( ) ) ;
155
+
156
+ let original_file_md = original. file_metadata ( ) ;
157
+ let processed_file_md = processed. file_metadata ( ) ;
158
+ assert_eq ! ( original_file_md. num_rows( ) , processed_file_md. num_rows( ) ) ;
159
+ assert_eq ! ( original_file_md. version( ) , processed_file_md. version( ) ) ;
160
+ assert_eq ! (
161
+ original_file_md. created_by( ) ,
162
+ processed_file_md. created_by( )
163
+ ) ;
164
+ assert_eq ! ( original_file_md. schema( ) , processed_file_md. schema( ) ) ;
165
+ assert_eq ! (
166
+ original_file_md. schema_descr( ) ,
167
+ processed_file_md. schema_descr( )
168
+ ) ;
169
+ assert_eq ! (
170
+ original_file_md. schema_descr_ptr( ) ,
171
+ processed_file_md. schema_descr_ptr( )
172
+ ) ;
173
+ assert_eq ! (
174
+ original_file_md. column_orders( ) ,
175
+ processed_file_md. column_orders( )
176
+ ) ;
177
+
178
+ if let Some ( kv_metas) = original_file_md. key_value_metadata ( ) {
179
+ let processed_kv_metas = processed_file_md. key_value_metadata ( ) . unwrap ( ) ;
180
+ assert_eq ! ( kv_metas. len( ) , processed_kv_metas. len( ) + 1 ) ;
181
+ let mut idx_for_processed = 0 ;
182
+ for kv in kv_metas {
183
+ if kv. key == encoding:: META_KEY {
184
+ continue ;
185
+ }
186
+ assert_eq ! ( kv, & processed_kv_metas[ idx_for_processed] ) ;
187
+ idx_for_processed += 1 ;
188
+ }
189
+ } else {
190
+ assert ! ( processed_file_md. key_value_metadata( ) . is_none( ) ) ;
191
+ }
192
+ }
193
+
194
+ fn write_parquet_file_with_metadata (
195
+ parquet_file_path : & Path ,
196
+ custom_meta_data : & CustomParquetMetaData ,
197
+ ) {
198
+ let tsid_array = {
199
+ let mut builder = UInt64Builder :: new ( ) ;
200
+ builder. append_value ( 10 ) ;
201
+ builder. append_null ( ) ;
202
+ builder. append_value ( 11 ) ;
203
+ builder. finish ( )
204
+ } ;
205
+ let timestamp_array = {
206
+ let mut builder = UInt64Builder :: new ( ) ;
207
+ builder. append_value ( 1000 ) ;
208
+ builder. append_null ( ) ;
209
+ builder. append_value ( 1001 ) ;
210
+ builder. finish ( )
211
+ } ;
212
+ let file = File :: create ( parquet_file_path) . unwrap ( ) ;
213
+ let schema = Schema :: new ( vec ! [
214
+ Field :: new( "tsid" , DataType :: UInt64 , true ) ,
215
+ Field :: new( "timestamp" , DataType :: UInt64 , true ) ,
216
+ ] ) ;
217
+
218
+ let batch = RecordBatch :: try_new (
219
+ Arc :: new ( schema) ,
220
+ vec ! [ Arc :: new( tsid_array) , Arc :: new( timestamp_array) ] ,
221
+ )
222
+ . unwrap ( ) ;
223
+ let mut writer = ArrowWriter :: try_new ( file, batch. schema ( ) , None ) . unwrap ( ) ;
224
+
225
+ let encoded_meta_data = encoding:: encode_sst_meta_data ( custom_meta_data. clone ( ) ) . unwrap ( ) ;
226
+ writer. append_key_value_metadata ( encoded_meta_data) ;
227
+
228
+ writer. write ( & batch) . unwrap ( ) ;
229
+ writer. close ( ) . unwrap ( ) ;
230
+ }
231
+
232
+ #[ test]
233
+ fn test_arrow_meta_data ( ) {
234
+ let temp_dir = tempfile:: tempdir ( ) . unwrap ( ) ;
235
+ let parquet_file_path = temp_dir. path ( ) . join ( "test_arrow_meta_data.par" ) ;
236
+ let schema = {
237
+ let tsid_column_schema = ColumnSchemaBuilder :: new (
238
+ "tsid" . to_string ( ) ,
239
+ common_types:: datum:: DatumKind :: UInt64 ,
240
+ )
241
+ . build ( )
242
+ . unwrap ( ) ;
243
+ let timestamp_column_schema = ColumnSchemaBuilder :: new (
244
+ "timestamp" . to_string ( ) ,
245
+ common_types:: datum:: DatumKind :: Timestamp ,
246
+ )
247
+ . build ( )
248
+ . unwrap ( ) ;
249
+ CustomSchemaBuilder :: new ( )
250
+ . auto_increment_column_id ( true )
251
+ . add_key_column ( tsid_column_schema)
252
+ . unwrap ( )
253
+ . add_key_column ( timestamp_column_schema)
254
+ . unwrap ( )
255
+ . build ( )
256
+ . unwrap ( )
257
+ } ;
258
+ let custom_meta_data = CustomParquetMetaData {
259
+ min_key : Bytes :: from_static ( & [ 0 , 1 ] ) ,
260
+ max_key : Bytes :: from_static ( & [ 2 , 2 ] ) ,
261
+ time_range : TimeRange :: new_unchecked ( Timestamp :: new ( 0 ) , Timestamp :: new ( 10 ) ) ,
262
+ max_sequence : 1001 ,
263
+ schema,
264
+ parquet_filter : None ,
265
+ collapsible_cols_idx : vec ! [ ] ,
266
+ } ;
267
+ write_parquet_file_with_metadata ( parquet_file_path. as_path ( ) , & custom_meta_data) ;
268
+
269
+ let parquet_file = File :: open ( parquet_file_path. as_path ( ) ) . unwrap ( ) ;
270
+ let parquet_meta_data = footer:: parse_metadata ( & parquet_file) . unwrap ( ) ;
271
+
272
+ let meta_data = MetaData :: try_new ( & parquet_meta_data, false ) . unwrap ( ) ;
273
+
274
+ assert_eq ! ( * * meta_data. custom( ) , custom_meta_data) ;
275
+ check_parquet_meta_data ( & parquet_meta_data, meta_data. parquet ( ) ) ;
276
+ }
277
+ }
0 commit comments