From f0c75b98fd9982450322cf9290ffab0b02ab4ce4 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Wed, 29 May 2024 14:20:30 +0200 Subject: [PATCH] [T2] Wide column metadata improvemnts 1. Make `ColumnMetaData.type` optional 2. Make `ColumnMetaData.path_in_schema` optional 3. Add `ColumnMetaData.schema_index`. This is the ordinal in `FileMetaData.schema` this column corresponds to. This allows sparse representation of columns in a rowgroup. --- src/main/thrift/parquet.thrift | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index c928ad66b..c34511caf 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -490,7 +490,7 @@ enum Encoding { // GROUP_VAR_INT = 1; /** - * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the + * DEPRECATED: Dictionary encoding. The values in the dictionary are encoded in the * plain type. * in a data page use RLE_DICTIONARY instead. * in a Dictionary page use PLAIN instead @@ -772,15 +772,25 @@ struct PageEncodingStats { * Description for column metadata */ struct ColumnMetaData { - /** Type of this column **/ - 1: required Type type + /** + * DEPRECATED: can be found in SchemaElement + * + * Writers MUST NOT omit this field until 2025-10-01. + * Readers MUST ignore this field before 2025-10-01. + */ + 1: optional Type type /** Set of all encodings used for this column. The purpose is to validate * whether we can decode those pages. **/ 2: required list encodings - /** Path in schema **/ - 3: required list path_in_schema + /** + * DEPRECATED: can be found in SchemaElement + * + * Writers MUST NOT omit this field until 2025-10-01. + * Readers MUST ignore this field before 2025-10-01. + */ + 3: optional list path_in_schema /** Compression codec **/ 4: required CompressionCodec codec @@ -833,6 +843,13 @@ struct ColumnMetaData { * filter pushdown. */ 16: optional SizeStatistics size_statistics; + + /** + * The index into FileMetadata.schema (list) for this column. + * This implies that ColumnMetaData can be sparse in a rowgroup, if for example + * a column does not have any data pages in a rowgroup. + */ + 17: optional i32 schema_index; } struct EncryptionWithFooterKey {