Skip to content

Commit 4880fc2

Browse files
zouxiang1993tanruixiang
authored andcommitted
feat: add more details about the sst in sst-metadata tool (apache#1019)
## Rationale More details about the sst are neeeded for troubleshooting problems. ## Detailed Changes - Output some statistics about the file; - Output compression information; ## Test Plan Check the output of sst-meta tool. --------- Co-authored-by: Ruixiang Tan <tanruixiang0104@gmail.com>
1 parent 91ecd60 commit 4880fc2

File tree

1 file changed

+63
-1
lines changed

1 file changed

+63
-1
lines changed

tools/src/bin/sst-metadata.rs

+63-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
//! A cli to query sst meta data
44
5-
use std::sync::Arc;
5+
use std::{collections::HashMap, sync::Arc};
66

77
use analytic_engine::sst::{meta_data::cache::MetaData, parquet::async_reader::ChunkReaderAdapter};
88
use anyhow::{Context, Result};
@@ -36,6 +36,34 @@ struct Args {
3636
page_indexes: bool,
3737
}
3838

39+
#[derive(Default, Debug)]
40+
struct FileStatistics {
41+
file_count: u64,
42+
size: usize,
43+
metadata_size: usize,
44+
kv_size: usize,
45+
filter_size: usize,
46+
row_num: i64,
47+
}
48+
49+
impl ToString for FileStatistics {
50+
fn to_string(&self) -> String {
51+
format!("FileStatistics {{\n\tfile_count: {},\n\tsize: {:.2},\n\tmetadata_size: {:.2}, \n\tkv_size: {:.2},\n\tfilter_size: {:.2},\n\trow_num: {},\n}}",
52+
self.file_count,
53+
as_mb(self.size),
54+
as_mb(self.metadata_size),
55+
as_mb(self.kv_size),
56+
as_mb(self.filter_size),
57+
self.row_num)
58+
}
59+
}
60+
61+
#[derive(Default, Debug)]
62+
struct FieldStatistics {
63+
compressed_size: i64,
64+
uncompressed_size: i64,
65+
}
66+
3967
fn new_runtime(thread_num: usize) -> Runtime {
4068
runtime::Builder::default()
4169
.thread_name("sst-metadata")
@@ -99,6 +127,8 @@ async fn run(args: Args) -> Result<()> {
99127
.cmp(&b.1.custom().time_range.inclusive_start())
100128
});
101129

130+
let mut file_stats = FileStatistics::default();
131+
let mut field_stats_map = HashMap::new();
102132
for (object_meta, sst_metadata, metadata_size, kv_size) in metas {
103133
let ObjectMeta { location, size, .. } = &object_meta;
104134
let custom_meta = sst_metadata.custom();
@@ -114,6 +144,27 @@ async fn run(args: Args) -> Result<()> {
114144
.unwrap_or(0);
115145
let file_metadata = parquet_meta.file_metadata();
116146
let row_num = file_metadata.num_rows();
147+
148+
file_stats.file_count += 1;
149+
file_stats.size += object_meta.size;
150+
file_stats.metadata_size += metadata_size;
151+
file_stats.kv_size += kv_size;
152+
file_stats.filter_size += filter_size;
153+
file_stats.row_num += row_num;
154+
155+
let fields = file_metadata.schema().get_fields();
156+
for (_, row_group) in parquet_meta.row_groups().iter().enumerate() {
157+
for i in 0..fields.len() {
158+
let column_meta = row_group.column(i);
159+
let field_name = fields.get(i).unwrap().get_basic_info().name().to_string();
160+
let mut field_stats = field_stats_map
161+
.entry(field_name)
162+
.or_insert(FieldStatistics::default());
163+
field_stats.compressed_size += column_meta.compressed_size();
164+
field_stats.uncompressed_size += column_meta.uncompressed_size();
165+
}
166+
}
167+
117168
if verbose {
118169
println!("object_meta:{object_meta:?}, parquet_meta:{parquet_meta:?}, custom_meta:{custom_meta:?}");
119170
} else {
@@ -127,6 +178,17 @@ async fn run(args: Args) -> Result<()> {
127178
}
128179
}
129180

181+
println!("{}", file_stats.to_string());
182+
println!("FieldStatistics: ");
183+
for (k, v) in field_stats_map.iter() {
184+
println!(
185+
"{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}",
186+
k,
187+
as_mb(v.compressed_size as usize),
188+
as_mb(v.uncompressed_size as usize),
189+
v.uncompressed_size as f64 / v.compressed_size as f64
190+
);
191+
}
130192
Ok(())
131193
}
132194

0 commit comments

Comments
 (0)