Skip to content

Commit f43c9fc

Browse files
committed
Revert "feat: add page indexes for metadata (#1027)"
This reverts commit 1fd7857.
1 parent 865f9e6 commit f43c9fc

File tree

8 files changed

+102
-184
lines changed

8 files changed

+102
-184
lines changed

Cargo.lock

-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

analytic_engine/src/sst/parquet/async_reader.rs

+81-33
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,17 @@ use datafusion::{
3030
metrics::ExecutionPlanMetricsSet,
3131
},
3232
};
33-
use futures::{Stream, StreamExt};
33+
use futures::{future::BoxFuture, FutureExt, Stream, StreamExt, TryFutureExt};
3434
use log::{debug, error};
3535
use object_store::{ObjectStoreRef, Path};
3636
use parquet::{
37-
arrow::{arrow_reader::RowSelection, ParquetRecordBatchStreamBuilder, ProjectionMask},
37+
arrow::{
38+
arrow_reader::RowSelection, async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder,
39+
ProjectionMask,
40+
},
3841
file::metadata::RowGroupMetaData,
3942
};
40-
use parquet_ext::{meta_data::ChunkReader, reader::ObjectStoreReader};
43+
use parquet_ext::meta_data::ChunkReader;
4144
use snafu::ResultExt;
4245
use table_engine::predicate::PredicateRef;
4346
use tokio::sync::mpsc::{self, Receiver, Sender};
@@ -278,23 +281,13 @@ impl<'a> Reader<'a> {
278281

279282
let mut streams = Vec::with_capacity(target_row_group_chunks.len());
280283
for chunk in target_row_group_chunks {
281-
let object_store_reader = ObjectStoreReader::new(
282-
self.store.clone(),
283-
self.path.clone(),
284-
parquet_metadata.clone(),
285-
);
284+
let object_store_reader =
285+
ObjectStoreReader::new(self.store.clone(), self.path.clone(), meta_data.clone());
286286
let mut builder = ParquetRecordBatchStreamBuilder::new(object_store_reader)
287287
.await
288288
.with_context(|| ParquetError)?;
289-
290289
let row_selection =
291290
self.build_row_selection(arrow_schema.clone(), &chunk, parquet_metadata)?;
292-
293-
debug!(
294-
"Build row selection for file path:{}, result:{row_selection:?}, page indexes:{}",
295-
self.path,
296-
parquet_metadata.page_indexes().is_some()
297-
);
298291
if let Some(selection) = row_selection {
299292
builder = builder.with_row_selection(selection);
300293
};
@@ -347,32 +340,18 @@ impl<'a> Reader<'a> {
347340
Ok(file_size)
348341
}
349342

350-
async fn load_meta_data_from_storage(&self, ignore_sst_filter: bool) -> Result<MetaData> {
343+
async fn load_meta_data_from_storage(&self) -> Result<parquet_ext::ParquetMetaData> {
351344
let file_size = self.load_file_size().await?;
352345
let chunk_reader_adapter = ChunkReaderAdapter::new(self.path, self.store);
353346

354-
let (parquet_meta_data, _) =
347+
let (meta_data, _) =
355348
parquet_ext::meta_data::fetch_parquet_metadata(file_size, &chunk_reader_adapter)
356349
.await
357350
.with_context(|| FetchAndDecodeSstMeta {
358351
file_path: self.path.to_string(),
359352
})?;
360353

361-
let object_store_reader = parquet_ext::reader::ObjectStoreReader::new(
362-
self.store.clone(),
363-
self.path.clone(),
364-
Arc::new(parquet_meta_data),
365-
);
366-
367-
let parquet_meta_data = parquet_ext::meta_data::meta_with_page_indexes(object_store_reader)
368-
.await
369-
.with_context(|| DecodePageIndexes {
370-
file_path: self.path.to_string(),
371-
})?;
372-
373-
MetaData::try_new(&parquet_meta_data, ignore_sst_filter)
374-
.box_err()
375-
.context(DecodeSstMeta)
354+
Ok(meta_data)
376355
}
377356

378357
fn need_update_cache(&self) -> bool {
@@ -396,8 +375,12 @@ impl<'a> Reader<'a> {
396375
let empty_predicate = self.predicate.exprs().is_empty();
397376

398377
let meta_data = {
378+
let parquet_meta_data = self.load_meta_data_from_storage().await?;
379+
399380
let ignore_sst_filter = avoid_update_cache && empty_predicate;
400-
self.load_meta_data_from_storage(ignore_sst_filter).await?
381+
MetaData::try_new(&parquet_meta_data, ignore_sst_filter)
382+
.box_err()
383+
.context(DecodeSstMeta)?
401384
};
402385

403386
if avoid_update_cache || self.meta_cache.is_none() {
@@ -430,6 +413,71 @@ impl<'a> Drop for Reader<'a> {
430413
}
431414
}
432415

416+
#[derive(Clone)]
417+
struct ObjectStoreReader {
418+
storage: ObjectStoreRef,
419+
path: Path,
420+
meta_data: MetaData,
421+
begin: Instant,
422+
}
423+
424+
impl ObjectStoreReader {
425+
fn new(storage: ObjectStoreRef, path: Path, meta_data: MetaData) -> Self {
426+
Self {
427+
storage,
428+
path,
429+
meta_data,
430+
begin: Instant::now(),
431+
}
432+
}
433+
}
434+
435+
impl Drop for ObjectStoreReader {
436+
fn drop(&mut self) {
437+
debug!(
438+
"ObjectStoreReader dropped, path:{}, elapsed:{:?}",
439+
&self.path,
440+
self.begin.elapsed()
441+
);
442+
}
443+
}
444+
445+
impl AsyncFileReader for ObjectStoreReader {
446+
fn get_bytes(&mut self, range: Range<usize>) -> BoxFuture<'_, parquet::errors::Result<Bytes>> {
447+
self.storage
448+
.get_range(&self.path, range)
449+
.map_err(|e| {
450+
parquet::errors::ParquetError::General(format!(
451+
"Failed to fetch range from object store, err:{e}"
452+
))
453+
})
454+
.boxed()
455+
}
456+
457+
fn get_byte_ranges(
458+
&mut self,
459+
ranges: Vec<Range<usize>>,
460+
) -> BoxFuture<'_, parquet::errors::Result<Vec<Bytes>>> {
461+
async move {
462+
self.storage
463+
.get_ranges(&self.path, &ranges)
464+
.map_err(|e| {
465+
parquet::errors::ParquetError::General(format!(
466+
"Failed to fetch ranges from object store, err:{e}"
467+
))
468+
})
469+
.await
470+
}
471+
.boxed()
472+
}
473+
474+
fn get_metadata(
475+
&mut self,
476+
) -> BoxFuture<'_, parquet::errors::Result<Arc<parquet::file::metadata::ParquetMetaData>>> {
477+
Box::pin(async move { Ok(self.meta_data.parquet().clone()) })
478+
}
479+
}
480+
433481
pub struct ChunkReaderAdapter<'a> {
434482
path: &'a Path,
435483
store: &'a ObjectStoreRef,

analytic_engine/src/sst/reader.rs

+17-23
Original file line numberDiff line numberDiff line change
@@ -15,70 +15,64 @@ pub mod error {
1515
#[derive(Debug, Snafu)]
1616
#[snafu(visibility(pub))]
1717
pub enum Error {
18-
#[snafu(display("Try to read again, path:{path}.\nBacktrace:\n{backtrace}"))]
18+
#[snafu(display("Try to read again, path:{}.\nBacktrace:\n{}", path, backtrace))]
1919
ReadAgain { backtrace: Backtrace, path: String },
2020

21-
#[snafu(display("Fail to read persisted file, path:{path}, err:{source}"))]
21+
#[snafu(display("Fail to read persisted file, path:{}, err:{}", path, source))]
2222
ReadPersist { path: String, source: GenericError },
2323

24-
#[snafu(display("Failed to decode record batch, err:{source}"))]
24+
#[snafu(display("Failed to decode record batch, err:{}", source))]
2525
DecodeRecordBatch { source: GenericError },
2626

2727
#[snafu(display(
28-
"Failed to decode sst meta data, file_path:{file_path}, err:{source}.\nBacktrace:\n{backtrace:?}",
28+
"Failed to decode sst meta data, file_path:{}, err:{}.\nBacktrace:\n{:?}",
29+
file_path,
30+
source,
31+
backtrace
2932
))]
3033
FetchAndDecodeSstMeta {
3134
file_path: String,
3235
source: parquet::errors::ParquetError,
3336
backtrace: Backtrace,
3437
},
3538

36-
#[snafu(display(
37-
"Failed to decode page indexes for meta data, file_path:{file_path}, err:{source}.\nBacktrace:\n{backtrace:?}",
38-
))]
39-
DecodePageIndexes {
40-
file_path: String,
41-
source: parquet::errors::ParquetError,
42-
backtrace: Backtrace,
43-
},
44-
45-
#[snafu(display("Failed to decode sst meta data, err:{source}"))]
39+
#[snafu(display("Failed to decode sst meta data, err:{}", source))]
4640
DecodeSstMeta { source: GenericError },
4741

48-
#[snafu(display("Sst meta data is not found.\nBacktrace:\n{backtrace}"))]
42+
#[snafu(display("Sst meta data is not found.\nBacktrace:\n{}", backtrace))]
4943
SstMetaNotFound { backtrace: Backtrace },
5044

51-
#[snafu(display("Fail to projection, err:{source}"))]
45+
#[snafu(display("Fail to projection, err:{}", source))]
5246
Projection { source: GenericError },
5347

54-
#[snafu(display("Sst meta data is empty.\nBacktrace:\n{backtrace}"))]
48+
#[snafu(display("Sst meta data is empty.\nBacktrace:\n{}", backtrace))]
5549
EmptySstMeta { backtrace: Backtrace },
5650

57-
#[snafu(display("Invalid schema, err:{source}"))]
51+
#[snafu(display("Invalid schema, err:{}", source))]
5852
InvalidSchema { source: common_types::schema::Error },
5953

60-
#[snafu(display("Meet a datafusion error, err:{source}\nBacktrace:\n{backtrace}"))]
54+
#[snafu(display("Meet a datafusion error, err:{}\nBacktrace:\n{}", source, backtrace))]
6155
DataFusionError {
6256
source: datafusion::error::DataFusionError,
6357
backtrace: Backtrace,
6458
},
6559

66-
#[snafu(display("Meet a object store error, err:{source}\nBacktrace:\n{backtrace}"))]
60+
#[snafu(display("Meet a object store error, err:{}\nBacktrace:\n{}", source, backtrace))]
6761
ObjectStoreError {
6862
source: object_store::ObjectStoreError,
6963
backtrace: Backtrace,
7064
},
7165

72-
#[snafu(display("Meet a parquet error, err:{source}\nBacktrace:\n{backtrace}"))]
66+
#[snafu(display("Meet a parquet error, err:{}\nBacktrace:\n{}", source, backtrace))]
7367
ParquetError {
7468
source: parquet::errors::ParquetError,
7569
backtrace: Backtrace,
7670
},
7771

78-
#[snafu(display("Other kind of error:{source}"))]
72+
#[snafu(display("Other kind of error:{}", source))]
7973
Other { source: GenericError },
8074

81-
#[snafu(display("Other kind of error, msg:{msg}.\nBacktrace:\n{backtrace}"))]
75+
#[snafu(display("Other kind of error, msg:{}.\nBacktrace:\n{}", msg, backtrace))]
8276
OtherNoCause { msg: String, backtrace: Backtrace },
8377
}
8478

components/parquet_ext/Cargo.toml

-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ async-trait = { workspace = true }
1717
bytes = { workspace = true }
1818
common_util = { workspace = true }
1919
datafusion = { workspace = true }
20-
futures = { workspace = true }
2120
log = { workspace = true }
22-
object_store = { workspace = true }
2321
parquet = { workspace = true }
2422
tokio = { workspace = true }

components/parquet_ext/src/lib.rs

-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
pub mod meta_data;
44
pub mod prune;
5-
pub mod reader;
65
pub mod reverse_reader;
76
#[cfg(test)]
87
pub mod tests;

components/parquet_ext/src/meta_data.rs

+1-22
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
11
// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
22

3-
use std::{ops::Range, sync::Arc};
3+
use std::ops::Range;
44

55
use async_trait::async_trait;
66
use bytes::Bytes;
77
use common_util::error::GenericResult;
88
use parquet::{
9-
arrow::{arrow_reader::ArrowReaderOptions, ParquetRecordBatchStreamBuilder},
109
errors::{ParquetError, Result},
1110
file::{footer, metadata::ParquetMetaData},
1211
};
1312

14-
use crate::reader::ObjectStoreReader;
15-
1613
#[async_trait]
1714
pub trait ChunkReader: Sync + Send {
1815
async fn get_bytes(&self, range: Range<usize>) -> GenericResult<Bytes>;
@@ -68,21 +65,3 @@ pub async fn fetch_parquet_metadata(
6865

6966
footer::decode_metadata(&metadata_bytes).map(|v| (v, metadata_len))
7067
}
71-
72-
/// Build page indexes for meta data
73-
///
74-
/// TODO: Currently there is no method to build page indexes for meta data in
75-
/// `parquet`, maybe we can write a issue in `arrow-rs` .
76-
pub async fn meta_with_page_indexes(
77-
object_store_reader: ObjectStoreReader,
78-
) -> Result<Arc<ParquetMetaData>> {
79-
let read_options = ArrowReaderOptions::new().with_page_index(true);
80-
let builder =
81-
ParquetRecordBatchStreamBuilder::new_with_options(object_store_reader, read_options)
82-
.await
83-
.map_err(|e| {
84-
let err_msg = format!("failed to build page indexes in metadata, err:{e}");
85-
ParquetError::General(err_msg)
86-
})?;
87-
Ok(builder.metadata().clone())
88-
}

0 commit comments

Comments
 (0)