Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 8c2f601

Browse files
committedMay 23, 2023
refactor: find new columns to improve write performance
1 parent dcae0e0 commit 8c2f601

File tree

2 files changed

+95
-41
lines changed

2 files changed

+95
-41
lines changed
 

‎proxy/src/write.rs

+92-39
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use std::{
1010

1111
use bytes::Bytes;
1212
use ceresdbproto::storage::{
13-
storage_service_client::StorageServiceClient, value, RouteRequest, WriteRequest,
13+
storage_service_client::StorageServiceClient, value, RouteRequest, Value, WriteRequest,
1414
WriteResponse as WriteResponsePB, WriteSeriesEntry, WriteTableRequest,
1515
};
1616
use cluster::config::SchemaConfig;
@@ -34,7 +34,7 @@ use query_engine::executor::Executor as QueryExecutor;
3434
use query_frontend::{
3535
frontend::{Context as FrontendContext, Frontend},
3636
plan::{AlterTableOperation, AlterTablePlan, InsertPlan, Plan},
37-
planner::build_schema_from_write_table_request,
37+
planner::{build_column_schema, try_get_data_type_from_value},
3838
provider::CatalogMetaProvider,
3939
};
4040
use router::endpoint::Endpoint;
@@ -43,7 +43,7 @@ use table_engine::table::TableRef;
4343
use tonic::transport::Channel;
4444

4545
use crate::{
46-
error::{ErrNoCause, ErrWithCause, InternalNoCause, Result},
46+
error::{ErrNoCause, ErrWithCause, Internal, InternalNoCause, Result},
4747
forward::{ForwardResult, ForwarderRef},
4848
Context, Proxy,
4949
};
@@ -320,7 +320,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
320320
.map(|endpoint| (router.table, endpoint.into()))
321321
})
322322
.filter(|router| !self.forwarder.is_local_endpoint(&router.1))
323-
.collect::<HashMap<_, _>>();
323+
.collect::<BTreeMap<_, _>>();
324324

325325
// No table need to be forwarded.
326326
if forwarded_table_routes.is_empty() {
@@ -477,14 +477,6 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
477477
code: StatusCode::BAD_REQUEST,
478478
})?;
479479
let schema = req_ctx.database;
480-
let schema_config = self
481-
.schema_config_provider
482-
.schema_config(&schema)
483-
.box_err()
484-
.with_context(|| ErrWithCause {
485-
code: StatusCode::INTERNAL_SERVER_ERROR,
486-
msg: format!("Fail to fetch schema config, schema_name:{schema}"),
487-
})?;
488480

489481
debug!(
490482
"Local write begin, catalog:{catalog}, schema:{schema}, request_id:{request_id}, first_table:{:?}, num_tables:{}",
@@ -503,7 +495,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
503495
};
504496

505497
let plan_vec = self
506-
.write_request_to_insert_plan(req.table_requests, schema_config, write_context)
498+
.write_request_to_insert_plan(req.table_requests, write_context)
507499
.await?;
508500

509501
let mut success = 0;
@@ -522,7 +514,6 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
522514
async fn write_request_to_insert_plan(
523515
&self,
524516
table_requests: Vec<WriteTableRequest>,
525-
schema_config: Option<&SchemaConfig>,
526517
write_context: WriteContext,
527518
) -> Result<Vec<InsertPlan>> {
528519
let mut plan_vec = Vec::with_capacity(table_requests.len());
@@ -534,7 +525,6 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
534525
deadline,
535526
auto_create_table,
536527
} = write_context;
537-
let schema_config = schema_config.cloned().unwrap_or_default();
538528
for write_table_req in table_requests {
539529
let table_name = &write_table_req.table;
540530
self.maybe_open_partition_table_if_not_exist(&catalog, &schema, table_name)
@@ -555,7 +545,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
555545
// * Currently, the decision to add columns is made at the request level, not at
556546
// the row level, so the cost is relatively small.
557547
let table_schema = table.schema();
558-
let columns = find_new_columns(&table_schema, &schema_config, &write_table_req)?;
548+
let columns = find_new_columns(&table_schema, &write_table_req)?;
559549
if !columns.is_empty() {
560550
self.execute_add_columns_plan(
561551
request_id,
@@ -668,32 +658,95 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
668658

669659
fn find_new_columns(
670660
schema: &Schema,
671-
schema_config: &SchemaConfig,
672-
write_req: &WriteTableRequest,
661+
write_table_req: &WriteTableRequest,
673662
) -> Result<Vec<ColumnSchema>> {
674-
let new_schema = build_schema_from_write_table_request(schema_config, write_req)
663+
let WriteTableRequest {
664+
table,
665+
field_names,
666+
tag_names,
667+
entries: write_entries,
668+
} = write_table_req;
669+
670+
let mut columns: BTreeMap<_, ColumnSchema> = BTreeMap::new();
671+
for write_entry in write_entries {
672+
// parse tags
673+
for tag in &write_entry.tags {
674+
let name_index = tag.name_index as usize;
675+
ensure!(
676+
name_index < tag_names.len(),
677+
InternalNoCause {
678+
msg: format!(
679+
"Tag {tag:?} is not found in tag_names:{tag_names:?}, table:{table}",
680+
),
681+
}
682+
);
683+
684+
let tag_name = &tag_names[name_index];
685+
686+
build_column(&mut columns, schema, tag_name, &tag.value, true)?;
687+
}
688+
689+
// parse fields
690+
for field_group in &write_entry.field_groups {
691+
for field in &field_group.fields {
692+
let field_index = field.name_index as usize;
693+
ensure!(
694+
field_index < field_names.len(),
695+
InternalNoCause {
696+
msg: format!(
697+
"Field {field:?} is not found in field_names:{field_names:?}, table:{table}",
698+
),
699+
}
700+
);
701+
if (field.name_index as usize) < field_names.len() {
702+
let field_name = &field_names[field.name_index as usize];
703+
build_column(&mut columns, schema, field_name, &field.value, false)?;
704+
}
705+
}
706+
}
707+
}
708+
709+
Ok(columns.into_iter().map(|v| v.1).collect())
710+
}
711+
712+
fn build_column<'a>(
713+
columns: &mut BTreeMap<&'a str, ColumnSchema>,
714+
schema: &Schema,
715+
name: &'a str,
716+
value: &Option<Value>,
717+
is_tag: bool,
718+
) -> Result<()> {
719+
// Skip adding columns, the following cases:
720+
// 1. Field already exists.
721+
// 2. The new column has been added.
722+
if schema.index_of(name).is_some() || columns.get(name).is_some() {
723+
return Ok(());
724+
}
725+
726+
let column_value = value
727+
.as_ref()
728+
.with_context(|| InternalNoCause {
729+
msg: format!("Field value is needed, field:{name}"),
730+
})?
731+
.value
732+
.as_ref()
733+
.with_context(|| InternalNoCause {
734+
msg: format!("Field value type is not supported, field:{name}"),
735+
})?;
736+
737+
let data_type = try_get_data_type_from_value(column_value)
675738
.box_err()
676-
.context(ErrWithCause {
677-
code: StatusCode::INTERNAL_SERVER_ERROR,
678-
msg: "Build schema from write table request failed",
739+
.context(Internal {
740+
msg: "Failed to get data type",
679741
})?;
680742

681-
let columns = new_schema.columns();
682-
let old_columns = schema.columns();
683-
684-
// find new columns:
685-
// 1. timestamp column can't be a new column;
686-
// 2. column not in old schema is a new column.
687-
let new_columns = columns
688-
.iter()
689-
.enumerate()
690-
.filter(|(idx, column)| {
691-
*idx != new_schema.timestamp_index()
692-
&& !old_columns.iter().any(|c| c.name == column.name)
693-
})
694-
.map(|(_, column)| column.clone())
695-
.collect();
696-
Ok(new_columns)
743+
let column_schema = build_column_schema(name, data_type, is_tag)
744+
.box_err()
745+
.context(Internal {
746+
msg: "Failed to build column schema",
747+
})?;
748+
columns.insert(name, column_schema);
749+
Ok(())
697750
}
698751

699752
fn write_table_request_to_insert_plan(
@@ -802,7 +855,7 @@ fn write_entry_to_rows(
802855
}
803856

804857
// Fill fields.
805-
let mut field_name_index: HashMap<String, usize> = HashMap::new();
858+
let mut field_name_index: BTreeMap<String, usize> = BTreeMap::new();
806859
for (i, field_group) in write_series_entry.field_groups.into_iter().enumerate() {
807860
// timestamp
808861
let timestamp_index_in_schema = schema.timestamp_index();

‎query_frontend/src/planner.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ impl<'a, P: MetaProvider> Planner<'a, P> {
363363
}
364364
}
365365

366-
fn build_column_schema(
366+
pub fn build_column_schema(
367367
column_name: &str,
368368
data_type: DatumKind,
369369
is_tag: bool,
@@ -537,7 +537,7 @@ fn ensure_data_type_compatible(
537537
Ok(())
538538
}
539539

540-
fn try_get_data_type_from_value(value: &PbValue) -> Result<DatumKind> {
540+
pub fn try_get_data_type_from_value(value: &PbValue) -> Result<DatumKind> {
541541
match value {
542542
PbValue::Float64Value(_) => Ok(DatumKind::Double),
543543
PbValue::StringValue(_) => Ok(DatumKind::String),
@@ -555,6 +555,7 @@ fn try_get_data_type_from_value(value: &PbValue) -> Result<DatumKind> {
555555
PbValue::VarbinaryValue(_) => Ok(DatumKind::Varbinary),
556556
}
557557
}
558+
558559
/// A planner wraps the datafusion's logical planner, and delegate sql like
559560
/// select/explain to datafusion's planner.
560561
pub(crate) struct PlannerDelegate<'a, P: MetaProvider> {

0 commit comments

Comments
 (0)
Please sign in to comment.