Skip to content

Commit fb034b0

Browse files
authored
Port codegened arrow serialization to arrow1 (#8208)
* Follows rerun-io/rerun#8206 * Part of rerun-io/rerun#3741 ## Changes To implement nullable unions, we have a `_null_marker: Null` variants in all our unions. This means all our unions are nullable. Previously we would only mark a struct field as nullable if it was declared as such in the `.fbs` file, but `arrow-rs` complains about this. So with this PR, if a struct field refers to a union type, that struct field will be marked as `nullable: true` in the datatype (in Rust, Python and C++).
1 parent 1202bd4 commit fb034b0

File tree

215 files changed

+3742
-3055
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

215 files changed

+3742
-3055
lines changed

crates/build/re_types_builder/src/codegen/cpp/mod.rs

+8-6
Original file line numberDiff line numberDiff line change
@@ -1729,12 +1729,12 @@ fn quote_fill_arrow_array_builder(
17291729
// C-style enum, encoded as a sparse arrow union
17301730
ObjectClass::Enum => {
17311731
quote! {
1732-
#parameter_check
1733-
ARROW_RETURN_NOT_OK(#builder->Reserve(static_cast<int64_t>(num_elements)));
1734-
for (size_t elem_idx = 0; elem_idx < num_elements; elem_idx += 1) {
1735-
const auto variant = elements[elem_idx];
1736-
ARROW_RETURN_NOT_OK(#builder->Append(static_cast<uint8_t>(variant)));
1737-
}
1732+
#parameter_check
1733+
ARROW_RETURN_NOT_OK(#builder->Reserve(static_cast<int64_t>(num_elements)));
1734+
for (size_t elem_idx = 0; elem_idx < num_elements; elem_idx += 1) {
1735+
const auto variant = elements[elem_idx];
1736+
ARROW_RETURN_NOT_OK(#builder->Append(static_cast<uint8_t>(variant)));
1737+
}
17381738
}
17391739
}
17401740

@@ -2482,6 +2482,7 @@ fn quote_arrow_field_type(
24822482
let name = &field.name;
24832483
let datatype = quote_arrow_datatype(&field.typ, objects, includes, false);
24842484
let is_nullable = field.is_nullable || field.typ == Type::Unit; // null type is always nullable
2485+
let is_nullable = is_nullable || field.typ.is_union(objects); // Rerun unions always has a `_null_marker: null` variant, so they are always nullable
24852486

24862487
quote! {
24872488
arrow::field(#name, #datatype, #is_nullable)
@@ -2496,6 +2497,7 @@ fn quote_arrow_elem_type(
24962497
let typ: Type = elem_type.clone().into();
24972498
let datatype = quote_arrow_datatype(&typ, objects, includes, false);
24982499
let is_nullable = typ == Type::Unit; // null type must be nullable
2500+
let is_nullable = is_nullable || elem_type.is_union(objects); // Rerun unions always has a `_null_marker: null` variant, so they are always nullable
24992501
quote! {
25002502
arrow::field("item", #datatype, #is_nullable)
25012503
}

crates/build/re_types_builder/src/codegen/python/mod.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -2550,7 +2550,8 @@ fn quote_arrow_field(field: &Field) -> String {
25502550
} = field;
25512551

25522552
let datatype = quote_arrow_datatype(data_type);
2553-
let is_nullable = if *is_nullable { "True" } else { "False" };
2553+
let is_nullable = *is_nullable || matches!(data_type.to_logical_type(), DataType::Union { .. }); // Rerun unions always has a `_null_marker: null` variant, so they are always nullable
2554+
let is_nullable = if is_nullable { "True" } else { "False" };
25542555
let metadata = quote_metadata_map(metadata);
25552556

25562557
format!(r#"pa.field("{name}", {datatype}, nullable={is_nullable}, metadata={metadata})"#)

crates/build/re_types_builder/src/codegen/rust/api.rs

+11-7
Original file line numberDiff line numberDiff line change
@@ -958,13 +958,13 @@ fn quote_trait_impls_for_datatype_or_component(
958958

959959
let quoted_serializer = if let Some(forwarded_type) = forwarded_type.as_ref() {
960960
quote! {
961-
fn to_arrow2_opt<'a>(
961+
fn to_arrow_opt<'a>(
962962
data: impl IntoIterator<Item = Option<impl Into<::std::borrow::Cow<'a, Self>>>>,
963-
) -> SerializationResult<Box<dyn arrow2::array::Array>>
963+
) -> SerializationResult<arrow::array::ArrayRef>
964964
where
965965
Self: Clone + 'a,
966966
{
967-
#forwarded_type::to_arrow2_opt(data.into_iter().map(|datum| {
967+
#forwarded_type::to_arrow_opt(data.into_iter().map(|datum| {
968968
datum.map(|datum| match datum.into() {
969969
::std::borrow::Cow::Borrowed(datum) => ::std::borrow::Cow::Borrowed(&datum.0),
970970
::std::borrow::Cow::Owned(datum) => ::std::borrow::Cow::Owned(datum.0),
@@ -978,9 +978,9 @@ fn quote_trait_impls_for_datatype_or_component(
978978

979979
quote! {
980980
// NOTE: Don't inline this, this gets _huge_.
981-
fn to_arrow2_opt<'a>(
981+
fn to_arrow_opt<'a>(
982982
data: impl IntoIterator<Item = Option<impl Into<::std::borrow::Cow<'a, Self>>>>,
983-
) -> SerializationResult<Box<dyn arrow2::array::Array>>
983+
) -> SerializationResult<arrow::array::ArrayRef>
984984
where
985985
Self: Clone + 'a
986986
{
@@ -989,10 +989,14 @@ fn quote_trait_impls_for_datatype_or_component(
989989

990990
#![allow(clippy::wildcard_imports)]
991991
#![allow(clippy::manual_is_variant_and)]
992-
use arrow::datatypes::*;
993-
use arrow2::array::*;
992+
use arrow::{array::*, buffer::*, datatypes::*};
994993
use ::re_types_core::{Loggable as _, ResultExt as _};
995994

995+
#[allow(unused)]
996+
fn as_array_ref<T: Array + 'static>(t: T) -> ArrayRef {
997+
std::sync::Arc::new(t) as ArrayRef
998+
}
999+
9961000
Ok(#quoted_serializer)
9971001
}
9981002
}

crates/build/re_types_builder/src/codegen/rust/arrow.rs

+46-7
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ use quote::quote;
55
// ---
66

77
/// `(Datatype, is_recursive)`
8+
///
9+
/// If `is_recursive` is set to `true`,
10+
/// then the generate code will often be shorter, as it it will
11+
/// defer to calling `arrow_datatype()` on the inner type.
812
pub struct ArrowDataTypeTokenizer<'a>(pub &'a ::arrow2::datatypes::DataType, pub bool);
913

1014
impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
@@ -31,18 +35,18 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
3135
DataType::LargeUtf8 => quote!(DataType::LargeUtf8),
3236

3337
DataType::List(field) => {
34-
let field = ArrowFieldTokenizer(field);
38+
let field = ArrowFieldTokenizer::new(field);
3539
quote!(DataType::List(std::sync::Arc::new(#field)))
3640
}
3741

3842
DataType::FixedSizeList(field, length) => {
39-
let field = ArrowFieldTokenizer(field);
43+
let field = ArrowFieldTokenizer::new(field);
4044
let length = Literal::usize_unsuffixed(*length);
4145
quote!(DataType::FixedSizeList(std::sync::Arc::new(#field), #length))
4246
}
4347

4448
DataType::Union(fields, types, mode) => {
45-
let fields = fields.iter().map(ArrowFieldTokenizer);
49+
let fields = fields.iter().map(ArrowFieldTokenizer::new);
4650
let mode = match mode {
4751
UnionMode::Dense => quote!(UnionMode::Dense),
4852
UnionMode::Sparse => quote!(UnionMode::Sparse),
@@ -66,18 +70,20 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
6670
}
6771

6872
DataType::Struct(fields) => {
69-
let fields = fields.iter().map(ArrowFieldTokenizer);
73+
let fields = fields.iter().map(ArrowFieldTokenizer::new);
7074
quote!(DataType::Struct(Fields::from(vec![ #(#fields,)* ])))
7175
}
7276

7377
DataType::Extension(fqname, datatype, _metadata) => {
7478
if *recursive {
79+
// TODO(emilk): if the logical datatype is a primitive, then we can just use it directly
80+
// so we get shorter generated code.
7581
let fqname_use = quote_fqname_as_type_path(fqname);
7682
quote!(<#fqname_use>::arrow_datatype())
7783
} else {
7884
let datatype = ArrowDataTypeTokenizer(datatype.to_logical_type(), false);
7985
quote!(#datatype)
80-
// TODO(cmc): Bring back extensions once we've fully replaced `arrow2-convert`!
86+
// TODO(#3741): Bring back extensions once we've fully replaced `arrow2-convert`!
8187
// let datatype = ArrowDataTypeTokenizer(datatype, false);
8288
// let metadata = OptionTokenizer(metadata.as_ref());
8389
// quote!(DataType::Extension(#fqname.to_owned(), Box::new(#datatype), #metadata))
@@ -90,16 +96,30 @@ impl quote::ToTokens for ArrowDataTypeTokenizer<'_> {
9096
}
9197
}
9298

93-
pub struct ArrowFieldTokenizer<'a>(pub &'a ::arrow2::datatypes::Field);
99+
pub struct ArrowFieldTokenizer<'a> {
100+
field: &'a ::arrow2::datatypes::Field,
101+
}
102+
103+
impl<'a> ArrowFieldTokenizer<'a> {
104+
pub fn new(field: &'a ::arrow2::datatypes::Field) -> Self {
105+
Self { field }
106+
}
107+
}
94108

95109
impl quote::ToTokens for ArrowFieldTokenizer<'_> {
96110
fn to_tokens(&self, tokens: &mut TokenStream) {
111+
let Self { field } = self;
97112
let arrow2::datatypes::Field {
98113
name,
99114
data_type,
100115
is_nullable,
101116
metadata,
102-
} = &self.0;
117+
} = field;
118+
119+
// Unions in Rerun always has a `_null_markers` arm, so all unions are nullable,
120+
// whether they are specified as such or not.
121+
let is_nullable =
122+
*is_nullable || matches!(field.data_type.to_logical_type(), DataType::Union { .. });
103123

104124
let datatype = ArrowDataTypeTokenizer(data_type, true);
105125

@@ -164,3 +184,22 @@ pub fn is_backed_by_arrow_buffer(typ: &DataType) -> bool {
164184
| DataType::Float64
165185
)
166186
}
187+
188+
pub fn quoted_arrow_primitive_type(datatype: &DataType) -> TokenStream {
189+
match datatype {
190+
DataType::Null => quote!(NullType),
191+
DataType::Boolean => quote!(BooleanType),
192+
DataType::Int8 => quote!(Int8Type),
193+
DataType::Int16 => quote!(Int16Type),
194+
DataType::Int32 => quote!(Int32Type),
195+
DataType::Int64 => quote!(Int64Type),
196+
DataType::UInt8 => quote!(UInt8Type),
197+
DataType::UInt16 => quote!(UInt16Type),
198+
DataType::UInt32 => quote!(UInt32Type),
199+
DataType::UInt64 => quote!(UInt64Type),
200+
DataType::Float16 => quote!(Float16Type),
201+
DataType::Float32 => quote!(Float32Type),
202+
DataType::Float64 => quote!(Float64Type),
203+
_ => unimplemented!("Not a primitive type: {datatype:#?}"),
204+
}
205+
}

0 commit comments

Comments
 (0)