From 226b73f875db51654d7bc1dbced517f5b2334bb6 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Mon, 24 Feb 2025 14:11:43 +0000 Subject: [PATCH 01/12] feat: fuse dict + runend decoding --- Cargo.lock | 1 + encodings/dict/Cargo.toml | 1 + encodings/dict/benches/dict_compress.rs | 30 +++++++++++++- encodings/dict/src/array.rs | 48 +++++++++++++---------- encodings/dict/src/compute/mod.rs | 36 +++++++++++++++-- encodings/runend/src/compute/mod.rs | 6 +++ encodings/runend/src/compute/take.rs | 1 + encodings/runend/src/compute/take_from.rs | 31 +++++++++++++++ vortex-array/src/compute/mod.rs | 2 + vortex-array/src/compute/take_from.rs | 32 +++++++++++++++ vortex-array/src/vtable/compute.rs | 6 ++- 11 files changed, 168 insertions(+), 26 deletions(-) create mode 100644 encodings/runend/src/compute/take_from.rs create mode 100644 vortex-array/src/compute/take_from.rs diff --git a/Cargo.lock b/Cargo.lock index 7841d84789..6f263c6509 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5424,6 +5424,7 @@ dependencies = [ "vortex-error", "vortex-fsst", "vortex-mask", + "vortex-runend", "vortex-scalar", ] diff --git a/encodings/dict/Cargo.toml b/encodings/dict/Cargo.toml index 096eac4a13..117af01d18 100644 --- a/encodings/dict/Cargo.toml +++ b/encodings/dict/Cargo.toml @@ -39,6 +39,7 @@ workspace = true divan = { workspace = true } rand = { workspace = true } vortex-array = { workspace = true, features = ["test-harness"] } +vortex-runend = { workspace = true } [[bench]] name = "dict_compress" diff --git a/encodings/dict/benches/dict_compress.rs b/encodings/dict/benches/dict_compress.rs index 5243fab3b0..09072a70c1 100644 --- a/encodings/dict/benches/dict_compress.rs +++ b/encodings/dict/benches/dict_compress.rs @@ -3,10 +3,14 @@ use divan::Bencher; use rand::distr::{Distribution, StandardUniform}; use vortex_array::Array; -use vortex_array::arrays::{VarBinArray, VarBinViewArray}; +use vortex_array::arrays::{PrimitiveArray, VarBinArray, VarBinViewArray}; +use vortex_array::validity::Validity::NonNullable; +use vortex_buffer::buffer; +use vortex_dict::DictArray; use vortex_dict::builders::dict_encode; use vortex_dict::test::{gen_primitive_for_dict, gen_varbin_words}; use vortex_dtype::NativePType; +use vortex_runend::RunEndArray; fn main() { divan::main(); @@ -92,3 +96,27 @@ fn decode_varbinview(bencher: Bencher, (len, unique_values): (usize, usize)) { .with_inputs(|| dict.clone()) .bench_values(|dict| dict.to_canonical()); } + +#[divan::bench(args = &[ + 100, + 1000, + 10_000, + 100_000, +])] +#[allow(clippy::cast_possible_truncation)] +fn decode_dict_with_runend_codes(bencher: Bencher, length: usize) { + let ends = PrimitiveArray::new( + buffer![(length / 3) as u32, (length / 2) as u32, length as u32], + NonNullable, + ) + .into_array(); + + let codes = PrimitiveArray::new(buffer![0u32, 1, 2], NonNullable).into_array(); + let runend_codes = RunEndArray::try_new(ends, codes).unwrap(); + let dict_values = PrimitiveArray::new(buffer![100u32, 200, 300], NonNullable).into_array(); + let dict_array = DictArray::try_new(runend_codes.into_array(), dict_values).unwrap(); + + bencher + .with_inputs(|| dict_array.clone()) + .bench_refs(|array| array.to_canonical()) +} diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index b142368118..4145729c02 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -103,28 +103,36 @@ impl ArrayCanonicalImpl for DictArray { take(&canonical_values, self.codes())?.to_canonical() } DType::Primitive(ptype, _) - // TODO(alex): handle nullable codes & values - if *ptype != PType::F16 - && self.codes().all_valid()? - && self.values().all_valid()? => - { - let codes = self.codes().to_primitive()?; - let values = self.values().to_primitive()?; - - match_each_unsigned_integer_ptype!(codes.ptype(), |$C| { - match_each_native_simd_ptype!(values.ptype(), |$V| { - // SIMD types larger than the SIMD register size are beneficial for - // performance as this leads to better instruction level parallelism. - let decoded = dict_decode_typed_primitive::<$C, $V, 64>( - codes.as_slice(), - values.as_slice(), - self.dtype().nullability(), - ); - decoded.to_canonical() - }) + if *ptype != PType::F16 + && self.codes().all_valid()? + && self.values().all_valid()? => + { + // TODO(alex): handle nullable codes & values + let codes = self.codes().to_primitive()?; + let values = self.values().to_primitive()?; + + match_each_unsigned_integer_ptype!(codes.ptype(), |$C| { + match_each_native_simd_ptype!(values.ptype(), |$V| { + // SIMD types larger than the SIMD register size are beneficial for + // performance as this leads to better instruction level parallelism. + let decoded = dict_decode_typed_primitive::<$C, $V, 64>( + codes.as_slice(), + values.as_slice(), + self.dtype().nullability(), + ); + decoded.to_canonical() + }) }) + } + _ => { + if let Some(take_from_fn) = self.codes().vtable().take_from_fn() { + if let Some(array) = take_from_fn.take_from(self.codes(), self.values())? { + return array.to_canonical(); + } } - _ => take(self.values(), self.codes())?.to_canonical() + + take(self.values(), self.codes())?.to_canonical() + } } } diff --git a/encodings/dict/src/compute/mod.rs b/encodings/dict/src/compute/mod.rs index 892b0b75f7..710201569f 100644 --- a/encodings/dict/src/compute/mod.rs +++ b/encodings/dict/src/compute/mod.rs @@ -58,9 +58,12 @@ impl ScalarAtFn<&DictArray> for DictEncoding { impl TakeFn<&DictArray> for DictEncoding { fn take(&self, array: &DictArray, indices: &dyn Array) -> VortexResult { - // Dict - // codes: 0 0 1 - // dict: a b c d e f g h + if let Some(take_from_fn) = indices.vtable().take_from_fn() { + if let Some(array) = take_from_fn.take_from(indices, array.values())? { + return Ok(array); + } + } + let codes = take(array.codes(), indices)?; DictArray::try_new(codes, array.values().clone()).map(|a| a.into_array()) } @@ -87,10 +90,12 @@ mod test { use vortex_array::arrays::{ConstantArray, PrimitiveArray, VarBinArray, VarBinViewArray}; use vortex_array::compute::test_harness::test_mask; use vortex_array::compute::{Operator, compare, scalar_at, slice}; - use vortex_array::{Array, ArrayRef, ToCanonical}; + use vortex_array::{Array, ArrayRef, IntoArray, ToCanonical}; + use vortex_buffer::buffer; use vortex_dtype::{DType, Nullability}; use vortex_scalar::Scalar; + use crate::DictArray; use crate::builders::dict_encode; #[test] @@ -231,4 +236,27 @@ mod test { .unwrap(); test_mask(&array); } + + #[test] + fn test_dict_array_runend_decode() { + let run_end_codes = vortex_runend::RunEndArray::try_new( + buffer![2u32, 7, 10].into_array(), + buffer![0u32, 2, 1].into_array(), + ) + .unwrap(); + + let dict_values = buffer![100u32, 200, 300].into_array(); + let dict_array = DictArray::try_new(run_end_codes.into_array(), dict_values).unwrap(); + let canonical = dict_array.to_canonical().unwrap(); + + let expected = buffer![100u32, 100, 300, 300, 300, 300, 300, 200, 200, 200] + .into_array() + .to_canonical() + .unwrap(); + + assert_eq!( + canonical.into_primitive().unwrap().as_slice::(), + expected.into_primitive().unwrap().as_slice::() + ); + } } diff --git a/encodings/runend/src/compute/mod.rs b/encodings/runend/src/compute/mod.rs index 37b52ca234..2c34893318 100644 --- a/encodings/runend/src/compute/mod.rs +++ b/encodings/runend/src/compute/mod.rs @@ -6,10 +6,12 @@ mod invert; mod scalar_at; mod slice; pub(crate) mod take; +mod take_from; use vortex_array::Array; use vortex_array::compute::{ BinaryNumericFn, CompareFn, FillNullFn, FilterFn, InvertFn, ScalarAtFn, SliceFn, TakeFn, + TakeFromFn, }; use vortex_array::vtable::ComputeVTable; @@ -47,6 +49,10 @@ impl ComputeVTable for RunEndEncoding { fn take_fn(&self) -> Option<&dyn TakeFn<&dyn Array>> { Some(self) } + + fn take_from_fn(&self) -> Option<&dyn TakeFromFn<&dyn Array>> { + Some(self) + } } #[cfg(test)] diff --git a/encodings/runend/src/compute/take.rs b/encodings/runend/src/compute/take.rs index 0c0ba15e7b..620b469192 100644 --- a/encodings/runend/src/compute/take.rs +++ b/encodings/runend/src/compute/take.rs @@ -25,6 +25,7 @@ impl TakeFn<&RunEndArray> for RunEndEncoding { }) .collect::>>()? }); + take_indices_unchecked(array, &checked_indices) } } diff --git a/encodings/runend/src/compute/take_from.rs b/encodings/runend/src/compute/take_from.rs new file mode 100644 index 0000000000..a0db0f2ba7 --- /dev/null +++ b/encodings/runend/src/compute/take_from.rs @@ -0,0 +1,31 @@ +use vortex_array::compute::{TakeFromFn, take}; +use vortex_array::{Array, ArrayRef}; +use vortex_dtype::DType; +use vortex_error::VortexResult; + +use crate::{RunEndArray, RunEndEncoding}; + +impl TakeFromFn<&RunEndArray> for RunEndEncoding { + fn take_from( + &self, + indices: &RunEndArray, + array: &dyn Array, + ) -> VortexResult> { + // Only `Primitve` and `Bool` are valid run-end value types. + if !matches!(array.dtype(), DType::Primitive(_, _) | DType::Bool(_)) { + return Ok(None); + } + + // Order the values to prepare for runend decoding. + let shuffled = take(array, indices.values())?; + + let ree_array = RunEndArray::with_offset_and_length( + indices.ends().clone(), + shuffled, + indices.offset(), + indices.len(), + )?; + + Ok(Some(ree_array.into_array())) + } +} diff --git a/vortex-array/src/compute/mod.rs b/vortex-array/src/compute/mod.rs index dd0e592062..1e31f1c882 100644 --- a/vortex-array/src/compute/mod.rs +++ b/vortex-array/src/compute/mod.rs @@ -29,6 +29,7 @@ pub use search_sorted::*; pub use slice::{SliceFn, slice}; pub use sum::*; pub use take::{TakeFn, take, take_into}; +pub use take_from::{TakeFromFn, take_from}; pub use to_arrow::*; mod between; @@ -49,6 +50,7 @@ mod search_sorted; mod slice; mod sum; mod take; +mod take_from; mod to_arrow; #[cfg(feature = "test-harness")] diff --git a/vortex-array/src/compute/take_from.rs b/vortex-array/src/compute/take_from.rs new file mode 100644 index 0000000000..fe7a929beb --- /dev/null +++ b/vortex-array/src/compute/take_from.rs @@ -0,0 +1,32 @@ +use vortex_error::{VortexExpect, VortexResult, vortex_bail}; + +use crate::encoding::Encoding; +use crate::{Array, ArrayRef}; + +pub trait TakeFromFn { + fn take_from(&self, indices: A, array: &dyn Array) -> VortexResult>; +} + +impl TakeFromFn<&dyn Array> for E +where + E: for<'a> TakeFromFn<&'a E::Array>, +{ + fn take_from(&self, indices: &dyn Array, array: &dyn Array) -> VortexResult> { + let indices = indices + .as_any() + .downcast_ref::() + .vortex_expect("Failed to downcast array"); + + TakeFromFn::take_from(self, indices, array) + } +} + +pub fn take_from(indices: &dyn Array, array: &dyn Array) -> VortexResult> { + let taken = indices + .vtable() + .take_from_fn() + .map(|f| f.take_from(indices, array)) + .unwrap_or_else(|| vortex_bail!(NotImplemented: "take_from", array.encoding()))?; + + Ok(taken) +} diff --git a/vortex-array/src/vtable/compute.rs b/vortex-array/src/vtable/compute.rs index 0a40dad53b..f44125b2e2 100644 --- a/vortex-array/src/vtable/compute.rs +++ b/vortex-array/src/vtable/compute.rs @@ -2,7 +2,7 @@ use crate::Array; use crate::compute::{ BetweenFn, BinaryBooleanFn, BinaryNumericFn, CastFn, CompareFn, FillForwardFn, FillNullFn, FilterFn, InvertFn, IsConstantFn, LikeFn, MaskFn, MinMaxFn, ScalarAtFn, SearchSortedFn, - SearchSortedUsizeFn, SliceFn, SumFn, TakeFn, ToArrowFn, + SearchSortedUsizeFn, SliceFn, SumFn, TakeFn, TakeFromFn, ToArrowFn, }; /// VTable for dispatching compute functions to Vortex encodings. @@ -133,6 +133,10 @@ pub trait ComputeVTable { None } + fn take_from_fn(&self) -> Option<&dyn TakeFromFn<&dyn Array>> { + None + } + /// Convert the array to an Arrow array of the given type. /// /// See: [ToArrowFn]. From 7a55fd93a140ab6849472bb26d1127d43a0ca069 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 14:33:53 +0000 Subject: [PATCH 02/12] typo --- encodings/runend/src/compute/take_from.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encodings/runend/src/compute/take_from.rs b/encodings/runend/src/compute/take_from.rs index a0db0f2ba7..a19e59c0fe 100644 --- a/encodings/runend/src/compute/take_from.rs +++ b/encodings/runend/src/compute/take_from.rs @@ -11,7 +11,7 @@ impl TakeFromFn<&RunEndArray> for RunEndEncoding { indices: &RunEndArray, array: &dyn Array, ) -> VortexResult> { - // Only `Primitve` and `Bool` are valid run-end value types. + // Only `Primitive` and `Bool` are valid run-end value types. if !matches!(array.dtype(), DType::Primitive(_, _) | DType::Bool(_)) { return Ok(None); } From 4d6cfdafdeafe3d63f59e6ff468df023805428a0 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 14:38:40 +0000 Subject: [PATCH 03/12] docs --- encodings/runend/src/compute/take_from.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/encodings/runend/src/compute/take_from.rs b/encodings/runend/src/compute/take_from.rs index a19e59c0fe..01fed738d8 100644 --- a/encodings/runend/src/compute/take_from.rs +++ b/encodings/runend/src/compute/take_from.rs @@ -6,6 +6,18 @@ use vortex_error::VortexResult; use crate::{RunEndArray, RunEndEncoding}; impl TakeFromFn<&RunEndArray> for RunEndEncoding { + /// Takes values from the source array using run-end encoded indices. + /// + /// # Arguments + /// + /// * `indices` - The run-end encoded array containing the indices + /// * `array` - The source array to take values from + /// + /// # Returns + /// + /// * `Ok(Some(array))` - If successful + /// * `Ok(None)` - If the source array has an unsupported dtype + /// fn take_from( &self, indices: &RunEndArray, From 2e6d0de21b614216b13c0d0dd2ff90ed07c8ba06 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 15:21:50 +0000 Subject: [PATCH 04/12] address comments --- encodings/dict/benches/dict_compress.rs | 24 ---------------- encodings/dict/src/compute/mod.rs | 6 ---- encodings/runend/benches/run_end_compress.rs | 30 ++++++++++++++++++++ encodings/runend/src/compute/take_from.rs | 2 +- vortex-array/src/compute/mod.rs | 2 +- vortex-array/src/compute/take.rs | 7 +++++ vortex-array/src/compute/take_from.rs | 12 +------- 7 files changed, 40 insertions(+), 43 deletions(-) diff --git a/encodings/dict/benches/dict_compress.rs b/encodings/dict/benches/dict_compress.rs index 09072a70c1..6335775f09 100644 --- a/encodings/dict/benches/dict_compress.rs +++ b/encodings/dict/benches/dict_compress.rs @@ -96,27 +96,3 @@ fn decode_varbinview(bencher: Bencher, (len, unique_values): (usize, usize)) { .with_inputs(|| dict.clone()) .bench_values(|dict| dict.to_canonical()); } - -#[divan::bench(args = &[ - 100, - 1000, - 10_000, - 100_000, -])] -#[allow(clippy::cast_possible_truncation)] -fn decode_dict_with_runend_codes(bencher: Bencher, length: usize) { - let ends = PrimitiveArray::new( - buffer![(length / 3) as u32, (length / 2) as u32, length as u32], - NonNullable, - ) - .into_array(); - - let codes = PrimitiveArray::new(buffer![0u32, 1, 2], NonNullable).into_array(); - let runend_codes = RunEndArray::try_new(ends, codes).unwrap(); - let dict_values = PrimitiveArray::new(buffer![100u32, 200, 300], NonNullable).into_array(); - let dict_array = DictArray::try_new(runend_codes.into_array(), dict_values).unwrap(); - - bencher - .with_inputs(|| dict_array.clone()) - .bench_refs(|array| array.to_canonical()) -} diff --git a/encodings/dict/src/compute/mod.rs b/encodings/dict/src/compute/mod.rs index 710201569f..7e91da1ba9 100644 --- a/encodings/dict/src/compute/mod.rs +++ b/encodings/dict/src/compute/mod.rs @@ -58,12 +58,6 @@ impl ScalarAtFn<&DictArray> for DictEncoding { impl TakeFn<&DictArray> for DictEncoding { fn take(&self, array: &DictArray, indices: &dyn Array) -> VortexResult { - if let Some(take_from_fn) = indices.vtable().take_from_fn() { - if let Some(array) = take_from_fn.take_from(indices, array.values())? { - return Ok(array); - } - } - let codes = take(array.codes(), indices)?; DictArray::try_new(codes, array.values().clone()).map(|a| a.into_array()) } diff --git a/encodings/runend/benches/run_end_compress.rs b/encodings/runend/benches/run_end_compress.rs index 0c783c2897..9b2ec25f27 100644 --- a/encodings/runend/benches/run_end_compress.rs +++ b/encodings/runend/benches/run_end_compress.rs @@ -55,3 +55,33 @@ fn decompress(bencher: Bencher, (length, run_step): (usize, usize)) { .with_inputs(|| runend_array.to_array()) .bench_values(|array| array.to_canonical().unwrap()); } + +#[divan::bench(args = BENCH_ARGS)] +fn take_from_primitive(bencher: Bencher, (array_size, run_length): (usize, usize)) { + let source_array = PrimitiveArray::from_iter(0..array_size as i32).into_array(); + + // Create run-end indices that select values with uniform run lengths + let num_runs = array_size / run_length; + let runs = (0..num_runs).collect::>(); + + // Create ends array - each run is run_length long + let ends = runs + .iter() + .map(|&i| ((i + 1) * run_length) as u64) + .collect::>(); + + // Create values array - each run selects a value + let values = runs + .iter() + .map(|&i| ((i * run_length) / 2) as u32) + .collect::>(); + + let ends_array = PrimitiveArray::from_iter(ends).into_array(); + let values_array = PrimitiveArray::from_iter(values).into_array(); + + let indices = RunEndArray::try_new(ends_array, values_array).unwrap(); + + bencher + .with_inputs(|| (&indices, &source_array)) + .bench_refs(|(indices, array)| take(indices, array).unwrap()); +} diff --git a/encodings/runend/src/compute/take_from.rs b/encodings/runend/src/compute/take_from.rs index 01fed738d8..47f92f047e 100644 --- a/encodings/runend/src/compute/take_from.rs +++ b/encodings/runend/src/compute/take_from.rs @@ -23,7 +23,7 @@ impl TakeFromFn<&RunEndArray> for RunEndEncoding { indices: &RunEndArray, array: &dyn Array, ) -> VortexResult> { - // Only `Primitive` and `Bool` are valid run-end value types. + // Only `Primitive` and `Bool` are valid run-end value types. - TODO: Support additional DTypes if !matches!(array.dtype(), DType::Primitive(_, _) | DType::Bool(_)) { return Ok(None); } diff --git a/vortex-array/src/compute/mod.rs b/vortex-array/src/compute/mod.rs index 1e31f1c882..37e870a020 100644 --- a/vortex-array/src/compute/mod.rs +++ b/vortex-array/src/compute/mod.rs @@ -29,7 +29,7 @@ pub use search_sorted::*; pub use slice::{SliceFn, slice}; pub use sum::*; pub use take::{TakeFn, take, take_into}; -pub use take_from::{TakeFromFn, take_from}; +pub use take_from::TakeFromFn; pub use to_arrow::*; mod between; diff --git a/vortex-array/src/compute/take.rs b/vortex-array/src/compute/take.rs index d60be89638..2e025bba8e 100644 --- a/vortex-array/src/compute/take.rs +++ b/vortex-array/src/compute/take.rs @@ -180,6 +180,13 @@ fn derive_take_stats(arr: &dyn Array) -> StatsSet { } fn take_impl(array: &dyn Array, indices: &dyn Array) -> VortexResult { + // First look for a TakeFrom specialized on the indices. + if let Some(take_from_fn) = indices.vtable().take_from_fn() { + if let Some(arr) = take_from_fn.take_from(indices, array)? { + return Ok(arr); + } + } + // If TakeFn defined for the encoding, delegate to TakeFn. // If we know from stats that indices are all valid, we can avoid all bounds checks. if let Some(take_fn) = array.vtable().take_fn() { diff --git a/vortex-array/src/compute/take_from.rs b/vortex-array/src/compute/take_from.rs index fe7a929beb..79fb4e001a 100644 --- a/vortex-array/src/compute/take_from.rs +++ b/vortex-array/src/compute/take_from.rs @@ -1,4 +1,4 @@ -use vortex_error::{VortexExpect, VortexResult, vortex_bail}; +use vortex_error::{VortexExpect, VortexResult}; use crate::encoding::Encoding; use crate::{Array, ArrayRef}; @@ -20,13 +20,3 @@ where TakeFromFn::take_from(self, indices, array) } } - -pub fn take_from(indices: &dyn Array, array: &dyn Array) -> VortexResult> { - let taken = indices - .vtable() - .take_from_fn() - .map(|f| f.take_from(indices, array)) - .unwrap_or_else(|| vortex_bail!(NotImplemented: "take_from", array.encoding()))?; - - Ok(taken) -} From 0288b97cebe84139b2c09c62f1c4e3c913715000 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 15:49:37 +0000 Subject: [PATCH 05/12] move & adjust benchmark --- encodings/runend/benches/run_end_compress.rs | 41 ++++++++------------ 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/encodings/runend/benches/run_end_compress.rs b/encodings/runend/benches/run_end_compress.rs index 9b2ec25f27..2451f2c481 100644 --- a/encodings/runend/benches/run_end_compress.rs +++ b/encodings/runend/benches/run_end_compress.rs @@ -4,6 +4,7 @@ use divan::Bencher; use itertools::repeat_n; use vortex_array::Array; use vortex_array::arrays::PrimitiveArray; +use vortex_array::compute::take; use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_runend::RunEndArray; @@ -39,7 +40,7 @@ fn compress(bencher: Bencher, (length, run_step): (usize, usize)) { } #[divan::bench(args = BENCH_ARGS)] -fn decompress(bencher: Bencher, (length, run_step): (usize, usize)) { +fn decompress_to_canonical(bencher: Bencher, (length, run_step): (usize, usize)) { let values = PrimitiveArray::new( (0..=length) .step_by(run_step) @@ -57,31 +58,21 @@ fn decompress(bencher: Bencher, (length, run_step): (usize, usize)) { } #[divan::bench(args = BENCH_ARGS)] -fn take_from_primitive(bencher: Bencher, (array_size, run_length): (usize, usize)) { - let source_array = PrimitiveArray::from_iter(0..array_size as i32).into_array(); - - // Create run-end indices that select values with uniform run lengths - let num_runs = array_size / run_length; - let runs = (0..num_runs).collect::>(); - - // Create ends array - each run is run_length long - let ends = runs - .iter() - .map(|&i| ((i + 1) * run_length) as u64) - .collect::>(); - - // Create values array - each run selects a value - let values = runs - .iter() - .map(|&i| ((i * run_length) / 2) as u32) - .collect::>(); - - let ends_array = PrimitiveArray::from_iter(ends).into_array(); - let values_array = PrimitiveArray::from_iter(values).into_array(); +fn take_indices(bencher: Bencher, (length, run_step): (usize, usize)) { + let values = PrimitiveArray::new( + (0..=length) + .step_by(run_step) + .enumerate() + .flat_map(|(idx, x)| repeat_n(idx as u64, x)) + .collect::>(), + Validity::NonNullable, + ); - let indices = RunEndArray::try_new(ends_array, values_array).unwrap(); + let source_array = PrimitiveArray::from_iter(0..length as i32).into_array(); + let (ends, values) = runend_encode(&values).unwrap(); + let runend_array = RunEndArray::try_new(ends.into_array(), values).unwrap(); bencher - .with_inputs(|| (&indices, &source_array)) - .bench_refs(|(indices, array)| take(indices, array).unwrap()); + .with_inputs(|| (runend_array.to_array(), source_array.clone())) + .bench_refs(|(indices, array)| take(array, indices).unwrap()); } From 20b810f53ceb135f215fb4374442ce094b466177 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 15:55:36 +0000 Subject: [PATCH 06/12] gets called through take --- encodings/dict/src/array.rs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index 4145729c02..da6aa83005 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -124,15 +124,7 @@ impl ArrayCanonicalImpl for DictArray { }) }) } - _ => { - if let Some(take_from_fn) = self.codes().vtable().take_from_fn() { - if let Some(array) = take_from_fn.take_from(self.codes(), self.values())? { - return array.to_canonical(); - } - } - - take(self.values(), self.codes())?.to_canonical() - } + _ => take(self.values(), self.codes())?.to_canonical(), } } From 204036dbd3eedb70efb26ffa01195326020aaa5f Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 16:05:23 +0000 Subject: [PATCH 07/12] unused imports --- Cargo.lock | 1 - encodings/dict/Cargo.toml | 1 - encodings/dict/benches/dict_compress.rs | 6 +----- encodings/dict/src/compute/mod.rs | 27 +------------------------ 4 files changed, 2 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6f263c6509..7841d84789 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5424,7 +5424,6 @@ dependencies = [ "vortex-error", "vortex-fsst", "vortex-mask", - "vortex-runend", "vortex-scalar", ] diff --git a/encodings/dict/Cargo.toml b/encodings/dict/Cargo.toml index 117af01d18..096eac4a13 100644 --- a/encodings/dict/Cargo.toml +++ b/encodings/dict/Cargo.toml @@ -39,7 +39,6 @@ workspace = true divan = { workspace = true } rand = { workspace = true } vortex-array = { workspace = true, features = ["test-harness"] } -vortex-runend = { workspace = true } [[bench]] name = "dict_compress" diff --git a/encodings/dict/benches/dict_compress.rs b/encodings/dict/benches/dict_compress.rs index 6335775f09..5243fab3b0 100644 --- a/encodings/dict/benches/dict_compress.rs +++ b/encodings/dict/benches/dict_compress.rs @@ -3,14 +3,10 @@ use divan::Bencher; use rand::distr::{Distribution, StandardUniform}; use vortex_array::Array; -use vortex_array::arrays::{PrimitiveArray, VarBinArray, VarBinViewArray}; -use vortex_array::validity::Validity::NonNullable; -use vortex_buffer::buffer; -use vortex_dict::DictArray; +use vortex_array::arrays::{VarBinArray, VarBinViewArray}; use vortex_dict::builders::dict_encode; use vortex_dict::test::{gen_primitive_for_dict, gen_varbin_words}; use vortex_dtype::NativePType; -use vortex_runend::RunEndArray; fn main() { divan::main(); diff --git a/encodings/dict/src/compute/mod.rs b/encodings/dict/src/compute/mod.rs index 7e91da1ba9..5b31228346 100644 --- a/encodings/dict/src/compute/mod.rs +++ b/encodings/dict/src/compute/mod.rs @@ -84,12 +84,10 @@ mod test { use vortex_array::arrays::{ConstantArray, PrimitiveArray, VarBinArray, VarBinViewArray}; use vortex_array::compute::test_harness::test_mask; use vortex_array::compute::{Operator, compare, scalar_at, slice}; - use vortex_array::{Array, ArrayRef, IntoArray, ToCanonical}; - use vortex_buffer::buffer; + use vortex_array::{Array, ArrayRef, ToCanonical}; use vortex_dtype::{DType, Nullability}; use vortex_scalar::Scalar; - use crate::DictArray; use crate::builders::dict_encode; #[test] @@ -230,27 +228,4 @@ mod test { .unwrap(); test_mask(&array); } - - #[test] - fn test_dict_array_runend_decode() { - let run_end_codes = vortex_runend::RunEndArray::try_new( - buffer![2u32, 7, 10].into_array(), - buffer![0u32, 2, 1].into_array(), - ) - .unwrap(); - - let dict_values = buffer![100u32, 200, 300].into_array(); - let dict_array = DictArray::try_new(run_end_codes.into_array(), dict_values).unwrap(); - let canonical = dict_array.to_canonical().unwrap(); - - let expected = buffer![100u32, 100, 300, 300, 300, 300, 300, 200, 200, 200] - .into_array() - .to_canonical() - .unwrap(); - - assert_eq!( - canonical.into_primitive().unwrap().as_slice::(), - expected.into_primitive().unwrap().as_slice::() - ); - } } From ab588d61e16956b917260d97b0f0a8a0d24da350 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 16:13:07 +0000 Subject: [PATCH 08/12] clippy --- encodings/runend/benches/run_end_compress.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/encodings/runend/benches/run_end_compress.rs b/encodings/runend/benches/run_end_compress.rs index 2451f2c481..6adcd5c60a 100644 --- a/encodings/runend/benches/run_end_compress.rs +++ b/encodings/runend/benches/run_end_compress.rs @@ -58,6 +58,7 @@ fn decompress_to_canonical(bencher: Bencher, (length, run_step): (usize, usize)) } #[divan::bench(args = BENCH_ARGS)] +#[allow(clippy::cast_possible_truncation)] fn take_indices(bencher: Bencher, (length, run_step): (usize, usize)) { let values = PrimitiveArray::new( (0..=length) @@ -68,7 +69,7 @@ fn take_indices(bencher: Bencher, (length, run_step): (usize, usize)) { Validity::NonNullable, ); - let source_array = PrimitiveArray::from_iter(0..length as i32).into_array(); + let source_array = PrimitiveArray::from_iter(0..(length as i32)).into_array(); let (ends, values) = runend_encode(&values).unwrap(); let runend_array = RunEndArray::try_new(ends.into_array(), values).unwrap(); From 153b967c7ea1d79358eb663bb18b6c51960d308b Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 17:31:16 +0000 Subject: [PATCH 09/12] clarify comment --- encodings/runend/src/compute/take_from.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/encodings/runend/src/compute/take_from.rs b/encodings/runend/src/compute/take_from.rs index 47f92f047e..e90809b82d 100644 --- a/encodings/runend/src/compute/take_from.rs +++ b/encodings/runend/src/compute/take_from.rs @@ -28,12 +28,14 @@ impl TakeFromFn<&RunEndArray> for RunEndEncoding { return Ok(None); } - // Order the values to prepare for runend decoding. - let shuffled = take(array, indices.values())?; + // Transform the run-end encoding from storing indices to storing values + // by taking values from `array` at positions specified in `indices.values()`. + let transformed = take(array, indices.values())?; + // Create a new run-end array now containing the values instead of indices. let ree_array = RunEndArray::with_offset_and_length( indices.ends().clone(), - shuffled, + transformed, indices.offset(), indices.len(), )?; From 6d462180a0e223bf60f4882b907da3ba4dcccaa6 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 18:36:12 +0000 Subject: [PATCH 10/12] fix benchmark --- encodings/runend/benches/run_end_compress.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/encodings/runend/benches/run_end_compress.rs b/encodings/runend/benches/run_end_compress.rs index 6adcd5c60a..8b7af6c102 100644 --- a/encodings/runend/benches/run_end_compress.rs +++ b/encodings/runend/benches/run_end_compress.rs @@ -61,7 +61,7 @@ fn decompress_to_canonical(bencher: Bencher, (length, run_step): (usize, usize)) #[allow(clippy::cast_possible_truncation)] fn take_indices(bencher: Bencher, (length, run_step): (usize, usize)) { let values = PrimitiveArray::new( - (0..=length) + (0..length) .step_by(run_step) .enumerate() .flat_map(|(idx, x)| repeat_n(idx as u64, x)) @@ -74,6 +74,6 @@ fn take_indices(bencher: Bencher, (length, run_step): (usize, usize)) { let runend_array = RunEndArray::try_new(ends.into_array(), values).unwrap(); bencher - .with_inputs(|| (runend_array.to_array(), source_array.clone())) - .bench_refs(|(indices, array)| take(array, indices).unwrap()); + .with_inputs(|| (source_array.clone(), runend_array.to_array())) + .bench_refs(|(array, indices)| take(array, indices).unwrap()); } From df42ec4cc9baef98ab5aeb503fe78bfdda877d96 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 20:36:55 +0000 Subject: [PATCH 11/12] docs: shorten docs --- encodings/runend/src/compute/take_from.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/encodings/runend/src/compute/take_from.rs b/encodings/runend/src/compute/take_from.rs index e90809b82d..037afdf26a 100644 --- a/encodings/runend/src/compute/take_from.rs +++ b/encodings/runend/src/compute/take_from.rs @@ -10,8 +10,8 @@ impl TakeFromFn<&RunEndArray> for RunEndEncoding { /// /// # Arguments /// - /// * `indices` - The run-end encoded array containing the indices - /// * `array` - The source array to take values from + /// * `indices` - Run-end encoded indices + /// * `array` - Array to take values from /// /// # Returns /// From b538a217d900d4c42143b9d9a57605c4186686f3 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 26 Feb 2025 22:44:24 +0000 Subject: [PATCH 12/12] rename --- encodings/runend/src/compute/take_from.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/encodings/runend/src/compute/take_from.rs b/encodings/runend/src/compute/take_from.rs index 037afdf26a..55e8a33e2b 100644 --- a/encodings/runend/src/compute/take_from.rs +++ b/encodings/runend/src/compute/take_from.rs @@ -11,31 +11,31 @@ impl TakeFromFn<&RunEndArray> for RunEndEncoding { /// # Arguments /// /// * `indices` - Run-end encoded indices - /// * `array` - Array to take values from + /// * `source` - Array to take values from /// /// # Returns /// - /// * `Ok(Some(array))` - If successful + /// * `Ok(Some(source))` - If successful /// * `Ok(None)` - If the source array has an unsupported dtype /// fn take_from( &self, indices: &RunEndArray, - array: &dyn Array, + source: &dyn Array, ) -> VortexResult> { // Only `Primitive` and `Bool` are valid run-end value types. - TODO: Support additional DTypes - if !matches!(array.dtype(), DType::Primitive(_, _) | DType::Bool(_)) { + if !matches!(source.dtype(), DType::Primitive(_, _) | DType::Bool(_)) { return Ok(None); } // Transform the run-end encoding from storing indices to storing values - // by taking values from `array` at positions specified in `indices.values()`. - let transformed = take(array, indices.values())?; + // by taking values from `source` at positions specified by `indices.values()`. + let values = take(source, indices.values())?; - // Create a new run-end array now containing the values instead of indices. + // Create a new run-end array containing values as values, instead of indices as values. let ree_array = RunEndArray::with_offset_and_length( indices.ends().clone(), - transformed, + values, indices.offset(), indices.len(), )?;