From 10a0e5c7ba4cd4ce3721d859b2e339c223f97e19 Mon Sep 17 00:00:00 2001 From: Geoffrey Claude Date: Fri, 9 Jan 2026 11:59:30 +0100 Subject: [PATCH] Implement Zero-Copy Reinterpretation and enable Int8/Int16 Bitmaps Build Int8 and Int16 IN-list bitmap filters by reinterpreting the input buffers as UInt8 or UInt16 with the same byte width. This avoids copying or numeric conversion while preserving signed integer equality semantics. --- .../physical-expr/src/expressions/in_list.rs | 1 + .../expressions/in_list/primitive_filter.rs | 19 +- .../src/expressions/in_list/strategy.rs | 8 +- .../src/expressions/in_list/transform.rs | 169 ++++++++++++++++++ 4 files changed, 189 insertions(+), 8 deletions(-) create mode 100644 datafusion/physical-expr/src/expressions/in_list/transform.rs diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 50ff3936937bf..be73b0a9d11be 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -41,6 +41,7 @@ mod primitive_filter; mod result; mod static_filter; mod strategy; +mod transform; use static_filter::StaticFilter; use strategy::instantiate_static_filter; diff --git a/datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs b/datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs index 0e2ee564656ac..30c51c90f73a9 100644 --- a/datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs +++ b/datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs @@ -142,6 +142,22 @@ where fn check(&self, needle: T::Native) -> bool { self.bits.get_bit(needle.as_usize()) } + + /// Check membership using a raw values slice (zero-copy path for type reinterpretation). + #[inline] + pub(super) fn contains_slice( + &self, + values: &[T::Native], + nulls: Option<&NullBuffer>, + negated: bool, + ) -> BooleanArray { + build_in_list_result(values.len(), nulls, self.null_count > 0, negated, |i| { + // SAFETY: `build_in_list_result` invokes this closure for + // indices in `0..values.len()`. + let needle = unsafe { *values.get_unchecked(i) }; + self.check(needle) + }) + } } impl StaticFilter for BitmapFilter @@ -359,9 +375,6 @@ macro_rules! primitive_static_filter { }; } -// Generate specialized filters for all integer primitive types -primitive_static_filter!(Int8StaticFilter, Int8Type); -primitive_static_filter!(Int16StaticFilter, Int16Type); primitive_static_filter!(Int32StaticFilter, Int32Type); primitive_static_filter!(Int64StaticFilter, Int64Type); primitive_static_filter!(UInt32StaticFilter, UInt32Type); diff --git a/datafusion/physical-expr/src/expressions/in_list/strategy.rs b/datafusion/physical-expr/src/expressions/in_list/strategy.rs index 21b658fad0382..8a544f9ad6485 100644 --- a/datafusion/physical-expr/src/expressions/in_list/strategy.rs +++ b/datafusion/physical-expr/src/expressions/in_list/strategy.rs @@ -25,6 +25,7 @@ use datafusion_common::Result; use super::array_static_filter::ArrayStaticFilter; use super::primitive_filter::*; use super::static_filter::StaticFilter; +use super::transform::make_bitmap_filter; pub(super) fn instantiate_static_filter( in_array: ArrayRef, @@ -37,13 +38,10 @@ pub(super) fn instantiate_static_filter( _ => in_array, }; match in_array.data_type() { - // Integer primitive types - DataType::Int8 => Ok(Arc::new(Int8StaticFilter::try_new(&in_array)?)), - DataType::Int16 => Ok(Arc::new(Int16StaticFilter::try_new(&in_array)?)), + DataType::Int8 | DataType::UInt8 => make_bitmap_filter::(&in_array), + DataType::Int16 | DataType::UInt16 => make_bitmap_filter::(&in_array), DataType::Int32 => Ok(Arc::new(Int32StaticFilter::try_new(&in_array)?)), DataType::Int64 => Ok(Arc::new(Int64StaticFilter::try_new(&in_array)?)), - DataType::UInt8 => Ok(Arc::new(BitmapFilter::::try_new(&in_array)?)), - DataType::UInt16 => Ok(Arc::new(BitmapFilter::::try_new(&in_array)?)), DataType::UInt32 => Ok(Arc::new(UInt32StaticFilter::try_new(&in_array)?)), DataType::UInt64 => Ok(Arc::new(UInt64StaticFilter::try_new(&in_array)?)), // Float primitive types (use ordered wrappers for Hash/Eq) diff --git a/datafusion/physical-expr/src/expressions/in_list/transform.rs b/datafusion/physical-expr/src/expressions/in_list/transform.rs new file mode 100644 index 0000000000000..332ce976729bf --- /dev/null +++ b/datafusion/physical-expr/src/expressions/in_list/transform.rs @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Type transformation utilities for InList filters. +//! +//! Some filters only depend on fixed-width value bit patterns. For those cases, +//! compatible primitive arrays can be reinterpreted to the filter's unsigned +//! storage type without copying values. + +use std::mem::size_of; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray}; +use arrow::buffer::ScalarBuffer; +use arrow::datatypes::{ArrowPrimitiveType, DataType}; +use datafusion_common::{Result, exec_datafusion_err}; + +use super::primitive_filter::{BitmapFilter, BitmapFilterType}; +use super::static_filter::{StaticFilter, handle_dictionary}; + +/// Bitmap filter for signed 1-byte and 2-byte primitive arrays. +/// +/// The bitmap implementation is keyed by an unsigned primitive type (`UInt8` or +/// `UInt16`). This wrapper keeps the original array type, such as `Int8`, and +/// only reinterprets values as the unsigned type when probing the bitmap. +struct ReinterpretedBitmap { + expected_data_type: DataType, + inner: BitmapFilter, +} + +impl StaticFilter for ReinterpretedBitmap { + fn null_count(&self) -> usize { + self.inner.null_count() + } + + fn contains(&self, v: &dyn Array, negated: bool) -> Result { + handle_dictionary!(self, v, negated); + + if v.data_type() != &self.expected_data_type { + return Err(exec_datafusion_err!( + "BitmapFilter: expected {} array, got {}", + self.expected_data_type, + v.data_type() + )); + } + + let data = v.to_data(); + let values: &[T::Native] = &data.buffer::(0)[..v.len()]; + + Ok(self.inner.contains_slice(values, data.nulls(), negated)) + } +} + +/// Views a primitive array as another primitive type with the same byte width. +/// +/// This does not convert values. It reuses the existing values buffer and +/// interprets each value's bytes as `T::Native`, preserving the null buffer. +/// The caller must check that the source and target primitive types have the +/// same width. +#[inline] +pub(crate) fn reinterpret_any_primitive_to( + array: &dyn Array, +) -> ArrayRef { + let data = array.to_data(); + let values = data.buffers()[0].clone(); + let buffer = ScalarBuffer::::new(values, data.offset(), data.len()); + Arc::new(PrimitiveArray::::new(buffer, array.nulls().cloned())) +} + +/// Creates a bitmap filter for 1-byte or 2-byte primitive arrays. +/// +/// Unsigned inputs use the bitmap filter directly. Signed inputs of the same +/// width are reinterpreted as the unsigned bitmap type, without copying. +pub(crate) fn make_bitmap_filter( + in_array: &ArrayRef, +) -> Result> +where + T: BitmapFilterType, +{ + if in_array.data_type() == &T::DATA_TYPE { + return Ok(Arc::new(BitmapFilter::::try_new(in_array)?)); + } + + let width = size_of::(); + if in_array.data_type().primitive_width() != Some(width) { + return Err(exec_datafusion_err!( + "BitmapFilter: expected {}-byte primitive array for {} bitmap, got {}", + width, + T::DATA_TYPE, + in_array.data_type() + )); + } + + let reinterpreted = reinterpret_any_primitive_to::(in_array.as_ref()); + let inner = BitmapFilter::::try_new(&reinterpreted)?; + Ok(Arc::new(ReinterpretedBitmap { + expected_data_type: in_array.data_type().clone(), + inner, + })) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Arc; + + use arrow::array::{ArrayRef, BooleanArray, Int8Array, Int16Array}; + use arrow::datatypes::{UInt8Type, UInt16Type}; + + #[test] + fn reinterpreted_bitmap_handles_signed_boundaries_and_slices() -> Result<()> { + let haystack: ArrayRef = Arc::new( + Int8Array::from(vec![Some(99), Some(i8::MIN), None, Some(-1), Some(42)]) + .slice(1, 3), + ); + let filter = make_bitmap_filter::(&haystack)?; + let needles = + Int8Array::from(vec![Some(7), Some(i8::MIN), Some(-1), None]).slice(1, 3); + + assert_eq!( + filter.contains(&needles, false)?, + BooleanArray::from(vec![Some(true), Some(true), None]) + ); + assert_eq!( + filter.contains(&needles, true)?, + BooleanArray::from(vec![Some(false), Some(false), None]) + ); + + let haystack: ArrayRef = Arc::new( + Int16Array::from(vec![ + Some(123), + Some(i16::MIN), + None, + Some(-1), + Some(i16::MAX), + ]) + .slice(1, 4), + ); + let filter = make_bitmap_filter::(&haystack)?; + let needles = + Int16Array::from(vec![Some(0), Some(i16::MIN), Some(7), Some(i16::MAX)]) + .slice(1, 3); + + assert_eq!( + filter.contains(&needles, false)?, + BooleanArray::from(vec![Some(true), None, Some(true)]) + ); + assert_eq!( + filter.contains(&needles, true)?, + BooleanArray::from(vec![Some(false), None, Some(false)]) + ); + + Ok(()) + } +}