diff --git a/Cargo.lock b/Cargo.lock index a5f233704..780dacc38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5701,7 +5701,9 @@ dependencies = [ "approx", "arrow-array", "arrow-buffer", + "arrow-ipc", "arrow-schema", + "datafusion-common", "sedona-common", "sedona-schema", "sedona-testing", diff --git a/rust/sedona-raster-functions/src/rs_pixel_functions.rs b/rust/sedona-raster-functions/src/rs_pixel_functions.rs index c6bb048bf..dcb82eaa3 100644 --- a/rust/sedona-raster-functions/src/rs_pixel_functions.rs +++ b/rust/sedona-raster-functions/src/rs_pixel_functions.rs @@ -191,7 +191,7 @@ impl SedonaScalarKernel for RsPixelAsCentroid { let grid_x = (col_x - 1) as f64 + 0.5; let grid_y = (row_y - 1) as f64 + 0.5; - let affine = AffineMatrix::from_metadata(raster.metadata()); + let affine = AffineMatrix::from_metadata(&raster.metadata()); let (wx, wy) = affine.transform(grid_x, grid_y); write_wkb_point(&mut builder, (wx, wy)) diff --git a/rust/sedona-raster-gdal/src/gdal_common.rs b/rust/sedona-raster-gdal/src/gdal_common.rs index 0c96fd1cb..2a6fad688 100644 --- a/rust/sedona-raster-gdal/src/gdal_common.rs +++ b/rust/sedona-raster-gdal/src/gdal_common.rs @@ -222,6 +222,13 @@ pub unsafe fn raster_ref_to_gdal_mem( .band(src_band_index) .map_err(|e| arrow_datafusion_err!(e))?; + if !band.is_2d() { + return exec_err!( + "GDAL backend requires a 2-dim band; got dim_names={:?}", + band.dim_names() + ); + } + if band.metadata().storage_type()? != StorageType::InDb { return Err(DataFusionError::NotImplemented( "OutDb bands are not supported in raster_to_mem_dataset".to_string(), @@ -825,4 +832,40 @@ mod tests { .unwrap(); assert!(err.to_string().contains("OutDb bands are not supported")); } + + #[test] + fn test_raster_ref_to_gdal_mem_rejects_nd_bands() { + // Build a 3-D in-db band shaped ["time","y","x"] over a 2-D raster. + // The N-D guard should fire before any GDAL call. + let mut builder = RasterBuilder::new(1); + builder + .start_raster_2d(2, 2, 0.0, 2.0, 1.0, -1.0, 0.0, 0.0, None) + .unwrap(); + builder + .start_band_nd( + None, + &["time", "y", "x"], + &[3, 2, 2], + BandDataType::UInt8, + None, + None, + None, + ) + .unwrap(); + builder + .band_data_writer() + .append_value(vec![0u8; 3 * 2 * 2]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + let raster_array = builder.finish().unwrap(); + let raster = single_raster(&raster_array); + + let err = with_gdal(|gdal| unsafe { raster_ref_to_gdal_mem(gdal, &raster, &[1]) }) + .err() + .unwrap(); + assert!( + err.to_string().contains("requires a 2-dim band"), + "got: {err}" + ); + } } diff --git a/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs b/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs index 03e45621e..a9d1013c5 100644 --- a/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs +++ b/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs @@ -259,6 +259,14 @@ impl GDALDatasetCache { for i in 1..=num_bands { let band = bands.band(i).map_err(|e| arrow_datafusion_err!(e))?; + + if !band.is_2d() { + return exec_err!( + "GDAL backend requires 2-dim bands; got dim_names={:?}", + band.dim_names() + ); + } + let band_metadata = band.metadata(); let band_type = band_metadata.data_type()?; let gdal_type = band_data_type_to_gdal(&band_type); @@ -660,6 +668,32 @@ mod tests { path_str } + /// Two-band GeoTIFF on disk: band 1 is filled with `band1_fill`, band 2 + /// with `band2_fill`. Used to exercise `#band=2` selection end-to-end. + fn create_two_band_source_tiff(temp_dir: &TempDir, band1_fill: u8, band2_fill: u8) -> String { + let path = temp_dir.path().join("two_band.tif"); + let path_str = path.to_string_lossy().to_string(); + + with_gdal(|gdal| { + let driver = gdal.get_driver_by_name("GTiff").unwrap(); + let dataset = driver + .create_with_band_type::(&path_str, 8, 8, 2) + .unwrap(); + dataset + .set_geo_transform(&[0.0, 1.0, 0.0, 8.0, 0.0, -1.0]) + .unwrap(); + for (i, fill) in [band1_fill, band2_fill].iter().enumerate() { + let band = dataset.rasterband(i + 1).unwrap(); + let mut buffer = Buffer::new((8, 8), vec![*fill; 8 * 8]); + band.write((0, 0), (8, 8), &mut buffer).unwrap(); + } + Ok(()) + }) + .unwrap(); + + path_str + } + fn build_outdb_raster(path: &str) -> arrow_array::StructArray { let mut builder = RasterBuilder::new(1); let metadata = RasterMetadata { @@ -976,4 +1010,109 @@ mod tests { assert!(key_a != key_b); } + + #[test] + fn test_provider_rejects_nd_band_in_vrt_path() { + let temp_dir = TempDir::new().unwrap(); + let path = create_source_tiff(&temp_dir); + + // Build a raster mixing one in-db 3-D band (forces N-D rejection inside + // build_vrt_from_sources) with one out-db band. + let mut builder = RasterBuilder::new(1); + builder + .start_raster_2d(8, 8, 0.0, 8.0, 1.0, -1.0, 0.0, 0.0, None) + .unwrap(); + builder + .start_band_nd( + None, + &["time", "y", "x"], + &[2, 8, 8], + BandDataType::UInt8, + None, + None, + None, + ) + .unwrap(); + builder + .band_data_writer() + .append_value(vec![0u8; 2 * 8 * 8]); + builder.finish_band().unwrap(); + builder + .start_band_nd( + None, + &["y", "x"], + &[8, 8], + BandDataType::UInt8, + Some(&[0u8]), + Some(&path), + Some("geotiff"), + ) + .unwrap(); + builder.band_data_writer().append_value([]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + let raster_struct = builder.finish().unwrap(); + let raster_array = RasterStructArray::new(&raster_struct); + let raster = raster_array.get(0).unwrap(); + let cache = Rc::new(GDALDatasetCache::try_new(4, 4).unwrap()); + + let err = with_gdal(|gdal| { + let provider = GDALDatasetProvider::new(gdal, Rc::clone(&cache)); + provider.raster_ref_to_gdal(&raster) + }) + .err() + .unwrap(); + assert!(err.to_string().contains("2-dim band"), "got: {err}"); + } + + #[test] + fn test_provider_selects_outdb_band_via_band_fragment() { + let temp_dir = TempDir::new().unwrap(); + // Source TIFF: band 1 filled with 7s, band 2 filled with 99s. + let path = create_two_band_source_tiff(&temp_dir, 7u8, 99u8); + + // Build a 1-band raster whose single band points at source band 2. + let metadata = RasterMetadata { + width: 8, + height: 8, + upperleft_x: 0.0, + upperleft_y: 8.0, + scale_x: 1.0, + scale_y: -1.0, + skew_x: 0.0, + skew_y: 0.0, + }; + let mut builder = RasterBuilder::new(1); + builder.start_raster(&metadata, None).unwrap(); + builder + .start_band(BandMetadata { + nodata_value: Some(vec![0u8]), + storage_type: StorageType::OutDbRef, + datatype: BandDataType::UInt8, + outdb_url: Some(path.clone()), + outdb_band_id: Some(2), + }) + .unwrap(); + builder.band_data_writer().append_value([]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + let raster_struct = builder.finish().unwrap(); + let raster_array = RasterStructArray::new(&raster_struct); + let raster = raster_array.get(0).unwrap(); + let cache = Rc::new(GDALDatasetCache::try_new(4, 4).unwrap()); + + let dataset = with_gdal(|gdal| { + let provider = GDALDatasetProvider::new(gdal, Rc::clone(&cache)); + provider.raster_ref_to_gdal(&raster) + }) + .unwrap(); + + let band = dataset + .as_dataset() + .rasterband(1) + .unwrap() + .read_as::((0, 0), (8, 8), (8, 8), None) + .unwrap(); + assert_eq!(band.data().to_vec(), vec![99u8; 8 * 8]); + } } diff --git a/rust/sedona-raster/Cargo.toml b/rust/sedona-raster/Cargo.toml index 7407a0507..37e7ecfbf 100644 --- a/rust/sedona-raster/Cargo.toml +++ b/rust/sedona-raster/Cargo.toml @@ -34,9 +34,11 @@ result_large_err = "allow" arrow-schema = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } +datafusion-common = { workspace = true } sedona-common = { workspace = true } sedona-schema = { workspace = true } [dev-dependencies] sedona-testing = { workspace = true } approx = { workspace = true } +arrow-ipc = { workspace = true } diff --git a/rust/sedona-raster/src/affine_transformation.rs b/rust/sedona-raster/src/affine_transformation.rs index ca6441e73..00ff22793 100644 --- a/rust/sedona-raster/src/affine_transformation.rs +++ b/rust/sedona-raster/src/affine_transformation.rs @@ -108,7 +108,7 @@ pub fn rotation(raster: &dyn RasterRef) -> f64 { /// * `y` - Y coordinate in pixel space (row) #[inline] pub fn to_world_coordinate(raster: &dyn RasterRef, x: i64, y: i64) -> (f64, f64) { - AffineMatrix::from_metadata(raster.metadata()).transform(x as f64, y as f64) + AffineMatrix::from_metadata(&raster.metadata()).transform(x as f64, y as f64) } /// Performs the inverse affine transformation to convert world coordinates back to raster pixel coordinates. @@ -124,14 +124,14 @@ pub fn to_raster_coordinate( world_y: f64, ) -> Result<(i64, i64), ArrowError> { let (rx, ry) = - AffineMatrix::from_metadata(raster.metadata()).inv_transform(world_x, world_y)?; + AffineMatrix::from_metadata(&raster.metadata()).inv_transform(world_x, world_y)?; Ok((rx as i64, ry as i64)) } #[cfg(test)] mod tests { use super::*; - use crate::traits::{MetadataRef, RasterMetadata}; + use crate::traits::{BandRef, Bands, RasterMetadata}; use approx::assert_relative_eq; use std::f64::consts::FRAC_1_SQRT_2; use std::f64::consts::PI; @@ -141,14 +141,34 @@ mod tests { } impl RasterRef for TestRaster { - fn metadata(&self) -> &dyn MetadataRef { - &self.metadata + fn num_bands(&self) -> usize { + 0 + } + fn bands(&self) -> Bands<'_> { + Bands::new(self) + } + fn band(&self, index: usize) -> Result, ArrowError> { + Err(ArrowError::InvalidArgumentError(format!( + "Band index {index} is out of range: this raster has 0 bands" + ))) + } + fn band_name(&self, _index: usize) -> Option<&str> { + None } fn crs(&self) -> Option<&str> { None } - fn bands(&self) -> &dyn crate::traits::BandsRef { - unimplemented!() + fn transform(&self) -> &[f64] { + &[] + } + fn spatial_dims(&self) -> Vec<&str> { + vec![] + } + fn spatial_shape(&self) -> &[i64] { + &[] + } + fn metadata(&self) -> RasterMetadata { + self.metadata.clone() } } diff --git a/rust/sedona-raster/src/array.rs b/rust/sedona-raster/src/array.rs index 07a4bce04..bc7c2317b 100644 --- a/rust/sedona-raster/src/array.rs +++ b/rust/sedona-raster/src/array.rs @@ -15,445 +15,448 @@ // specific language governing permissions and limitations // under the License. +use std::borrow::Cow; + use arrow_array::{ - Array, BinaryArray, BinaryViewArray, Float64Array, ListArray, StringArray, StringViewArray, - StructArray, UInt32Array, UInt64Array, + Array, BinaryArray, BinaryViewArray, Float64Array, Int64Array, ListArray, StringArray, + StringViewArray, StructArray, UInt32Array, UInt64Array, }; use arrow_schema::ArrowError; -use crate::traits::{ - BandIterator, BandMetadataRef, BandRef, BandsRef, MetadataRef, RasterMetadata, RasterRef, -}; -use sedona_schema::raster::{ - band_indices, band_metadata_indices, metadata_indices, raster_indices, BandDataType, - StorageType, -}; +use crate::traits::{BandRef, Bands, NdBuffer, RasterRef, ViewEntry}; +use sedona_schema::raster::{band_indices, raster_indices, BandDataType}; -/// Implement MetadataRef for RasterMetadata to allow direct use with builder -impl MetadataRef for RasterMetadata { - fn width(&self) -> u64 { - self.width - } - fn height(&self) -> u64 { - self.height - } - fn upper_left_x(&self) -> f64 { - self.upperleft_x - } - fn upper_left_y(&self) -> f64 { - self.upperleft_y - } - fn scale_x(&self) -> f64 { - self.scale_x - } - fn scale_y(&self) -> f64 { - self.scale_y - } - fn skew_x(&self) -> f64 { - self.skew_x - } - fn skew_y(&self) -> f64 { - self.skew_y - } -} - -/// Implementation of MetadataRef for Arrow StructArray -struct MetadataRefImpl<'a> { - width_array: &'a UInt64Array, - height_array: &'a UInt64Array, - upper_left_x_array: &'a Float64Array, - upper_left_y_array: &'a Float64Array, - scale_x_array: &'a Float64Array, - scale_y_array: &'a Float64Array, - skew_x_array: &'a Float64Array, - skew_y_array: &'a Float64Array, - index: usize, +/// Arrow-backed implementation of BandRef for a single band within a raster. +/// +/// Today this handles only the canonical identity view: `view_entries` is +/// synthesised from `source_shape`, `visible_shape == source_shape`, +/// and `byte_strides` are plain C-order strides with `byte_offset = 0`. +struct BandRefImpl<'a> { + dim_names_list: &'a ListArray, + dim_names_values: &'a StringArray, + source_shape_list: &'a ListArray, + source_shape_values: &'a UInt64Array, + nodata_array: &'a BinaryArray, + outdb_uri_array: &'a StringArray, + outdb_format_array: &'a StringViewArray, + data_array: &'a BinaryViewArray, + /// Absolute row index within the flattened bands arrays + band_row: usize, + /// Resolved at construction so accessors don't re-decode the discriminant. + data_type: BandDataType, + /// Per-visible-axis view, length = ndim. Always identity today. + view_entries: Vec, + /// Visible shape, length = ndim. Equals `source_shape` today. + visible_shape: Vec, + /// Byte strides per visible axis. C-order over `source_shape` today. + byte_strides: Vec, + /// Byte offset into `data` of the visible region's `[0,...,0]` element. + byte_offset: u64, } -impl<'a> MetadataRef for MetadataRefImpl<'a> { - #[inline(always)] - fn width(&self) -> u64 { - self.width_array.value(self.index) - } - - #[inline(always)] - fn height(&self) -> u64 { - self.height_array.value(self.index) +impl<'a> BandRef for BandRefImpl<'a> { + fn ndim(&self) -> usize { + self.view_entries.len() } - #[inline(always)] - fn upper_left_x(&self) -> f64 { - self.upper_left_x_array.value(self.index) + fn dim_names(&self) -> Vec<&str> { + let start = self.dim_names_list.value_offsets()[self.band_row] as usize; + let end = self.dim_names_list.value_offsets()[self.band_row + 1] as usize; + (start..end) + .map(|i| self.dim_names_values.value(i)) + .collect() } - #[inline(always)] - fn upper_left_y(&self) -> f64 { - self.upper_left_y_array.value(self.index) + fn shape(&self) -> &[u64] { + &self.visible_shape } - #[inline(always)] - fn scale_x(&self) -> f64 { - self.scale_x_array.value(self.index) + fn raw_source_shape(&self) -> &[u64] { + let start = self.source_shape_list.value_offsets()[self.band_row] as usize; + let end = self.source_shape_list.value_offsets()[self.band_row + 1] as usize; + &self.source_shape_values.values()[start..end] } - #[inline(always)] - fn scale_y(&self) -> f64 { - self.scale_y_array.value(self.index) + fn view(&self) -> &[ViewEntry] { + &self.view_entries } - #[inline(always)] - fn skew_x(&self) -> f64 { - self.skew_x_array.value(self.index) + fn data_type(&self) -> BandDataType { + self.data_type } - #[inline(always)] - fn skew_y(&self) -> f64 { - self.skew_y_array.value(self.index) + fn data(&self) -> &[u8] { + // Pre-N-D compatibility surface. Identity-view InDb bands → the + // row-major in-line buffer (zero-copy borrow into the StructArray), + // matching the pre-N-D behavior exactly. OutDb → `&[]` from the + // empty `data` column, no panic. Non-identity views never reach + // here — `RasterRefImpl::band()` rejects them upstream so the + // raw column bytes always equal the visible bytes for any band + // this reader produces. + self.data_array.value(self.band_row) } -} - -/// Implementation of BandMetadataRef for Arrow StructArray -struct BandMetadataRefImpl<'a> { - nodata_array: &'a BinaryArray, - storage_type_array: &'a UInt32Array, - datatype_array: &'a UInt32Array, - outdb_url_array: &'a StringArray, - outdb_band_id_array: &'a UInt32Array, - band_index: usize, -} -impl<'a> BandMetadataRef for BandMetadataRefImpl<'a> { - fn nodata_value(&self) -> Option<&[u8]> { - if self.nodata_array.is_null(self.band_index) { + fn nodata(&self) -> Option<&[u8]> { + if self.nodata_array.is_null(self.band_row) { None } else { - Some(self.nodata_array.value(self.band_index)) + Some(self.nodata_array.value(self.band_row)) } } - fn storage_type(&self) -> Result { - let value = self.storage_type_array.value(self.band_index); - let storage_type = match value { - 0 => StorageType::InDb, - 1 => StorageType::OutDbRef, - _ => { - return Err(ArrowError::InvalidArgumentError(format!( - "Unknown storage type: {}", - value - ))) - } - }; - Ok(storage_type) - } - - fn data_type(&self) -> Result { - let value = self.datatype_array.value(self.band_index); - let band_data_type = match value { - 1 => BandDataType::UInt8, - 2 => BandDataType::UInt16, - 3 => BandDataType::Int16, - 4 => BandDataType::UInt32, - 5 => BandDataType::Int32, - 6 => BandDataType::Float32, - 7 => BandDataType::Float64, - 8 => BandDataType::UInt64, - 9 => BandDataType::Int64, - 10 => BandDataType::Int8, - _ => { - return Err(ArrowError::InvalidArgumentError(format!( - "Unknown band data type: {}", - self.datatype_array.value(self.band_index) - ))) - } - }; - Ok(band_data_type) - } - - fn outdb_url(&self) -> Option<&str> { - if self.outdb_url_array.is_null(self.band_index) { + fn outdb_uri(&self) -> Option<&str> { + if self.outdb_uri_array.is_null(self.band_row) { None } else { - Some(self.outdb_url_array.value(self.band_index)) + Some(self.outdb_uri_array.value(self.band_row)) } } - fn outdb_band_id(&self) -> Option { - if self.outdb_band_id_array.is_null(self.band_index) { + fn outdb_format(&self) -> Option<&str> { + if self.outdb_format_array.is_null(self.band_row) { None } else { - Some(self.outdb_band_id_array.value(self.band_index)) + Some(self.outdb_format_array.value(self.band_row)) } } -} -/// Implementation of BandRef for accessing individual band data -struct BandRefImpl<'a> { - band_metadata: BandMetadataRefImpl<'a>, - band_data: &'a [u8], -} + fn is_indb(&self) -> bool { + !self.data_array.value(self.band_row).is_empty() + } -impl<'a> BandRef for BandRefImpl<'a> { - fn metadata(&self) -> &dyn BandMetadataRef { - &self.band_metadata + fn nd_buffer(&self) -> Result, ArrowError> { + if !self.is_indb() { + return Err(ArrowError::NotYetImplemented( + "OutDb byte access via nd_buffer() is not yet implemented; \ + backend-specific OutDb resolvers are tracked separately" + .to_string(), + )); + } + // shape and strides are owned by NdBuffer (see its doc comment). + // Cloning here is cheap — both vecs are O(ndim), a handful of values. + Ok(NdBuffer { + buffer: self.data_array.value(self.band_row), + shape: self.visible_shape.clone(), + strides: self.byte_strides.clone(), + offset: self.byte_offset, + data_type: self.data_type, + }) } - fn data(&self) -> &[u8] { - self.band_data + fn contiguous_data(&self) -> Result, ArrowError> { + if !self.is_indb() { + return Err(ArrowError::NotYetImplemented( + "OutDb byte access via contiguous_data() is not yet implemented; \ + backend-specific OutDb resolvers are tracked separately" + .to_string(), + )); + } + // Identity-view only today, so the data buffer is already row-major + // over the visible region. + Ok(Cow::Borrowed(self.data_array.value(self.band_row))) } } -/// Implementation of BandsRef for accessing all bands in a raster -struct BandsRefImpl<'a> { +/// Arrow-backed implementation of RasterRef for a single raster row. +/// +/// Holds flat references to the underlying Arrow arrays so the impl does +/// not borrow from a `RasterStructArray` wrapper. That keeps +/// `RasterStructArray::get(&self, ...)` callable without a `&'a self` +/// constraint, which would otherwise force callers to hoist the +/// `RasterStructArray` into a `let` binding. +pub struct RasterRefImpl<'a> { + crs_array: &'a StringViewArray, + transform_list: &'a ListArray, + transform_values: &'a Float64Array, + spatial_dims_list: &'a ListArray, + spatial_dims_values: &'a StringViewArray, + spatial_shape_list: &'a ListArray, + spatial_shape_values: &'a Int64Array, bands_list: &'a ListArray, - raster_index: usize, - // Direct references to the metadata and data arrays - nodata_array: &'a BinaryArray, - storage_type_array: &'a UInt32Array, - datatype_array: &'a UInt32Array, - outdb_url_array: &'a StringArray, - outdb_band_id_array: &'a UInt32Array, + band_name_array: &'a StringArray, + band_dim_names_list: &'a ListArray, + band_dim_names_values: &'a StringArray, + band_source_shape_list: &'a ListArray, + band_source_shape_values: &'a UInt64Array, + band_datatype_array: &'a UInt32Array, + band_nodata_array: &'a BinaryArray, + band_view_list: &'a ListArray, + band_outdb_uri_array: &'a StringArray, + band_outdb_format_array: &'a StringViewArray, band_data_array: &'a BinaryViewArray, + raster_index: usize, +} + +impl<'a> RasterRefImpl<'a> { + /// Returns the raw CRS string reference with the array's lifetime. + pub fn crs_str_ref(&self) -> Option<&'a str> { + if self.crs_array.is_null(self.raster_index) { + None + } else { + Some(self.crs_array.value(self.raster_index)) + } + } } -impl<'a> BandsRef for BandsRefImpl<'a> { - fn len(&self) -> usize { +impl<'a> RasterRef for RasterRefImpl<'a> { + fn num_bands(&self) -> usize { self.bands_list.value_length(self.raster_index) as usize } - /// Get a specific band by number (1-based index) - fn band(&self, number: usize) -> Result, ArrowError> { - if number == 0 { + fn bands(&self) -> Bands<'_> { + Bands::new(self) + } + + fn band(&self, index: usize) -> Result, ArrowError> { + let nbands = self.num_bands(); + if index >= nbands { return Err(ArrowError::InvalidArgumentError(format!( - "Invalid band number {number}: band numbers must be 1-based" + "Band index {index} is out of range: this raster has {nbands} bands" ))); } - // By convention, band numbers are 1-based. - // Convert to zero-based index. - let index = number - 1; - if index >= self.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Band number {} is out of range: this raster has {} bands", - number, - self.len() + let start = self.bands_list.value_offsets()[self.raster_index] as usize; + let band_row = start + index; + + // Read source shape slice. + let ss_start = self.band_source_shape_list.value_offsets()[band_row] as usize; + let ss_end = self.band_source_shape_list.value_offsets()[band_row + 1] as usize; + let source_shape: &[u64] = &self.band_source_shape_values.values()[ss_start..ss_end]; + + // Reject 0-D bands at the read boundary. Schema doesn't forbid them + // outright but every consumer assumes ndim >= 1. + if source_shape.is_empty() { + return Err(ArrowError::ExternalError(Box::new( + sedona_common::sedona_internal_datafusion_err!( + "band {band_row} has empty source_shape; ndim must be >= 1" + ), ))); } - let start = self.bands_list.value_offsets()[self.raster_index] as usize; - let band_row = start + index; + // Resolve data type up front; an unknown discriminant is a + // schema-corruption bug, not user data, so failing the band loudly + // here is appropriate. + let data_type_value = self.band_datatype_array.value(band_row); + let data_type = BandDataType::try_from_u32(data_type_value).ok_or_else(|| { + ArrowError::ExternalError(Box::new(sedona_common::sedona_internal_datafusion_err!( + "band {band_row} has unknown data_type discriminant {data_type_value}" + ))) + })?; + + // Only the canonical identity view (null view row) is written today. + // A non-null view row would require the view → byte-stride composition + // path, which is not yet implemented. Surface it loudly here rather + // than silently rejecting the band, so callers see the standardised + // SedonaDB-internal-error framing. + if !self.band_view_list.is_null(band_row) { + return Err(ArrowError::ExternalError(Box::new( + sedona_common::sedona_internal_datafusion_err!( + "non-null view row at band {band_row}: view composition is not yet implemented" + ), + ))); + } + let view_entries: Vec = source_shape + .iter() + .enumerate() + .map(|(i, &s)| ViewEntry { + source_axis: i as i64, + start: 0, + step: 1, + steps: s as i64, + }) + .collect(); - let band_metadata = BandMetadataRefImpl { - nodata_array: self.nodata_array, - storage_type_array: self.storage_type_array, - datatype_array: self.datatype_array, - outdb_url_array: self.outdb_url_array, - outdb_band_id_array: self.outdb_band_id_array, - band_index: band_row, - }; + let visible_shape: Vec = source_shape.to_vec(); - let band_data = self.band_data_array.value(band_row); + let dtype_size = data_type.byte_size() as i64; + let mut byte_strides = vec![0i64; source_shape.len()]; + byte_strides[source_shape.len() - 1] = dtype_size; + for k in (0..source_shape.len() - 1).rev() { + byte_strides[k] = byte_strides[k + 1] * (source_shape[k + 1] as i64); + } Ok(Box::new(BandRefImpl { - band_metadata, - band_data, + dim_names_list: self.band_dim_names_list, + dim_names_values: self.band_dim_names_values, + source_shape_list: self.band_source_shape_list, + source_shape_values: self.band_source_shape_values, + nodata_array: self.band_nodata_array, + outdb_uri_array: self.band_outdb_uri_array, + outdb_format_array: self.band_outdb_format_array, + data_array: self.band_data_array, + band_row, + data_type, + view_entries, + visible_shape, + byte_strides, + byte_offset: 0, })) } - fn iter(&self) -> Box + '_> { - Box::new(BandIteratorImpl { - bands: self, - current: 1, // Start at 1 for 1-based band numbering - }) + fn band_data_type(&self, index: usize) -> Option { + if index >= self.num_bands() { + return None; + } + let start = self.bands_list.value_offsets()[self.raster_index] as usize; + let band_row = start + index; + let value = self.band_datatype_array.value(band_row); + BandDataType::try_from_u32(value) } -} - -/// Concrete implementation of BandIterator trait -pub struct BandIteratorImpl<'a> { - bands: &'a dyn BandsRef, - current: usize, -} -impl<'a> Iterator for BandIteratorImpl<'a> { - type Item = Box; - - fn next(&mut self) -> Option { - // current is 1-based, compare against len() + 1 - if self.current <= self.bands.len() { - let band = self.bands.band(self.current).ok(); // Convert Result to Option - self.current += 1; - band - } else { + fn band_outdb_uri(&self, index: usize) -> Option<&str> { + if index >= self.num_bands() { + return None; + } + let start = self.bands_list.value_offsets()[self.raster_index] as usize; + let band_row = start + index; + if self.band_outdb_uri_array.is_null(band_row) { None + } else { + Some(self.band_outdb_uri_array.value(band_row)) } } - fn size_hint(&self) -> (usize, Option) { - // current is 1-based, so remaining calculation needs adjustment - let remaining = self.bands.len().saturating_sub(self.current - 1); - (remaining, Some(remaining)) - } -} - -impl<'a> BandIterator<'a> for BandIteratorImpl<'a> { - fn len(&self) -> usize { - // current is 1-based, so remaining calculation needs adjustment - self.bands.len().saturating_sub(self.current - 1) - } -} - -impl ExactSizeIterator for BandIteratorImpl<'_> {} - -/// Implementation of RasterRef for complete raster access -pub struct RasterRefImpl<'a> { - metadata: MetadataRefImpl<'a>, - crs: &'a StringViewArray, - bands: BandsRefImpl<'a>, -} - -impl<'a> RasterRefImpl<'a> { - /// Creates a new RasterRefImpl that provides zero-copy access to the raster at the specified index. - /// - /// # Arguments - /// * `raster_struct_array` - The Arrow StructArray containing raster data - /// * `raster_index` - The zero-based index of the raster to access - #[inline(always)] - pub fn new(raster_struct_array: &RasterStructArray<'a>, raster_index: usize) -> Self { - let metadata = MetadataRefImpl { - width_array: raster_struct_array.width_array, - height_array: raster_struct_array.height_array, - upper_left_x_array: raster_struct_array.upper_left_x_array, - upper_left_y_array: raster_struct_array.upper_left_y_array, - scale_x_array: raster_struct_array.scale_x_array, - scale_y_array: raster_struct_array.scale_y_array, - skew_x_array: raster_struct_array.skew_x_array, - skew_y_array: raster_struct_array.skew_y_array, - index: raster_index, - }; - - let bands = BandsRefImpl { - bands_list: raster_struct_array.bands_list, - raster_index, - nodata_array: raster_struct_array.band_nodata_array, - storage_type_array: raster_struct_array.band_storage_type_array, - datatype_array: raster_struct_array.band_datatype_array, - outdb_url_array: raster_struct_array.band_outdb_url_array, - outdb_band_id_array: raster_struct_array.band_outdb_band_id_array, - band_data_array: raster_struct_array.band_data_array, - }; - - Self { - metadata, - crs: raster_struct_array.crs, - bands, + fn band_outdb_format(&self, index: usize) -> Option<&str> { + if index >= self.num_bands() { + return None; + } + let start = self.bands_list.value_offsets()[self.raster_index] as usize; + let band_row = start + index; + if self.band_outdb_format_array.is_null(band_row) { + None + } else { + Some(self.band_outdb_format_array.value(band_row)) } } - pub fn crs_str_ref(&self) -> Option<&'a str> { - if self.crs.is_null(self.bands.raster_index) { + fn band_nodata(&self, index: usize) -> Option<&[u8]> { + if index >= self.num_bands() { + return None; + } + let start = self.bands_list.value_offsets()[self.raster_index] as usize; + let band_row = start + index; + if self.band_nodata_array.is_null(band_row) { None } else { - Some(self.crs.value(self.bands.raster_index)) + Some(self.band_nodata_array.value(band_row)) } } -} -impl<'a> RasterRef for RasterRefImpl<'a> { - #[inline(always)] - fn metadata(&self) -> &dyn MetadataRef { - &self.metadata + fn band_name(&self, index: usize) -> Option<&str> { + if index >= self.num_bands() { + return None; + } + let start = self.bands_list.value_offsets()[self.raster_index] as usize; + let band_row = start + index; + if self.band_name_array.is_null(band_row) { + None + } else { + Some(self.band_name_array.value(band_row)) + } } - #[inline(always)] fn crs(&self) -> Option<&str> { self.crs_str_ref() } - #[inline(always)] - fn bands(&self) -> &dyn BandsRef { - &self.bands + fn transform(&self) -> &[f64] { + let start = self.transform_list.value_offsets()[self.raster_index] as usize; + let end = self.transform_list.value_offsets()[self.raster_index + 1] as usize; + assert!( + end - start >= 6, + "transform list must have at least 6 elements for raster {}, got {}", + self.raster_index, + end - start + ); + &self.transform_values.values()[start..start + 6] + } + + fn spatial_dims(&self) -> Vec<&str> { + let offsets = self.spatial_dims_list.value_offsets(); + let start = offsets[self.raster_index] as usize; + let end = offsets[self.raster_index + 1] as usize; + (start..end) + .map(|i| self.spatial_dims_values.value(i)) + .collect() + } + + fn spatial_shape(&self) -> &[i64] { + let offsets = self.spatial_shape_list.value_offsets(); + let start = offsets[self.raster_index] as usize; + let end = offsets[self.raster_index + 1] as usize; + &self.spatial_shape_values.values()[start..end] } } -/// Access rasters from the Arrow StructArray +/// Access rasters from the Arrow StructArray. /// -/// This provides efficient, zero-copy access to raster data stored in Arrow format. +/// Provides efficient, zero-copy access to N-D raster data stored in Arrow format. pub struct RasterStructArray<'a> { raster_array: &'a StructArray, - width_array: &'a UInt64Array, - height_array: &'a UInt64Array, - upper_left_x_array: &'a Float64Array, - upper_left_y_array: &'a Float64Array, - scale_x_array: &'a Float64Array, - scale_y_array: &'a Float64Array, - skew_x_array: &'a Float64Array, - skew_y_array: &'a Float64Array, - crs: &'a StringViewArray, + // Top-level fields + crs_array: &'a StringViewArray, + transform_list: &'a ListArray, + transform_values: &'a Float64Array, + spatial_dims_list: &'a ListArray, + spatial_dims_values: &'a StringViewArray, + spatial_shape_list: &'a ListArray, + spatial_shape_values: &'a Int64Array, bands_list: &'a ListArray, - band_nodata_array: &'a BinaryArray, - band_storage_type_array: &'a UInt32Array, + // Band-level fields (flattened across all bands in all rasters) + band_name_array: &'a StringArray, + band_dim_names_list: &'a ListArray, + band_dim_names_values: &'a StringArray, + band_source_shape_list: &'a ListArray, + band_source_shape_values: &'a UInt64Array, band_datatype_array: &'a UInt32Array, - band_outdb_url_array: &'a StringArray, - band_outdb_band_id_array: &'a UInt32Array, + band_nodata_array: &'a BinaryArray, + band_view_list: &'a ListArray, + band_outdb_uri_array: &'a StringArray, + band_outdb_format_array: &'a StringViewArray, band_data_array: &'a BinaryViewArray, } impl<'a> RasterStructArray<'a> { - /// Create a new RasterStructArray from an existing StructArray + /// Create a new RasterStructArray from an existing StructArray. #[inline] pub fn new(raster_array: &'a StructArray) -> Self { - let crs = raster_array + // Top-level fields + let crs_array = raster_array .column(raster_indices::CRS) .as_any() .downcast_ref::() .unwrap(); - - // Extract the metadata arrays for direct access - let metadata_struct = raster_array - .column(raster_indices::METADATA) + let transform_list = raster_array + .column(raster_indices::TRANSFORM) .as_any() - .downcast_ref::() - .unwrap(); - let width_array = metadata_struct - .column(metadata_indices::WIDTH) - .as_any() - .downcast_ref::() - .unwrap(); - let height_array = metadata_struct - .column(metadata_indices::HEIGHT) - .as_any() - .downcast_ref::() - .unwrap(); - let upper_left_x_array = metadata_struct - .column(metadata_indices::UPPERLEFT_X) - .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); - let upper_left_y_array = metadata_struct - .column(metadata_indices::UPPERLEFT_Y) + let transform_values = transform_list + .values() .as_any() .downcast_ref::() .unwrap(); - let scale_x_array = metadata_struct - .column(metadata_indices::SCALE_X) + let spatial_dims_list = raster_array + .column(raster_indices::SPATIAL_DIMS) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); - let scale_y_array = metadata_struct - .column(metadata_indices::SCALE_Y) + let spatial_dims_values = spatial_dims_list + .values() .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); - let skew_x_array = metadata_struct - .column(metadata_indices::SKEW_X) + let spatial_shape_list = raster_array + .column(raster_indices::SPATIAL_SHAPE) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); - let skew_y_array = metadata_struct - .column(metadata_indices::SKEW_Y) + let spatial_shape_values = spatial_shape_list + .values() .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); - // Extract the band arrays for direct access + // Bands list and nested struct let bands_list = raster_array .column(raster_indices::BANDS) .as_any() @@ -464,35 +467,57 @@ impl<'a> RasterStructArray<'a> { .as_any() .downcast_ref::() .unwrap(); - let band_metadata_struct = bands_struct - .column(band_indices::METADATA) + + // Band-level fields + let band_name_array = bands_struct + .column(band_indices::NAME) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); - let band_nodata_array = band_metadata_struct - .column(band_metadata_indices::NODATAVALUE) + let band_dim_names_list = bands_struct + .column(band_indices::DIM_NAMES) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); - let band_storage_type_array = band_metadata_struct - .column(band_metadata_indices::STORAGE_TYPE) + let band_dim_names_values = band_dim_names_list + .values() .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); - let band_datatype_array = band_metadata_struct - .column(band_metadata_indices::DATATYPE) + let band_source_shape_list = bands_struct + .column(band_indices::SOURCE_SHAPE) + .as_any() + .downcast_ref::() + .unwrap(); + let band_source_shape_values = band_source_shape_list + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let band_datatype_array = bands_struct + .column(band_indices::DATA_TYPE) .as_any() .downcast_ref::() .unwrap(); - let band_outdb_url_array = band_metadata_struct - .column(band_metadata_indices::OUTDB_URL) + let band_nodata_array = bands_struct + .column(band_indices::NODATA) + .as_any() + .downcast_ref::() + .unwrap(); + let band_view_list = bands_struct + .column(band_indices::VIEW) + .as_any() + .downcast_ref::() + .unwrap(); + let band_outdb_uri_array = bands_struct + .column(band_indices::OUTDB_URI) .as_any() .downcast_ref::() .unwrap(); - let band_outdb_band_id_array = band_metadata_struct - .column(band_metadata_indices::OUTDB_BAND_ID) + let band_outdb_format_array = bands_struct + .column(band_indices::OUTDB_FORMAT) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let band_data_array = bands_struct .column(band_indices::DATA) @@ -502,38 +527,41 @@ impl<'a> RasterStructArray<'a> { Self { raster_array, - width_array, - height_array, - upper_left_x_array, - upper_left_y_array, - scale_x_array, - scale_y_array, - skew_x_array, - skew_y_array, - crs, + crs_array, + transform_list, + transform_values, + spatial_dims_list, + spatial_dims_values, + spatial_shape_list, + spatial_shape_values, bands_list, - band_nodata_array, - band_storage_type_array, + band_name_array, + band_dim_names_list, + band_dim_names_values, + band_source_shape_list, + band_source_shape_values, band_datatype_array, - band_outdb_url_array, - band_outdb_band_id_array, + band_nodata_array, + band_view_list, + band_outdb_uri_array, + band_outdb_format_array, band_data_array, } } - /// Get the total number of rasters in the array + /// Get the total number of rasters in the array. #[inline(always)] pub fn len(&self) -> usize { self.raster_array.len() } - /// Check if the array is empty + /// Check if the array is empty. #[inline(always)] pub fn is_empty(&self) -> bool { self.raster_array.is_empty() } - /// Get a specific raster by index without consuming the iterator + /// Get a specific raster by index. #[inline(always)] pub fn get(&self, index: usize) -> Result, ArrowError> { if index >= self.raster_array.len() { @@ -541,10 +569,31 @@ impl<'a> RasterStructArray<'a> { "Invalid raster index: {index}" ))); } - - Ok(RasterRefImpl::new(self, index)) + Ok(RasterRefImpl { + crs_array: self.crs_array, + transform_list: self.transform_list, + transform_values: self.transform_values, + spatial_dims_list: self.spatial_dims_list, + spatial_dims_values: self.spatial_dims_values, + spatial_shape_list: self.spatial_shape_list, + spatial_shape_values: self.spatial_shape_values, + bands_list: self.bands_list, + band_name_array: self.band_name_array, + band_dim_names_list: self.band_dim_names_list, + band_dim_names_values: self.band_dim_names_values, + band_source_shape_list: self.band_source_shape_list, + band_source_shape_values: self.band_source_shape_values, + band_datatype_array: self.band_datatype_array, + band_nodata_array: self.band_nodata_array, + band_view_list: self.band_view_list, + band_outdb_uri_array: self.band_outdb_uri_array, + band_outdb_format_array: self.band_outdb_format_array, + band_data_array: self.band_data_array, + raster_index: index, + }) } + /// Check if a raster at the given index is null. #[inline(always)] pub fn is_null(&self, index: usize) -> bool { self.raster_array.is_null(index) @@ -556,9 +605,14 @@ mod tests { use super::*; use crate::builder::RasterBuilder; use crate::traits::{BandMetadata, RasterMetadata}; - use arrow_schema::DataType; - use sedona_schema::raster::{BandDataType, StorageType}; + use arrow_array::{ArrayRef, ListArray, StructArray, UInt32Array, UInt64Array}; + use arrow_buffer::{OffsetBuffer, ScalarBuffer}; + use arrow_schema::{DataType, Fields}; + use sedona_schema::raster::{ + band_indices, raster_indices, BandDataType, RasterSchema, StorageType, + }; use sedona_testing::rasters::generate_test_rasters; + use std::sync::Arc; #[test] fn test_array_basic_functionality() { @@ -691,6 +745,7 @@ mod tests { .iter() .enumerate() .map(|(i, band)| { + let band = band.unwrap(); assert_eq!(band.data()[0], i as u8); band.data()[0] }) @@ -708,44 +763,34 @@ mod tests { assert!(rasters.is_null(1)); } - /// Test that `data_type()` and `storage_type()` return `Err` for invalid values - /// instead of panicking. - #[test] - fn test_invalid_band_metadata_returns_err() { - use arrow_buffer::{OffsetBuffer, ScalarBuffer}; - use sedona_schema::raster::RasterSchema; - use std::sync::Arc; - - // Build a valid single-band raster first + /// Build a single-raster, single-band raster StructArray with the + /// canonical identity view. Used as the baseline input to the surgery + /// helpers below; callers replace one band-level column to simulate + /// schema corruption on non-view fields. + fn build_identity_raster() -> StructArray { let mut builder = RasterBuilder::new(1); - let metadata = RasterMetadata { - width: 2, - height: 2, - upperleft_x: 0.0, - upperleft_y: 0.0, - scale_x: 1.0, - scale_y: -1.0, - skew_x: 0.0, - skew_y: 0.0, - }; - builder.start_raster(&metadata, None).unwrap(); - let band_meta = BandMetadata { - nodata_value: None, - storage_type: StorageType::InDb, - datatype: BandDataType::UInt8, - outdb_url: None, - outdb_band_id: None, - }; - builder.start_band(band_meta).unwrap(); - builder.band_data_writer().append_value([1u8; 4]); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x"], &[3], None) + .unwrap(); + builder + .start_band_nd(None, &["x"], &[3], BandDataType::UInt8, None, None, None) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8, 1, 2]); builder.finish_band().unwrap(); builder.finish_raster().unwrap(); - let valid_array = builder.finish().unwrap(); + builder.finish().unwrap() + } - // Extract original columns from the valid raster - let metadata_col = valid_array.column(raster_indices::METADATA).clone(); - let crs_col = valid_array.column(raster_indices::CRS).clone(); - let bands_list = valid_array + /// Replace a single column of the bands struct, then rebuild the bands + /// list and the top-level raster struct. Schema-shape preserving — this + /// only swaps the array data, never the field type. + fn replace_band_column( + array: &StructArray, + column_index: usize, + new_column: ArrayRef, + ) -> StructArray { + let bands_list = array .column(raster_indices::BANDS) .as_any() .downcast_ref::() @@ -755,75 +800,323 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let orig_band_meta_struct = bands_struct - .column(band_indices::METADATA) - .as_any() - .downcast_ref::() - .unwrap(); - let band_data_col = bands_struct.column(band_indices::DATA).clone(); - // Build tampered band metadata with invalid storage_type=99 and datatype=99 - let DataType::Struct(band_metadata_fields) = RasterSchema::band_metadata_type() else { - panic!("Expected struct type for band metadata"); + let mut columns: Vec = bands_struct.columns().to_vec(); + columns[column_index] = new_column; + let DataType::Struct(band_fields) = RasterSchema::band_type() else { + unreachable!("band_type must be Struct") }; - let tampered_band_metadata = StructArray::new( - band_metadata_fields, - vec![ - orig_band_meta_struct - .column(band_metadata_indices::NODATAVALUE) - .clone(), - Arc::new(UInt32Array::from(vec![99u32])), // invalid storage_type - Arc::new(UInt32Array::from(vec![99u32])), // invalid datatype - orig_band_meta_struct - .column(band_metadata_indices::OUTDB_URL) - .clone(), - orig_band_meta_struct - .column(band_metadata_indices::OUTDB_BAND_ID) - .clone(), - ], - None, - ); + let new_bands_struct = + StructArray::new(band_fields, columns, bands_struct.nulls().cloned()); - // Rebuild band struct - let DataType::Struct(band_fields) = RasterSchema::band_type() else { - panic!("Expected struct type for band"); + let DataType::List(bands_field) = RasterSchema::bands_type() else { + unreachable!("bands_type must be List") }; - let tampered_band_struct = StructArray::new( - band_fields, - vec![Arc::new(tampered_band_metadata), band_data_col], - None, + let new_bands_list = ListArray::new( + bands_field, + bands_list.offsets().clone(), + Arc::new(new_bands_struct), + bands_list.nulls().cloned(), ); - // Rebuild bands list - let DataType::List(band_field) = RasterSchema::bands_type() else { - panic!("Expected list type for bands"); + let mut top_columns: Vec = array.columns().to_vec(); + top_columns[raster_indices::BANDS] = Arc::new(new_bands_list); + let raster_fields = RasterSchema::fields(); + StructArray::new( + Fields::from(raster_fields.to_vec()), + top_columns, + array.nulls().cloned(), + ) + } + + // bad data_type discriminant + + #[test] + fn band_and_band_data_type_surface_corruption_for_unknown_discriminant() { + let array = build_identity_raster(); + let bad_dtype: ArrayRef = Arc::new(UInt32Array::from(vec![0xFFu32])); + let mutated = replace_band_column(&array, band_indices::DATA_TYPE, bad_dtype); + let rasters = RasterStructArray::new(&mutated); + let r = rasters.get(0).unwrap(); + // band() surfaces the corruption through the standardized + // SedonaDB-internal-error message routed via ArrowError::ExternalError. + // `Box` isn't `Debug`, so unwrap_err doesn't compile — + // pull the error out via `.err().unwrap()` on the `Option` side. + let err = r.band(0).err().unwrap(); + assert!(err.to_string().contains("SedonaDB internal error")); + assert!(err.to_string().contains("data_type discriminant")); + // band_data_type retains its `Option` fast-path shape — corrupt + // discriminant collapses to None for consistency with the existing + // accessor's contract. + assert!(r.band_data_type(0).is_none()); + } + + // empty source_shape + + #[test] + fn band_surfaces_internal_error_when_source_shape_is_empty() { + let array = build_identity_raster(); + // Replace source_shape with a single empty list row. + let DataType::List(ss_field) = RasterSchema::source_shape_type() else { + unreachable!() }; - let tampered_bands_list = ListArray::new( - band_field, - OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 1])), - Arc::new(tampered_band_struct), + let empty_source_shape = ListArray::new( + ss_field, + OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 0])), + Arc::new(UInt64Array::from(Vec::::new())), None, ); + let mutated = replace_band_column( + &array, + band_indices::SOURCE_SHAPE, + Arc::new(empty_source_shape), + ); + let rasters = RasterStructArray::new(&mutated); + let err = rasters.get(0).unwrap().band(0).err().unwrap(); + assert!(err.to_string().contains("SedonaDB internal error")); + assert!(err.to_string().contains("empty source_shape")); + } - // Rebuild the top-level raster struct - let tampered_raster = StructArray::new( - RasterSchema::fields(), - vec![metadata_col, crs_col, Arc::new(tampered_bands_list)], - None, + // direct fast-path tests + + #[test] + fn raster_ref_fast_paths_return_expected_values() { + // Single 2-band raster: band 0 has explicit values for nodata, + // outdb_uri, outdb_format; band 1 has all-nullable fields null. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[3, 2], None) + .unwrap(); + builder + .start_band_nd( + Some("a"), + &["y", "x"], + &[2, 3], + BandDataType::UInt16, + Some(&[0xFFu8, 0xFE]), + Some("s3://bucket/a.tif"), + Some("GTiff"), + ) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 12]); + builder.finish_band().unwrap(); + builder + .start_band_nd( + Some("b"), + &["y", "x"], + &[2, 3], + BandDataType::Float32, + None, + None, + None, + ) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 24]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + + // Bounds: out-of-range indices yield None on every fast path. + assert!(r.band_data_type(2).is_none()); + assert!(r.band_outdb_uri(2).is_none()); + assert!(r.band_outdb_format(2).is_none()); + assert!(r.band_nodata(2).is_none()); + + // Band 0 — non-null values. + assert_eq!(r.band_data_type(0), Some(BandDataType::UInt16)); + assert_eq!(r.band_outdb_uri(0), Some("s3://bucket/a.tif")); + assert_eq!(r.band_outdb_format(0), Some("GTiff")); + assert_eq!(r.band_nodata(0), Some(&[0xFFu8, 0xFE][..])); + + // Band 1 — null fields. + assert_eq!(r.band_data_type(1), Some(BandDataType::Float32)); + assert!(r.band_outdb_uri(1).is_none()); + assert!(r.band_outdb_format(1).is_none()); + assert!(r.band_nodata(1).is_none()); + + // Cross-check against the BandRef slow path. + let band0 = r.band(0).unwrap(); + assert_eq!(band0.data_type(), BandDataType::UInt16); + assert_eq!(band0.outdb_uri(), Some("s3://bucket/a.tif")); + assert_eq!(band0.outdb_format(), Some("GTiff")); + assert_eq!(band0.nodata(), Some(&[0xFFu8, 0xFE][..])); + + // bands() view: 1-based band(N), len, is_empty, iter — same shape as + // pre-N-D callers expect. Exercise via the concrete type and via a + // `&dyn RasterRef` to confirm both dispatch paths work. + let bands = r.bands(); + assert_eq!(bands.len(), 2); + assert!(!bands.is_empty()); + assert_eq!(bands.band(1).unwrap().data_type(), BandDataType::UInt16); + assert_eq!(bands.band(2).unwrap().data_type(), BandDataType::Float32); + assert!(bands.band(0).is_err()); // 0 is invalid (1-based) + assert!(bands.band(3).is_err()); // out of range + assert_eq!(bands.iter().count(), 2); + let dyn_r: &dyn RasterRef = &r; + assert_eq!(dyn_r.bands().len(), 2); + + // metadata() shim: concrete RasterMetadata/BandMetadata values. + let m = r.metadata(); + assert_eq!(m.width(), 3); + assert_eq!(m.height(), 2); + assert_eq!(m.upper_left_x(), 0.0); + assert_eq!(m.scale_x(), 1.0); + let b0 = r.band(0).unwrap(); + let bm0 = b0.metadata(); + assert_eq!(bm0.data_type().unwrap(), BandDataType::UInt16); + assert_eq!( + bm0.storage_type().unwrap(), + sedona_schema::raster::StorageType::InDb ); + assert_eq!(bm0.nodata_value(), Some(&[0xFFu8, 0xFE][..])); + // Band 0 is InDb (has bytes), so outdb_* are hidden via the shim + // even though the row carries an outdb_uri hint. + assert!(bm0.outdb_url().is_none()); + assert!(bm0.outdb_band_id().is_none()); + } - // Read back and verify that data_type() and storage_type() return Err - let rasters = RasterStructArray::new(&tampered_raster); - let raster = rasters.get(0).unwrap(); - let band = raster.bands().band(1).unwrap(); - let band_meta = band.metadata(); + // multi-band, multi-raster identity - let storage_err = band_meta.storage_type().unwrap_err(); - assert!(storage_err.to_string().contains("Unknown storage type: 99")); + #[test] + fn multi_raster_identity_views() { + // Two rasters with multiple identity bands each. Exercises the + // `bands_list.value_offsets()` routing for every per-band lookup — + // a naive reader that forgets to add the per-raster offset would + // hand back data from the wrong band. + let mut builder = RasterBuilder::new(2); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + + // Raster 0: three identity bands. + builder + .start_raster_nd(&transform, &["x"], &[3], None) + .unwrap(); + builder + .start_band_nd(None, &["x"], &[3], BandDataType::UInt8, None, None, None) + .unwrap(); + builder.band_data_writer().append_value(vec![10u8, 20, 30]); + builder.finish_band().unwrap(); + builder + .start_band_nd(None, &["x"], &[3], BandDataType::UInt8, None, None, None) + .unwrap(); + builder.band_data_writer().append_value(vec![40u8, 50, 60]); + builder.finish_band().unwrap(); + builder + .start_band_nd(None, &["x"], &[3], BandDataType::UInt8, None, None, None) + .unwrap(); + builder + .band_data_writer() + .append_value(vec![100u8, 101, 102]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + // Raster 1: two identity bands of a different shape. + builder + .start_raster_nd(&transform, &["x"], &[4], None) + .unwrap(); + builder + .start_band_nd(None, &["x"], &[4], BandDataType::UInt8, None, None, None) + .unwrap(); + builder + .band_data_writer() + .append_value(vec![42u8, 43, 44, 45]); + builder.finish_band().unwrap(); + builder + .start_band_nd(None, &["x"], &[4], BandDataType::UInt8, None, None, None) + .unwrap(); + builder.band_data_writer().append_value(vec![1u8, 2, 3, 4]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); - let data_type_err = band_meta.data_type().unwrap_err(); - assert!(data_type_err - .to_string() - .contains("Unknown band data type: 99")); + let r0 = rasters.get(0).unwrap(); + assert_eq!(r0.num_bands(), 3); + assert_eq!(r0.band(0).unwrap().shape(), &[3]); + assert_eq!( + &*r0.band(0).unwrap().contiguous_data().unwrap(), + &[10u8, 20, 30] + ); + assert_eq!(r0.band(1).unwrap().shape(), &[3]); + assert_eq!( + &*r0.band(1).unwrap().contiguous_data().unwrap(), + &[40u8, 50, 60] + ); + assert_eq!(r0.band(2).unwrap().shape(), &[3]); + assert_eq!( + &*r0.band(2).unwrap().contiguous_data().unwrap(), + &[100u8, 101, 102] + ); + + let r1 = rasters.get(1).unwrap(); + assert_eq!(r1.num_bands(), 2); + assert_eq!(r1.band(0).unwrap().shape(), &[4]); + assert_eq!( + &*r1.band(0).unwrap().contiguous_data().unwrap(), + &[42u8, 43, 44, 45] + ); + assert_eq!(r1.band(1).unwrap().shape(), &[4]); + assert_eq!( + &*r1.band(1).unwrap().contiguous_data().unwrap(), + &[1u8, 2, 3, 4] + ); + + // Fast paths must honour the same offsets. + assert_eq!(r0.band_data_type(1), Some(BandDataType::UInt8)); + assert_eq!(r1.band_data_type(0), Some(BandDataType::UInt8)); + assert_eq!(r1.band_data_type(1), Some(BandDataType::UInt8)); + } + + // null raster row, fast path + + #[test] + fn null_raster_row_fast_paths_return_none_after_non_null() { + // A non-null raster precedes the null one, so the underlying flat + // band arrays are non-empty. A naive fast path that forgets the + // bands_list.value_offsets() routing would return *raster 0's* + // band 0 metadata when asked for raster 1's band 0 — a real bug + // that a single-null-raster fixture cannot detect. + let mut builder = RasterBuilder::new(2); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x"], &[3], None) + .unwrap(); + builder + .start_band_nd( + Some("a"), + &["x"], + &[3], + BandDataType::UInt16, + Some(&[0xFFu8, 0xFE]), + Some("s3://bucket/a.tif"), + Some("GTiff"), + ) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 6]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + builder.append_null().unwrap(); + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + + // Sanity: raster 0 still resolves correctly. + let r0 = rasters.get(0).unwrap(); + assert_eq!(r0.band_data_type(0), Some(BandDataType::UInt16)); + assert_eq!(r0.band_outdb_uri(0), Some("s3://bucket/a.tif")); + + // Raster 1 is null with zero bands. Every per-band lookup is + // out of range — `band()` surfaces an out-of-range error, + // the fast-path accessors return None. + assert!(rasters.is_null(1)); + let r1 = rasters.get(1).unwrap(); + assert_eq!(r1.num_bands(), 0); + assert!(r1.band(0).is_err()); + assert!(r1.band_data_type(0).is_none()); + assert!(r1.band_outdb_uri(0).is_none()); + assert!(r1.band_outdb_format(0).is_none()); + assert!(r1.band_nodata(0).is_none()); } } diff --git a/rust/sedona-raster/src/builder.rs b/rust/sedona-raster/src/builder.rs index 3db236cb4..31d018de4 100644 --- a/rust/sedona-raster/src/builder.rs +++ b/rust/sedona-raster/src/builder.rs @@ -17,16 +17,16 @@ use arrow_array::{ builder::{ - BinaryBuilder, BinaryViewBuilder, BooleanBuilder, Float64Builder, StringBuilder, - StringViewBuilder, UInt32Builder, UInt64Builder, + ArrayBuilder, BinaryBuilder, BinaryViewBuilder, BooleanBuilder, Float64Builder, + Int64Builder, StringBuilder, StringViewBuilder, UInt32Builder, UInt64Builder, }, Array, ArrayRef, ListArray, StructArray, }; -use arrow_buffer::{OffsetBuffer, ScalarBuffer}; +use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_schema::{ArrowError, DataType}; use std::sync::Arc; -use sedona_schema::raster::RasterSchema; +use sedona_schema::raster::{BandDataType, RasterSchema}; use crate::traits::{BandMetadata, MetadataRef}; @@ -75,267 +75,598 @@ use crate::traits::{BandMetadata, MetadataRef}; /// // Finish the raster /// builder.finish_raster().unwrap(); /// -/// // Finish building and get the StructArray +/// // Get the final StructArray /// let raster_array = builder.finish().unwrap(); /// ``` pub struct RasterBuilder { - // Metadata fields - width: UInt64Builder, - height: UInt64Builder, - upper_left_x: Float64Builder, - upper_left_y: Float64Builder, - scale_x: Float64Builder, - scale_y: Float64Builder, - skew_x: Float64Builder, - skew_y: Float64Builder, - - // CRS field + // Top-level raster fields crs: StringViewBuilder, - - // Band metadata fields - band_nodata: BinaryBuilder, - band_storage_type: UInt32Builder, + transform_values: Float64Builder, + transform_offsets: Vec, + spatial_dims_values: StringViewBuilder, + spatial_dims_offsets: Vec, + spatial_shape_values: Int64Builder, + spatial_shape_offsets: Vec, + + // Band fields (flattened across all bands) + band_name: StringBuilder, + band_dim_names_values: StringBuilder, + band_dim_names_offsets: Vec, + band_shape_values: UInt64Builder, + band_shape_offsets: Vec, band_datatype: UInt32Builder, - band_outdb_url: StringBuilder, - band_outdb_band_id: UInt32Builder, - - // Band data field + band_nodata: BinaryBuilder, + // VIEW field — one entry per visible dimension per band. Stored as four + // parallel Int64 columns + a List offset vector; assembled into a + // `ListArray>` in `finish()`. + band_view_source_axis_values: Int64Builder, + band_view_start_values: Int64Builder, + band_view_step_values: Int64Builder, + band_view_steps_values: Int64Builder, + band_view_offsets: Vec, + // Per-band validity for the view list. `false` means the row is null — + // the canonical representation of an identity view. `true` means the row + // carries an explicit view in the four parallel value builders. + band_view_validity: Vec, + band_outdb_uri: StringBuilder, + band_outdb_format: StringViewBuilder, band_data: BinaryViewBuilder, // List structure tracking band_offsets: Vec, // Track where each raster's bands start/end current_band_count: i32, // Track bands in current raster - raster_validity: BooleanBuilder, // Track which rasters are null + // Current raster state (needed for start_band_2d) + current_width: u64, + current_height: u64, + + // Per-raster validation state: spatial dims/shape and recorded bands so + // finish_raster can check every band matches the top-level spatial grid. + current_spatial_dims: Vec, + current_spatial_shape: Vec, + current_raster_bands: Vec<(Vec, Vec)>, + + // Track band_data count at the start of each band for finish_band validation + band_data_count_at_start: usize, + + raster_validity: BooleanBuilder, } impl RasterBuilder { - /// Create a new raster builder with the specified capacity + /// Create a new raster builder with the specified capacity. pub fn new(capacity: usize) -> Self { Self { - // Metadata builders - width: UInt64Builder::with_capacity(capacity), - height: UInt64Builder::with_capacity(capacity), - upper_left_x: Float64Builder::with_capacity(capacity), - upper_left_y: Float64Builder::with_capacity(capacity), - scale_x: Float64Builder::with_capacity(capacity), - scale_y: Float64Builder::with_capacity(capacity), - skew_x: Float64Builder::with_capacity(capacity), - skew_y: Float64Builder::with_capacity(capacity), - - // CRS builder crs: StringViewBuilder::with_capacity(capacity), - - // Band builders - estimate some bands per raster - // The capacity is at raster level, but each raster has multiple bands and - // are large. We may want to add an optional parameter to control expected - // bands per raster or even band size in the future - band_nodata: BinaryBuilder::with_capacity(capacity, capacity), - band_storage_type: UInt32Builder::with_capacity(capacity), + transform_values: Float64Builder::with_capacity(capacity * 6), + transform_offsets: vec![0], + spatial_dims_values: StringViewBuilder::with_capacity(capacity * 2), + spatial_dims_offsets: vec![0], + spatial_shape_values: Int64Builder::with_capacity(capacity * 2), + spatial_shape_offsets: vec![0], + + band_name: StringBuilder::with_capacity(capacity, capacity), + band_dim_names_values: StringBuilder::with_capacity(capacity * 2, capacity * 4), + band_dim_names_offsets: vec![0], + band_shape_values: UInt64Builder::with_capacity(capacity * 2), + band_shape_offsets: vec![0], band_datatype: UInt32Builder::with_capacity(capacity), - band_outdb_url: StringBuilder::with_capacity(capacity, capacity), - band_outdb_band_id: UInt32Builder::with_capacity(capacity), + band_nodata: BinaryBuilder::with_capacity(capacity, capacity), + band_view_source_axis_values: Int64Builder::with_capacity(capacity * 2), + band_view_start_values: Int64Builder::with_capacity(capacity * 2), + band_view_step_values: Int64Builder::with_capacity(capacity * 2), + band_view_steps_values: Int64Builder::with_capacity(capacity * 2), + band_view_offsets: vec![0], + band_view_validity: Vec::with_capacity(capacity), + band_outdb_uri: StringBuilder::with_capacity(capacity, capacity), + band_outdb_format: StringViewBuilder::with_capacity(capacity), band_data: BinaryViewBuilder::with_capacity(capacity), - // List tracking band_offsets: vec![0], current_band_count: 0, + current_width: 0, + current_height: 0, + + current_spatial_dims: Vec::new(), + current_spatial_shape: Vec::new(), + current_raster_bands: Vec::new(), + + band_data_count_at_start: 0, - // Raster-level validity (keeps track of null rasters) raster_validity: BooleanBuilder::with_capacity(capacity), } } - /// Start a new raster with metadata and optional CRS - pub fn start_raster( + /// Start a new raster with explicit N-D parameters. + /// + /// `transform` must be a 6-element GDAL GeoTransform: + /// `[origin_x, scale_x, skew_x, origin_y, skew_y, scale_y]` + /// + /// `spatial_dims` names the raster-level spatial dimensions (today always + /// length 2, e.g. `["x","y"]`). `spatial_shape` gives their sizes in the + /// same order. Every band added to this raster must contain each name in + /// `spatial_dims` within its own `dim_names`, with matching size. + pub fn start_raster_nd( &mut self, - metadata: &dyn MetadataRef, + transform: &[f64; 6], + spatial_dims: &[&str], + spatial_shape: &[i64], crs: Option<&str>, ) -> Result<(), ArrowError> { - self.append_metadata_from_ref(metadata)?; - self.append_crs(crs)?; + if spatial_dims.len() != spatial_shape.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "spatial_dims.len() ({}) must equal spatial_shape.len() ({})", + spatial_dims.len(), + spatial_shape.len() + ))); + } + + // Transform + for &v in transform { + self.transform_values.append_value(v); + } + let next = *self.transform_offsets.last().unwrap() + 6; + self.transform_offsets.push(next); + + // Spatial dims + shape + for d in spatial_dims { + self.spatial_dims_values.append_value(d); + } + let next = *self.spatial_dims_offsets.last().unwrap() + spatial_dims.len() as i32; + self.spatial_dims_offsets.push(next); + + for &s in spatial_shape { + self.spatial_shape_values.append_value(s); + } + let next = *self.spatial_shape_offsets.last().unwrap() + spatial_shape.len() as i32; + self.spatial_shape_offsets.push(next); + + // CRS + match crs { + Some(crs_data) => self.crs.append_value(crs_data), + None => self.crs.append_null(), + } - // Reset band count for this raster self.current_band_count = 0; + self.current_spatial_dims = spatial_dims.iter().map(|s| s.to_string()).collect(); + self.current_spatial_shape = spatial_shape.to_vec(); + self.current_raster_bands.clear(); + // Preserve legacy current_width/current_height for start_band_2d (set + // by start_raster_2d). Callers using this direct entry point drive + // their own shapes via start_band_nd. + self.current_width = 0; + self.current_height = 0; Ok(()) } - /// Start a new band - this must be called before writing band data - pub fn start_band(&mut self, band_metadata: BandMetadata) -> Result<(), ArrowError> { - // Append band metadata - match band_metadata.nodata_value { - Some(nodata) => self.band_nodata.append_value(&nodata), + /// Convenience: start a 2-D raster with positional geotransform parameters. + /// Sets `spatial_dims=["x","y"]` and `spatial_shape=[width, height]` and + /// builds the 6-element GDAL transform internally. The N-D entry point is + /// [`Self::start_raster_nd`]; the metadata-taking entry is + /// [`Self::start_raster`]. + #[allow(clippy::too_many_arguments)] + pub fn start_raster_2d( + &mut self, + width: u64, + height: u64, + origin_x: f64, + origin_y: f64, + scale_x: f64, + scale_y: f64, + skew_x: f64, + skew_y: f64, + crs: Option<&str>, + ) -> Result<(), ArrowError> { + let transform = [origin_x, scale_x, skew_x, origin_y, skew_y, scale_y]; + self.start_raster_nd(&transform, &["x", "y"], &[width as i64, height as i64], crs)?; + self.current_width = width; + self.current_height = height; + Ok(()) + } + + /// Start a 2-D raster from a `&dyn MetadataRef`. Matches the pre-N-D + /// signature so callers from before the refactor keep compiling without + /// changing argument lists. + pub fn start_raster( + &mut self, + metadata: &dyn MetadataRef, + crs: Option<&str>, + ) -> Result<(), ArrowError> { + self.start_raster_2d( + metadata.width(), + metadata.height(), + metadata.upper_left_x(), + metadata.upper_left_y(), + metadata.scale_x(), + metadata.scale_y(), + metadata.skew_x(), + metadata.skew_y(), + crs, + ) + } + + /// Start a new band with explicit N-D parameters. + /// + /// `outdb_uri` is the *location* of the external resource (scheme is + /// resolved by an `ObjectStoreRegistry`). `outdb_format` is the *format* + /// used to interpret the bytes at that location (e.g. `"geotiff"`, + /// `"zarr"`). A null `outdb_format` means the band is in-memory — the + /// band's `data` buffer is authoritative. + #[allow(clippy::too_many_arguments)] + pub fn start_band_nd( + &mut self, + name: Option<&str>, + dim_names: &[&str], + shape: &[u64], + data_type: BandDataType, + nodata: Option<&[u8]>, + outdb_uri: Option<&str>, + outdb_format: Option<&str>, + ) -> Result<(), ArrowError> { + if dim_names.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "start_band_nd: 0-dimensional bands are not supported".into(), + )); + } + if dim_names.len() != shape.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "start_band_nd: dim_names ({}) and shape ({}) must have the same length", + dim_names.len(), + shape.len(), + ))); + } + // Name + match name { + Some(n) => self.band_name.append_value(n), + None => self.band_name.append_null(), + } + + // Dim names + for dn in dim_names { + self.band_dim_names_values.append_value(dn); + } + let next = *self.band_dim_names_offsets.last().unwrap() + dim_names.len() as i32; + self.band_dim_names_offsets.push(next); + + // Shape + for &s in shape { + self.band_shape_values.append_value(s); + } + let next = *self.band_shape_offsets.last().unwrap() + shape.len() as i32; + self.band_shape_offsets.push(next); + + // Data type + self.band_datatype.append_value(data_type as u32); + + // Nodata + match nodata { + Some(nodata_bytes) => self.band_nodata.append_value(nodata_bytes), None => self.band_nodata.append_null(), } - self.band_storage_type - .append_value(band_metadata.storage_type as u32); - self.band_datatype - .append_value(band_metadata.datatype as u32); + // VIEW: canonical identity is encoded as a null list entry — no + // values appended, offset unchanged, validity bit cleared. + let next = *self.band_view_offsets.last().unwrap(); + self.band_view_offsets.push(next); + self.band_view_validity.push(false); - match band_metadata.outdb_url { - Some(url) => self.band_outdb_url.append_value(&url), - None => self.band_outdb_url.append_null(), + // OutDb URI + match outdb_uri { + Some(uri) => self.band_outdb_uri.append_value(uri), + None => self.band_outdb_uri.append_null(), } - match band_metadata.outdb_band_id { - Some(band_id) => self.band_outdb_band_id.append_value(band_id), - None => self.band_outdb_band_id.append_null(), + // OutDb format + match outdb_format { + Some(format) => self.band_outdb_format.append_value(format), + None => self.band_outdb_format.append_null(), } self.current_band_count += 1; + self.band_data_count_at_start = self.band_data.len(); + + // Record this band's dims/shape for strict validation at finish_raster. + self.current_raster_bands.push(( + dim_names.iter().map(|s| s.to_string()).collect(), + shape.to_vec(), + )); Ok(()) } - /// Get direct access to the BinaryViewBuilder for writing the current band's data - /// Must be called after start_band() to write data to the current band + /// Convenience: start a 2D band with `dim_names=["y","x"]` and `shape=[height, width]`. + /// + /// Must be called after `start_raster_2d` / `start_raster_2d` which sets + /// the current width/height. + pub fn start_band_2d( + &mut self, + data_type: BandDataType, + nodata: Option<&[u8]>, + ) -> Result<(), ArrowError> { + if self.current_width == 0 && self.current_height == 0 { + return Err(ArrowError::InvalidArgumentError( + "start_band_2d requires prior start_raster_2d (width and height are 0)".into(), + )); + } + self.start_band_nd( + None, + &["y", "x"], + &[self.current_height, self.current_width], + data_type, + nodata, + None, + None, + ) + } + + /// Start a 2-D band from a concrete [`BandMetadata`] struct. Matches + /// the pre-N-D signature so callers from before the refactor keep + /// compiling. For OutDb bands the `outdb_url` + `outdb_band_id` are + /// recombined into the SedonaDB `#band=N` URI convention. + pub fn start_band(&mut self, metadata: BandMetadata) -> Result<(), ArrowError> { + if self.current_width == 0 && self.current_height == 0 { + return Err(ArrowError::InvalidArgumentError( + "start_band requires prior start_raster / start_raster_2d (width and height are 0)" + .into(), + )); + } + let outdb_uri = match (metadata.outdb_url.as_deref(), metadata.outdb_band_id) { + (Some(url), Some(band_id)) => Some(format!("{url}#band={band_id}")), + (Some(url), None) => Some(url.to_string()), + _ => None, + }; + self.start_band_nd( + None, + &["y", "x"], + &[self.current_height, self.current_width], + metadata.datatype, + metadata.nodata_value.as_deref(), + outdb_uri.as_deref(), + None, + ) + } + + /// Get direct access to the BinaryViewBuilder for writing the current band's data. pub fn band_data_writer(&mut self) -> &mut BinaryViewBuilder { &mut self.band_data } - /// Finish writing the current band + /// Finish writing the current band. + /// + /// Validates that exactly one data value was appended since `start_band_nd()`. pub fn finish_band(&mut self) -> Result<(), ArrowError> { - // Band data should already be written via band_data_writer - // Nothing additional needed here since we're building flat + let current_count = self.band_data.len(); + if current_count != self.band_data_count_at_start + 1 { + return Err(ArrowError::InvalidArgumentError( + format!( + "Expected exactly one band data value per band, but got {} appended since start_band_nd()", + current_count - self.band_data_count_at_start + ), + )); + } Ok(()) } - /// Finish all bands for the current raster + /// Finish all bands for the current raster. + /// + /// Strictly validates every band added since `start_raster_nd`: each name in + /// the top-level `spatial_dims` must appear in the band's own `dim_names` + /// with a size matching the corresponding entry in `spatial_shape`. pub fn finish_raster(&mut self) -> Result<(), ArrowError> { - // Record the end offset for this raster's bands + for (band_idx, (band_dims, band_shape)) in self.current_raster_bands.iter().enumerate() { + for (spatial_idx, spatial_dim) in self.current_spatial_dims.iter().enumerate() { + let pos = band_dims + .iter() + .position(|d| d == spatial_dim) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Band {band_idx} is missing spatial dimension {spatial_dim:?} \ + (band dim_names = {band_dims:?})" + )) + })?; + let expected = self.current_spatial_shape[spatial_idx]; + let actual = band_shape[pos] as i64; + if actual != expected { + return Err(ArrowError::InvalidArgumentError(format!( + "Band {band_idx} dimension {spatial_dim:?} has size {actual}, \ + expected {expected} from top-level spatial_shape" + ))); + } + } + } + let next_offset = self.band_offsets.last().unwrap() + self.current_band_count; self.band_offsets.push(next_offset); - self.raster_validity.append_value(true); - + self.current_raster_bands.clear(); + self.current_spatial_dims.clear(); + self.current_spatial_shape.clear(); Ok(()) } - /// Append raster metadata from a MetadataRef trait object - fn append_metadata_from_ref(&mut self, metadata: &dyn MetadataRef) -> Result<(), ArrowError> { - self.width.append_value(metadata.width()); - self.height.append_value(metadata.height()); - self.upper_left_x.append_value(metadata.upper_left_x()); - self.upper_left_y.append_value(metadata.upper_left_y()); - self.scale_x.append_value(metadata.scale_x()); - self.scale_y.append_value(metadata.scale_y()); - self.skew_x.append_value(metadata.skew_x()); - self.skew_y.append_value(metadata.skew_y()); - - Ok(()) - } - - /// Set the CRS for the current raster - pub fn append_crs(&mut self, crs: Option<&str>) -> Result<(), ArrowError> { - match crs { - Some(crs_data) => self.crs.append_value(crs_data), - None => self.crs.append_null(), + /// Append a null raster. + pub fn append_null(&mut self) -> Result<(), ArrowError> { + // Transform: append 6 zeros + for _ in 0..6 { + self.transform_values.append_value(0.0); } - Ok(()) - } + let next = *self.transform_offsets.last().unwrap() + 6; + self.transform_offsets.push(next); - /// Append a null raster - pub fn append_null(&mut self) -> Result<(), ArrowError> { - // Since metadata fields are non-nullable, provide default values - self.width.append_value(0u64); - self.height.append_value(0u64); - self.upper_left_x.append_value(0.0f64); - self.upper_left_y.append_value(0.0f64); - self.scale_x.append_value(0.0f64); - self.scale_y.append_value(0.0f64); - self.skew_x.append_value(0.0f64); - self.skew_y.append_value(0.0f64); - - // Append null CRS + // Spatial dims + shape: empty list for null rasters. + let next = *self.spatial_dims_offsets.last().unwrap(); + self.spatial_dims_offsets.push(next); + let next = *self.spatial_shape_offsets.last().unwrap(); + self.spatial_shape_offsets.push(next); + + // CRS: null self.crs.append_null(); - // No bands for null raster + // No bands let current_offset = *self.band_offsets.last().unwrap(); self.band_offsets.push(current_offset); - // Mark raster as null + // Mark null self.raster_validity.append_null(); Ok(()) } - /// Finish building and return the constructed StructArray + /// Finish building and return the constructed StructArray. pub fn finish(mut self) -> Result { - // Build the metadata struct using the schema - let metadata_fields = if let DataType::Struct(fields) = RasterSchema::metadata_type() { - fields - } else { + // Build transform list + let transform_values = self.transform_values.finish(); + let transform_offsets = OffsetBuffer::new(ScalarBuffer::from(self.transform_offsets)); + let DataType::List(transform_field) = RasterSchema::transform_type() else { return Err(ArrowError::SchemaError( - "Expected struct type for metadata".to_string(), + "Expected list type for transform".to_string(), )); }; + let transform_list = ListArray::new( + transform_field, + transform_offsets, + Arc::new(transform_values), + None, + ); - let metadata_arrays: Vec = vec![ - Arc::new(self.width.finish()), - Arc::new(self.height.finish()), - Arc::new(self.upper_left_x.finish()), - Arc::new(self.upper_left_y.finish()), - Arc::new(self.scale_x.finish()), - Arc::new(self.scale_y.finish()), - Arc::new(self.skew_x.finish()), - Arc::new(self.skew_y.finish()), - ]; - let metadata_array = StructArray::new(metadata_fields, metadata_arrays, None); - - // Build the band metadata struct using the schema - let band_metadata_fields = - if let DataType::Struct(fields) = RasterSchema::band_metadata_type() { - fields - } else { - return Err(ArrowError::SchemaError( - "Expected struct type for band metadata".to_string(), - )); - }; + // Build spatial_dims list + let spatial_dims_values = self.spatial_dims_values.finish(); + let spatial_dims_offsets = OffsetBuffer::new(ScalarBuffer::from(self.spatial_dims_offsets)); + let DataType::List(spatial_dims_field) = RasterSchema::spatial_dims_type() else { + return Err(ArrowError::SchemaError( + "Expected list type for spatial_dims".to_string(), + )); + }; + let spatial_dims_list = ListArray::new( + spatial_dims_field, + spatial_dims_offsets, + Arc::new(spatial_dims_values), + None, + ); - let band_metadata_arrays: Vec = vec![ - Arc::new(self.band_nodata.finish()), - Arc::new(self.band_storage_type.finish()), - Arc::new(self.band_datatype.finish()), - Arc::new(self.band_outdb_url.finish()), - Arc::new(self.band_outdb_band_id.finish()), - ]; - let band_metadata_array = - StructArray::new(band_metadata_fields, band_metadata_arrays, None); + // Build spatial_shape list + let spatial_shape_values = self.spatial_shape_values.finish(); + let spatial_shape_offsets = + OffsetBuffer::new(ScalarBuffer::from(self.spatial_shape_offsets)); + let DataType::List(spatial_shape_field) = RasterSchema::spatial_shape_type() else { + return Err(ArrowError::SchemaError( + "Expected list type for spatial_shape".to_string(), + )); + }; + let spatial_shape_list = ListArray::new( + spatial_shape_field, + spatial_shape_offsets, + Arc::new(spatial_shape_values), + None, + ); + + // Build band dim_names nested list + let dim_names_values = self.band_dim_names_values.finish(); + let dim_names_offsets = OffsetBuffer::new(ScalarBuffer::from(self.band_dim_names_offsets)); + let DataType::List(dim_names_field) = RasterSchema::dim_names_type() else { + return Err(ArrowError::SchemaError( + "Expected list type for dim_names".to_string(), + )); + }; + let dim_names_list = ListArray::new( + dim_names_field, + dim_names_offsets, + Arc::new(dim_names_values), + None, + ); - // Build the band struct using the schema - let band_fields = if let DataType::Struct(fields) = RasterSchema::band_type() { - fields + // Build band source_shape nested list + let source_shape_values = self.band_shape_values.finish(); + let source_shape_offsets = OffsetBuffer::new(ScalarBuffer::from(self.band_shape_offsets)); + let DataType::List(source_shape_field) = RasterSchema::source_shape_type() else { + return Err(ArrowError::SchemaError( + "Expected list type for source_shape".to_string(), + )); + }; + let source_shape_list = ListArray::new( + source_shape_field, + source_shape_offsets, + Arc::new(source_shape_values), + None, + ); + + // Build band view nested list (List>). + let view_source_axis = self.band_view_source_axis_values.finish(); + let view_start = self.band_view_start_values.finish(); + let view_step = self.band_view_step_values.finish(); + let view_steps = self.band_view_steps_values.finish(); + let view_offsets = OffsetBuffer::new(ScalarBuffer::from(self.band_view_offsets)); + let DataType::List(view_list_field) = RasterSchema::view_type() else { + return Err(ArrowError::SchemaError( + "Expected list type for view".to_string(), + )); + }; + let DataType::Struct(view_struct_fields) = view_list_field.data_type().clone() else { + return Err(ArrowError::SchemaError( + "Expected struct type inside view list".to_string(), + )); + }; + let view_struct = StructArray::new( + view_struct_fields, + vec![ + Arc::new(view_source_axis) as ArrayRef, + Arc::new(view_start) as ArrayRef, + Arc::new(view_step) as ArrayRef, + Arc::new(view_steps) as ArrayRef, + ], + None, + ); + let view_nulls = if self.band_view_validity.iter().all(|&b| b) { + None } else { + Some(NullBuffer::from_iter( + self.band_view_validity.iter().copied(), + )) + }; + let view_list = ListArray::new( + view_list_field, + view_offsets, + Arc::new(view_struct), + view_nulls, + ); + + // Build band struct + let DataType::Struct(band_fields) = RasterSchema::band_type() else { return Err(ArrowError::SchemaError( "Expected struct type for band".to_string(), )); }; let band_arrays: Vec = vec![ - Arc::new(band_metadata_array), + Arc::new(self.band_name.finish()), + Arc::new(dim_names_list), + Arc::new(source_shape_list), + Arc::new(self.band_datatype.finish()), + Arc::new(self.band_nodata.finish()), + Arc::new(view_list), + Arc::new(self.band_outdb_uri.finish()), + Arc::new(self.band_outdb_format.finish()), Arc::new(self.band_data.finish()), ]; - let band_struct_array = StructArray::new(band_fields, band_arrays, None); + let band_struct = StructArray::new(band_fields, band_arrays, None); - // Build the bands list array using the schema - let band_field = if let DataType::List(field) = RasterSchema::bands_type() { - field - } else { + // Build bands list + let DataType::List(bands_field) = RasterSchema::bands_type() else { return Err(ArrowError::SchemaError( "Expected list type for bands".to_string(), )); }; + let band_list_offsets = OffsetBuffer::new(ScalarBuffer::from(self.band_offsets)); + let bands_list = + ListArray::new(bands_field, band_list_offsets, Arc::new(band_struct), None); - let offsets = OffsetBuffer::new(ScalarBuffer::from(self.band_offsets)); - let bands_list = ListArray::new(band_field, offsets, Arc::new(band_struct_array), None); - - // Build the final raster struct using the schema + // Build top-level raster struct let raster_fields = RasterSchema::fields(); let raster_arrays: Vec = vec![ - Arc::new(metadata_array), Arc::new(self.crs.finish()), + Arc::new(transform_list), + Arc::new(spatial_dims_list), + Arc::new(spatial_shape_list), Arc::new(bands_list), ]; @@ -351,7 +682,13 @@ mod tests { use super::*; use crate::array::RasterStructArray; use crate::traits::{RasterMetadata, RasterRef}; - use sedona_schema::raster::{BandDataType, StorageType}; + use arrow_array::RecordBatch; + use arrow_ipc::reader::StreamReader; + use arrow_ipc::writer::StreamWriter; + use arrow_schema::Schema; + use sedona_schema::raster::StorageType; + use std::borrow::Cow; + use std::io::Cursor; #[test] fn test_iterator_basic_functionality() { @@ -483,6 +820,7 @@ mod tests { .iter() .enumerate() .map(|(i, band)| { + let band = band.unwrap(); assert_eq!(band.data()[0], i as u8); band.data()[0] }) @@ -533,7 +871,7 @@ mod tests { let source_raster = iterator.get(0).unwrap(); target_builder - .start_raster(source_raster.metadata(), source_raster.crs()) + .start_raster(&source_raster.metadata(), source_raster.crs()) .unwrap(); // Add new band data while preserving original metadata @@ -846,4 +1184,755 @@ mod tests { let band = result.unwrap(); assert_eq!(band.data().len(), 100); } + + #[test] + fn test_roundtrip_2d_raster() { + let mut builder = RasterBuilder::new(1); + builder + .start_raster_2d( + 10, + 20, + 100.0, + 200.0, + 1.0, + -2.0, + 0.25, + 0.5, + Some("EPSG:4326"), + ) + .unwrap(); + builder + .start_band_2d(BandDataType::UInt8, Some(&[255u8])) + .unwrap(); + builder.band_data_writer().append_value(vec![1u8; 200]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + assert_eq!(rasters.len(), 1); + + let r = rasters.get(0).unwrap(); + assert_eq!(r.width().unwrap(), 10); + assert_eq!(r.height().unwrap(), 20); + assert_eq!(r.transform(), &[100.0, 1.0, 0.25, 200.0, 0.5, -2.0]); + assert_eq!(r.x_dim(), "x"); + assert_eq!(r.y_dim(), "y"); + assert_eq!(r.crs(), Some("EPSG:4326")); + assert_eq!(r.num_bands(), 1); + + let band = r.band(0).unwrap(); + assert_eq!(band.ndim(), 2); + assert_eq!(band.dim_names(), vec!["y", "x"]); + assert_eq!(band.shape(), &[20, 10]); + assert_eq!(band.data_type(), BandDataType::UInt8); + assert_eq!(band.nodata(), Some(&[255u8][..])); + assert_eq!(band.contiguous_data().unwrap().len(), 200); + } + + #[test] + fn test_roundtrip_multi_band() { + let mut builder = RasterBuilder::new(1); + builder + .start_raster_2d(2, 2, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0, None) + .unwrap(); + + // Band 0: UInt8 + builder + .start_band_2d(BandDataType::UInt8, Some(&[255u8])) + .unwrap(); + builder.band_data_writer().append_value([1u8, 2, 3, 4]); + builder.finish_band().unwrap(); + + // Band 1: Float32 + builder.start_band_2d(BandDataType::Float32, None).unwrap(); + let f32_data: Vec = [1.5f32, 2.5, 3.5, 4.5] + .iter() + .flat_map(|v| v.to_le_bytes()) + .collect(); + builder.band_data_writer().append_value(&f32_data); + builder.finish_band().unwrap(); + + builder.finish_raster().unwrap(); + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + + assert_eq!(r.num_bands(), 2); + + let b0 = r.band(0).unwrap(); + assert_eq!(b0.data_type(), BandDataType::UInt8); + assert_eq!(b0.nodata(), Some(&[255u8][..])); + + let b1 = r.band(1).unwrap(); + assert_eq!(b1.data_type(), BandDataType::Float32); + assert_eq!(b1.nodata(), None); + } + + #[test] + fn test_null_raster() { + let mut builder = RasterBuilder::new(2); + builder + .start_raster_2d(1, 1, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0, None) + .unwrap(); + builder.start_band_2d(BandDataType::UInt8, None).unwrap(); + builder.band_data_writer().append_value([0u8]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + builder.append_null().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + assert_eq!(rasters.len(), 2); + assert!(!rasters.is_null(0)); + assert!(rasters.is_null(1)); + } + + #[test] + fn test_nd_band() { + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[5, 4], None) + .unwrap(); + + // 3D band: [time=3, y=4, x=5] + builder + .start_band_nd( + Some("temperature"), + &["time", "y", "x"], + &[3, 4, 5], + BandDataType::Float32, + None, + None, + None, + ) + .unwrap(); + let data = vec![0u8; 3 * 4 * 5 * 4]; // 3*4*5 Float32 elements + builder.band_data_writer().append_value(&data); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + + assert_eq!(r.band_name(0), Some("temperature")); + let band = r.band(0).unwrap(); + assert_eq!(band.ndim(), 3); + assert_eq!(band.dim_names(), vec!["time", "y", "x"]); + assert_eq!(band.shape(), &[3, 4, 5]); + assert_eq!(band.dim_size("time"), Some(3)); + assert_eq!(band.dim_size("y"), Some(4)); + assert_eq!(band.dim_size("x"), Some(5)); + assert_eq!(band.dim_size("z"), None); + + // Verify strides are standard C-order: [4*5*4, 5*4, 4] = [80, 20, 4] + let buf = band.nd_buffer().unwrap(); + assert_eq!(buf.strides, &[80, 20, 4]); + assert_eq!(buf.offset, 0); + } + + #[test] + fn test_nonstandard_spatial_dim_names() { + // Zarr-style dataset with lat/lon instead of y/x + let mut builder = RasterBuilder::new(1); + let transform = [10.0, 0.01, 0.0, 50.0, 0.0, -0.01]; + builder + .start_raster_nd( + &transform, + &["longitude", "latitude"], + &[360, 180], + Some("EPSG:4326"), + ) + .unwrap(); + builder + .start_band_nd( + Some("sst"), + &["latitude", "longitude"], + &[180, 360], + BandDataType::Float32, + None, + None, + None, + ) + .unwrap(); + let data = vec![0u8; 180 * 360 * 4]; + builder.band_data_writer().append_value(&data); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + + assert_eq!(r.x_dim(), "longitude"); + assert_eq!(r.y_dim(), "latitude"); + // width = size of "longitude" dim, height = size of "latitude" dim + assert_eq!(r.width().unwrap(), 360); + assert_eq!(r.height().unwrap(), 180); + } + + #[test] + fn test_mixed_dimensionality_bands() { + // One 3D band and one 2D band in the same raster + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[64, 64], None) + .unwrap(); + + // Band 0: 3D [time=12, y=64, x=64] + builder + .start_band_nd( + Some("temperature"), + &["time", "y", "x"], + &[12, 64, 64], + BandDataType::Float32, + None, + None, + None, + ) + .unwrap(); + let data_3d = vec![0u8; 12 * 64 * 64 * 4]; + builder.band_data_writer().append_value(&data_3d); + builder.finish_band().unwrap(); + + // Band 1: 2D [y=64, x=64] + builder + .start_band_nd( + Some("elevation"), + &["y", "x"], + &[64, 64], + BandDataType::Float64, + None, + None, + None, + ) + .unwrap(); + let data_2d = vec![0u8; 64 * 64 * 8]; + builder.band_data_writer().append_value(&data_2d); + builder.finish_band().unwrap(); + + builder.finish_raster().unwrap(); + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + + assert_eq!(r.num_bands(), 2); + // width/height derived from band(0) which is 3D + assert_eq!(r.width().unwrap(), 64); + assert_eq!(r.height().unwrap(), 64); + + let b0 = r.band(0).unwrap(); + assert_eq!(b0.ndim(), 3); + assert_eq!(b0.dim_names(), vec!["time", "y", "x"]); + assert_eq!(b0.shape(), &[12, 64, 64]); + assert_eq!(b0.dim_size("time"), Some(12)); + + let b1 = r.band(1).unwrap(); + assert_eq!(b1.ndim(), 2); + assert_eq!(b1.dim_names(), vec!["y", "x"]); + assert_eq!(b1.shape(), &[64, 64]); + assert_eq!(b1.dim_size("time"), None); + } + + #[test] + fn test_dim_index_lookup() { + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[32, 32], None) + .unwrap(); + builder + .start_band_nd( + None, + &["time", "pressure", "y", "x"], + &[6, 10, 32, 32], + BandDataType::Float32, + None, + None, + None, + ) + .unwrap(); + let data = vec![0u8; 6 * 10 * 32 * 32 * 4]; + builder.band_data_writer().append_value(&data); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + + assert_eq!(band.dim_index("time"), Some(0)); + assert_eq!(band.dim_index("pressure"), Some(1)); + assert_eq!(band.dim_index("y"), Some(2)); + assert_eq!(band.dim_index("x"), Some(3)); + assert_eq!(band.dim_index("wavelength"), None); + + assert_eq!(band.dim_size("time"), Some(6)); + assert_eq!(band.dim_size("pressure"), Some(10)); + assert_eq!(band.dim_size("wavelength"), None); + } + + #[test] + fn test_contiguous_data_is_borrowed() { + let mut builder = RasterBuilder::new(1); + builder + .start_raster_2d(4, 4, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0, None) + .unwrap(); + builder.start_band_2d(BandDataType::UInt8, None).unwrap(); + builder.band_data_writer().append_value([1u8; 16]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + + let data = band.contiguous_data().unwrap(); + // Identity-view bands are always contiguous, so should be Cow::Borrowed + assert!(matches!(data, Cow::Borrowed(_))); + assert_eq!(data.len(), 16); + } + + #[test] + fn test_nd_buffer_strides_various_types() { + // Each raster exercises a different shape; strict spatial-grid + // validation forbids mixing bands of disagreeing spatial sizes within + // one raster. + let mut builder = RasterBuilder::new(3); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + + // Raster 0 — UInt8: element size = 1, shape [3, 4] → strides [4, 1] + builder + .start_raster_nd(&transform, &["x", "y"], &[4, 3], None) + .unwrap(); + builder + .start_band_nd( + None, + &["y", "x"], + &[3, 4], + BandDataType::UInt8, + None, + None, + None, + ) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 12]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + // Raster 1 — Float64: element size = 8, shape [2, 3, 5] → strides [120, 40, 8] + builder + .start_raster_nd(&transform, &["x", "y"], &[5, 3], None) + .unwrap(); + builder + .start_band_nd( + None, + &["z", "y", "x"], + &[2, 3, 5], + BandDataType::Float64, + None, + None, + None, + ) + .unwrap(); + builder + .band_data_writer() + .append_value(vec![0u8; 2 * 3 * 5 * 8]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + // Raster 2 — UInt16: element size = 2, shape [10] → strides [2]. + // Only has an "x" dim, so declare spatial_dims=["x"]. + builder + .start_raster_nd(&transform, &["x"], &[10], None) + .unwrap(); + builder + .start_band_nd(None, &["x"], &[10], BandDataType::UInt16, None, None, None) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 20]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + + let r0 = rasters.get(0).unwrap(); + let b0 = r0.band(0).unwrap(); + assert_eq!(b0.nd_buffer().unwrap().strides, &[4, 1]); // UInt8 [3, 4] + + let r1 = rasters.get(1).unwrap(); + let b1 = r1.band(0).unwrap(); + assert_eq!(b1.nd_buffer().unwrap().strides, &[120, 40, 8]); // Float64 [2, 3, 5] + + let r2 = rasters.get(2).unwrap(); + let b2 = r2.band(0).unwrap(); + assert_eq!(b2.nd_buffer().unwrap().strides, &[2]); // UInt16 [10] + } + + #[test] + fn test_width_height_no_bands() { + // Zero-band raster — used as a "target grid" specification (GDAL warp + // pattern). Width/height come from the top-level spatial_shape, not + // band(0). + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[64, 32], None) + .unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + + assert_eq!(r.num_bands(), 0); + assert_eq!(r.width().unwrap(), 64); + assert_eq!(r.height().unwrap(), 32); + } + + #[test] + fn test_band_name_nullable() { + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[4, 4], None) + .unwrap(); + + // Named band + builder + .start_band_nd( + Some("temperature"), + &["y", "x"], + &[4, 4], + BandDataType::Float32, + None, + None, + None, + ) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 64]); + builder.finish_band().unwrap(); + + // Unnamed band (via start_band_2d which passes None for name) + builder.current_width = 4; + builder.current_height = 4; + builder.start_band_2d(BandDataType::UInt8, None).unwrap(); + builder.band_data_writer().append_value(vec![0u8; 16]); + builder.finish_band().unwrap(); + + builder.finish_raster().unwrap(); + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + + assert_eq!(r.band_name(0), Some("temperature")); + assert_eq!(r.band_name(1), None); // unnamed + assert_eq!(r.band_name(99), None); // out of range + } + + #[test] + fn test_spatial_dims_shape_roundtrip() { + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["longitude", "latitude"], &[360, 180], None) + .unwrap(); + builder + .start_band_nd( + None, + &["latitude", "longitude"], + &[180, 360], + BandDataType::UInt8, + None, + None, + None, + ) + .unwrap(); + builder + .band_data_writer() + .append_value(vec![0u8; 360 * 180]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + + assert_eq!(r.spatial_dims(), vec!["longitude", "latitude"]); + assert_eq!(r.spatial_shape(), &[360, 180]); + assert_eq!(r.x_dim(), "longitude"); + assert_eq!(r.y_dim(), "latitude"); + assert_eq!(r.width().unwrap(), 360); + assert_eq!(r.height().unwrap(), 180); + } + + #[test] + fn test_zero_band_raster_roundtrip() { + // Zero-band rasters double as "target grid" specifications. They must + // round-trip through the builder cleanly. + let mut builder = RasterBuilder::new(1); + let transform = [10.0, 1.0, 0.0, 20.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[128, 64], Some("EPSG:3857")) + .unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + + assert_eq!(r.num_bands(), 0); + assert_eq!(r.spatial_dims(), vec!["x", "y"]); + assert_eq!(r.spatial_shape(), &[128, 64]); + assert_eq!(r.width().unwrap(), 128); + assert_eq!(r.height().unwrap(), 64); + assert_eq!(r.crs(), Some("EPSG:3857")); + } + + #[test] + fn test_band_missing_spatial_dim_errors() { + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[4, 4], None) + .unwrap(); + // Band is missing "y" entirely. + builder + .start_band_nd(None, &["x"], &[4], BandDataType::UInt8, None, None, None) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 4]); + builder.finish_band().unwrap(); + + let err = builder.finish_raster().unwrap_err(); + assert!( + err.to_string().contains("missing spatial dimension"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_start_band_rejects_zero_dim() { + // 0-D bands carry no spatial extent and no caller has a use for + // them. start_band_nd must reject an empty dim_names slice eagerly so + // the malformed band never reaches the buffer layer. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder.start_raster_nd(&transform, &[], &[], None).unwrap(); + let err = builder + .start_band_nd(None, &[], &[], BandDataType::UInt8, None, None, None) + .unwrap_err(); + assert!( + err.to_string().contains("0-dimensional"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_contiguous_data_identity_via_start_band_is_borrowed() { + // Canonical identity: the row's view list is null, and the read path + // synthesises the identity view. Should still hand the underlying + // bytes back without copying. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[3, 2], None) + .unwrap(); + builder + .start_band_nd( + None, + &["y", "x"], + &[2, 3], + BandDataType::UInt8, + None, + None, + None, + ) + .unwrap(); + let pixels: Vec = (0..6).collect(); + builder.band_data_writer().append_value(pixels.clone()); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + + // Visible shape comes from the synthesised identity view. + assert_eq!(band.shape(), &[2, 3]); + assert_eq!(band.raw_source_shape(), &[2, 3]); + + let buf = band.nd_buffer().unwrap(); + assert_eq!(buf.strides, &[3, 1]); + assert_eq!(buf.offset, 0); + + let bytes = band.contiguous_data().unwrap(); + assert!(matches!(bytes, Cow::Borrowed(_))); + assert_eq!(&*bytes, pixels.as_slice()); + } + + #[test] + fn test_view_field_is_null_for_identity_band() { + // Schema invariant: identity views are stored as null list rows so + // the canonical "no slice" case costs no Arrow space. Confirm by + // poking the raw column. + use arrow_array::Array; + + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[2, 2], None) + .unwrap(); + builder + .start_band_nd( + None, + &["y", "x"], + &[2, 2], + BandDataType::UInt8, + None, + None, + None, + ) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 4]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let bands_list = array + .column(sedona_schema::raster::raster_indices::BANDS) + .as_any() + .downcast_ref::() + .unwrap(); + let bands_struct = bands_list + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let view_list = bands_struct + .column(sedona_schema::raster::band_indices::VIEW) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(view_list.len(), 1); + assert!( + view_list.is_null(0), + "identity-view band should serialise as a null view row" + ); + } + + #[test] + fn test_band_spatial_dim_size_mismatch_errors() { + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[4, 4], None) + .unwrap(); + // Band has "x" and "y" but x-size disagrees with top-level shape. + builder + .start_band_nd( + None, + &["y", "x"], + &[4, 8], + BandDataType::UInt8, + None, + None, + None, + ) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 32]); + builder.finish_band().unwrap(); + + let err = builder.finish_raster().unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("has size 8") && msg.contains("expected 4"), + "unexpected error: {msg}" + ); + } + + #[test] + fn test_view_null_round_trips_through_arrow_ipc() { + // Schema invariant: a band built via start_band_nd serialises with a + // null view row, and the null must survive an Arrow IPC round-trip. + // If a future change accidentally writes a non-null empty list + // instead, downstream readers (DuckDB, PyArrow, sedona-py) will + // disagree about whether the view is identity. + + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[3, 2], None) + .unwrap(); + builder + .start_band_nd( + None, + &["y", "x"], + &[2, 3], + BandDataType::UInt8, + None, + None, + None, + ) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 6]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let schema = Arc::new(Schema::new(vec![Arc::new(arrow_schema::Field::new( + "raster", + array.data_type().clone(), + true, + )) as arrow_schema::FieldRef])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array.clone())]).unwrap(); + + let mut buf: Vec = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buf, schema.as_ref()).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + + let cursor = Cursor::new(buf); + let reader = StreamReader::try_new(cursor, None).unwrap(); + let batches: Vec<_> = reader.collect::, _>>().unwrap(); + assert_eq!(batches.len(), 1); + let restored_struct = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + let bands_list = restored_struct + .column(sedona_schema::raster::raster_indices::BANDS) + .as_any() + .downcast_ref::() + .unwrap(); + let bands_struct = bands_list + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let view_list = bands_struct + .column(sedona_schema::raster::band_indices::VIEW) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(view_list.len(), 1); + assert!( + view_list.is_null(0), + "identity-view band must remain a null view row after IPC round-trip" + ); + + let rasters = RasterStructArray::new(restored_struct); + let r0 = rasters.get(0).unwrap(); + assert_eq!(r0.band(0).unwrap().shape(), &[2, 3]); + } } diff --git a/rust/sedona-raster/src/display.rs b/rust/sedona-raster/src/display.rs index 400658a0a..c56f075e2 100644 --- a/rust/sedona-raster/src/display.rs +++ b/rust/sedona-raster/src/display.rs @@ -84,6 +84,7 @@ impl fmt::Display for RasterDisplay<'_> { let has_outdb = bands .iter() + .filter_map(Result::ok) .any(|band| matches!(band.metadata().storage_type(), Ok(StorageType::OutDbRef))); // Write: [WxH/nbands] @ [xmin ymin xmax ymax] diff --git a/rust/sedona-raster/src/traits.rs b/rust/sedona-raster/src/traits.rs index f8541ff33..381c3c154 100644 --- a/rust/sedona-raster/src/traits.rs +++ b/rust/sedona-raster/src/traits.rs @@ -15,11 +15,65 @@ // specific language governing permissions and limitations // under the License. +use std::borrow::Cow; + use arrow_schema::ArrowError; +use sedona_schema::raster::BandDataType; + +/// View into a band's N-D data buffer with layout metadata. +/// +/// `shape`, `strides`, and `offset` describe the *visible* region in +/// byte-stride terms — they are computed by composing the band's +/// `source_shape` (the natural extent of `buffer`) with its `view` +/// (the per-axis `(source_axis, start, step, steps)` slice spec). Stride +/// can be zero (broadcast) or negative (reverse iteration), and may not be +/// C-order. Consumers that need a flat row-major buffer should use +/// `BandRef::contiguous_data()` instead. +/// +/// Only `buffer` is tied to the producer's lifetime `'a` (it can be tens of +/// MBs of pixel data and must not be copied). `shape` and `strides` are +/// owned `Vec`s — they're tiny (ndim ≤ a handful) so an allocation here is +/// negligible, and owning them lets an `NdBuffer` outlive the producer's +/// internal layout cache (e.g. cross-thread, return-by-value). +#[derive(Debug)] +pub struct NdBuffer<'a> { + pub buffer: &'a [u8], + pub shape: Vec, + pub strides: Vec, + pub offset: u64, + pub data_type: BandDataType, +} -use sedona_schema::raster::{BandDataType, StorageType}; +/// One per-dimension entry of a band's logical view. Describes how a +/// visible axis maps onto an axis of the underlying source buffer. +/// +/// - `source_axis`: index into the band's `source_shape` that this visible +/// axis reads from. Across a band's full view, `source_axis` values must +/// form a permutation of `0..ndim` — axis-dropping and axis-introducing +/// views are not supported today. +/// - `start`: starting index along the source axis (in elements, not bytes). +/// - `step`: stride between consecutive visible elements along the source +/// axis. `step == 0` means broadcast (the same source element is +/// exposed `steps` times); negative `step` means reverse iteration. +/// - `steps`: number of visible elements along this axis. `steps == 0` is +/// allowed (empty axis). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ViewEntry { + pub source_axis: i64, + pub start: i64, + pub step: i64, + pub steps: i64, +} -/// Metadata for a raster +/// Concrete raster metadata returned by `RasterRef::metadata()`. +/// +/// Restored from the pre-N-D schema to keep callers that pattern-match on +/// `metadata.width`, `metadata.upperleft_x`, etc. compiling. Computed +/// eagerly from `RasterRef::transform()` and `RasterRef::spatial_shape()`. +/// +/// Panics on construction (`metadata()`) if the raster lacks width or +/// height — corrupt schemas error through the `width()`/`height()` trait +/// methods directly; the metadata accessor is the convenience surface. #[derive(Debug, Clone)] pub struct RasterMetadata { pub width: u64, @@ -32,29 +86,10 @@ pub struct RasterMetadata { pub skew_y: f64, } -/// Metadata for a single band -#[derive(Debug, Clone)] -pub struct BandMetadata { - pub nodata_value: Option>, - pub storage_type: StorageType, - pub datatype: BandDataType, - /// URL for OutDb reference (only used when storage_type == OutDbRef) - pub outdb_url: Option, - /// Band ID within the OutDb resource (only used when storage_type == OutDbRef) - pub outdb_band_id: Option, -} - -/// Trait for accessing complete raster data -pub trait RasterRef { - /// Raster metadata accessor - fn metadata(&self) -> &dyn MetadataRef; - /// CRS accessor - fn crs(&self) -> Option<&str>; - /// Bands accessor - fn bands(&self) -> &dyn BandsRef; -} - -/// Trait for accessing raster metadata (dimensions, geotransform, bounding box, etc.) +/// Pre-N-D metadata-accessor trait. Restored so callers from before the +/// N-D refactor that write `fn foo(metadata: &dyn MetadataRef)` keep +/// compiling. `RasterMetadata` is the canonical implementer; new code +/// should reach for `RasterRef::width()? / height()?` instead. pub trait MetadataRef { /// Width of the raster in pixels fn width(&self) -> u64; @@ -73,61 +108,570 @@ pub trait MetadataRef { /// Y-direction skew/rotation fn skew_y(&self) -> f64; } -/// Trait for accessing all bands in a raster -pub trait BandsRef { - /// Number of bands in the raster - fn len(&self) -> usize; - /// Check if no bands are present - fn is_empty(&self) -> bool { + +impl MetadataRef for RasterMetadata { + fn width(&self) -> u64 { + self.width + } + fn height(&self) -> u64 { + self.height + } + fn upper_left_x(&self) -> f64 { + self.upperleft_x + } + fn upper_left_y(&self) -> f64 { + self.upperleft_y + } + fn scale_x(&self) -> f64 { + self.scale_x + } + fn scale_y(&self) -> f64 { + self.scale_y + } + fn skew_x(&self) -> f64 { + self.skew_x + } + fn skew_y(&self) -> f64 { + self.skew_y + } +} + +impl RasterMetadata { + pub fn width(&self) -> u64 { + self.width + } + pub fn height(&self) -> u64 { + self.height + } + pub fn upper_left_x(&self) -> f64 { + self.upperleft_x + } + pub fn upper_left_y(&self) -> f64 { + self.upperleft_y + } + pub fn scale_x(&self) -> f64 { + self.scale_x + } + pub fn scale_y(&self) -> f64 { + self.scale_y + } + pub fn skew_x(&self) -> f64 { + self.skew_x + } + pub fn skew_y(&self) -> f64 { + self.skew_y + } +} + +/// Concrete band metadata returned by `BandRef::metadata()`. +/// +/// Restored from the pre-N-D schema. The `outdb_url` and `outdb_band_id` +/// fields are eagerly parsed from the N-D `outdb_uri` (which carries a +/// `#band=N` fragment in the SedonaDB convention) so callers from the +/// pre-N-D era keep compiling against the same field names. +#[derive(Debug, Clone)] +pub struct BandMetadata { + pub nodata_value: Option>, + pub storage_type: sedona_schema::raster::StorageType, + pub datatype: BandDataType, + pub outdb_url: Option, + pub outdb_band_id: Option, +} + +impl BandMetadata { + pub fn nodata_value(&self) -> Option<&[u8]> { + self.nodata_value.as_deref() + } + /// Returns the storage type. Wrapped in `Result` to match main's + /// `BandMetadataRef::storage_type()` signature — our shim + /// implementation never errors, but the signature is preserved so + /// existing `matches!(band.metadata().storage_type(), Ok(...))` + /// patterns from before the N-D refactor keep compiling. + pub fn storage_type(&self) -> Result { + Ok(self.storage_type) + } + /// Returns the band data type. Wrapped in `Result` to match main's + /// `BandMetadataRef::data_type()` signature — see `storage_type()`. + pub fn data_type(&self) -> Result { + Ok(self.datatype) + } + pub fn outdb_url(&self) -> Option<&str> { + self.outdb_url.as_deref() + } + pub fn outdb_band_id(&self) -> Option { + self.outdb_band_id + } + /// Nodata value interpreted as f64. Mirrors the pre-N-D + /// `BandMetadataRef::nodata_value_as_f64()`. Uses the lossless + /// conversion (errors on i64/u64 magnitudes > 2^53) so the shim + /// surface picks up the same correctness fix as + /// `BandRef::nodata_as_f64()`. + pub fn nodata_value_as_f64(&self) -> Result, ArrowError> { + let bytes = match self.nodata_value.as_deref() { + Some(b) => b, + None => return Ok(None), + }; + nodata_bytes_to_f64_lossless(bytes, &self.datatype).map(Some) + } +} + +/// Parse the SedonaDB `#band=N` fragment out of an out-DB URI. +/// Returns `(base_url, band_id)`; band_id defaults to 1 if absent. +/// Duplicated (intentionally — and minimally) from +/// `sedona-raster-gdal::source_uri` because the shim lives in +/// `sedona-raster` and can't reach across the crate boundary. +fn split_outdb_band_fragment(uri: &str) -> (String, u32) { + if let Some(hash_pos) = uri.rfind('#') { + let (base, fragment) = uri.split_at(hash_pos); + let fragment = &fragment[1..]; // skip the '#' + if let Some(rest) = fragment.strip_prefix("band=") { + if let Ok(n) = rest.parse::() { + return (base.to_string(), n); + } + } + } + (uri.to_string(), 1) +} + +/// Iteration view over a raster's bands. Returned by `RasterRef::bands()`. +/// +/// Wraps a borrowed `&dyn RasterRef` and offers the `len()` / `band(1-based)` +/// / `iter()` shape that callers used before the N-D refactor. New code can +/// equivalently use `RasterRef::num_bands()` and `RasterRef::band(0-based)` +/// directly; both call patterns coexist. +pub struct Bands<'a> { + raster: &'a dyn RasterRef, +} + +impl<'a> Bands<'a> { + /// Wrap a `&dyn RasterRef` for the legacy 1-based band-access surface. + pub fn new(raster: &'a dyn RasterRef) -> Self { + Self { raster } + } +} + +impl<'a> Bands<'a> { + /// Number of bands in the raster. + pub fn len(&self) -> usize { + self.raster.num_bands() + } + + /// True iff the raster has zero bands. + pub fn is_empty(&self) -> bool { self.len() == 0 } - /// Get a specific band by number (returns Error if out of bounds) - /// By convention, band numbers are 1-based - fn band(&self, number: usize) -> Result, ArrowError>; - /// Iterator over all bands - fn iter(&self) -> Box + '_>; + + /// Look up a band by **1-based** number. Returns an error rather than + /// `None` so callers can use `?`. For 0-based access, use + /// `RasterRef::band` directly. + pub fn band(&self, number: usize) -> Result, ArrowError> { + if number == 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid band number {number}: band numbers must be 1-based" + ))); + } + self.raster.band(number - 1) + } + + /// Iterate over every band in 0-based order. Yields `Result` so that + /// a corrupt band surfaces as an error rather than being silently + /// dropped from the iteration. + pub fn iter(&self) -> impl Iterator, ArrowError>> + 'a { + let raster = self.raster; + (0..raster.num_bands()).map(move |i| raster.band(i)) + } } -/// Trait for accessing individual band data +/// Trait for accessing an N-dimensional raster (top level). +/// +/// Replaces the legacy `RasterRef` + `MetadataRef` + `BandsRef` hierarchy with +/// a single flat interface. Bands are 0-indexed. +pub trait RasterRef { + /// Number of bands/variables + fn num_bands(&self) -> usize; + + /// Access a band by 0-based index. Returns an `ArrowError` when the + /// index is out of range or when the underlying schema is malformed + /// (unknown data-type discriminant, corrupt view, etc.). The latter + /// cases route through `sedona_common::sedona_internal_datafusion_err!` + /// so they carry the standardised "SedonaDB internal error" framing. + fn band(&self, index: usize) -> Result, ArrowError>; + + /// 1-based band-access view used by callers from before the N-D + /// refactor. Implementers typically write `Bands::new(self)`. + fn bands(&self) -> Bands<'_>; + + /// Band name (e.g., Zarr variable name). None for unnamed bands. + fn band_name(&self, index: usize) -> Option<&str>; + + /// Fast path for band data type — reads the scalar `data_type` column + /// without materialising a full `BandRef`. UDFs that only need this + /// metadata field should prefer this over `band(i)?.data_type()`. + /// Returns None if `index` is out of range or the discriminant is invalid. + /// + /// The default implementation delegates to `band(i)`. Backends with a + /// flat columnar layout should override for the no-allocation fast path. + fn band_data_type(&self, index: usize) -> Option { + // Fast-path accessor: corrupt bands and out-of-range indices both + // collapse to `None`. Callers that need to distinguish the two + // should use `band(index)` directly. + self.band(index).ok().map(|b| b.data_type()) + } + + /// Fast path for band outdb URI — reads the `outdb_uri` column without + /// materialising a `BandRef`. Returns None if the band has no URI or + /// if `index` is out of range. + /// + /// The default implementation must allocate a `Box`; the + /// raster-array backend overrides it to read the column directly. + /// Default returns None because the borrow can't outlive the boxed band. + fn band_outdb_uri(&self, index: usize) -> Option<&str> { + let _ = index; + None + } + + /// Fast path for band outdb format — reads the `outdb_format` column + /// without materialising a `BandRef`. Default returns None for the + /// same lifetime reason as `band_outdb_uri`. + fn band_outdb_format(&self, index: usize) -> Option<&str> { + let _ = index; + None + } + + /// Fast path for band nodata bytes — reads the `nodata` column without + /// materialising a `BandRef`. Default returns None for the same + /// lifetime reason as `band_outdb_uri`. + fn band_nodata(&self, index: usize) -> Option<&[u8]> { + let _ = index; + None + } + + /// CRS string (PROJJSON, WKT, or authority code). None if not set. + fn crs(&self) -> Option<&str>; + + /// 6-element affine transform in GDAL GeoTransform order: + /// `[origin_x, scale_x, skew_x, origin_y, skew_y, scale_y]` + fn transform(&self) -> &[f64]; + + /// Eagerly-computed concrete metadata view (width, height, geotransform + /// scalars). Mirrors the pre-N-D `RasterRef::metadata()` accessor. + /// + /// Panics if `spatial_shape` lacks width/height or `transform` is the + /// wrong length — those are corrupt-schema cases that error cleanly + /// through the `width()`/`height()` trait methods, but the metadata + /// accessor predates that contract and is kept infallible for caller + /// ergonomics. + fn metadata(&self) -> RasterMetadata { + let width = self + .width() + .expect("raster has no width (spatial_shape missing); use width()? for error handling"); + let height = self + .height() + .expect("raster has no height; use height()? for error handling"); + let t = self.transform(); + if t.len() != 6 { + panic!("transform must be 6 elements, got {}", t.len()); + } + RasterMetadata { + width, + height, + upperleft_x: t[0], + scale_x: t[1], + skew_x: t[2], + upperleft_y: t[3], + skew_y: t[4], + scale_y: t[5], + } + } + + /// Spatial dimension names, in order (today `["x","y"]`; a future Z phase + /// would extend to `["x","y","z"]`). Every band must contain each of these + /// names in its own `dim_names`, with matching sizes. + fn spatial_dims(&self) -> Vec<&str>; + + /// Spatial dimension sizes, in the same order as `spatial_dims`. Today + /// `[width, height]`. + fn spatial_shape(&self) -> &[i64]; + + /// Name of the X spatial dimension (e.g., "x", "lon", "easting"). + fn x_dim(&self) -> &str { + let dims = self.spatial_dims(); + dims.into_iter().next().unwrap_or("x") + } + + /// Name of the Y spatial dimension (e.g., "y", "lat", "northing"). + fn y_dim(&self) -> &str { + let dims = self.spatial_dims(); + dims.into_iter().nth(1).unwrap_or("y") + } + + /// Width in pixels — size of the X spatial dimension from the top-level + /// `spatial_shape`. Errors if `spatial_shape` is empty or the X size is + /// negative; both are invariant violations rather than legitimate "no + /// value" states. + fn width(&self) -> Result { + let shape = self.spatial_shape(); + let Some(&v) = shape.first() else { + return Err(ArrowError::InvalidArgumentError( + "raster has no width (spatial_shape is empty)".to_string(), + )); + }; + if v < 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "raster width must be non-negative, got {v}" + ))); + } + Ok(v as u64) + } + + /// Height in pixels — size of the Y spatial dimension from the top-level + /// `spatial_shape`. Errors if `spatial_shape` has fewer than two entries + /// or the Y size is negative. + fn height(&self) -> Result { + let shape = self.spatial_shape(); + let Some(&v) = shape.get(1) else { + return Err(ArrowError::InvalidArgumentError(format!( + "raster has no height (spatial_shape has {} entries, need >= 2)", + shape.len() + ))); + }; + if v < 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "raster height must be non-negative, got {v}" + ))); + } + Ok(v as u64) + } + + /// Look up a band by name. Returns an error if no band has that + /// name or if the matching band is malformed. + fn band_by_name(&self, name: &str) -> Result, ArrowError> { + let i = (0..self.num_bands()) + .find(|&i| self.band_name(i) == Some(name)) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!("Band with name '{name}' not found")) + })?; + self.band(i) + } +} + +/// Trait for accessing a single band/variable within an N-D raster. +/// +/// This is the consumer interface. Implementations handle storage details +/// Two data access paths: +/// - `contiguous_data()` — flat row-major bytes for consumers that don't need +/// stride awareness (most RS_* functions, GDAL boundary, serialization). +/// - `nd_buffer()` — raw buffer + shape + strides + offset for stride-aware +/// consumers (numpy zero-copy views, Arrow FFI) that want to avoid copies. pub trait BandRef { - /// Band metadata accessor - fn metadata(&self) -> &dyn BandMetadataRef; - /// Raw band data as bytes (zero-copy access) - fn data(&self) -> &[u8]; -} - -/// Trait for accessing individual band metadata -pub trait BandMetadataRef { - /// No-data value as raw bytes (None if null) - fn nodata_value(&self) -> Option<&[u8]>; - /// Storage type (InDb, OutDbRef, etc) - fn storage_type(&self) -> Result; - /// Band data type (UInt8, Float32, etc.) - fn data_type(&self) -> Result; - /// OutDb URL (only used when storage_type == OutDbRef) - fn outdb_url(&self) -> Option<&str>; - /// OutDb band ID (only used when storage_type == OutDbRef) - fn outdb_band_id(&self) -> Option; - - /// No-data value interpreted as f64. + // -- Dimension metadata -- + + /// Number of dimensions in this band + fn ndim(&self) -> usize; + + /// Dimension names in order (e.g., `["time", "y", "x"]`) + fn dim_names(&self) -> Vec<&str>; + + /// Visible shape — size of each dimension in the band's view, in + /// `dim_names` order. Derived from `view`: `[v.steps for v in view]`. + /// This is what almost all consumers want; use `raw_source_shape()` only + /// when you need to address into the raw `data` buffer (e.g. FFI). + fn shape(&self) -> &[u64]; + + /// **Internal/FFI-only.** Natural C-order extent of the band's + /// underlying `data` buffer, indexed by *source* axis (not visible + /// axis). Almost every consumer wants `shape()` instead — that is the + /// region the band exposes, and is what you compare against + /// `spatial_shape`, iterate over for pixels, and compose further views + /// against. The two only agree when the band's view is the identity; + /// any slice, broadcast, or permutation makes them diverge. + /// + /// Use this only when you need to index directly into the raw `data` + /// bytes (e.g. Arrow C Data Interface, numpy zero-copy views) and you + /// also handle `view()` and the byte-stride layout from `nd_buffer()`. + fn raw_source_shape(&self) -> &[u64]; + + /// Per-visible-dimension view entries describing how the band's + /// visible axes map onto its `source_shape`. `view().len() == ndim()`. + /// See `ViewEntry` for per-entry semantics. + fn view(&self) -> &[ViewEntry]; + + /// Size of a named dimension (None if doesn't exist) + fn dim_size(&self, name: &str) -> Option { + let idx = self.dim_index(name)?; + Some(self.shape()[idx]) + } + + /// Index of a named dimension (None if doesn't exist) + fn dim_index(&self, name: &str) -> Option { + self.dim_names().iter().position(|n| *n == name) + } + + /// True iff this band is shaped exactly like a legacy 2-D raster band: + /// `dim_names == ["y", "x"]` and the view is the identity over the + /// band's `raw_source_shape` (no slice, no broadcast, no permutation). + /// + /// GDAL-backed SQL functions use this to refuse N-D bands cleanly while + /// they wait for an MDArray-aware port. + fn is_2d(&self) -> bool { + let dims = self.dim_names(); + if dims.len() != 2 || dims[0] != "y" || dims[1] != "x" { + return false; + } + let view = self.view(); + let source_shape = self.raw_source_shape(); + if view.len() != 2 || source_shape.len() != 2 { + return false; + } + view.iter().enumerate().all(|(i, v)| { + v.source_axis as usize == i + && v.start == 0 + && v.step == 1 + && v.steps >= 0 + && v.steps as u64 == source_shape[i] + }) + } + + // -- Band metadata -- + + /// Data type for all elements in this band + fn data_type(&self) -> BandDataType; + + /// Nodata value as raw bytes (None if not set) + fn nodata(&self) -> Option<&[u8]>; + + /// OutDb URI — location of the external resource (e.g. + /// `"s3://bucket/file.tif"`, `"file:///…"`, `"mem://…"`). None for + /// in-memory bands. Scheme resolution is delegated to an + /// `ObjectStoreRegistry`; it does *not* imply a format. + fn outdb_uri(&self) -> Option<&str> { + None + } + + /// OutDb format — how to interpret the bytes at `outdb_uri` + /// (e.g. `"geotiff"`, `"zarr"`). None means in-memory — the band's + /// `contiguous_data()` / `nd_buffer()` is authoritative. + fn outdb_format(&self) -> Option<&str> { + None + } + + /// True if this band's bytes live in the `data` buffer (in-database). + /// False if the bytes must be fetched from `outdb_uri` (out-of-database). + /// + /// The discriminator is whether the `data` buffer is non-empty — + /// `outdb_uri` and `outdb_format` are orthogonal location/format hints + /// that may be set on either kind of band. + fn is_indb(&self) -> bool { + // Default: materialize via nd_buffer and check buffer emptiness. + // Concrete impls should override with a direct buffer check. + self.nd_buffer().is_ok_and(|b| !b.buffer.is_empty()) + } + + /// Eagerly-computed concrete band metadata. Mirrors the pre-N-D + /// `BandRef::metadata()` accessor. + /// + /// `outdb_url` and `outdb_band_id` are parsed from `outdb_uri()`'s + /// SedonaDB `#band=N` fragment convention so callers that pattern-match + /// on those fields keep compiling. + fn metadata(&self) -> BandMetadata { + let is_indb = self.is_indb(); + // Match the pre-N-D contract: outdb_url / outdb_band_id are only + // populated when storage_type is OutDbRef. The current schema lets + // the URI hint coexist with InDb data; this surface hides that. + let (outdb_url, outdb_band_id) = if !is_indb { + match self.outdb_uri() { + Some(uri) => { + let (base, band) = split_outdb_band_fragment(uri); + (Some(base), Some(band)) + } + None => (None, None), + } + } else { + (None, None) + }; + BandMetadata { + nodata_value: self.nodata().map(|b| b.to_vec()), + storage_type: if is_indb { + sedona_schema::raster::StorageType::InDb + } else { + sedona_schema::raster::StorageType::OutDbRef + }, + datatype: self.data_type(), + outdb_url, + outdb_band_id, + } + } + + // -- Data access -- + + /// Raw backing buffer + visible-region layout. Triggers load for lazy + /// impls. The returned `NdBuffer` describes the band's view in + /// byte-stride terms — `shape` is the visible shape, `strides` and + /// `offset` are computed by composing the view with the source's + /// natural C-order byte strides. Strides may be zero (broadcast) or + /// negative (reverse iteration). + fn nd_buffer(&self) -> Result, ArrowError>; + + /// Contiguous row-major bytes covering the *visible* region. Zero-copy + /// (`Cow::Borrowed`) when the view is full identity over a C-order + /// source buffer; copies into a new buffer when the view slices, + /// broadcasts, or permutes. Most RS_* functions use this. + fn contiguous_data(&self) -> Result, ArrowError>; + + /// Pre-N-D compatibility shim: raw row-major bytes for InDb, + /// identity-view bands. Panics on anything else (OutDb, non-identity + /// view, or a `contiguous_data` error) — corresponds to main's + /// infallible `BandRef::data() -> &[u8]` which only ever ran against + /// identity-view InDb bands. + fn data(&self) -> &[u8] { + // Compatibility shim: returns the same bytes pre-N-D callers expect + // from `BandRef::data() -> &[u8]`. Delegates to `contiguous_data()` + // so identity-view bands surface the borrowed in-line bytes, + // matching the pre-N-D behavior exactly. View-materialized + // (`Cow::Owned`) bands can't be returned through `&[u8]` because + // the owned `Vec` would die at the end of this call — implementers + // that need view-materialized bytes via `data()` must override and + // anchor the materialized buffer on `Self`; other consumers should + // reach for `contiguous_data()` directly. + match self + .contiguous_data() + .expect("BandRef::data() requires an in-db band with bytes") + { + Cow::Borrowed(b) => b, + Cow::Owned(_) => panic!( + "BandRef::data() can't return view-materialized bytes; \ + use contiguous_data() for sliced/permuted bands" + ), + } + } + + /// Nodata value interpreted as f64. /// /// Returns `Ok(None)` when no nodata value is defined, `Ok(Some(f64))` on - /// success, or an error when the raw bytes have an unexpected length for - /// the band's data type. - fn nodata_value_as_f64(&self) -> Result, ArrowError> { - let bytes = match self.nodata_value() { + /// success, or an error when the raw bytes have an unexpected length **or** + /// when the nodata value cannot be represented exactly in `f64`. + /// + /// 64-bit integer bands (`Int64`, `UInt64`) error rather than silently + /// rounding when the magnitude exceeds 2^53 — values outside + /// `[-9_007_199_254_740_992, 9_007_199_254_740_992]` can't round-trip + /// through `f64` and a rounded sentinel can collide with a real pixel + /// value. Use `nodata()` directly to recover the exact bytes when full + /// integer precision matters (e.g. when nodata is the type's extreme + /// value like `0xFF…FF`). + fn nodata_as_f64(&self) -> Result, ArrowError> { + let bytes = match self.nodata() { Some(b) => b, None => return Ok(None), }; - let dt = self.data_type()?; - nodata_bytes_to_f64(bytes, &dt).map(Some) + nodata_bytes_to_f64_lossless(bytes, &self.data_type()).map(Some) } } /// Convert raw nodata bytes to f64 given a [`BandDataType`]. /// /// The bytes are expected to be in little-endian order and exactly match the -/// byte size of the data type. +/// byte size of the data type. Internal helper for the lossless wrapper; +/// non-i64/u64 callers reach for `nodata_bytes_to_f64_lossless` instead. fn nodata_bytes_to_f64(bytes: &[u8], dt: &BandDataType) -> Result { macro_rules! read_le { ($t:ty, $n:expr) => {{ @@ -173,12 +717,49 @@ fn nodata_bytes_to_f64(bytes: &[u8], dt: &BandDataType) -> Result: Iterator> { - fn len(&self) -> usize; - /// Check if there are no more bands - fn is_empty(&self) -> bool { - self.len() == 0 +/// Convert raw nodata bytes to f64, erroring on lossy conversion. +/// +/// Like [`nodata_bytes_to_f64`] but rejects 64-bit integer values whose +/// magnitude exceeds 2^53, since they can't round-trip through `f64`. +/// Callers that interpret nodata as a sentinel (e.g. UDFs that compare +/// pixel == nodata) should prefer this over the lossy variant — a rounded +/// `0xFFFF_FFFF_FFFF_FFFE` sentinel can silently collide with a real +/// pixel value. +pub fn nodata_bytes_to_f64_lossless(bytes: &[u8], dt: &BandDataType) -> Result { + match dt { + BandDataType::UInt64 => { + let arr: [u8; 8] = bytes.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Invalid nodata byte length for UInt64: expected 8, got {}", + bytes.len() + )) + })?; + let v = u64::from_le_bytes(arr); + if v > (1u64 << 53) { + return Err(ArrowError::InvalidArgumentError(format!( + "UInt64 nodata value {v} cannot be represented exactly as f64 \ + (magnitude > 2^53); use the raw nodata bytes instead" + ))); + } + Ok(v as f64) + } + BandDataType::Int64 => { + let arr: [u8; 8] = bytes.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Invalid nodata byte length for Int64: expected 8, got {}", + bytes.len() + )) + })?; + let v = i64::from_le_bytes(arr); + if v.unsigned_abs() > (1u64 << 53) { + return Err(ArrowError::InvalidArgumentError(format!( + "Int64 nodata value {v} cannot be represented exactly as f64 \ + (magnitude > 2^53); use the raw nodata bytes instead" + ))); + } + Ok(v as f64) + } + _ => nodata_bytes_to_f64(bytes, dt), } } @@ -217,4 +798,195 @@ mod tests { let result = nodata_bytes_to_f64(&[1, 2, 3], &BandDataType::Float64); assert!(result.is_err()); } + + #[test] + fn test_nodata_bytes_to_f64_lossless_int64_within_mantissa() { + // Boundary: 2^53 is the largest magnitude that round-trips exactly. + let safe = 1i64 << 53; + let val = nodata_bytes_to_f64_lossless(&safe.to_le_bytes(), &BandDataType::Int64).unwrap(); + assert_eq!(val as i64, safe); + + let neg_safe = -(1i64 << 53); + let val = + nodata_bytes_to_f64_lossless(&neg_safe.to_le_bytes(), &BandDataType::Int64).unwrap(); + assert_eq!(val as i64, neg_safe); + } + + #[test] + fn test_nodata_bytes_to_f64_lossless_int64_errors_above_mantissa() { + let big = (1i64 << 53) + 1; + let err = + nodata_bytes_to_f64_lossless(&big.to_le_bytes(), &BandDataType::Int64).unwrap_err(); + assert!( + err.to_string().contains("Int64 nodata value"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_nodata_bytes_to_f64_lossless_uint64_sentinel_errors() { + // The common sentinel 0xFFFF_FFFF_FFFF_FFFF is exactly the case the + // review flagged: lossy variant silently rounds to a value that can + // collide with a real pixel; lossless variant errors. + let sentinel = u64::MAX; + let err = nodata_bytes_to_f64_lossless(&sentinel.to_le_bytes(), &BandDataType::UInt64) + .unwrap_err(); + assert!( + err.to_string().contains("UInt64 nodata value"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_nodata_bytes_to_f64_lossless_delegates_for_smaller_types() { + // Non-64-bit types pass through to nodata_bytes_to_f64 unchanged. + let val = nodata_bytes_to_f64_lossless(&[42], &BandDataType::UInt8).unwrap(); + assert_eq!(val, 42.0); + let val = nodata_bytes_to_f64_lossless(&[0xFE], &BandDataType::Int8).unwrap(); + assert_eq!(val, -2.0); + } + + #[test] + fn test_split_outdb_band_fragment_with_band() { + let (base, n) = split_outdb_band_fragment("s3://bucket/file.tif#band=42"); + assert_eq!(base, "s3://bucket/file.tif"); + assert_eq!(n, 42); + } + + #[test] + fn test_split_outdb_band_fragment_without_band_defaults_to_1() { + let (base, n) = split_outdb_band_fragment("s3://bucket/file.tif"); + assert_eq!(base, "s3://bucket/file.tif"); + assert_eq!(n, 1); + } + + #[test] + fn test_split_outdb_band_fragment_malformed_fragment_defaults_to_1() { + // `#band=abc` is malformed; treat the whole string as the base URL. + let (base, n) = split_outdb_band_fragment("s3://bucket/file.tif#band=abc"); + assert_eq!(base, "s3://bucket/file.tif#band=abc"); + assert_eq!(n, 1); + } + + fn ve(source_axis: i64, start: i64, step: i64, steps: i64) -> ViewEntry { + ViewEntry { + source_axis, + start, + step, + steps, + } + } + + /// Minimal `BandRef` stub: only the inputs `is_2d` actually inspects + /// (`dim_names`, `view`, `raw_source_shape`) carry meaningful values; + /// every other method returns a placeholder we never read. + struct StubBand { + dim_names: Vec, + source_shape: Vec, + shape: Vec, + view: Vec, + } + + impl BandRef for StubBand { + fn ndim(&self) -> usize { + self.dim_names.len() + } + fn dim_names(&self) -> Vec<&str> { + self.dim_names.iter().map(String::as_str).collect() + } + fn shape(&self) -> &[u64] { + &self.shape + } + fn raw_source_shape(&self) -> &[u64] { + &self.source_shape + } + fn view(&self) -> &[ViewEntry] { + &self.view + } + fn data_type(&self) -> BandDataType { + BandDataType::UInt8 + } + fn nodata(&self) -> Option<&[u8]> { + None + } + fn nd_buffer(&self) -> Result, ArrowError> { + unimplemented!("not used in is_2d tests") + } + fn contiguous_data(&self) -> Result, ArrowError> { + unimplemented!("not used in is_2d tests") + } + } + + fn band(dims: &[&str], source_shape: &[u64], view: &[ViewEntry]) -> StubBand { + let shape = view.iter().map(|v| v.steps as u64).collect(); + StubBand { + dim_names: dims.iter().map(|s| (*s).to_string()).collect(), + source_shape: source_shape.to_vec(), + shape, + view: view.to_vec(), + } + } + + #[test] + fn is_2d_identity_yx_is_true() { + let b = band(&["y", "x"], &[4, 5], &[ve(0, 0, 1, 4), ve(1, 0, 1, 5)]); + assert!(b.is_2d()); + } + + #[test] + fn is_2d_identity_3d_is_false() { + let b = band( + &["time", "y", "x"], + &[3, 4, 5], + &[ve(0, 0, 1, 3), ve(1, 0, 1, 4), ve(2, 0, 1, 5)], + ); + assert!(!b.is_2d()); + } + + #[test] + fn is_2d_identity_1d_is_false() { + let b = band(&["x"], &[5], &[ve(0, 0, 1, 5)]); + assert!(!b.is_2d()); + } + + #[test] + fn is_2d_yx_with_slice_view_is_false() { + // Same dim_names but the y-axis is sliced — view is not the identity. + let b = band(&["y", "x"], &[4, 5], &[ve(0, 1, 1, 2), ve(1, 0, 1, 5)]); + assert!(!b.is_2d()); + } + + #[test] + fn is_2d_yx_with_step_two_is_false() { + let b = band(&["y", "x"], &[4, 5], &[ve(0, 0, 2, 2), ve(1, 0, 1, 5)]); + assert!(!b.is_2d()); + } + + #[test] + fn is_2d_yx_with_broadcast_is_false() { + let b = band(&["y", "x"], &[4, 5], &[ve(0, 0, 0, 4), ve(1, 0, 1, 5)]); + assert!(!b.is_2d()); + } + + #[test] + fn is_2d_permuted_xy_is_false() { + // dim_names are swapped — not the legacy 2D shape, even though the + // view per-axis is the identity. + let b = band(&["x", "y"], &[5, 4], &[ve(0, 0, 1, 5), ve(1, 0, 1, 4)]); + assert!(!b.is_2d()); + } + + #[test] + fn is_2d_yx_with_transposed_source_axes_is_false() { + // dim_names are ["y","x"] but the view permutes the source axes, + // so the band exposes y-then-x out of an x-then-y source. + let b = band(&["y", "x"], &[5, 4], &[ve(1, 0, 1, 4), ve(0, 0, 1, 5)]); + assert!(!b.is_2d()); + } + + #[test] + fn is_2d_yx_other_dim_names_is_false() { + let b = band(&["lat", "lon"], &[4, 5], &[ve(0, 0, 1, 4), ve(1, 0, 1, 5)]); + assert!(!b.is_2d()); + } } diff --git a/rust/sedona-schema/src/raster.rs b/rust/sedona-schema/src/raster.rs index b5b8745c4..436baf8dd 100644 --- a/rust/sedona-schema/src/raster.rs +++ b/rust/sedona-schema/src/raster.rs @@ -16,34 +16,54 @@ // under the License. use arrow_schema::{DataType, Field, FieldRef, Fields}; -/// Schema for storing raster data in Apache Arrow format. -/// Utilizing nested structs and lists to represent raster metadata and bands. +/// Schema for storing N-dimensional raster data in Apache Arrow format. +/// +/// Each raster has a CRS, an affine transform, a list of spatial dimension +/// names (`spatial_dims`) and sizes (`spatial_shape`), and a list of bands. +/// Each band is an N-D chunk with named dimensions, a `source_shape` +/// describing the natural extent of its underlying buffer, and a `view` +/// describing the visible region of that buffer. +/// +/// `spatial_dims` + `spatial_shape` are the raster-level source of truth for +/// the spatial grid — today length 2 (`["x","y"]`, `[width, height]`), +/// Z-ready for a future 3D phase. All bands must contain every name in +/// `spatial_dims` in their own `dim_names`, with the band's *visible* size +/// for that dim matching `spatial_shape`. +/// +/// 2D rasters are represented as bands with `dim_names=["y","x"]` and +/// `source_shape=[height, width]`. #[derive(Debug, PartialEq, Clone)] pub struct RasterSchema; + impl RasterSchema { /// Returns the top-level fields for the raster schema structure. pub fn fields() -> Fields { Fields::from(vec![ - Field::new(column::METADATA, Self::metadata_type(), false), Field::new(column::CRS, Self::crs_type(), true), // Optional: may be inferred from data + Field::new(column::TRANSFORM, Self::transform_type(), false), + Field::new(column::SPATIAL_DIMS, Self::spatial_dims_type(), false), + Field::new(column::SPATIAL_SHAPE, Self::spatial_shape_type(), false), Field::new(column::BANDS, Self::bands_type(), true), ]) } - /// Raster metadata schema - pub fn metadata_type() -> DataType { - DataType::Struct(Fields::from(vec![ - // Raster dimensions - Field::new(column::WIDTH, DataType::UInt64, false), - Field::new(column::HEIGHT, DataType::UInt64, false), - // Geospatial transformation parameters - Field::new(column::UPPERLEFT_X, DataType::Float64, false), - Field::new(column::UPPERLEFT_Y, DataType::Float64, false), - Field::new(column::SCALE_X, DataType::Float64, false), - Field::new(column::SCALE_Y, DataType::Float64, false), - Field::new(column::SKEW_X, DataType::Float64, false), - Field::new(column::SKEW_Y, DataType::Float64, false), - ])) + /// Affine transform schema — 6-element GDAL GeoTransform: + /// `[origin_x, scale_x, skew_x, origin_y, skew_y, scale_y]` + pub fn transform_type() -> DataType { + DataType::List(FieldRef::new(Field::new("item", DataType::Float64, false))) + } + + /// Spatial dimension names schema — list of `Utf8View` strings, one per + /// spatial axis. Today always `["x","y"]`; becomes `["x","y","z"]` if a + /// future phase adds Z support. + pub fn spatial_dims_type() -> DataType { + DataType::List(FieldRef::new(Field::new("item", DataType::Utf8View, false))) + } + + /// Spatial shape schema — list of `Int64` sizes in the same order as + /// `spatial_dims`. Today `[width, height]`. + pub fn spatial_shape_type() -> DataType { + DataType::List(FieldRef::new(Field::new("item", DataType::Int64, false))) } /// Bands list schema @@ -55,29 +75,59 @@ impl RasterSchema { ))) } - /// Individual band schema + /// Individual band schema — flattened N-D band with dimension metadata. + /// + /// Out-of-band ("outdb") bands carry two orthogonal identifiers: + /// - `outdb_uri` is the *location* (what scheme/registry to dispatch to, + /// e.g. `s3://bucket/file.tif`, `file:///…`, `mem://…`). + /// - `outdb_format` is the *format* (how to interpret the bytes, e.g. + /// `"geotiff"`, `"zarr"`). Null format means in-memory — the band's + /// `data` buffer is authoritative. pub fn band_type() -> DataType { DataType::Struct(Fields::from(vec![ - Field::new(column::METADATA, Self::band_metadata_type(), false), - Field::new(column::DATA, Self::band_data_type(), false), + Field::new(column::NAME, DataType::Utf8, true), + Field::new(column::DIM_NAMES, Self::dim_names_type(), false), + Field::new(column::SOURCE_SHAPE, Self::source_shape_type(), false), + Field::new(column::DATATYPE, DataType::UInt32, false), + Field::new(column::NODATA, DataType::Binary, true), + Field::new(column::VIEW, Self::view_type(), true), + Field::new(column::OUTDB_URI, DataType::Utf8, true), + Field::new(column::OUTDB_FORMAT, DataType::Utf8View, true), + Field::new(column::DATA, DataType::BinaryView, false), ])) } - /// Band metadata schema - pub fn band_metadata_type() -> DataType { - DataType::Struct(Fields::from(vec![ - Field::new(column::NODATAVALUE, DataType::Binary, true), // Optional: null means no nodata value specified - Field::new(column::STORAGE_TYPE, DataType::UInt32, false), - Field::new(column::DATATYPE, DataType::UInt32, false), - // OutDb reference fields - only used when storage_type == OutDbRef - Field::new(column::OUTDB_URL, DataType::Utf8, true), - Field::new(column::OUTDB_BAND_ID, DataType::UInt32, true), - ])) + /// Dimension names list type + pub fn dim_names_type() -> DataType { + DataType::List(FieldRef::new(Field::new("item", DataType::Utf8, false))) } - /// Band data schema - stores the actual raster pixel data as a binary blob - pub fn band_data_type() -> DataType { - DataType::BinaryView + /// Source shape list type — the natural C-order extent of the band's + /// `data` buffer (or outdb-resolved source) per dimension. The *visible* + /// shape exposed to consumers is derived from `view`: + /// `[entry.steps for entry in view]`. + pub fn source_shape_type() -> DataType { + DataType::List(FieldRef::new(Field::new("item", DataType::UInt64, false))) + } + + /// View list type — one entry per dimension in the band's *visible* + /// order. Each entry is a `(source_axis, start, step, steps)` quadruple + /// describing how the visible axis maps onto the band's source shape. + /// The field is nullable: a null view denotes the identity view + /// `[(i, 0, 1, source_shape[i]) for i in 0..ndim]` and is the canonical + /// representation for any band whose data has not been sliced. See + /// `RasterSchema` doc for full semantics. + pub fn view_type() -> DataType { + DataType::List(FieldRef::new(Field::new( + "item", + DataType::Struct(Fields::from(vec![ + Field::new("source_axis", DataType::Int64, false), + Field::new("start", DataType::Int64, false), + Field::new("step", DataType::Int64, false), + Field::new("steps", DataType::Int64, false), + ])), + false, + ))) } /// Coordinate Reference System (CRS) schema - stores CRS as JSON string (PROJ or WKT format) @@ -102,6 +152,10 @@ pub enum BandDataType { Float64 = 7, UInt64 = 8, Int64 = 9, + // Int8 was added after the original 1-7 set (PR #589) and after the + // 64-bit additions at 8-9. The discriminants are an Arrow-column + // contract for the `band.data_type` UInt32 column — reordering would + // silently misinterpret existing raster data, so new variants append. Int8 = 10, } @@ -116,6 +170,23 @@ impl BandDataType { } } + /// Try to convert from a u32 discriminant value. + pub fn try_from_u32(value: u32) -> Option { + match value { + 1 => Some(BandDataType::UInt8), + 2 => Some(BandDataType::UInt16), + 3 => Some(BandDataType::Int16), + 4 => Some(BandDataType::UInt32), + 5 => Some(BandDataType::Int32), + 6 => Some(BandDataType::Float32), + 7 => Some(BandDataType::Float64), + 8 => Some(BandDataType::UInt64), + 9 => Some(BandDataType::Int64), + 10 => Some(BandDataType::Int8), + _ => None, + } + } + /// Java/Sedona-compatible pixel type name (e.g. `"UNSIGNED_8BITS"`). pub fn pixel_type_name(&self) -> &'static str { match self { @@ -134,24 +205,18 @@ impl BandDataType { } } -/// Storage strategy for raster band data within Apache Arrow arrays. -/// -/// This enum defines how raster data is physically stored and accessed: +/// Where a band's pixel data lives. /// -/// **InDb**: Raster data is embedded directly in the Arrow array as binary blobs. -/// - Self-contained, no external dependencies, fast access for small-medium rasters -/// - Increases Arrow array size, memory usage grows and copy times increase with raster size -/// - Best for: Tiles, thumbnails, processed results, small rasters (<10MB per band) -/// -/// **OutDbRef**: Raster data is stored externally with references in the Arrow array. -/// - Keeps Arrow arrays lightweight, supports massive rasters, enables lazy loading -/// - Requires external storage management, potential for broken references -/// - Best for: Large satellite imagery, time series data, cloud-native workflows -/// - Supported backends: S3, GCS, Azure Blob, local filesystem, HTTP endpoints +/// Restored from the pre-N-D schema to keep downstream code that pattern- +/// matches on `StorageType::InDb` / `StorageType::OutDbRef` compiling. +/// The current N-D schema discriminates via `BandRef::is_indb()` (true ↔ +/// `InDb`, false ↔ `OutDbRef`); this enum is the shim over that. #[repr(u16)] #[derive(Clone, Debug, PartialEq, Eq, Hash, Copy)] pub enum StorageType { + /// Band data is materialized into the raster row's `data` Arrow column. InDb = 0, + /// Band data lives outside the row and is referenced by `outdb_uri`. OutDbRef = 1, } @@ -160,62 +225,55 @@ pub enum StorageType { /// /// Using compile-time constants avoids string lookups and provides type safety /// when accessing nested struct fields in Arrow arrays. -pub mod metadata_indices { - pub const WIDTH: usize = 0; - pub const HEIGHT: usize = 1; - pub const UPPERLEFT_X: usize = 2; - pub const UPPERLEFT_Y: usize = 3; - pub const SCALE_X: usize = 4; - pub const SCALE_Y: usize = 5; - pub const SKEW_X: usize = 6; - pub const SKEW_Y: usize = 7; -} - -pub mod band_metadata_indices { - pub const NODATAVALUE: usize = 0; - pub const STORAGE_TYPE: usize = 1; - pub const DATATYPE: usize = 2; - pub const OUTDB_URL: usize = 3; - pub const OUTDB_BAND_ID: usize = 4; +pub mod raster_indices { + pub const CRS: usize = 0; + pub const TRANSFORM: usize = 1; + pub const SPATIAL_DIMS: usize = 2; + pub const SPATIAL_SHAPE: usize = 3; + pub const BANDS: usize = 4; } pub mod band_indices { - pub const METADATA: usize = 0; - pub const DATA: usize = 1; + pub const NAME: usize = 0; + pub const DIM_NAMES: usize = 1; + pub const SOURCE_SHAPE: usize = 2; + pub const DATA_TYPE: usize = 3; + pub const NODATA: usize = 4; + pub const VIEW: usize = 5; + pub const OUTDB_URI: usize = 6; + pub const OUTDB_FORMAT: usize = 7; + pub const DATA: usize = 8; } -pub mod raster_indices { - pub const METADATA: usize = 0; - pub const CRS: usize = 1; - pub const BANDS: usize = 2; +/// Field indices within the `view` struct (`(source_axis, start, step, steps)`). +pub mod band_view_indices { + pub const SOURCE_AXIS: usize = 0; + pub const START: usize = 1; + pub const STEP: usize = 2; + pub const STEPS: usize = 3; } /// Column name constants used throughout the raster schema definition. /// These string constants ensure consistency across schema creation and field access. pub mod column { - pub const METADATA: &str = "metadata"; + // Top-level raster fields + pub const CRS: &str = "crs"; + pub const TRANSFORM: &str = "transform"; + pub const SPATIAL_DIMS: &str = "spatial_dims"; + pub const SPATIAL_SHAPE: &str = "spatial_shape"; pub const BANDS: &str = "bands"; pub const BAND: &str = "band"; - pub const DATA: &str = "data"; - - // Raster metadata fields - pub const WIDTH: &str = "width"; - pub const HEIGHT: &str = "height"; - pub const UPPERLEFT_X: &str = "upperleft_x"; - pub const UPPERLEFT_Y: &str = "upperleft_y"; - pub const SCALE_X: &str = "scale_x"; - pub const SCALE_Y: &str = "scale_y"; - pub const SKEW_X: &str = "skew_x"; - pub const SKEW_Y: &str = "skew_y"; - // Raster CRS field - pub const CRS: &str = "crs"; - // Band metadata fields - pub const NODATAVALUE: &str = "nodata_value"; - pub const STORAGE_TYPE: &str = "storage_type"; + // Band fields + pub const NAME: &str = "name"; + pub const DIM_NAMES: &str = "dim_names"; + pub const SOURCE_SHAPE: &str = "source_shape"; pub const DATATYPE: &str = "data_type"; - pub const OUTDB_URL: &str = "outdb_url"; - pub const OUTDB_BAND_ID: &str = "outdb_band_id"; + pub const NODATA: &str = "nodata"; + pub const VIEW: &str = "view"; + pub const OUTDB_URI: &str = "outdb_uri"; + pub const OUTDB_FORMAT: &str = "outdb_format"; + pub const DATA: &str = "data"; } #[cfg(test)] @@ -225,10 +283,12 @@ mod tests { #[test] fn test_raster_schema_fields() { let fields = RasterSchema::fields(); - assert_eq!(fields.len(), 3); - assert_eq!(fields[0].name(), column::METADATA); - assert_eq!(fields[1].name(), column::CRS); - assert_eq!(fields[2].name(), column::BANDS); + assert_eq!(fields.len(), 5); + assert_eq!(fields[0].name(), column::CRS); + assert_eq!(fields[1].name(), column::TRANSFORM); + assert_eq!(fields[2].name(), column::SPATIAL_DIMS); + assert_eq!(fields[3].name(), column::SPATIAL_SHAPE); + assert_eq!(fields[4].name(), column::BANDS); } /// Comprehensive test to verify all hard-coded indices match the actual schema. @@ -238,128 +298,90 @@ mod tests { fn test_hardcoded_indices_match_schema() { // Test raster-level indices let raster_fields = RasterSchema::fields(); - assert_eq!(raster_fields.len(), 3, "Expected exactly 3 raster fields"); - assert_eq!( - raster_fields[raster_indices::METADATA].name(), - column::METADATA, - "Raster metadata index mismatch" - ); + assert_eq!(raster_fields.len(), 5, "Expected exactly 5 raster fields"); assert_eq!( raster_fields[raster_indices::CRS].name(), column::CRS, "Raster CRS index mismatch" ); + assert_eq!( + raster_fields[raster_indices::TRANSFORM].name(), + column::TRANSFORM, + "Raster TRANSFORM index mismatch" + ); + assert_eq!( + raster_fields[raster_indices::SPATIAL_DIMS].name(), + column::SPATIAL_DIMS, + "Raster SPATIAL_DIMS index mismatch" + ); + assert_eq!( + raster_fields[raster_indices::SPATIAL_SHAPE].name(), + column::SPATIAL_SHAPE, + "Raster SPATIAL_SHAPE index mismatch" + ); assert_eq!( raster_fields[raster_indices::BANDS].name(), column::BANDS, "Raster BANDS index mismatch" ); - // Test metadata indices - let metadata_type = RasterSchema::metadata_type(); - if let DataType::Struct(metadata_fields) = metadata_type { - assert_eq!( - metadata_fields.len(), - 8, - "Expected exactly 8 metadata fields" - ); - assert_eq!( - metadata_fields[metadata_indices::WIDTH].name(), - column::WIDTH, - "Metadata width index mismatch" - ); - assert_eq!( - metadata_fields[metadata_indices::HEIGHT].name(), - column::HEIGHT, - "Metadata height index mismatch" - ); - assert_eq!( - metadata_fields[metadata_indices::UPPERLEFT_X].name(), - column::UPPERLEFT_X, - "Metadata upperleft_x index mismatch" - ); - assert_eq!( - metadata_fields[metadata_indices::UPPERLEFT_Y].name(), - column::UPPERLEFT_Y, - "Metadata upperleft_y index mismatch" - ); - assert_eq!( - metadata_fields[metadata_indices::SCALE_X].name(), - column::SCALE_X, - "Metadata scale_x index mismatch" - ); - assert_eq!( - metadata_fields[metadata_indices::SCALE_Y].name(), - column::SCALE_Y, - "Metadata scale_y index mismatch" - ); - assert_eq!( - metadata_fields[metadata_indices::SKEW_X].name(), - column::SKEW_X, - "Metadata skew_x index mismatch" - ); - assert_eq!( - metadata_fields[metadata_indices::SKEW_Y].name(), - column::SKEW_Y, - "Metadata skew_y index mismatch" - ); - } else { - panic!("Expected Struct type for metadata"); - } - - // Test band metadata indices - let band_metadata_type = RasterSchema::band_metadata_type(); - if let DataType::Struct(band_metadata_fields) = band_metadata_type { + // Test band indices + let band_type = RasterSchema::band_type(); + if let DataType::Struct(band_fields) = band_type { + assert_eq!(band_fields.len(), 9, "Expected exactly 9 band fields"); + assert_eq!(band_fields[band_indices::NAME].name(), column::NAME); assert_eq!( - band_metadata_fields.len(), - 5, - "Expected exactly 5 band metadata fields" + band_fields[band_indices::DIM_NAMES].name(), + column::DIM_NAMES ); assert_eq!( - band_metadata_fields[band_metadata_indices::NODATAVALUE].name(), - column::NODATAVALUE, - "Band metadata nodatavalue index mismatch" + band_fields[band_indices::SOURCE_SHAPE].name(), + column::SOURCE_SHAPE ); assert_eq!( - band_metadata_fields[band_metadata_indices::STORAGE_TYPE].name(), - column::STORAGE_TYPE, - "Band metadata storage_type index mismatch" + band_fields[band_indices::DATA_TYPE].name(), + column::DATATYPE ); - assert_eq!( - band_metadata_fields[band_metadata_indices::DATATYPE].name(), - column::DATATYPE, - "Band metadata datatype index mismatch" + assert_eq!(band_fields[band_indices::NODATA].name(), column::NODATA); + assert_eq!(band_fields[band_indices::VIEW].name(), column::VIEW); + assert!( + band_fields[band_indices::VIEW].is_nullable(), + "view field must be nullable — null encodes the identity view" ); assert_eq!( - band_metadata_fields[band_metadata_indices::OUTDB_URL].name(), - column::OUTDB_URL, - "Band metadata outdb_url index mismatch" + band_fields[band_indices::OUTDB_URI].name(), + column::OUTDB_URI ); assert_eq!( - band_metadata_fields[band_metadata_indices::OUTDB_BAND_ID].name(), - column::OUTDB_BAND_ID, - "Band metadata outdb_band_id index mismatch" + band_fields[band_indices::OUTDB_FORMAT].name(), + column::OUTDB_FORMAT ); + assert_eq!(band_fields[band_indices::DATA].name(), column::DATA); } else { - panic!("Expected Struct type for band metadata"); + panic!("Expected Struct type for band"); } + } - // Test band indices - let band_type = RasterSchema::band_type(); - if let DataType::Struct(band_fields) = band_type { - assert_eq!(band_fields.len(), 2, "Expected exactly 2 band fields"); - assert_eq!( - band_fields[band_indices::METADATA].name(), - column::METADATA, - "Band metadata index mismatch" - ); - assert_eq!( - band_fields[band_indices::DATA].name(), - column::DATA, - "Band data index mismatch" - ); - } else { - panic!("Expected Struct type for band"); + #[test] + fn test_view_type_struct_shape() { + // The view struct must have exactly 4 Int64 fields in the order + // expected by band_view_indices. + let DataType::List(item_field) = RasterSchema::view_type() else { + panic!("Expected List type for view"); + }; + let DataType::Struct(view_fields) = item_field.data_type() else { + panic!("Expected Struct type inside view list"); + }; + assert_eq!(view_fields.len(), 4); + assert_eq!( + view_fields[band_view_indices::SOURCE_AXIS].name(), + "source_axis" + ); + assert_eq!(view_fields[band_view_indices::START].name(), "start"); + assert_eq!(view_fields[band_view_indices::STEP].name(), "step"); + assert_eq!(view_fields[band_view_indices::STEPS].name(), "steps"); + for f in view_fields.iter() { + assert_eq!(f.data_type(), &DataType::Int64); } } @@ -377,6 +399,48 @@ mod tests { assert_eq!(BandDataType::Float64.byte_size(), 8); } + #[test] + fn test_band_data_type_try_from_u32() { + assert_eq!(BandDataType::try_from_u32(1), Some(BandDataType::UInt8)); + assert_eq!(BandDataType::try_from_u32(2), Some(BandDataType::UInt16)); + assert_eq!(BandDataType::try_from_u32(3), Some(BandDataType::Int16)); + assert_eq!(BandDataType::try_from_u32(4), Some(BandDataType::UInt32)); + assert_eq!(BandDataType::try_from_u32(5), Some(BandDataType::Int32)); + assert_eq!(BandDataType::try_from_u32(6), Some(BandDataType::Float32)); + assert_eq!(BandDataType::try_from_u32(7), Some(BandDataType::Float64)); + assert_eq!(BandDataType::try_from_u32(8), Some(BandDataType::UInt64)); + assert_eq!(BandDataType::try_from_u32(9), Some(BandDataType::Int64)); + assert_eq!(BandDataType::try_from_u32(10), Some(BandDataType::Int8)); + assert_eq!(BandDataType::try_from_u32(0), None); + assert_eq!(BandDataType::try_from_u32(11), None); + assert_eq!(BandDataType::try_from_u32(u32::MAX), None); + } + + #[test] + fn test_band_data_type_roundtrip_u32() { + // Verify that discriminant → try_from_u32 round-trips for all variants + let all_types = [ + BandDataType::UInt8, + BandDataType::UInt16, + BandDataType::Int16, + BandDataType::UInt32, + BandDataType::Int32, + BandDataType::Float32, + BandDataType::Float64, + BandDataType::UInt64, + BandDataType::Int64, + BandDataType::Int8, + ]; + for dt in all_types { + let value = dt as u32; + assert_eq!( + BandDataType::try_from_u32(value), + Some(dt), + "Round-trip failed for {dt:?} (discriminant {value})" + ); + } + } + #[test] fn test_band_data_type_pixel_type_name() { assert_eq!(BandDataType::UInt8.pixel_type_name(), "UNSIGNED_8BITS");