diff --git a/Cargo.lock b/Cargo.lock index c3bf04d04..2331e62ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6130,10 +6130,14 @@ version = "0.4.0" dependencies = [ "arrow-array", "arrow-buffer", + "arrow-schema", "criterion", "datafusion-common", + "datafusion-expr", "lru 0.18.0", "sedona-common", + "sedona-expr", + "sedona-functions", "sedona-gdal", "sedona-raster", "sedona-schema", diff --git a/docs/reference/sql/rs_frompath.qmd b/docs/reference/sql/rs_frompath.qmd new file mode 100644 index 000000000..7d3855c9d --- /dev/null +++ b/docs/reference/sql/rs_frompath.qmd @@ -0,0 +1,40 @@ +--- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +title: RS_FromPath +description: Creates an out-of-database raster from a raster file path. +kernels: + - returns: raster + args: + - name: path + type: string +--- + +## Description + +Loads raster metadata from the file at `path` and returns a raster whose bands +reference the source file as out-db bands. + +This is useful when you want to work with rasters stored on disk without copying +their pixel data into the raster value itself. + +## Examples + +```sql +SELECT RS_BandPath(RS_FromPath('../../../submodules/sedona-testing/data/raster/test4.tiff')); +``` diff --git a/rust/sedona-raster-gdal/Cargo.toml b/rust/sedona-raster-gdal/Cargo.toml index 5dba31c98..d0fe9b9f7 100644 --- a/rust/sedona-raster-gdal/Cargo.toml +++ b/rust/sedona-raster-gdal/Cargo.toml @@ -33,9 +33,13 @@ result_large_err = "allow" [dependencies] arrow-array = { workspace = true } arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } datafusion-common = { workspace = true } +datafusion-expr = { workspace = true } lru = { workspace = true } sedona-common = { workspace = true } +sedona-expr = { workspace = true } +sedona-functions = { workspace = true } sedona-gdal = { workspace = true } sedona-raster = { workspace = true } sedona-schema = { workspace = true } @@ -46,3 +50,8 @@ sedona-gdal = { workspace = true, features = ["gdal-sys"] } sedona-testing = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread"] } + +[[bench]] +harness = false +name = "rs_frompath" +path = "benches/rs_frompath.rs" diff --git a/rust/sedona-raster-gdal/benches/rs_frompath.rs b/rust/sedona-raster-gdal/benches/rs_frompath.rs new file mode 100644 index 000000000..65291eb4a --- /dev/null +++ b/rust/sedona-raster-gdal/benches/rs_frompath.rs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for RS_FromPath UDF. +//! +//! RS_FromPath creates out-db rasters from file paths, so these benchmarks use +//! raster fixtures from the `sedona-testing` test module rather than synthetic input. + +use std::{hint::black_box, sync::Arc}; + +use arrow_array::{ArrayRef, StringArray}; +use arrow_schema::DataType; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use datafusion_expr::ScalarUDF; +use sedona_schema::datatypes::SedonaType; +use sedona_testing::{data::test_raster, testers::ScalarUdfTester}; + +const SMALL_RASTER_FIXTURES: &[&str] = &[ + "test1.tiff", + "test2.tif", + "test3.tif", + "test4.tiff", + "test5.tiff", +]; + +fn raster_path_array(names: &[&str], rows: usize) -> ArrayRef { + assert!( + !names.is_empty(), + "benchmark fixture list must not be empty" + ); + + let paths = names + .iter() + .map(|name| test_raster(name).unwrap()) + .collect::>(); + + let values = (0..rows) + .map(|index| paths[index % paths.len()].as_str()) + .collect::>(); + + Arc::new(StringArray::from(values)) +} + +fn bench_rs_frompath(c: &mut Criterion) { + let udf: ScalarUDF = sedona_raster_gdal::rs_frompath_udf().into(); + let tester = ScalarUdfTester::new(udf, vec![SedonaType::Arrow(DataType::Utf8)]); + + let single_small = raster_path_array(&["test4.tiff"], 1); + let mixed_small = raster_path_array(SMALL_RASTER_FIXTURES, SMALL_RASTER_FIXTURES.len()); + let batched_small = raster_path_array(SMALL_RASTER_FIXTURES, 256); + + let mut group = c.benchmark_group("rs_frompath"); + + group.throughput(Throughput::Elements(single_small.len() as u64)); + group.bench_with_input( + BenchmarkId::new("fixtures", "single_small"), + &single_small, + |b, input| b.iter(|| black_box(tester.invoke_array(input.clone()).unwrap())), + ); + + group.throughput(Throughput::Elements(mixed_small.len() as u64)); + group.bench_with_input( + BenchmarkId::new("fixtures", "mixed_small"), + &mixed_small, + |b, input| b.iter(|| black_box(tester.invoke_array(input.clone()).unwrap())), + ); + + group.throughput(Throughput::Elements(batched_small.len() as u64)); + group.bench_with_input( + BenchmarkId::new("fixtures", "batched_small"), + &batched_small, + |b, input| b.iter(|| black_box(tester.invoke_array(input.clone()).unwrap())), + ); + + group.finish(); +} + +criterion_group!(benches, bench_rs_frompath); +criterion_main!(benches); diff --git a/rust/sedona-raster-gdal/src/gdal_common.rs b/rust/sedona-raster-gdal/src/gdal_common.rs index 2a6fad688..072b0bdef 100644 --- a/rust/sedona-raster-gdal/src/gdal_common.rs +++ b/rust/sedona-raster-gdal/src/gdal_common.rs @@ -21,6 +21,7 @@ use sedona_gdal::gdal::Gdal; use sedona_gdal::gdal_dyn_bindgen::{GDAL_OF_RASTER, GDAL_OF_READONLY, GDAL_OF_VERBOSE_ERROR}; use sedona_gdal::geo_transform::GeoTransform; use sedona_gdal::mem::MemDatasetBuilder; +use sedona_gdal::raster::rasterband::RasterBand; use sedona_gdal::raster::types::DatasetOptions; use sedona_gdal::raster::types::GdalDataType; @@ -268,37 +269,11 @@ pub unsafe fn raster_ref_to_gdal_mem( .band(src_band_index) .map_err(|e| arrow_datafusion_err!(e))?; let band_metadata = band.metadata(); - let band_type = band_metadata.data_type()?; if let Some(nodata_bytes) = band_metadata.nodata_value() { let raster_band = dataset .rasterband(dst_band_index) .map_err(convert_gdal_err)?; - match band_type { - BandDataType::UInt64 => { - let nodata_bytes: [u8; 8] = nodata_bytes.try_into().map_err(|_| { - exec_datafusion_err!("Invalid nodata byte length for UInt64") - })?; - let nodata = u64::from_le_bytes(nodata_bytes); - raster_band - .set_no_data_value_u64(Some(nodata)) - .map_err(convert_gdal_err)?; - } - BandDataType::Int64 => { - let nodata_bytes: [u8; 8] = nodata_bytes.try_into().map_err(|_| { - exec_datafusion_err!("Invalid nodata byte length for Int64") - })?; - let nodata = i64::from_le_bytes(nodata_bytes); - raster_band - .set_no_data_value_i64(Some(nodata)) - .map_err(convert_gdal_err)?; - } - _ => { - let nodata = bytes_to_f64(nodata_bytes, &band_type)?; - raster_band - .set_no_data_value(Some(nodata)) - .map_err(convert_gdal_err)?; - } - } + set_band_nodata_from_bytes(&raster_band, Some(nodata_bytes))?; } } @@ -320,6 +295,54 @@ pub fn nodata_bytes_to_f64(nodata_bytes: Option<&[u8]>, band_type: &BandDataType bytes_to_f64(bytes, band_type).ok() } +/// Read a GDAL band's nodata value into a byte vector using the band's native type. +pub fn band_nodata_to_bytes(band: &RasterBand<'_>) -> Result>> { + let band_type = gdal_to_band_data_type(band.band_type())?; + + Ok(match band_type { + BandDataType::UInt64 => band + .no_data_value_u64() + .map(|nodata| nodata.to_le_bytes().to_vec()), + BandDataType::Int64 => band + .no_data_value_i64() + .map(|nodata| nodata.to_le_bytes().to_vec()), + _ => band + .no_data_value() + .map(|nodata| nodata_f64_to_bytes(nodata, &band_type)), + }) +} + +/// Set a GDAL band's nodata value from stored bytes using the band's native type. +pub fn set_band_nodata_from_bytes( + band: &RasterBand<'_>, + nodata_bytes: Option<&[u8]>, +) -> Result<()> { + let band_type = gdal_to_band_data_type(band.band_type())?; + + match (nodata_bytes, band_type) { + (Some(bytes), BandDataType::UInt64) => { + let bytes: [u8; 8] = bytes + .try_into() + .map_err(|_| exec_datafusion_err!("Invalid nodata byte length for UInt64"))?; + band.set_no_data_value_u64(Some(u64::from_le_bytes(bytes))) + .map_err(convert_gdal_err) + } + (Some(bytes), BandDataType::Int64) => { + let bytes: [u8; 8] = bytes + .try_into() + .map_err(|_| exec_datafusion_err!("Invalid nodata byte length for Int64"))?; + band.set_no_data_value_i64(Some(i64::from_le_bytes(bytes))) + .map_err(convert_gdal_err) + } + (Some(bytes), band_type) => band + .set_no_data_value(Some(bytes_to_f64(bytes, &band_type)?)) + .map_err(convert_gdal_err), + (None, BandDataType::UInt64) => band.set_no_data_value_u64(None).map_err(convert_gdal_err), + (None, BandDataType::Int64) => band.set_no_data_value_i64(None).map_err(convert_gdal_err), + (None, _) => band.set_no_data_value(None).map_err(convert_gdal_err), + } +} + /// Convert a f64 nodata value into a byte vector appropriate for the given band type. pub fn nodata_f64_to_bytes(nodata: f64, band_type: &BandDataType) -> Vec { match band_type { diff --git a/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs b/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs index a9d1013c5..f8f878677 100644 --- a/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs +++ b/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs @@ -34,7 +34,8 @@ use sedona_schema::raster::{BandDataType, StorageType}; use crate::gdal_common::{ band_data_type_to_gdal, bytes_to_f64, convert_gdal_err, normalize_outdb_source_path, - open_gdal_dataset, raster_ref_to_gdal_empty, raster_ref_to_gdal_mem, ToGdalGeoTransform, + open_gdal_dataset, raster_ref_to_gdal_empty, raster_ref_to_gdal_mem, + set_band_nodata_from_bytes, ToGdalGeoTransform, }; /// A GDAL dataset constructed from a `RasterRef`. @@ -281,32 +282,7 @@ impl GDALDatasetCache { let vrt_band = vrt.rasterband(i).map_err(convert_gdal_err)?; if let Some(nodata_bytes) = band_metadata.nodata_value() { - match band_type { - BandDataType::UInt64 => { - let nodata_bytes: [u8; 8] = nodata_bytes.try_into().map_err(|_| { - exec_datafusion_err!("Invalid nodata byte length for UInt64") - })?; - let nodata = u64::from_le_bytes(nodata_bytes); - vrt_band - .set_no_data_value_u64(Some(nodata)) - .map_err(convert_gdal_err)?; - } - BandDataType::Int64 => { - let nodata_bytes: [u8; 8] = nodata_bytes.try_into().map_err(|_| { - exec_datafusion_err!("Invalid nodata byte length for Int64") - })?; - let nodata = i64::from_le_bytes(nodata_bytes); - vrt_band - .set_no_data_value_i64(Some(nodata)) - .map_err(convert_gdal_err)?; - } - _ => { - let nodata = bytes_to_f64(nodata_bytes, &band_type)?; - vrt_band - .set_no_data_value(nodata) - .map_err(convert_gdal_err)?; - } - } + set_band_nodata_from_bytes(&vrt_band, Some(nodata_bytes))?; } match band_metadata.storage_type()? { diff --git a/rust/sedona-raster-gdal/src/lib.rs b/rust/sedona-raster-gdal/src/lib.rs index 8e8c871fb..360320b56 100644 --- a/rust/sedona-raster-gdal/src/lib.rs +++ b/rust/sedona-raster-gdal/src/lib.rs @@ -25,13 +25,14 @@ //! - GDAL datatype and nodata conversion helpers //! - path normalization for GDAL VSI-backed raster sources -// Temporary until https://github.com/apache/sedona-db/issues/804 is resolved. -#[allow(dead_code)] +pub mod register; + mod gdal_common; // Temporary until https://github.com/apache/sedona-db/issues/804 is resolved. #[allow(dead_code)] mod gdal_dataset_provider; +mod rs_frompath; mod utils; #[cfg(test)] @@ -42,4 +43,5 @@ pub use gdal_common::{ band_data_type_to_gdal, bytes_to_f64, gdal_to_band_data_type, gdal_type_byte_size, nodata_bytes_to_f64, nodata_f64_to_bytes, }; -pub use utils::{append_as_indb_raster, dataset_to_indb_raster}; +pub use rs_frompath::rs_frompath_udf; +pub use utils::{append_as_indb_raster, append_as_outdb_raster, dataset_to_indb_raster}; diff --git a/rust/sedona-raster-gdal/src/register.rs b/rust/sedona-raster-gdal/src/register.rs new file mode 100644 index 000000000..4db9cf56d --- /dev/null +++ b/rust/sedona-raster-gdal/src/register.rs @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use sedona_expr::function_set::FunctionSet; + +/// Export the set of GDAL-backed functions defined in this crate. +pub fn default_function_set() -> FunctionSet { + let mut function_set = FunctionSet::new(); + function_set.insert_scalar_udf(crate::rs_frompath::rs_frompath_udf()); + function_set +} diff --git a/rust/sedona-raster-gdal/src/rs_frompath.rs b/rust/sedona-raster-gdal/src/rs_frompath.rs new file mode 100644 index 000000000..1bf0f3f19 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_frompath.rs @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_FromPath UDF - Load out-db raster from file path. + +use std::sync::Arc; + +use arrow_array::Array; +use arrow_schema::DataType; +use datafusion_common::cast::as_string_array; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_common::sedona_internal_err; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_functions::executor::WkbBytesExecutor; +use sedona_raster::builder::RasterBuilder; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; + +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::configure_thread_local_options; +use crate::utils::append_as_outdb_raster; + +pub fn rs_frompath_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_frompath", + vec![Arc::new(RsFromPath)], + Volatility::Volatile, + ) +} + +#[derive(Debug)] +pub(crate) struct RsFromPath; + +impl SedonaScalarKernel for RsFromPath { + fn return_type(&self, args: &[SedonaType]) -> Result> { + ArgMatcher::new(vec![ArgMatcher::is_string()], RASTER).match_args(args) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + let executor = WkbBytesExecutor::new(arg_types, args); + + let paths = args[0] + .cast_to(&DataType::Utf8, None)? + .into_array_of_size(executor.num_iterations())?; + let path_array = as_string_array(&paths)?; + + let mut builder = RasterBuilder::new(path_array.len()); + for path_opt in path_array { + if let Some(path) = path_opt { + append_as_outdb_raster(gdal, path, &mut builder)?; + } else { + builder.append_null()?; + } + } + + let result: Arc = Arc::new(builder.finish()?); + executor.finish(result) + }) + } + + fn invoke_batch( + &self, + _arg_types: &[SedonaType], + _args: &[ColumnarValue], + ) -> Result { + sedona_internal_err!("Should not be called because invoke_batch_from_args() is implemented") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{StringArray, StructArray}; + use datafusion_common::cast::as_struct_array; + use datafusion_common::ScalarValue; + use datafusion_expr::ScalarUDFImpl; + use sedona_raster::array::RasterStructArray; + use sedona_raster::traits::RasterRef; + use sedona_testing::data::test_raster; + + #[test] + fn test_rs_from_path_udf_name() { + assert_eq!(rs_frompath_udf().name(), "rs_frompath"); + } + + fn assert_raster_dimensions( + result: &ColumnarValue, + expected_len: usize, + width: u64, + height: u64, + ) { + fn assert_struct_array_dimensions( + struct_arr: &StructArray, + expected_len: usize, + width: u64, + height: u64, + ) { + let raster_array = RasterStructArray::new(struct_arr); + assert_eq!(raster_array.len(), expected_len); + + for idx in 0..expected_len { + let raster = raster_array.get(idx).unwrap(); + assert_eq!(raster.metadata().width(), width); + assert_eq!(raster.metadata().height(), height); + } + } + + match result { + ColumnarValue::Array(arr) => { + let struct_arr = as_struct_array(arr).unwrap(); + assert_struct_array_dimensions(struct_arr, expected_len, width, height); + } + ColumnarValue::Scalar(ScalarValue::Struct(struct_arr)) => { + assert_struct_array_dimensions(struct_arr, expected_len, width, height); + } + other => panic!("Unexpected result: {other:?}"), + } + } + + #[test] + fn test_invoke_rs_from_path() { + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + + let paths = Arc::new(StringArray::from(vec![path.as_str()])); + let input = ColumnarValue::Array(paths); + + let kernel = RsFromPath; + let result = kernel + .invoke_batch_from_args(&[], &[input], &SedonaType::Arrow(DataType::Null), 0, None) + .expect("Should invoke successfully"); + + assert_raster_dimensions(&result, 1, 10, 10); + + let scalar_input = ColumnarValue::Scalar(ScalarValue::Utf8(Some(path.clone()))); + let scalar_result = kernel + .invoke_batch_from_args( + &[], + &[scalar_input], + &SedonaType::Arrow(DataType::Null), + 0, + None, + ) + .expect("Should invoke successfully for scalar path"); + + assert_raster_dimensions(&scalar_result, 1, 10, 10); + + let multi_paths = Arc::new(StringArray::from(vec![path.as_str(), path.as_str()])); + let multi_result = kernel + .invoke_batch_from_args( + &[], + &[ColumnarValue::Array(multi_paths)], + &SedonaType::Arrow(DataType::Null), + 0, + None, + ) + .expect("Should invoke successfully for multiple paths"); + + assert_raster_dimensions(&multi_result, 2, 10, 10); + + let empty_paths = Arc::new(StringArray::from(Vec::<&str>::new())); + let empty_result = kernel + .invoke_batch_from_args( + &[], + &[ColumnarValue::Array(empty_paths)], + &SedonaType::Arrow(DataType::Null), + 0, + None, + ) + .expect("Should invoke successfully for empty paths"); + + match empty_result { + ColumnarValue::Array(arr) => { + let struct_arr = as_struct_array(&arr).unwrap(); + assert_eq!(struct_arr.len(), 0); + } + other => panic!("Expected empty array result, got {other:?}"), + } + } + + #[test] + fn test_invoke_rs_from_path_propagates_nulls() { + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + + let input = + ColumnarValue::Array(Arc::new(StringArray::from(vec![Some(path.as_str()), None]))); + + let result = RsFromPath + .invoke_batch_from_args(&[], &[input], &SedonaType::Arrow(DataType::Null), 0, None) + .expect("Should invoke successfully for null-containing input"); + + match result { + ColumnarValue::Array(arr) => { + let struct_arr = as_struct_array(&arr).unwrap(); + assert_eq!(struct_arr.len(), 2); + assert!(!struct_arr.is_null(0)); + assert!(struct_arr.is_null(1)); + + let raster_array = RasterStructArray::new(struct_arr); + let raster = raster_array.get(0).unwrap(); + assert_eq!(raster.metadata().width(), 10); + assert_eq!(raster.metadata().height(), 10); + } + other => panic!("Expected array result, got {other:?}"), + } + } + + #[test] + fn test_invoke_rs_from_path_invalid_path_errors() { + let missing_path = "/definitely/missing/rs_from_path_test.tif"; + let input = ColumnarValue::Scalar(ScalarValue::Utf8(Some(missing_path.to_string()))); + + let err = RsFromPath + .invoke_batch_from_args(&[], &[input], &SedonaType::Arrow(DataType::Null), 0, None) + .expect_err("Missing path should return an error"); + + let err_message = err.to_string(); + assert!(err_message.contains(&format!( + "Failed to open raster file '{}' (GDAL path '{}')", + missing_path, missing_path + ))); + } + + #[test] + fn test_invoke_rs_from_path_scalar_ignores_num_rows_for_shape() { + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + + let result = RsFromPath + .invoke_batch_from_args( + &[], + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(path)))], + &SedonaType::Arrow(DataType::Null), + 32, + None, + ) + .expect("Should invoke successfully for scalar path with larger num_rows"); + + assert!(matches!(result, ColumnarValue::Scalar(_))); + assert_raster_dimensions(&result, 1, 10, 10); + } +} diff --git a/rust/sedona-raster-gdal/src/utils.rs b/rust/sedona-raster-gdal/src/utils.rs index 30f543071..65f193111 100644 --- a/rust/sedona-raster-gdal/src/utils.rs +++ b/rust/sedona-raster-gdal/src/utils.rs @@ -22,13 +22,19 @@ use arrow_buffer::Buffer; use datafusion_common::error::Result; use datafusion_common::exec_datafusion_err; use sedona_gdal::dataset::Dataset; +use sedona_gdal::gdal::Gdal; +use sedona_gdal::gdal_dyn_bindgen::{GDAL_OF_RASTER, GDAL_OF_READONLY}; +use sedona_gdal::raster::types::DatasetOptions; use sedona_gdal::spatial_ref::SpatialRef; use sedona_raster::builder::RasterBuilder; use sedona_raster::traits::BandMetadata; -use sedona_schema::raster::{BandDataType, StorageType}; +use sedona_schema::raster::StorageType; -use crate::gdal_common::{gdal_to_band_data_type, RasterMetadataFromGdalGeoTransform}; +use crate::gdal_common::{ + band_nodata_to_bytes, gdal_to_band_data_type, normalize_outdb_source_path, + RasterMetadataFromGdalGeoTransform, +}; /// Append a GDAL dataset as a single in-db raster to the provided [`RasterBuilder`]. pub fn append_as_indb_raster(dataset: &Dataset, builder: &mut RasterBuilder) -> Result<()> { @@ -59,18 +65,7 @@ pub fn append_as_indb_raster(dataset: &Dataset, builder: &mut RasterBuilder) -> let band_data_type = gdal_to_band_data_type(gdal_type) .map_err(|_| exec_datafusion_err!("Unsupported band data type: {:?}", gdal_type))?; - // Get nodata value - let nodata_bytes = match band_data_type { - BandDataType::UInt64 => band - .no_data_value_u64() - .map(|no_data| no_data.to_le_bytes().to_vec()), - BandDataType::Int64 => band - .no_data_value_i64() - .map(|no_data| no_data.to_le_bytes().to_vec()), - _ => band - .no_data_value() - .map(|no_data| crate::gdal_common::nodata_f64_to_bytes(no_data, &band_data_type)), - }; + let nodata_bytes = band_nodata_to_bytes(&band)?; let band_metadata = BandMetadata { nodata_value: nodata_bytes, @@ -109,6 +104,68 @@ pub fn append_as_indb_raster(dataset: &Dataset, builder: &mut RasterBuilder) -> Ok(()) } +/// Append a raster source path as a single out-db raster to the provided [`RasterBuilder`]. +pub fn append_as_outdb_raster(gdal: &Gdal, path: &str, builder: &mut RasterBuilder) -> Result<()> { + let gdal_path = normalize_outdb_source_path(path); + let dataset = gdal + .open_ex_with_options( + &gdal_path, + DatasetOptions { + open_flags: GDAL_OF_RASTER | GDAL_OF_READONLY, + ..Default::default() + }, + ) + .map_err(|e| { + exec_datafusion_err!( + "Failed to open raster file '{}' (GDAL path '{}'): {}", + path, + gdal_path, + e + ) + })?; + + let (width, height) = dataset.raster_size(); + let geotransform = dataset + .geo_transform() + .map_err(|e| exec_datafusion_err!("Failed to get geotransform: {}", e))?; + let metadata = geotransform.to_raster_metadata(width, height); + + let crs = dataset + .spatial_ref() + .ok() + .and_then(|sr: SpatialRef| sr.to_projjson().ok()); + + builder.start_raster(&metadata, crs.as_deref())?; + + let band_count = dataset.raster_count(); + for band_idx in 1..=band_count { + let band = dataset + .rasterband(band_idx) + .map_err(|e| exec_datafusion_err!("Failed to get band {}: {}", band_idx, e))?; + + let gdal_type = band.band_type(); + let band_data_type = gdal_to_band_data_type(gdal_type) + .map_err(|_| exec_datafusion_err!("Unsupported band data type: {:?}", gdal_type))?; + + let nodata_bytes = band_nodata_to_bytes(&band)?; + + let band_metadata = BandMetadata { + nodata_value: nodata_bytes, + storage_type: StorageType::OutDbRef, + datatype: band_data_type, + outdb_url: Some(path.to_string()), + outdb_band_id: Some(band_idx as u32), + }; + + builder.start_band(band_metadata)?; + builder.band_data_writer().append_value([]); + builder.finish_band()?; + } + + builder.finish_raster()?; + Ok(()) +} + /// Materialize a single GDAL dataset as an in-db raster `StructArray`. pub fn dataset_to_indb_raster(dataset: &Dataset) -> Result { let mut builder = RasterBuilder::new(1); @@ -121,7 +178,7 @@ pub fn dataset_to_indb_raster(dataset: &Dataset) -> Result { #[cfg(test)] mod tests { - use super::{append_as_indb_raster, dataset_to_indb_raster}; + use super::{append_as_indb_raster, append_as_outdb_raster, dataset_to_indb_raster}; use arrow_array::StructArray; use datafusion_common::exec_datafusion_err; @@ -134,6 +191,7 @@ mod tests { use sedona_raster::builder::RasterBuilder; use sedona_raster::traits::RasterRef; use sedona_schema::raster::{BandDataType, StorageType}; + use sedona_testing::data::test_raster; use tempfile::TempDir; use crate::gdal_common::with_gdal; @@ -153,6 +211,12 @@ mod tests { dataset_to_indb_raster(&dataset) } + fn load_as_outdb_raster(gdal: &Gdal, path: &str) -> datafusion_common::Result { + let mut builder = RasterBuilder::new(1); + append_as_outdb_raster(gdal, path, &mut builder)?; + builder.finish().map_err(Into::into) + } + fn write_uint64_tiff(gdal: &Gdal, path: &str, nodata: u64, data: Vec) { let driver = gdal.get_driver_by_name("GTiff").unwrap(); let dataset = driver.create_with_band_type::(path, 2, 2, 1).unwrap(); @@ -273,6 +337,75 @@ mod tests { assert_eq!(band.data(), [1u8, 2, 3, 4, 5, 6]); } + #[test] + fn append_as_outdb_raster_reads_single_band_geotiff() { + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + + let raster = with_gdal(|gdal| load_as_outdb_raster(gdal, &path)).unwrap(); + let raster_struct = RasterStructArray::new(&raster); + assert_eq!(raster_struct.len(), 1); + + let raster = raster_struct.get(0).unwrap(); + assert_eq!(raster.metadata().width(), 10); + assert_eq!(raster.metadata().height(), 10); + assert!(raster.crs().is_some()); + + let band = raster.bands().band(1).unwrap(); + assert_eq!( + band.metadata().storage_type().unwrap(), + StorageType::OutDbRef + ); + assert!(band.metadata().outdb_url().unwrap().contains("test4.tiff")); + } + + #[test] + fn append_as_outdb_raster_preserves_uint64_nodata() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("uint64.tif"); + let path_str = path.to_string_lossy().to_string(); + let nodata = 9_007_199_254_740_993u64; + + with_gdal(|gdal| { + write_uint64_tiff(gdal, &path_str, nodata, vec![1, 2, 3, 4]); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .unwrap(); + + let raster = with_gdal(|gdal| load_as_outdb_raster(gdal, &path_str)).unwrap(); + let raster_struct = RasterStructArray::new(&raster); + let raster = raster_struct.get(0).unwrap(); + let band = raster.bands().band(1).unwrap(); + + assert_eq!( + band.metadata().nodata_value().unwrap(), + nodata.to_le_bytes() + ); + } + + #[test] + fn append_as_outdb_raster_preserves_int64_nodata() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("int64.tif"); + let path_str = path.to_string_lossy().to_string(); + let nodata = -9_007_199_254_740_993i64; + + with_gdal(|gdal| { + write_int64_tiff(gdal, &path_str, nodata, vec![-1, -2, -3, -4]); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .unwrap(); + + let raster = with_gdal(|gdal| load_as_outdb_raster(gdal, &path_str)).unwrap(); + let raster_struct = RasterStructArray::new(&raster); + let raster = raster_struct.get(0).unwrap(); + let band = raster.bands().band(1).unwrap(); + + assert_eq!( + band.metadata().nodata_value().unwrap(), + nodata.to_le_bytes() + ); + } + #[test] fn dataset_to_indb_raster_preserves_uint64_nodata_and_data() { let temp_dir = TempDir::new().unwrap(); diff --git a/rust/sedona/src/context.rs b/rust/sedona/src/context.rs index eb88e9414..5cf4d653a 100644 --- a/rust/sedona/src/context.rs +++ b/rust/sedona/src/context.rs @@ -233,6 +233,8 @@ impl SedonaContext { Arc::new(RandomGeometryFunction::default()), ); + out.register_function_set(sedona_raster_gdal::register::default_function_set()); + // Always register default function set out.register_function_set(sedona_functions::register::default_function_set());