paleolimbot commented on code in PR #831: URL: https://github.com/apache/sedona-db/pull/831#discussion_r3234788810
########## rust/sedona-raster-gdal/src/rs_frompath.rs: ########## @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_FromPath UDF - Load out-db raster from file path. + +use std::sync::Arc; + +use arrow_array::Array; +use arrow_schema::DataType; +use datafusion_common::cast::as_string_array; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_common::sedona_internal_err; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::builder::RasterBuilder; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; + +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::configure_thread_local_options; +use crate::utils::append_as_outdb_raster; + +pub fn rs_frompath_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_frompath", + vec![Arc::new(RsFromPath)], + Volatility::Volatile, + ) +} + +#[derive(Debug)] +pub(crate) struct RsFromPath; + +impl SedonaScalarKernel for RsFromPath { + fn return_type(&self, args: &[SedonaType]) -> Result<Option<SedonaType>> { + ArgMatcher::new(vec![ArgMatcher::is_string()], RASTER).match_args(args) + } + + fn invoke_batch_from_args( + &self, + _arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result<ColumnarValue> { + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + + let paths = args[0].cast_to(&DataType::Utf8, None)?.into_array(1)?; + let path_array = as_string_array(&paths)?; + + let len = path_array.len(); + let mut builder = RasterBuilder::new(len); + for i in 0..len { + if path_array.is_null(i) { + builder.append_null()?; + } else { + let path = path_array.value(i); + append_as_outdb_raster(gdal, path, &mut builder)?; + } + } Review Comment: This works, although our iteration elsewhere is generally `for item_opt in path_array { ... }` ########## rust/sedona-raster-gdal/src/rs_frompath.rs: ########## @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_FromPath UDF - Load out-db raster from file path. + +use std::sync::Arc; + +use arrow_array::Array; +use arrow_schema::DataType; +use datafusion_common::cast::as_string_array; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_common::sedona_internal_err; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::builder::RasterBuilder; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; + +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::configure_thread_local_options; +use crate::utils::append_as_outdb_raster; + +pub fn rs_frompath_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_frompath", + vec![Arc::new(RsFromPath)], + Volatility::Volatile, + ) +} + +#[derive(Debug)] +pub(crate) struct RsFromPath; + +impl SedonaScalarKernel for RsFromPath { + fn return_type(&self, args: &[SedonaType]) -> Result<Option<SedonaType>> { + ArgMatcher::new(vec![ArgMatcher::is_string()], RASTER).match_args(args) + } + + fn invoke_batch_from_args( + &self, + _arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result<ColumnarValue> { + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + + let paths = args[0].cast_to(&DataType::Utf8, None)?.into_array(1)?; Review Comment: ```suggestion let executor = WkbBytesExecutor::new(arg_types, args); let paths = args[0].cast_to(&DataType::Utf8, None)?.into_array(executor.num_iterations())?; ``` The rest of this loop suggests this can handle both scalars and columns but I think as written this will fail for anything except a scalar or one row table ########## rust/sedona-raster-gdal/src/rs_frompath.rs: ########## @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_FromPath UDF - Load out-db raster from file path. + +use std::sync::Arc; + +use arrow_array::Array; +use arrow_schema::DataType; +use datafusion_common::cast::as_string_array; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_common::sedona_internal_err; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::builder::RasterBuilder; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; + +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::configure_thread_local_options; +use crate::utils::append_as_outdb_raster; + +pub fn rs_frompath_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_frompath", + vec![Arc::new(RsFromPath)], + Volatility::Volatile, + ) +} + +#[derive(Debug)] +pub(crate) struct RsFromPath; + +impl SedonaScalarKernel for RsFromPath { + fn return_type(&self, args: &[SedonaType]) -> Result<Option<SedonaType>> { + ArgMatcher::new(vec![ArgMatcher::is_string()], RASTER).match_args(args) + } + + fn invoke_batch_from_args( + &self, + _arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result<ColumnarValue> { + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + + let paths = args[0].cast_to(&DataType::Utf8, None)?.into_array(1)?; + let path_array = as_string_array(&paths)?; + + let len = path_array.len(); + let mut builder = RasterBuilder::new(len); + for i in 0..len { + if path_array.is_null(i) { + builder.append_null()?; + } else { + let path = path_array.value(i); + append_as_outdb_raster(gdal, path, &mut builder)?; + } + } + + let result: Arc<dyn Array> = Arc::new(builder.finish()?); + + match &args[0] { + ColumnarValue::Scalar(_) => { + let scalar = datafusion_common::ScalarValue::try_from_array(&result, 0)?; + Ok(ColumnarValue::Scalar(scalar)) + } + ColumnarValue::Array(_) => Ok(ColumnarValue::Array(result)), + } Review Comment: ```suggestion executor.finish(result) ``` ########## rust/sedona-raster-gdal/src/rs_frompath.rs: ########## @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_FromPath UDF - Load out-db raster from file path. + +use std::sync::Arc; + +use arrow_array::Array; +use arrow_schema::DataType; +use datafusion_common::cast::as_string_array; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_common::sedona_internal_err; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::builder::RasterBuilder; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; + +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::configure_thread_local_options; +use crate::utils::append_as_outdb_raster; + +pub fn rs_frompath_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_frompath", + vec![Arc::new(RsFromPath)], + Volatility::Volatile, + ) +} + +#[derive(Debug)] +pub(crate) struct RsFromPath; + +impl SedonaScalarKernel for RsFromPath { + fn return_type(&self, args: &[SedonaType]) -> Result<Option<SedonaType>> { + ArgMatcher::new(vec![ArgMatcher::is_string()], RASTER).match_args(args) + } + + fn invoke_batch_from_args( + &self, + _arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result<ColumnarValue> { + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + + let paths = args[0].cast_to(&DataType::Utf8, None)?.into_array(1)?; + let path_array = as_string_array(&paths)?; + + let len = path_array.len(); + let mut builder = RasterBuilder::new(len); + for i in 0..len { + if path_array.is_null(i) { + builder.append_null()?; + } else { + let path = path_array.value(i); + append_as_outdb_raster(gdal, path, &mut builder)?; + } + } + + let result: Arc<dyn Array> = Arc::new(builder.finish()?); + + match &args[0] { + ColumnarValue::Scalar(_) => { + let scalar = datafusion_common::ScalarValue::try_from_array(&result, 0)?; + Ok(ColumnarValue::Scalar(scalar)) + } + ColumnarValue::Array(_) => Ok(ColumnarValue::Array(result)), + } + }) + } + + fn invoke_batch( + &self, + _arg_types: &[SedonaType], + _args: &[ColumnarValue], + ) -> Result<ColumnarValue> { + sedona_internal_err!("Should not be called because invoke_batch_from_args() is implemented") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::StringArray; + use datafusion_common::cast::{as_struct_array, as_uint64_array}; + use datafusion_common::ScalarValue; + use datafusion_expr::ScalarUDFImpl; + use sedona_expr::scalar_udf::SedonaScalarKernel; + use sedona_schema::raster::{metadata_indices, raster_indices}; + use sedona_testing::data::test_raster; + + #[test] + fn test_rs_from_path_udf_name() { + assert_eq!(rs_frompath_udf().name(), "rs_frompath"); + } + + fn assert_raster_dimensions( + result: &ColumnarValue, + expected_len: usize, + width: u64, + height: u64, + ) { + match result { Review Comment: Not your problem for this PR, but we really need support for rasters in the `ScalarUdfTester` so that these tests can be more compact. In the meantime this could be moved to `sedona_testing::rasters` which has a few asserters along these lines. ########## rust/sedona/src/context.rs: ########## @@ -232,6 +221,10 @@ impl SedonaContext { Arc::new(RandomGeometryFunction::default()), ); + for udf in sedona_raster_gdal::all_gdal_udfs() { + out.ctx.register_udf(udf.into()); Review Comment: Can this use the same pattern as the other crates that provide scalar functions? (e.g., `out.register_scalar_kernels(sedona_raster_gdal::register::scalar_kernels().into_iter())?;`). I suppose if we make it an async UDF we'll need the pattern you have here. ########## rust/sedona-raster-gdal/Cargo.toml: ########## @@ -31,13 +31,18 @@ rust-version.workspace = true result_large_err = "allow" [dependencies] +arrow = { workspace = true } Review Comment: Can we use a targeted subcrate for whatever uses this import? ########## rust/sedona-raster-gdal/src/utils.rs: ########## @@ -109,6 +104,68 @@ pub fn append_as_indb_raster(dataset: &Dataset, builder: &mut RasterBuilder) -> Ok(()) } +/// Append a raster source path as a single out-db raster to the provided [`RasterBuilder`]. +pub fn append_as_outdb_raster(gdal: &Gdal, path: &str, builder: &mut RasterBuilder) -> Result<()> { + let gdal_path = normalize_outdb_source_path(path); + let dataset = gdal + .open_ex_with_options( + &gdal_path, + DatasetOptions { + open_flags: GDAL_OF_RASTER | GDAL_OF_READONLY, + ..Default::default() + }, + ) Review Comment: I am guessing this does blocking IO. Should we make any attempt to mitigate this or implement `RS_FromPath` as an async UDF to get the auto batch size adjustment from DataFusion for these types of functions? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
