paleolimbot commented on code in PR #787:
URL: https://github.com/apache/sedona-db/pull/787#discussion_r3164256192


##########
rust/sedona-raster-gdal/src/gdal_common.rs:
##########
@@ -0,0 +1,752 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use sedona_gdal::dataset::Dataset;
+use sedona_gdal::errors::GdalError;
+use sedona_gdal::gdal::Gdal;
+use sedona_gdal::gdal_dyn_bindgen::{GDAL_OF_RASTER, GDAL_OF_READONLY, 
GDAL_OF_VERBOSE_ERROR};
+use sedona_gdal::mem::MemDatasetBuilder;
+use sedona_gdal::raster::types::DatasetOptions;
+use sedona_gdal::raster::types::GdalDataType;
+
+use sedona_raster::traits::RasterRef;
+use sedona_schema::raster::{BandDataType, StorageType};
+
+use datafusion_common::{
+    arrow_datafusion_err, exec_datafusion_err, exec_err, DataFusionError, 
Result,
+};
+
+/// Execute a closure with a reference to the global [`Gdal`] handle,
+/// converting initialization errors to [`DataFusionError`].
+pub(crate) fn with_gdal<F, R>(f: F) -> Result<R>
+where
+    F: FnOnce(&Gdal) -> Result<R>,
+{
+    match sedona_gdal::global::with_global_gdal(f) {
+        Ok(inner_result) => inner_result,
+        Err(init_err) => Err(DataFusionError::External(Box::new(init_err))),
+    }
+}
+
+/// Converts a BandDataType to the corresponding GDAL data type.
+pub fn band_data_type_to_gdal(band_type: &BandDataType) -> GdalDataType {
+    match band_type {
+        BandDataType::UInt8 => GdalDataType::UInt8,
+        BandDataType::Int8 => GdalDataType::Int8,
+        BandDataType::UInt16 => GdalDataType::UInt16,
+        BandDataType::Int16 => GdalDataType::Int16,
+        BandDataType::UInt32 => GdalDataType::UInt32,
+        BandDataType::Int32 => GdalDataType::Int32,
+        BandDataType::UInt64 => GdalDataType::UInt64,
+        BandDataType::Int64 => GdalDataType::Int64,
+        BandDataType::Float32 => GdalDataType::Float32,
+        BandDataType::Float64 => GdalDataType::Float64,
+    }
+}
+
+/// Converts a GDAL data type to the corresponding BandDataType.
+pub fn gdal_to_band_data_type(gdal_type: GdalDataType) -> Result<BandDataType> 
{
+    match gdal_type {
+        GdalDataType::UInt8 => Ok(BandDataType::UInt8),
+        GdalDataType::Int8 => Ok(BandDataType::Int8),
+        GdalDataType::UInt16 => Ok(BandDataType::UInt16),
+        GdalDataType::Int16 => Ok(BandDataType::Int16),
+        GdalDataType::UInt32 => Ok(BandDataType::UInt32),
+        GdalDataType::Int32 => Ok(BandDataType::Int32),
+        GdalDataType::UInt64 => Ok(BandDataType::UInt64),
+        GdalDataType::Int64 => Ok(BandDataType::Int64),
+        GdalDataType::Float32 => Ok(BandDataType::Float32),
+        GdalDataType::Float64 => Ok(BandDataType::Float64),
+        _ => Err(DataFusionError::NotImplemented(format!(
+            "GDAL data type {:?} is not supported",
+            gdal_type
+        ))),
+    }
+}
+
+/// Returns the byte size of a GDAL data type.
+pub fn gdal_type_byte_size(gdal_type: GdalDataType) -> usize {
+    gdal_type.byte_size()
+}
+
+/// Interprets bytes according to the band data type and returns the value as 
`f64`.
+///
+/// Returns an error if `bytes` does not have the expected length for 
`band_type`.
+pub fn bytes_to_f64(bytes: &[u8], band_type: &BandDataType) -> Result<f64> {
+    macro_rules! read_le_f64 {
+        ($t:ty, $n:expr) => {{
+            let arr: [u8; $n] = bytes.try_into().map_err(|_| {
+                exec_datafusion_err!(
+                    "Invalid byte slice length for type {}, expected: {}, 
actual: {}",
+                    stringify!($t),
+                    $n,
+                    bytes.len()
+                )
+            })?;
+            Ok(<$t>::from_le_bytes(arr) as f64)
+        }};
+    }
+
+    match band_type {
+        BandDataType::UInt8 => {
+            if bytes.len() != 1 {
+                return exec_err!(
+                    "Invalid byte length for UInt8: expected 1, got {}",
+                    bytes.len()
+                );
+            }
+            Ok(bytes[0] as f64)
+        }
+        BandDataType::Int8 => {
+            if bytes.len() != 1 {
+                return exec_err!(
+                    "Invalid byte length for Int8: expected 1, got {}",
+                    bytes.len()
+                );
+            }
+            Ok(bytes[0] as i8 as f64)
+        }
+        BandDataType::UInt16 => read_le_f64!(u16, 2),
+        BandDataType::Int16 => read_le_f64!(i16, 2),
+        BandDataType::UInt32 => read_le_f64!(u32, 4),
+        BandDataType::Int32 => read_le_f64!(i32, 4),
+        BandDataType::UInt64 => read_le_f64!(u64, 8),
+        BandDataType::Int64 => read_le_f64!(i64, 8),
+        BandDataType::Float32 => read_le_f64!(f32, 4),
+        BandDataType::Float64 => read_le_f64!(f64, 8),
+    }
+}
+
+/// Convert [GdalError] to [DataFusionError]
+pub(crate) fn convert_gdal_err(e: GdalError) -> DataFusionError {
+    DataFusionError::External(Box::new(e))
+}
+
+/// This function creates a GDAL dataset backed by the MEM driver that directly
+/// references the band data stored in the [RasterRef]. No data copying occurs 
-
+/// the GDAL bands point to the same memory as the data buffer held by 
[RasterRef].
+///
+/// # Arguments
+/// * `raster` - The RasterRef value
+/// * `band_indices` - The indices of the bands to include in the GDAL dataset 
(1-based)
+///
+/// # Returns
+/// A [`Dataset`] that provides access to the GDAL dataset.
+///
+/// # Errors
+/// Returns an error if:
+/// - Any band uses OutDb storage
+/// - GDAL driver operations fail
+/// - Accessing RasterRef fails
+pub unsafe fn raster_ref_to_gdal_mem<R: RasterRef + ?Sized>(
+    gdal: &Gdal,
+    raster: &R,
+    band_indices: &[usize],
+) -> Result<Dataset> {
+    let metadata = raster.metadata();
+    let bands = raster.bands();
+
+    let width = metadata.width() as usize;
+    let height = metadata.height() as usize;
+
+    // Create internal MEM dataset via sedona-gdal shim to avoid open dataset 
list contention.
+    let mut mem_ds_builder = MemDatasetBuilder::new(width, height);
+
+    // Add bands with DATAPOINTER option (zero-copy)
+    //
+    // Note: GDALAddBand always appends a new band, so the destination band 
index
+    // is sequential (1..=band_indices.len()), even if the source band indices 
are
+    // sparse (e.g. [1, 3]).
+    for &src_band_index in band_indices.iter() {
+        let band = bands
+            .band(src_band_index)
+            .map_err(|e| arrow_datafusion_err!(e))?;
+
+        if band.metadata().storage_type()? != StorageType::InDb {
+            return Err(DataFusionError::NotImplemented(
+                "OutDb bands are not supported in 
raster_to_mem_dataset".to_string(),
+            ));
+        }
+
+        let band_metadata = band.metadata();
+        let band_type = band_metadata.data_type()?;
+        let gdal_type = band_data_type_to_gdal(&band_type);
+        let band_data = band.data();
+        let data_ptr = band_data.as_ptr();
+        unsafe {
+            mem_ds_builder = mem_ds_builder.add_band(gdal_type, data_ptr as 
*mut u8);
+        }
+    }
+
+    let dataset = unsafe {
+        mem_ds_builder
+            .build(gdal)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?
+    };
+
+    // GDAL geotransform: [origin_x, pixel_width, rotation_x, origin_y, 
rotation_y, pixel_height]
+    let geotransform = [
+        metadata.upper_left_x(),
+        metadata.scale_x(),
+        metadata.skew_x(),
+        metadata.upper_left_y(),
+        metadata.skew_y(),
+        metadata.scale_y(),
+    ];
+
+    dataset
+        .set_geo_transform(&geotransform)
+        .map_err(convert_gdal_err)?;
+
+    // Set projection/CRS if available
+    if let Some(crs) = raster.crs() {
+        dataset.set_projection(crs).map_err(convert_gdal_err)?;
+    }
+
+    for (dst_band_index, &src_band_index) in band_indices.iter().enumerate() {
+        let dst_band_index = dst_band_index + 1;
+        let band = bands
+            .band(src_band_index)
+            .map_err(|e| arrow_datafusion_err!(e))?;
+        let band_metadata = band.metadata();
+        let band_type = band_metadata.data_type()?;
+        if let Some(nodata_bytes) = band_metadata.nodata_value() {
+            let raster_band = dataset
+                .rasterband(dst_band_index)
+                .map_err(convert_gdal_err)?;
+            match band_type {
+                BandDataType::UInt64 => {
+                    let nodata_bytes: [u8; 8] = 
nodata_bytes.try_into().map_err(|_| {
+                        exec_datafusion_err!("Invalid nodata byte length for 
UInt64")
+                    })?;
+                    let nodata = u64::from_le_bytes(nodata_bytes);
+                    raster_band
+                        .set_no_data_value_u64(Some(nodata))
+                        .map_err(convert_gdal_err)?;
+                }
+                BandDataType::Int64 => {
+                    let nodata_bytes: [u8; 8] = 
nodata_bytes.try_into().map_err(|_| {
+                        exec_datafusion_err!("Invalid nodata byte length for 
Int64")
+                    })?;
+                    let nodata = i64::from_le_bytes(nodata_bytes);
+                    raster_band
+                        .set_no_data_value_i64(Some(nodata))
+                        .map_err(convert_gdal_err)?;
+                }
+                _ => {
+                    let nodata = bytes_to_f64(nodata_bytes, &band_type)?;
+                    raster_band
+                        .set_no_data_value(Some(nodata))
+                        .map_err(convert_gdal_err)?;
+                }
+            }
+        }
+    }
+
+    Ok(dataset)
+}
+
+pub fn raster_ref_to_gdal_empty<R: RasterRef + ?Sized>(gdal: &Gdal, raster: 
&R) -> Result<Dataset> {
+    unsafe {
+        // SAFETY: raster_ref_to_gdal_mem is safe to call with an empty band 
list. The
+        // returned dataset will have zero bands and references no external 
memory.
+        raster_ref_to_gdal_mem(gdal, raster, &[])
+    }
+}
+
+/// Interpret optional nodata bytes according to the band data type and return 
an Option<f64>.
+/// Returns `None` if `nodata_bytes` is `None` or cannot be parsed for the 
given type.
+pub fn nodata_bytes_to_f64(nodata_bytes: Option<&[u8]>, band_type: 
&BandDataType) -> Option<f64> {
+    let bytes = nodata_bytes?;
+    bytes_to_f64(bytes, band_type).ok()
+}
+
+/// Convert a f64 nodata value into a byte vector appropriate for the given 
band type.
+pub fn nodata_f64_to_bytes(nodata: f64, band_type: &BandDataType) -> Vec<u8> {
+    match band_type {
+        BandDataType::UInt8 => vec![(nodata as u8)],
+        BandDataType::Int8 => (nodata as i8).to_le_bytes().to_vec(),
+        BandDataType::UInt16 => (nodata as u16).to_le_bytes().to_vec(),
+        BandDataType::Int16 => (nodata as i16).to_le_bytes().to_vec(),
+        BandDataType::UInt32 => (nodata as u32).to_le_bytes().to_vec(),
+        BandDataType::Int32 => (nodata as i32).to_le_bytes().to_vec(),
+        BandDataType::UInt64 => (nodata as u64).to_le_bytes().to_vec(),
+        BandDataType::Int64 => (nodata as i64).to_le_bytes().to_vec(),
+        BandDataType::Float32 => (nodata as f32).to_le_bytes().to_vec(),
+        BandDataType::Float64 => nodata.to_le_bytes().to_vec(),
+    }
+}
+
+/// Open an out-db raster source as a GDAL dataset, normalizing the URL to a 
GDAL VSI path if needed.
+pub fn open_gdal_dataset(gdal: &Gdal, url: &str, open_options: 
Option<&[&str]>) -> Result<Dataset> {
+    let normalized_url = normalize_outdb_source_path(url);
+    gdal.open_ex_with_options(
+        &normalized_url,
+        DatasetOptions {
+            open_flags: GDAL_OF_RASTER | GDAL_OF_READONLY | 
GDAL_OF_VERBOSE_ERROR,
+            open_options,
+            ..Default::default()
+        },
+    )
+    .map_err(convert_gdal_err)
+}
+
+/// Normalize out-db raster URLs to GDAL VSI paths.
+///
+/// Supported translations:
+/// - `s3://bucket/key` -> `/vsis3/bucket/key`
+/// - `s3a://bucket/key` -> `/vsis3/bucket/key`

Review Comment:
   No need to do it here, but `{rest}.zip` -> `/vsizip/{rest}` is also useful.



##########
rust/sedona-raster-gdal/src/gdal_common.rs:
##########
@@ -0,0 +1,752 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use sedona_gdal::dataset::Dataset;
+use sedona_gdal::errors::GdalError;
+use sedona_gdal::gdal::Gdal;
+use sedona_gdal::gdal_dyn_bindgen::{GDAL_OF_RASTER, GDAL_OF_READONLY, 
GDAL_OF_VERBOSE_ERROR};
+use sedona_gdal::mem::MemDatasetBuilder;
+use sedona_gdal::raster::types::DatasetOptions;
+use sedona_gdal::raster::types::GdalDataType;
+
+use sedona_raster::traits::RasterRef;
+use sedona_schema::raster::{BandDataType, StorageType};
+
+use datafusion_common::{
+    arrow_datafusion_err, exec_datafusion_err, exec_err, DataFusionError, 
Result,
+};
+
+/// Execute a closure with a reference to the global [`Gdal`] handle,
+/// converting initialization errors to [`DataFusionError`].
+pub(crate) fn with_gdal<F, R>(f: F) -> Result<R>
+where
+    F: FnOnce(&Gdal) -> Result<R>,
+{
+    match sedona_gdal::global::with_global_gdal(f) {
+        Ok(inner_result) => inner_result,
+        Err(init_err) => Err(DataFusionError::External(Box::new(init_err))),
+    }
+}
+
+/// Converts a BandDataType to the corresponding GDAL data type.
+pub fn band_data_type_to_gdal(band_type: &BandDataType) -> GdalDataType {
+    match band_type {
+        BandDataType::UInt8 => GdalDataType::UInt8,
+        BandDataType::Int8 => GdalDataType::Int8,
+        BandDataType::UInt16 => GdalDataType::UInt16,
+        BandDataType::Int16 => GdalDataType::Int16,
+        BandDataType::UInt32 => GdalDataType::UInt32,
+        BandDataType::Int32 => GdalDataType::Int32,
+        BandDataType::UInt64 => GdalDataType::UInt64,
+        BandDataType::Int64 => GdalDataType::Int64,
+        BandDataType::Float32 => GdalDataType::Float32,
+        BandDataType::Float64 => GdalDataType::Float64,
+    }
+}
+
+/// Converts a GDAL data type to the corresponding BandDataType.
+pub fn gdal_to_band_data_type(gdal_type: GdalDataType) -> Result<BandDataType> 
{
+    match gdal_type {
+        GdalDataType::UInt8 => Ok(BandDataType::UInt8),
+        GdalDataType::Int8 => Ok(BandDataType::Int8),
+        GdalDataType::UInt16 => Ok(BandDataType::UInt16),
+        GdalDataType::Int16 => Ok(BandDataType::Int16),
+        GdalDataType::UInt32 => Ok(BandDataType::UInt32),
+        GdalDataType::Int32 => Ok(BandDataType::Int32),
+        GdalDataType::UInt64 => Ok(BandDataType::UInt64),
+        GdalDataType::Int64 => Ok(BandDataType::Int64),
+        GdalDataType::Float32 => Ok(BandDataType::Float32),
+        GdalDataType::Float64 => Ok(BandDataType::Float64),
+        _ => Err(DataFusionError::NotImplemented(format!(
+            "GDAL data type {:?} is not supported",
+            gdal_type
+        ))),
+    }
+}
+
+/// Returns the byte size of a GDAL data type.
+pub fn gdal_type_byte_size(gdal_type: GdalDataType) -> usize {
+    gdal_type.byte_size()
+}
+
+/// Interprets bytes according to the band data type and returns the value as 
`f64`.
+///
+/// Returns an error if `bytes` does not have the expected length for 
`band_type`.
+pub fn bytes_to_f64(bytes: &[u8], band_type: &BandDataType) -> Result<f64> {
+    macro_rules! read_le_f64 {
+        ($t:ty, $n:expr) => {{
+            let arr: [u8; $n] = bytes.try_into().map_err(|_| {
+                exec_datafusion_err!(
+                    "Invalid byte slice length for type {}, expected: {}, 
actual: {}",
+                    stringify!($t),
+                    $n,
+                    bytes.len()
+                )
+            })?;
+            Ok(<$t>::from_le_bytes(arr) as f64)
+        }};
+    }
+
+    match band_type {
+        BandDataType::UInt8 => {
+            if bytes.len() != 1 {
+                return exec_err!(
+                    "Invalid byte length for UInt8: expected 1, got {}",
+                    bytes.len()
+                );
+            }
+            Ok(bytes[0] as f64)
+        }
+        BandDataType::Int8 => {
+            if bytes.len() != 1 {
+                return exec_err!(
+                    "Invalid byte length for Int8: expected 1, got {}",
+                    bytes.len()
+                );
+            }
+            Ok(bytes[0] as i8 as f64)
+        }
+        BandDataType::UInt16 => read_le_f64!(u16, 2),
+        BandDataType::Int16 => read_le_f64!(i16, 2),
+        BandDataType::UInt32 => read_le_f64!(u32, 4),
+        BandDataType::Int32 => read_le_f64!(i32, 4),
+        BandDataType::UInt64 => read_le_f64!(u64, 8),
+        BandDataType::Int64 => read_le_f64!(i64, 8),

Review Comment:
   Maybe remove these two since they're lossy and you handle/test them 
separately via the dedicated gdal setters?



##########
rust/sedona-raster-gdal/src/lib.rs:
##########
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! GDAL integration foundations for Apache SedonaDB raster types.
+//!
+//! This crate provides the lower-level utilities used by future GDAL-backed
+//! raster functions:
+//!
+//! - in-db raster to GDAL MEM dataset conversion
+//! - out-db and mixed raster to GDAL VRT dataset conversion
+//! - GDAL datatype and nodata conversion helpers
+//! - path normalization for GDAL VSI-backed raster sources
+
+#[allow(dead_code)]
+mod gdal_common;
+#[allow(dead_code)]
+mod gdal_dataset_provider;

Review Comment:
   Do you still need these after the pub use? If you do perhaps add a comment 
with a link to a PR or issue that will remove it (or just make it pub).
   
   ```suggestion
   mod gdal_common;
   mod gdal_dataset_provider;
   ```



##########
rust/sedona-raster-gdal/src/gdal_dataset_provider.rs:
##########
@@ -0,0 +1,993 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::convert::TryInto;
+use std::{cell::RefCell, marker::PhantomData, num::NonZeroUsize, rc::Rc};
+
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{
+    arrow_datafusion_err, exec_datafusion_err, exec_err, DataFusionError, 
Result,
+};
+
+use sedona_gdal::dataset::Dataset;
+use sedona_gdal::gdal::Gdal;
+use sedona_gdal::geo_transform::{GeoTransform, GeoTransformEx};
+use sedona_gdal::raster::types::GdalDataType;
+
+use sedona_common::SedonaOptions;
+use sedona_raster::traits::RasterRef;
+use sedona_schema::raster::{BandDataType, StorageType};
+
+use crate::gdal_common::{
+    band_data_type_to_gdal, bytes_to_f64, convert_gdal_err, 
normalize_outdb_source_path,
+    open_gdal_dataset, raster_ref_to_gdal_empty, raster_ref_to_gdal_mem,
+};
+
+/// A GDAL dataset constructed from a `RasterRef`.
+///
+/// This struct is designed to keep any backing GDAL datasets alive for as 
long as
+/// the returned `dataset` might reference them.
+///
+/// Field semantics by raster storage layout:
+///
+/// 1) **In-db bands only**
+///    - `dataset`: a GDAL **MEM** dataset containing all bands.
+///    - `gdal_mem_source`: `None` (the MEM dataset is already stored in 
`dataset`).
+///    - `_gdal_outdb_sources`: empty.
+///
+/// 2) **Out-db bands only**
+///    - `dataset`: a GDAL **VRT** dataset sized like the target raster, with 
each VRT band
+///      sourcing from an external dataset band.
+///    - `gdal_mem_source`: `None`.
+///    - `_gdal_outdb_sources`: contains the opened external GDAL datasets 
(kept alive via `Rc`).
+///      (There may be duplicates if multiple bands reference the same URL; 
that is fine.)
+///
+/// 3) **Mixed in-db + out-db bands**
+///    - `dataset`: a GDAL **VRT** dataset with band order matching the target 
raster.
+///      In-db bands source from a MEM dataset; out-db bands source from 
external datasets.
+///    - `gdal_mem_source`: `Some(MEM dataset)` containing only the in-db 
bands, in the same order
+///      as they appear in the target raster.
+///    - `_gdal_outdb_sources`: contains the opened external GDAL datasets 
(kept alive via `Rc`).
+pub(crate) struct RasterDataset<'a> {
+    /// The dataset to use for further GDAL operations.
+    dataset: Rc<Dataset>,
+    /// A MEM dataset holding in-db band data when `dataset` is a VRT that 
references it.
+    _gdal_mem_source: Option<Rc<Dataset>>,
+    /// External datasets referenced by the VRT; kept alive for the lifetime 
of this struct.
+    _gdal_outdb_sources: Vec<Rc<Dataset>>,
+    /// Binds this dataset's lifetime to the borrowed source raster.
+    _source_raster: PhantomData<&'a dyn RasterRef>,
+}
+
+impl<'a> RasterDataset<'a> {
+    /// Return a reference to the underlying GDAL dataset.
+    pub(crate) fn as_dataset(&self) -> &Dataset {
+        &self.dataset
+    }
+}
+
+thread_local! {
+    /// Thread-local lazily-initialized `GDALDatasetCache`.
+    static TL_GDAL_DATASET_CACHE: RefCell<Option<Rc<GDALDatasetCache>>> = 
const { RefCell::new(None) };
+}
+
+const DEFAULT_GDAL_SOURCE_CACHE_SIZE: usize = 32;
+const DEFAULT_GDAL_VRT_CACHE_SIZE: usize = 32;
+
+pub(crate) fn configure_thread_local_options(
+    gdal: &Gdal,
+    config_options: Option<&ConfigOptions>,
+) -> Result<()> {
+    // Set frequently requested GDAL config options as thread-local options to 
eliminate the
+    // need for acquiring configs from global config or environment variable, 
which is very
+    // likely to result in heavy contention in multi-threaded environments.
+    let cpl_debug_enabled = config_options

Review Comment:
   It is probably a good pattern to always do this within some context like 
`with_thread_local_options()` that takes care to reset the values on exit, 
particularly when borrowing the shared object from something like rasterio 
which may be doing the same thing. During execution we'll often have our own 
threads from tokio but I think constant folding happens on the main Python 
thread. We can follow up with this if it's difficult to implement here.



##########
rust/sedona-raster-gdal/src/utils.rs:
##########
@@ -0,0 +1,167 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utility functions for loading raster data via GDAL.
+
+use arrow_array::StructArray;
+use datafusion_common::error::Result;
+use datafusion_common::exec_datafusion_err;
+use sedona_gdal::dataset::Dataset;
+use sedona_gdal::gdal::Gdal;
+use sedona_gdal::gdal_dyn_bindgen::{GDAL_OF_RASTER, GDAL_OF_READONLY};
+use sedona_gdal::raster::types::DatasetOptions;
+use sedona_gdal::spatial_ref::SpatialRef;
+
+use sedona_raster::builder::RasterBuilder;
+use sedona_raster::traits::{BandMetadata, RasterMetadata};
+use sedona_schema::raster::{BandDataType, StorageType};
+
+use crate::gdal_common::{gdal_to_band_data_type, nodata_f64_to_bytes};
+
+/// Load a raster from any GDAL-openable path as an in-db raster `StructArray`.
+///
+/// The `path` can be a regular file path, a `/vsimem/` memory path,
+/// a `/vsicurl/` URL, or any other GDAL virtual filesystem path.
+pub fn load_as_indb_raster(gdal: &Gdal, path: &str) -> Result<StructArray> {
+    // Open dataset from path
+    let dataset = gdal
+        .open_ex_with_options(

Review Comment:
   Should we file a follow-up issue to add tests for this?



##########
rust/sedona-raster-gdal/src/gdal_dataset_provider.rs:
##########
@@ -0,0 +1,993 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::convert::TryInto;
+use std::{cell::RefCell, marker::PhantomData, num::NonZeroUsize, rc::Rc};
+
+use datafusion_common::config::ConfigOptions;
+use datafusion_common::{
+    arrow_datafusion_err, exec_datafusion_err, exec_err, DataFusionError, 
Result,
+};
+
+use sedona_gdal::dataset::Dataset;
+use sedona_gdal::gdal::Gdal;
+use sedona_gdal::geo_transform::{GeoTransform, GeoTransformEx};
+use sedona_gdal::raster::types::GdalDataType;
+
+use sedona_common::SedonaOptions;
+use sedona_raster::traits::RasterRef;
+use sedona_schema::raster::{BandDataType, StorageType};
+
+use crate::gdal_common::{
+    band_data_type_to_gdal, bytes_to_f64, convert_gdal_err, 
normalize_outdb_source_path,
+    open_gdal_dataset, raster_ref_to_gdal_empty, raster_ref_to_gdal_mem,
+};
+
+/// A GDAL dataset constructed from a `RasterRef`.
+///
+/// This struct is designed to keep any backing GDAL datasets alive for as 
long as
+/// the returned `dataset` might reference them.
+///
+/// Field semantics by raster storage layout:
+///
+/// 1) **In-db bands only**
+///    - `dataset`: a GDAL **MEM** dataset containing all bands.
+///    - `gdal_mem_source`: `None` (the MEM dataset is already stored in 
`dataset`).
+///    - `_gdal_outdb_sources`: empty.
+///
+/// 2) **Out-db bands only**
+///    - `dataset`: a GDAL **VRT** dataset sized like the target raster, with 
each VRT band
+///      sourcing from an external dataset band.
+///    - `gdal_mem_source`: `None`.
+///    - `_gdal_outdb_sources`: contains the opened external GDAL datasets 
(kept alive via `Rc`).
+///      (There may be duplicates if multiple bands reference the same URL; 
that is fine.)
+///
+/// 3) **Mixed in-db + out-db bands**
+///    - `dataset`: a GDAL **VRT** dataset with band order matching the target 
raster.
+///      In-db bands source from a MEM dataset; out-db bands source from 
external datasets.
+///    - `gdal_mem_source`: `Some(MEM dataset)` containing only the in-db 
bands, in the same order
+///      as they appear in the target raster.
+///    - `_gdal_outdb_sources`: contains the opened external GDAL datasets 
(kept alive via `Rc`).
+pub(crate) struct RasterDataset<'a> {
+    /// The dataset to use for further GDAL operations.
+    dataset: Rc<Dataset>,
+    /// A MEM dataset holding in-db band data when `dataset` is a VRT that 
references it.
+    _gdal_mem_source: Option<Rc<Dataset>>,
+    /// External datasets referenced by the VRT; kept alive for the lifetime 
of this struct.
+    _gdal_outdb_sources: Vec<Rc<Dataset>>,
+    /// Binds this dataset's lifetime to the borrowed source raster.
+    _source_raster: PhantomData<&'a dyn RasterRef>,
+}
+
+impl<'a> RasterDataset<'a> {
+    /// Return a reference to the underlying GDAL dataset.
+    pub(crate) fn as_dataset(&self) -> &Dataset {
+        &self.dataset
+    }
+}
+
+thread_local! {
+    /// Thread-local lazily-initialized `GDALDatasetCache`.
+    static TL_GDAL_DATASET_CACHE: RefCell<Option<Rc<GDALDatasetCache>>> = 
const { RefCell::new(None) };
+}
+
+const DEFAULT_GDAL_SOURCE_CACHE_SIZE: usize = 32;
+const DEFAULT_GDAL_VRT_CACHE_SIZE: usize = 32;
+
+pub(crate) fn configure_thread_local_options(
+    gdal: &Gdal,
+    config_options: Option<&ConfigOptions>,
+) -> Result<()> {
+    // Set frequently requested GDAL config options as thread-local options to 
eliminate the
+    // need for acquiring configs from global config or environment variable, 
which is very
+    // likely to result in heavy contention in multi-threaded environments.
+    let cpl_debug_enabled = config_options
+        .and_then(|c| {
+            c.extensions
+                .get::<SedonaOptions>()
+                .map(|opts| opts.gdal.cpl_debug)
+        })
+        .unwrap_or(false);
+    let cpl_debug_value = if cpl_debug_enabled { "ON" } else { "OFF" };
+
+    let thread_local_options = [
+        ("CPL_DEBUG", cpl_debug_value),
+        ("OSR_DEFAULT_AXIS_MAPPING_STRATEGY", "AUTHORITY_COMPLIANT"),
+        ("GDAL_VALIDATE_CREATION_OPTIONS", "YES"),
+        ("CHECK_WITH_INVERT_PROJ", "NO"),
+        ("GDAL_FORCE_CACHING", "NO"),
+        ("GDAL_ENABLE_READ_WRITE_MUTEX", "YES"),
+    ];
+
+    for (key, value) in thread_local_options {
+        gdal.set_thread_local_config_option(key, value)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+    }
+    Ok(())
+}
+
+/// Get or create the thread-local `GDALDatasetCache`.
+pub(crate) fn thread_local_cache() -> Result<Rc<GDALDatasetCache>> {
+    TL_GDAL_DATASET_CACHE.with(|cell| {
+        let mut opt = cell.borrow_mut();
+        if let Some(rc) = opt.as_ref() {
+            Ok(Rc::clone(rc))
+        } else {

Review Comment:
   I hope there is a future where we can move some of these caches to be 
session scoped (this feels like it could potentially be large in some cases).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to