adriangb commented on code in PR #21291:
URL: https://github.com/apache/datafusion/pull/21291#discussion_r3074211973


##########
datafusion/common/src/types/canonical_extensions/fixed_shape_tensor.rs:
##########
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::types::extension::DFExtensionType;
+use arrow::datatypes::DataType;
+use arrow_schema::extension::{ExtensionType, FixedShapeTensor};
+
+/// Defines the extension type logic for the canonical 
`arrow.fixed_shape_tensor` extension type.

Review Comment:
   I think all of these should link to the spec page for these canonical 
extension types.



##########
datafusion-examples/examples/extension_types/temperature.rs:
##########
@@ -143,46 +135,61 @@ pub struct TemperatureExtensionType {
 }
 
 impl TemperatureExtensionType {
+    /// The name of the extension type.
+    pub const NAME: &'static str = "custom.temperature";
+
     /// Creates a new [`TemperatureExtensionType`].
-    pub fn new(storage_type: DataType, temperature_unit: TemperatureUnit) -> 
Self {
-        Self {
-            storage_type,
-            temperature_unit,
+    pub fn try_new(
+        storage_type: &DataType,
+        temperature_unit: TemperatureUnit,
+    ) -> Result<Self, ArrowError> {
+        match storage_type {
+            DataType::Float32 | DataType::Float64 => {}
+            _ => {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Invalid data type: {storage_type} for temperature type, 
expected Float32 or Float64",
+                )));
+            }
         }
+
+        let result = Self {
+            storage_type: storage_type.clone(),
+            temperature_unit,
+        };
+        Ok(result)
     }
 }
 
-/// Implementation of [`ExtensionType`] for [`TemperatureExtensionType`].
-///
-/// This implements the arrow-rs trait for reading, writing, and validating 
extension types.
-impl ExtensionType for TemperatureExtensionType {
-    /// Arrow extension type name that is stored in the `ARROW:extension:name` 
field.
-    const NAME: &'static str = "custom.temperature";
-    type Metadata = TemperatureUnit;
-
-    fn metadata(&self) -> &Self::Metadata {
-        &self.temperature_unit
-    }
+/// Represents the unit of a temperature reading.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TemperatureUnit {
+    Celsius,
+    Fahrenheit,
+    Kelvin,
+}
 
+impl TemperatureUnit {
     /// Arrow extension type metadata is encoded as a string and stored using 
the
     /// `ARROW:extension:metadata` key. As we only store the name of the unit, 
a simple string
     /// suffices. Extension types can store more complex metadata using 
serialization formats like
     /// JSON.
-    fn serialize_metadata(&self) -> Option<String> {
-        let s = match self.temperature_unit {
+    pub fn serialize(self) -> String {
+        let result = match self {
             TemperatureUnit::Celsius => "celsius",
             TemperatureUnit::Fahrenheit => "fahrenheit",
             TemperatureUnit::Kelvin => "kelvin",
         };
-        Some(s.to_string())
+        result.to_owned()
     }
+}
 
-    /// Inverse operation of [`Self::serialize_metadata`]. This creates the 
[`TemperatureUnit`]
-    /// value from the serialized string.
-    fn deserialize_metadata(
-        metadata: Option<&str>,
-    ) -> std::result::Result<Self::Metadata, ArrowError> {
-        match metadata {
+/// Inverse operation of [`TemperatureUnit::serialize`]. This creates the 
[`TemperatureUnit`]
+/// value from the serialized string.
+impl TryFrom<Option<&str>> for TemperatureUnit {

Review Comment:
   I personally would rather have a `fn deserialize()` method, I think it makes 
it more clear what the intention is. I often find that blanket `TryFrom<str 
like thing>` are confusing / could have multiple meanings (e.g. deserialize vs. 
parse from CLI args)



##########
datafusion/common/src/types/canonical_extensions/fixed_shape_tensor.rs:
##########
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::types::extension::DFExtensionType;
+use arrow::datatypes::DataType;
+use arrow_schema::extension::{ExtensionType, FixedShapeTensor};
+
+/// Defines the extension type logic for the canonical 
`arrow.fixed_shape_tensor` extension type.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type 
mechanism.
+#[derive(Debug, Clone)]
+pub struct DFFixedShapeTensor {
+    inner: FixedShapeTensor,
+    /// The storage type of the tensor.
+    ///
+    /// While we could reconstruct the storage type from the inner 
[`FixedShapeTensor`], we may
+    /// choose a different name for the field within the 
[`DataType::FixedSizeList`] which can
+    /// cause problems down the line (e.g., checking for equality).
+    storage_type: DataType,
+}
+
+impl DFFixedShapeTensor {
+    /// Creates a new [`DFFixedShapeTensor`], validating that the storage type 
is compatible with
+    /// the extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <FixedShapeTensor as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: <FixedShapeTensor as ExtensionType>::try_new(data_type, 
metadata)?,
+            storage_type: data_type.clone(),

Review Comment:
   I will say this is a bit unwieldy. I don't have a better proposal but wanted 
to note it.



##########
datafusion-examples/examples/extension_types/temperature.rs:
##########
@@ -143,46 +135,61 @@ pub struct TemperatureExtensionType {
 }
 
 impl TemperatureExtensionType {
+    /// The name of the extension type.
+    pub const NAME: &'static str = "custom.temperature";
+
     /// Creates a new [`TemperatureExtensionType`].
-    pub fn new(storage_type: DataType, temperature_unit: TemperatureUnit) -> 
Self {
-        Self {
-            storage_type,
-            temperature_unit,
+    pub fn try_new(
+        storage_type: &DataType,
+        temperature_unit: TemperatureUnit,
+    ) -> Result<Self, ArrowError> {
+        match storage_type {
+            DataType::Float32 | DataType::Float64 => {}
+            _ => {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Invalid data type: {storage_type} for temperature type, 
expected Float32 or Float64",
+                )));

Review Comment:
   Makes sense to verify the type and error here 👍🏻 



##########
datafusion/common/src/types/canonical_extensions/opaque.rs:
##########
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::types::extension::DFExtensionType;
+use arrow::datatypes::DataType;
+use arrow_schema::extension::{ExtensionType, Opaque};
+
+/// Defines the extension type logic for the canonical `arrow.opaque` 
extension type.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type 
mechanism.

Review Comment:
   In addition to a link to the canonical extension types (e.g. 
https://arrow.apache.org/docs/format/CanonicalExtensions.html#opaque) it might 
be nice to copy a summary of the type into here for discoverability.



##########
datafusion/common/src/types/canonical_extensions/bool8.rs:
##########
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::error::_internal_err;
+use crate::types::extension::DFExtensionType;
+use arrow::array::{Array, Int8Array};
+use arrow::datatypes::DataType;
+use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, 
FormatResult};
+use arrow_schema::extension::{Bool8, ExtensionType};
+use std::fmt::Write;
+
+/// Defines the extension type logic for the canonical `arrow.bool8` extension 
type.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type 
mechanism.
+#[derive(Debug, Clone)]
+pub struct DFBool8(Bool8);
+
+impl DFBool8 {
+    /// Creates a new [`DFBool8`], validating that the storage type is 
compatible with the
+    /// extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <Bool8 as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self(<Bool8 as ExtensionType>::try_new(
+            data_type, metadata,

Review Comment:
   Should the supported data types be checked here as well? Maybe add to the 
docstring what `data_type` represents (it seems `DFExtensionType::storage_type` 
is hardcoded below so it's not immediately obvious why this is passed into the 
constructor).



##########
datafusion/common/src/types/canonical_extensions/bool8.rs:
##########
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::error::_internal_err;
+use crate::types::extension::DFExtensionType;
+use arrow::array::{Array, Int8Array};
+use arrow::datatypes::DataType;
+use arrow::util::display::{ArrayFormatter, DisplayIndex, FormatOptions, 
FormatResult};
+use arrow_schema::extension::{Bool8, ExtensionType};
+use std::fmt::Write;
+
+/// Defines the extension type logic for the canonical `arrow.bool8` extension 
type.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type 
mechanism.
+#[derive(Debug, Clone)]
+pub struct DFBool8(Bool8);
+
+impl DFBool8 {
+    /// Creates a new [`DFBool8`], validating that the storage type is 
compatible with the
+    /// extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <Bool8 as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self(<Bool8 as ExtensionType>::try_new(
+            data_type, metadata,
+        )?))
+    }
+}
+
+impl DFExtensionType for DFBool8 {
+    fn storage_type(&self) -> DataType {
+        DataType::Int8
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        self.0.serialize_metadata()
+    }
+
+    fn create_array_formatter<'fmt>(
+        &self,
+        array: &'fmt dyn Array,
+        options: &FormatOptions<'fmt>,
+    ) -> Result<Option<ArrayFormatter<'fmt>>> {
+        if array.data_type() != &DataType::Int8 {
+            return _internal_err!("Wrong array type for Bool8");
+        }
+
+        let display_index = Bool8ValueDisplayIndex {
+            array: array.as_any().downcast_ref().unwrap(),
+            null_str: options.null(),
+        };
+        Ok(Some(ArrayFormatter::new(
+            Box::new(display_index),
+            options.safe(),
+        )))
+    }
+}
+
+/// Pretty printer for binary UUID values.

Review Comment:
   ```suggestion
   /// Pretty printer for binary bool8 values.
   ```
   
   ?



##########
datafusion/common/src/types/canonical_extensions/fixed_shape_tensor.rs:
##########
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::Result;
+use crate::types::extension::DFExtensionType;
+use arrow::datatypes::DataType;
+use arrow_schema::extension::{ExtensionType, FixedShapeTensor};
+
+/// Defines the extension type logic for the canonical 
`arrow.fixed_shape_tensor` extension type.
+///
+/// See [`DFExtensionType`] for information on DataFusion's extension type 
mechanism.
+#[derive(Debug, Clone)]
+pub struct DFFixedShapeTensor {
+    inner: FixedShapeTensor,
+    /// The storage type of the tensor.
+    ///
+    /// While we could reconstruct the storage type from the inner 
[`FixedShapeTensor`], we may
+    /// choose a different name for the field within the 
[`DataType::FixedSizeList`] which can
+    /// cause problems down the line (e.g., checking for equality).
+    storage_type: DataType,
+}
+
+impl DFFixedShapeTensor {
+    /// Creates a new [`DFFixedShapeTensor`], validating that the storage type 
is compatible with
+    /// the extension type.
+    pub fn try_new(
+        data_type: &DataType,
+        metadata: <FixedShapeTensor as ExtensionType>::Metadata,
+    ) -> Result<Self> {
+        Ok(Self {
+            inner: <FixedShapeTensor as ExtensionType>::try_new(data_type, 
metadata)?,
+            storage_type: data_type.clone(),
+        })
+    }
+}
+
+impl DFExtensionType for DFFixedShapeTensor {
+    fn storage_type(&self) -> DataType {
+        self.storage_type.clone()
+    }
+
+    fn serialize_metadata(&self) -> Option<String> {
+        self.inner.serialize_metadata()
+    }

Review Comment:
   Are there cases where the metadata depends on the storage type?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to