tschwarzinger commented on code in PR #21714:
URL: https://github.com/apache/datafusion/pull/21714#discussion_r3159959219
##########
datafusion/sql/tests/common/mod.rs:
##########
@@ -343,12 +343,6 @@ impl TypePlanner for CustomTypePlanner {
sql_type: &sqlparser::ast::DataType,
) -> Result<Option<FieldRef>> {
match sql_type {
- sqlparser::ast::DataType::Uuid => Ok(Some(Arc::new(
Review Comment:
🥳
##########
datafusion/sql/src/planner.rs:
##########
@@ -641,6 +641,21 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
// If no type_planner can handle this type, use the default conversion
match sql_type {
+ // Canonical Arrow extension types
+ SQLDataType::Uuid => Ok(Arc::new(
+ Field::new("", DataType::FixedSizeBinary(16),
true).with_metadata(
+ HashMap::from([(
+ "ARROW:extension:name".to_string(),
+ "arrow.uuid".to_string(),
+ )]),
+ ),
+ )),
+ SQLDataType::JSON | SQLDataType::JSONB => Ok(Arc::new(
Review Comment:
I am no expert in SQL data types but are we sure that `SQLDataType::JSONB`
is the same thing as the regular JSON?
Maybe this would require another extension type (in a separate issue) that
uses `DataType::Binary`?
##########
datafusion/sql/src/statement.rs:
##########
@@ -530,11 +531,26 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
.iter()
.zip(input_fields)
.map(|(field, input_field)| {
- cast(
- col(input_field.name()),
- field.data_type().clone(),
- )
- .alias(field.name())
+ let target_field = Arc::new(
+ Field::new(
+ field.name(),
+ field.data_type().clone(),
+ field.is_nullable(),
+ )
+
.with_metadata(field.metadata().clone()),
+ );
+ let metadata =
+
FieldMetadata::new_from_field(field.as_ref());
+ let alias_metadata = if
metadata.is_empty() {
+ None
+ } else {
+ Some(metadata)
+ };
+ Expr::Cast(Cast::new_from_field(
+ Box::new(col(input_field.name())),
+ target_field,
Review Comment:
Maybe I am missing it but what is the difference of `field` and
`target_field` here? Can't we just use
```rust
// Ideally we could also avoid the `FieldMetadata` but I think that's not
possible with the current API.
let metadata =
FieldMetadata::new_from_field(field.as_ref());
let alias_metadata = if metadata.is_empty() {
None
} else {
Some(metadata)
};
Expr::Cast(Cast::new_from_field(
Box::new(col(input_field.name())),
Arc::clone(field),
))
.alias_with_metadata(field.name(), alias_metadata)
```
##########
datafusion/sqllogictest/test_files/sql_extension_types.slt:
##########
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for built-in mapping of SQL types to Arrow canonical extension types.
+# See: https://arrow.apache.org/docs/format/CanonicalExtensions.html
+
+#######
+# UUID Extension Type
+#######
+
+# CAST to UUID preserves extension metadata
+query ?T
+SELECT CAST(arrow_cast(X'00010203040506070809000102030506',
'FixedSizeBinary(16)') AS UUID),
+ arrow_metadata(CAST(arrow_cast(X'00010203040506070809000102030506',
'FixedSizeBinary(16)') AS UUID), 'ARROW:extension:name');
+----
+00010203040506070809000102030506 arrow.uuid
+
+# CREATE TABLE with UUID column preserves extension metadata through VALUES
+statement ok
+CREATE TABLE test_uuid (
+ value UUID
+) AS VALUES
+ (NULL);
+
+query TTT
+DESCRIBE test_uuid;
+----
+value FixedSizeBinary(16) YES
+
+query T
+SELECT arrow_metadata(value, 'ARROW:extension:name') FROM test_uuid;
+----
+arrow.uuid
+
+statement ok
+DROP TABLE test_uuid;
+
+# UUID column with non-null binary value
+statement ok
+CREATE TABLE test_uuid2 (
+ id UUID
+) AS VALUES
+ (CAST(arrow_cast(X'00010203040506070809000102030506',
'FixedSizeBinary(16)') AS UUID));
+
+query T
+SELECT arrow_metadata(id, 'ARROW:extension:name') FROM test_uuid2;
+----
+arrow.uuid
+
+statement ok
+DROP TABLE test_uuid2;
+
+#######
+# JSON Extension Type
+#######
+
+# CAST to JSON preserves extension metadata
+query TT
+SELECT CAST('{"a": 1}' AS JSON),
+ arrow_metadata(CAST('{"a": 1}' AS JSON), 'ARROW:extension:name');
+----
+{"a": 1} arrow.json
+
+# CREATE TABLE with JSON column preserves extension metadata
+statement ok
+CREATE TABLE test_json (
+ data JSON
+) AS VALUES
+ (NULL);
+
+query TTT
+DESCRIBE test_json;
+----
+data Utf8 YES
+
+query T
+SELECT arrow_metadata(data, 'ARROW:extension:name') FROM test_json;
+----
+arrow.json
+
+statement ok
+DROP TABLE test_json;
+
+# JSON column with non-null value
+statement ok
+CREATE TABLE test_json2 (
+ data JSON
+) AS VALUES
+ (CAST('{"hello": "world"}' AS JSON));
+
+query T
+SELECT arrow_metadata(data, 'ARROW:extension:name') FROM test_json2;
+----
+arrow.json
+
+statement ok
+DROP TABLE test_json2;
+
+#######
+# JSONB Extension Type (maps to arrow.json)
Review Comment:
See remark above
##########
datafusion/sql/src/planner.rs:
##########
@@ -641,6 +641,21 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
// If no type_planner can handle this type, use the default conversion
match sql_type {
+ // Canonical Arrow extension types
+ SQLDataType::Uuid => Ok(Arc::new(
+ Field::new("", DataType::FixedSizeBinary(16),
true).with_metadata(
+ HashMap::from([(
+ "ARROW:extension:name".to_string(),
Review Comment:
I feel like we should Arrow's constants here
##########
datafusion/sql/src/planner.rs:
##########
@@ -641,6 +641,21 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
// If no type_planner can handle this type, use the default conversion
match sql_type {
+ // Canonical Arrow extension types
+ SQLDataType::Uuid => Ok(Arc::new(
+ Field::new("", DataType::FixedSizeBinary(16),
true).with_metadata(
+ HashMap::from([(
+ "ARROW:extension:name".to_string(),
+ "arrow.uuid".to_string(),
+ )]),
+ ),
+ )),
+ SQLDataType::JSON | SQLDataType::JSONB => Ok(Arc::new(
+ Field::new("", DataType::Utf8,
true).with_metadata(HashMap::from([(
+ "ARROW:extension:name".to_string(),
+ "arrow.json".to_string(),
+ )])),
Review Comment:
I saw that arrow-rs' Json type rejects empty metadata:
So we maybe should use an empty string as the extension type metadata.
The relevant code:
```rust
fn deserialize_metadata(metadata: Option<&str>) ->
Result<Self::Metadata, ArrowError> {
const ERR: &str = "Json extension type metadata is either an empty
string or a JSON string with an empty object";
metadata
.map_or_else(
|| Err(ArrowError::InvalidArgumentError(ERR.to_owned())),
|metadata| {
match metadata {
// Empty string
"" => Ok(None),
value => serde_json::from_str::<Empty>(value)
.map(Option::Some)
.map_err(|_|
ArrowError::InvalidArgumentError(ERR.to_owned())),
}
},
)
.map(JsonMetadata)
}
```
##########
datafusion/sqllogictest/test_files/sql_extension_types.slt:
##########
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
Review Comment:
I think it would be great if we register the extension types for these tests.
I've prepared a follow-up PR for that:
https://github.com/apache/datafusion/compare/main...tschwarzinger:datafusion:extension-types-in-slt-tests-2?expand=1
The difference would be that i) we would test the extension type system and
ii) get prettier formatting for values:
```text
# CAST to UUID preserves extension metadata
query ?T
SELECT CAST(arrow_cast(X'00010203040506070809000102030506',
'FixedSizeBinary(16)') AS UUID),
arrow_metadata(CAST(arrow_cast(X'00010203040506070809000102030506',
'FixedSizeBinary(16)') AS UUID), 'ARROW:extension:name');
----
00010203-0405-0607-0809-000102030506 arrow.uuid
```
But I think we can do that in a follow-up.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]