This is an automated email from the ASF dual-hosted git repository.
vhs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 00a406682edb feat(sync): Map VECTOR type to binary for metastore sync
support (#18480)
00a406682edb is described below
commit 00a406682edb0d803705a60756d7e3ef00a9c34b
Author: voonhous <[email protected]>
AuthorDate: Sun Apr 12 23:18:56 2026 +0800
feat(sync): Map VECTOR type to binary for metastore sync support (#18480)
* fix(sync): Map VECTOR type to binary in Hive, Spark, and BigQuery sync
(#18343)
- VECTOR columns were not handled in sync schema converters, causing
UnsupportedOperationException when syncing tables with vector columns to
external metastores.
- Map VECTOR to its underlying physical type (binary/BYTES) so engines like
Trino can read via Hive metastore.
* Remove unsued test
---
.../hudi/gcp/bigquery/BigQuerySchemaResolver.java | 1 +
.../hudi/gcp/bigquery/TestBigQuerySchemaResolver.java | 14 ++++++++++++++
.../java/org/apache/hudi/hive/util/HiveSchemaUtil.java | 1 +
.../org/apache/hudi/hive/TestSparkSchemaUtils.java | 18 ++++++++++++++++++
.../org/apache/hudi/hive/util/TestHiveSchemaUtil.java | 4 +++-
.../apache/hudi/sync/common/util/SparkSchemaUtils.java | 1 +
6 files changed, 38 insertions(+), 1 deletion(-)
diff --git
a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySchemaResolver.java
b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySchemaResolver.java
index 6c88593ffbab..e32921d7052c 100644
---
a/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySchemaResolver.java
+++
b/hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySchemaResolver.java
@@ -140,6 +140,7 @@ public class BigQuerySchemaResolver {
break;
case BYTES:
case FIXED:
+ case VECTOR:
standardSQLTypeName = StandardSQLTypeName.BYTES;
break;
case DECIMAL:
diff --git
a/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySchemaResolver.java
b/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySchemaResolver.java
index 9f93f622afee..10ddfeef5632 100644
---
a/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySchemaResolver.java
+++
b/hudi-gcp/src/test/java/org/apache/hudi/gcp/bigquery/TestBigQuerySchemaResolver.java
@@ -280,4 +280,18 @@ public class TestBigQuerySchemaResolver {
BigQuerySchemaResolver resolver = new BigQuerySchemaResolver(metaClient ->
mockTableSchemaResolver);
Assertions.assertEquals(PRIMITIVE_TYPES_BQ_SCHEMA,
resolver.getTableSchema(mockMetaClient, Collections.emptyList()));
}
+
+ @Test
+ void convertSchema_vectorField() {
+ HoodieSchema input = HoodieSchema.createRecord("testRecord", null, null,
false, Arrays.asList(
+ HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)),
+ HoodieSchemaField.of("embedding", HoodieSchema.createVector(128))
+ ));
+
+ Schema expected = Schema.of(
+ Field.newBuilder("id",
StandardSQLTypeName.INT64).setMode(Field.Mode.REQUIRED).build(),
+ Field.newBuilder("embedding",
StandardSQLTypeName.BYTES).setMode(Field.Mode.REQUIRED).build());
+
+ Assertions.assertEquals(expected, SCHEMA_RESOLVER.convertSchema(input));
+ }
}
diff --git
a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/HiveSchemaUtil.java
b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/HiveSchemaUtil.java
index 2741cdfbfe19..64933d962d22 100644
---
a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/HiveSchemaUtil.java
+++
b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/HiveSchemaUtil.java
@@ -225,6 +225,7 @@ public class HiveSchemaUtil {
case BYTES:
case UUID:
case FIXED:
+ case VECTOR:
return BINARY_TYPE_NAME;
case DATE:
return "DATE";
diff --git
a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestSparkSchemaUtils.java
b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestSparkSchemaUtils.java
index 0026fb71a417..db585ddcefeb 100644
---
a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestSparkSchemaUtils.java
+++
b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestSparkSchemaUtils.java
@@ -194,4 +194,22 @@ public class TestSparkSchemaUtils {
StructType blobStruct = (StructType) sparkSchema.fields()[1].dataType();
assertEquals(3, blobStruct.fields().length);
}
+
+ @Test
+ public void testConvertSchemaWithVectorField() {
+ HoodieSchema schema = HoodieSchema.createRecord("root", null, null, false,
Arrays.asList(
+ HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT),
null, null),
+ HoodieSchemaField.of("embedding", HoodieSchema.createVector(128),
null, null)
+ ));
+
+ String sparkJson = SparkSchemaUtils.convertToSparkSchemaJson(schema);
+ assertNotNull(sparkJson);
+ assertFalse(sparkJson.isEmpty());
+
+ StructType sparkSchema = (StructType) StructType.fromJson(sparkJson);
+ assertEquals(2, sparkSchema.fields().length);
+ assertEquals("id", sparkSchema.fields()[0].name());
+ assertEquals("embedding", sparkSchema.fields()[1].name());
+ assertInstanceOf(BinaryType$.class, sparkSchema.fields()[1].dataType());
+ }
}
diff --git
a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/util/TestHiveSchemaUtil.java
b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/util/TestHiveSchemaUtil.java
index c79ef417fa47..53698fc056be 100644
---
a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/util/TestHiveSchemaUtil.java
+++
b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/util/TestHiveSchemaUtil.java
@@ -148,7 +148,8 @@ public class TestHiveSchemaUtil {
HoodieSchemaField.of("time_millis_field",
HoodieSchema.createTimeMillis()),
HoodieSchemaField.of("time_micros_field",
HoodieSchema.createTimeMicros()),
HoodieSchemaField.of("decimal_field",
HoodieSchema.createDecimal(10, 2)),
- HoodieSchemaField.of("uuid_field",
HoodieSchema.create(HoodieSchemaType.UUID))
+ HoodieSchemaField.of("uuid_field",
HoodieSchema.create(HoodieSchemaType.UUID)),
+ HoodieSchemaField.of("vector_field",
HoodieSchema.createVector(128))
)
);
@@ -172,6 +173,7 @@ public class TestHiveSchemaUtil {
expected.put("`time_micros_field`", "bigint");
expected.put("`decimal_field`", "DECIMAL(10 , 2)");
expected.put("`uuid_field`", "binary");
+ expected.put("`vector_field`", "binary");
assertEquals(expected, actual);
}
}
diff --git
a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SparkSchemaUtils.java
b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SparkSchemaUtils.java
index 51198f1ae7c5..2a809dc80ee6 100644
---
a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SparkSchemaUtils.java
+++
b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/util/SparkSchemaUtils.java
@@ -56,6 +56,7 @@ public class SparkSchemaUtils {
return "\"string\"";
case BYTES:
case FIXED:
+ case VECTOR:
return "\"binary\"";
case DATE:
return "\"date\"";