This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 9df72a96f3 [Feature](multi-catalog) Support hadoop viewfs. (#24168)
9df72a96f3 is described below
commit 9df72a96f39261b45edd15fd12db9546d7e7f321
Author: Qi Chen <[email protected]>
AuthorDate: Wed Sep 13 00:20:12 2023 +0800
[Feature](multi-catalog) Support hadoop viewfs. (#24168)
### Feature
Support hadoop viewfs.
### Test
- Regression tests:
- hive viewfs test.
- tvf viewfs test.
- Broker load with broker and with hdfs tests manually.
---
be/src/io/fs/hdfs_file_system.cpp | 8 +--
docs/en/docs/lakehouse/multi-catalog/hive.md | 22 ++++++++
docs/zh-CN/docs/lakehouse/multi-catalog/hive.md | 22 ++++++++
.../org/apache/doris/analysis/StorageBackend.java | 6 ++-
.../java/org/apache/doris/common/FeConstants.java | 1 +
.../org/apache/doris/fs/FileSystemFactory.java | 3 +-
.../doris/planner/external/FileQueryScanNode.java | 2 +
.../doris/broker/hdfs/FileSystemManager.java | 3 +-
.../external_table_p2/hive/test_viewfs_hive.out | 35 +++++++++++++
.../data/external_table_p2/tvf/test_tvf_p2.out | 3 ++
.../external_table_p2/hive/test_viewfs_hive.groovy | 59 ++++++++++++++++++++++
.../external_table_p2/tvf/test_tvf_p2.groovy | 9 ++++
12 files changed, 163 insertions(+), 10 deletions(-)
diff --git a/be/src/io/fs/hdfs_file_system.cpp
b/be/src/io/fs/hdfs_file_system.cpp
index 5294890324..1e71ade934 100644
--- a/be/src/io/fs/hdfs_file_system.cpp
+++ b/be/src/io/fs/hdfs_file_system.cpp
@@ -138,7 +138,7 @@ HdfsFileSystem::HdfsFileSystem(const THdfsParams&
hdfs_params, const std::string
_hdfs_params(hdfs_params),
_fs_handle(nullptr),
_profile(profile) {
- if (_hdfs_params.__isset.fs_name) {
+ if (fs_name.empty() && _hdfs_params.__isset.fs_name) {
_fs_name = _hdfs_params.fs_name;
} else {
_fs_name = fs_name;
@@ -509,11 +509,7 @@ Status HdfsFileSystemCache::get_connection(const
THdfsParams& hdfs_params,
uint64 HdfsFileSystemCache::_hdfs_hash_code(const THdfsParams& hdfs_params,
const std::string& fs_name) {
uint64 hash_code = 0;
- if (hdfs_params.__isset.fs_name) {
- hash_code += Fingerprint(hdfs_params.fs_name);
- } else {
- hash_code += Fingerprint(fs_name);
- }
+ hash_code += Fingerprint(fs_name);
if (hdfs_params.__isset.user) {
hash_code += Fingerprint(hdfs_params.user);
}
diff --git a/docs/en/docs/lakehouse/multi-catalog/hive.md
b/docs/en/docs/lakehouse/multi-catalog/hive.md
index 87e562ee88..afa388321c 100644
--- a/docs/en/docs/lakehouse/multi-catalog/hive.md
+++ b/docs/en/docs/lakehouse/multi-catalog/hive.md
@@ -95,6 +95,28 @@ Please place the `krb5.conf` file and `keytab`
authentication file under all `BE
The value of `hive.metastore.kerberos.principal` needs to be consistent with
the property of the same name of the connected hive metastore, which can be
obtained from `hive-site.xml`.
+### Hive On VIEWFS
+
+```sql
+CREATE CATALOG hive PROPERTIES (
+ 'type'='hms',
+ 'hive.metastore.uris' = 'thrift://172.0.0.1:9083',
+ 'hadoop.username' = 'hive',
+ 'dfs.nameservices'='your-nameservice',
+ 'dfs.ha.namenodes.your-nameservice'='nn1,nn2',
+ 'dfs.namenode.rpc-address.your-nameservice.nn1'='172.21.0.2:8088',
+ 'dfs.namenode.rpc-address.your-nameservice.nn2'='172.21.0.3:8088',
+
'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider',
+ 'fs.defaultFS' = 'viewfs://your-cluster',
+ 'fs.viewfs.mounttable.your-cluster.link./ns1' = 'hdfs://your-nameservice/',
+ 'fs.viewfs.mounttable.your-cluster.homedir' = '/ns1'
+);
+```
+
+viewfs related parameters can be added to the catalog configuration as above,
or added to `conf/core-site.xml`.
+
+How viewfs works and parameter configuration, please refer to relevant hadoop
documents, for example,
https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/ViewFs.html
+
### Hive On JuiceFS
Data is stored in JuiceFS, examples are as follows:
diff --git a/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md
b/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md
index 3242426184..e582d5e907 100644
--- a/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md
+++ b/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md
@@ -94,6 +94,28 @@ CREATE CATALOG hive PROPERTIES (
请在所有的 `BE`、`FE` 节点下放置 `krb5.conf` 文件和 `keytab` 认证文件,`keytab`
认证文件路径和配置保持一致,`krb5.conf` 文件默认放置在 `/etc/krb5.conf` 路径。
`hive.metastore.kerberos.principal` 的值需要和所连接的 hive metastore 的同名属性保持一致,可从
`hive-site.xml` 中获取。
+### Hive On VIEWFS
+
+```sql
+CREATE CATALOG hive PROPERTIES (
+ 'type'='hms',
+ 'hive.metastore.uris' = 'thrift://172.0.0.1:9083',
+ 'hadoop.username' = 'hive',
+ 'dfs.nameservices'='your-nameservice',
+ 'dfs.ha.namenodes.your-nameservice'='nn1,nn2',
+ 'dfs.namenode.rpc-address.your-nameservice.nn1'='172.21.0.2:8088',
+ 'dfs.namenode.rpc-address.your-nameservice.nn2'='172.21.0.3:8088',
+
'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider',
+ 'fs.defaultFS' = 'viewfs://your-cluster',
+ 'fs.viewfs.mounttable.your-cluster.link./ns1' = 'hdfs://your-nameservice/',
+ 'fs.viewfs.mounttable.your-cluster.homedir' = '/ns1'
+);
+```
+
+viewfs 相关参数可以如上面一样添加到 catalog 配置中,也可以添加到 `conf/core-site.xml` 中。
+
+viewfs 工作原理和参数配置可以参考 hadoop 相关文档,比如
https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/ViewFs.html
+
### Hive On JuiceFS
数据存储在JuiceFS,示例如下:
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java
index ada22ad301..f3d7f7e49f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/StorageBackend.java
@@ -50,6 +50,7 @@ public class StorageBackend implements ParseNode {
if (!schema.equalsIgnoreCase("bos")
&& !schema.equalsIgnoreCase("afs")
&& !schema.equalsIgnoreCase("hdfs")
+ && !schema.equalsIgnoreCase("viewfs")
&& !schema.equalsIgnoreCase("ofs")
&& !schema.equalsIgnoreCase("obs")
&& !schema.equalsIgnoreCase("oss")
@@ -58,8 +59,9 @@ public class StorageBackend implements ParseNode {
&& !schema.equalsIgnoreCase("gfs")
&& !schema.equalsIgnoreCase("jfs")
&& !schema.equalsIgnoreCase("gs")) {
- throw new AnalysisException("Invalid broker path. please use
valid 'hdfs://', 'afs://' , 'bos://',"
- + " 'ofs://', 'obs://', 'oss://', 's3a://', 'cosn://',
'gfs://', 'gs://' or 'jfs://' path.");
+ throw new AnalysisException("Invalid broker path. please use
valid 'hdfs://', 'viewfs://', 'afs://',"
+ + " 'bos://', 'ofs://', 'obs://', 'oss://', 's3a://',
'cosn://', 'gfs://', 'gs://'"
+ + " or 'jfs://' path.");
}
} else if (type == StorageBackend.StorageType.S3 &&
!schema.equalsIgnoreCase("s3")) {
throw new AnalysisException("Invalid export path. please use valid
's3://' path.");
diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java
b/fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java
index f7fb0348dd..e9e853eb9f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/FeConstants.java
@@ -88,6 +88,7 @@ public class FeConstants {
public static String FS_PREFIX_GFS = "gfs";
public static String FS_PREFIX_JFS = "jfs";
public static String FS_PREFIX_HDFS = "hdfs";
+ public static String FS_PREFIX_VIEWFS = "viewfs";
public static String FS_PREFIX_FILE = "file";
public static final String INTERNAL_DB_NAME = "__internal_schema";
public static String TEMP_MATERIZLIZE_DVIEW_PREFIX =
"internal_tmp_materialized_view_";
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemFactory.java
b/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemFactory.java
index c2a070cdfa..3837a7eb95 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemFactory.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/fs/FileSystemFactory.java
@@ -65,7 +65,8 @@ public class FileSystemFactory {
} else {
fsType = FileSystemType.S3;
}
- } else if (location.startsWith(FeConstants.FS_PREFIX_HDFS) ||
location.startsWith(FeConstants.FS_PREFIX_GFS)) {
+ } else if (location.startsWith(FeConstants.FS_PREFIX_HDFS) ||
location.startsWith(FeConstants.FS_PREFIX_GFS)
+ || location.startsWith(FeConstants.FS_PREFIX_VIEWFS)) {
fsType = FileSystemType.DFS;
} else if (location.startsWith(FeConstants.FS_PREFIX_OFS) ||
location.startsWith(FeConstants.FS_PREFIX_COSN)) {
// ofs:// and cosn:// use the same underlying file system: Tencent
Cloud HDFS, aka CHDFS)) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java
index d59c6bec7d..00188dc50b 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java
@@ -484,6 +484,8 @@ public abstract class FileQueryScanNode extends
FileScanNode {
return Optional.of(TFileType.FILE_S3);
} else if (location.startsWith(FeConstants.FS_PREFIX_HDFS)) {
return Optional.of(TFileType.FILE_HDFS);
+ } else if (location.startsWith(FeConstants.FS_PREFIX_VIEWFS)) {
+ return Optional.of(TFileType.FILE_HDFS);
} else if (location.startsWith(FeConstants.FS_PREFIX_COSN)) {
return Optional.of(TFileType.FILE_HDFS);
} else if (location.startsWith(FeConstants.FS_PREFIX_FILE)) {
diff --git
a/fs_brokers/apache_hdfs_broker/src/main/java/org/apache/doris/broker/hdfs/FileSystemManager.java
b/fs_brokers/apache_hdfs_broker/src/main/java/org/apache/doris/broker/hdfs/FileSystemManager.java
index d25947e33b..997cf0cb2c 100644
---
a/fs_brokers/apache_hdfs_broker/src/main/java/org/apache/doris/broker/hdfs/FileSystemManager.java
+++
b/fs_brokers/apache_hdfs_broker/src/main/java/org/apache/doris/broker/hdfs/FileSystemManager.java
@@ -66,6 +66,7 @@ public class FileSystemManager {
.getLogger(FileSystemManager.class.getName());
// supported scheme
private static final String HDFS_SCHEME = "hdfs";
+ private static final String VIEWFS_SCHEME = "viewfs";
private static final String S3A_SCHEME = "s3a";
private static final String KS3_SCHEME = "ks3";
private static final String CHDFS_SCHEME = "ofs";
@@ -210,7 +211,7 @@ public class FileSystemManager {
"invalid path. scheme is null");
}
BrokerFileSystem brokerFileSystem = null;
- if (scheme.equals(HDFS_SCHEME)) {
+ if (scheme.equals(HDFS_SCHEME) || scheme.equals(VIEWFS_SCHEME)) {
brokerFileSystem = getDistributedFileSystem(path, properties);
} else if (scheme.equals(S3A_SCHEME)) {
brokerFileSystem = getS3AFileSystem(path, properties);
diff --git a/regression-test/data/external_table_p2/hive/test_viewfs_hive.out
b/regression-test/data/external_table_p2/hive/test_viewfs_hive.out
new file mode 100644
index 0000000000..37e1df0ba0
--- /dev/null
+++ b/regression-test/data/external_table_p2/hive/test_viewfs_hive.out
@@ -0,0 +1,35 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !viewfs --
+1 Tom 48 \N male
+2 Jerry 35 \N male
+3 Frank 25 \N male
+4 Ada 22 \N female
+
+-- !viewfs_partition1 --
+1 Tom 48 \N male 20230101
+2 Jerry 35 \N male 20230101
+3 Frank 25 \N male 20230201
+4 Ada 22 \N female 20230201
+
+-- !viewfs_partition2 --
+1 Tom 48 \N male 20230101
+2 Jerry 35 \N male 20230101
+
+-- !viewfs_partition3 --
+3 Frank 25 \N male 20230201
+4 Ada 22 \N female 20230201
+
+-- !viewfs_mixed_partition1 --
+1 Tom 48 \N male 20230101
+2 Jerry 35 \N male 20230101
+3 Frank 25 \N male 20230201
+4 Ada 22 \N female 20230201
+
+-- !viewfs_mixed_partition2 --
+1 Tom 48 \N male 20230101
+2 Jerry 35 \N male 20230101
+
+-- !viewfs_mixed_partition3 --
+3 Frank 25 \N male 20230201
+4 Ada 22 \N female 20230201
+
diff --git a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
index cb7239f2ff..86f3f43f2d 100644
--- a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
+++ b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out
@@ -42,3 +42,6 @@
-- !row_cross_pages --
25001 25001 25001
+-- !viewfs --
+25001 25001 25001
+
diff --git
a/regression-test/suites/external_table_p2/hive/test_viewfs_hive.groovy
b/regression-test/suites/external_table_p2/hive/test_viewfs_hive.groovy
new file mode 100644
index 0000000000..9ccea773ed
--- /dev/null
+++ b/regression-test/suites/external_table_p2/hive/test_viewfs_hive.groovy
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_viewfs_hive",
"p2,external,hive,external_remote,external_remote_hive") {
+
+ String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ String extHiveHmsHost =
context.config.otherConfigs.get("extHiveHmsHost")
+ String extHiveHmsPort =
context.config.otherConfigs.get("extHiveHmsPort")
+ String nameNodeHost = context.config.otherConfigs.get("extHiveHmsHost")
+ String hdfsPort = context.config.otherConfigs.get("extHdfsPort")
+ String catalog_name = "test_viewfs_hive"
+
+ sql """drop catalog if exists ${catalog_name};"""
+
+ sql """
+ create catalog if not exists ${catalog_name} properties (
+ 'type'='hms',
+ 'hadoop.username' = 'hadoop',
+ 'hive.metastore.uris' =
'thrift://${extHiveHmsHost}:${extHiveHmsPort}',
+ 'fs.viewfs.mounttable.my-cluster.link./ns1' =
'hdfs://${nameNodeHost}:${hdfsPort}/',
+ 'fs.viewfs.mounttable.my-cluster.homedir' = '/ns1',
+ 'fs.defaultFS' = 'viewfs://my-cluster'
+ );
+ """
+ logger.info("catalog " + catalog_name + " created")
+ sql """switch ${catalog_name};"""
+ logger.info("switched to catalog " + catalog_name)
+
+ sql """ use viewfs """
+
+ // The location of table is on viewfs.
+ qt_viewfs """ select * from test_viewfs order by id"""
+
+ // The location of partition table is on viewfs.
+ qt_viewfs_partition1 """ select * from test_viewfs_partition order by
id"""
+ qt_viewfs_partition2 """ select * from test_viewfs_partition where
part_col = 20230101 order by id"""
+ qt_viewfs_partition3 """ select * from test_viewfs_partition where
part_col = 20230201 order by id"""
+
+ // The location of partition table contains hdfs and viewfs locations
partitions.
+ qt_viewfs_mixed_partition1 """ select * from
test_viewfs_mixed_partition order by id"""
+ qt_viewfs_mixed_partition2 """ select * from
test_viewfs_mixed_partition where part_col = 20230101 order by id"""
+ qt_viewfs_mixed_partition3 """ select * from
test_viewfs_mixed_partition where part_col = 20230201 order by id"""
+ }
+}
diff --git a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
index ec3fb41d95..853b5d2f4d 100644
--- a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
+++ b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy
@@ -53,5 +53,14 @@ suite("test_tvf_p2",
"p2,external,tvf,external_remote,external_remote_tvf") {
"uri" =
"hdfs://${nameNodeHost}:${hdfsPort}/catalog/tvf/parquet/row_cross_pages.parquet",
"format" = "parquet",
"fs.defaultFS" = "hdfs://${nameNodeHost}:${hdfsPort}")"""
+
+ // viewfs
+ qt_viewfs """select count(id), count(m1), count(m2)
+ from hdfs(
+ "uri" =
"viewfs://my-cluster/ns1/catalog/tvf/parquet/row_cross_pages.parquet",
+ "format" = "parquet",
+ "fs.defaultFS" = "viewfs://my-cluster",
+ "fs.viewfs.mounttable.my-cluster.link./ns1" =
"hdfs://${nameNodeHost}:${hdfsPort}/",
+ "fs.viewfs.mounttable.my-cluster.homedir" = "/ns1")"""
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]