This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 1489e3cfbf [Fix](file system) Make the constructor of `XxxFileSystem` a private method (#15889) 1489e3cfbf is described below commit 1489e3cfbf4ad4abc8ca76fa15aeeabe881cb786 Author: Tiewei Fang <43782773+bepppo...@users.noreply.github.com> AuthorDate: Fri Jan 13 15:32:16 2023 +0800 [Fix](file system) Make the constructor of `XxxFileSystem` a private method (#15889) Since Filesystem inherited std::enable_shared_from_this , it is dangerous to create native point of FileSystem. To avoid this behavior, making the constructor of XxxFileSystem a private method and using the static method create(...) to get a new FileSystem object. --- be/src/io/file_factory.cpp | 8 ++-- be/src/io/fs/broker_file_system.cpp | 7 ++++ be/src/io/fs/broker_file_system.h | 8 +++- be/src/io/fs/file_system.h | 6 +-- be/src/io/fs/hdfs_file_system.cpp | 5 +++ be/src/io/fs/hdfs_file_system.h | 6 ++- be/src/io/fs/local_file_system.cpp | 7 +++- be/src/io/fs/local_file_system.h | 5 ++- be/src/io/fs/path.h | 6 --- be/src/io/fs/s3_file_system.cpp | 5 +++ be/src/io/fs/s3_file_system.h | 5 ++- be/src/olap/data_dir.cpp | 2 +- be/src/olap/storage_policy_mgr.cpp | 2 +- be/src/service/brpc_conflict.h | 48 ---------------------- be/test/io/cache/remote_file_cache_test.cpp | 2 +- be/test/olap/remote_rowset_gc_test.cpp | 2 +- be/test/olap/rowset/beta_rowset_test.cpp | 2 +- be/test/olap/tablet_cooldown_test.cpp | 2 +- be/test/vec/exec/parquet/parquet_reader_test.cpp | 2 +- be/test/vec/exec/parquet/parquet_thrift_test.cpp | 8 ++-- .../sql-manual/sql-functions/table-functions/s3.md | 43 ++++++++++++++++++- .../sql-manual/sql-functions/table-functions/s3.md | 45 +++++++++++++++++++- 22 files changed, 145 insertions(+), 81 deletions(-) diff --git a/be/src/io/file_factory.cpp b/be/src/io/file_factory.cpp index 4fbc6dc9f6..62696552a5 100644 --- a/be/src/io/file_factory.cpp +++ b/be/src/io/file_factory.cpp @@ -210,7 +210,7 @@ Status FileFactory::create_hdfs_reader(const THdfsParams& hdfs_params, const std io::FileReaderSPtr* reader, const io::FileReaderOptions& reader_options, IOContext* io_ctx) { - hdfs_file_system->reset(new io::HdfsFileSystem(hdfs_params, "")); + *hdfs_file_system = io::HdfsFileSystem::create(hdfs_params, ""); RETURN_IF_ERROR((std::static_pointer_cast<io::HdfsFileSystem>(*hdfs_file_system))->connect()); RETURN_IF_ERROR((*hdfs_file_system)->open_file(path, reader_options, reader, io_ctx)); return Status::OK(); @@ -235,7 +235,7 @@ Status FileFactory::create_s3_reader(const std::map<std::string, std::string>& p } S3Conf s3_conf; RETURN_IF_ERROR(ClientFactory::convert_properties_to_s3_conf(prop, s3_uri, &s3_conf)); - s3_file_system->reset(new io::S3FileSystem(s3_conf, "")); + *s3_file_system = io::S3FileSystem::create(s3_conf, ""); RETURN_IF_ERROR((std::static_pointer_cast<io::S3FileSystem>(*s3_file_system))->connect()); RETURN_IF_ERROR((*s3_file_system)->open_file(s3_uri.get_key(), reader_options, reader, io_ctx)); return Status::OK(); @@ -248,8 +248,8 @@ Status FileFactory::create_broker_reader(const TNetworkAddress& broker_addr, io::FileReaderSPtr* reader, const io::FileReaderOptions& reader_options, IOContext* io_ctx) { - broker_file_system->reset( - new io::BrokerFileSystem(broker_addr, prop, file_description.file_size)); + *broker_file_system = + io::BrokerFileSystem::create(broker_addr, prop, file_description.file_size); RETURN_IF_ERROR( (std::static_pointer_cast<io::BrokerFileSystem>(*broker_file_system))->connect()); RETURN_IF_ERROR((*broker_file_system) diff --git a/be/src/io/fs/broker_file_system.cpp b/be/src/io/fs/broker_file_system.cpp index 4407f3d686..875eb0829b 100644 --- a/be/src/io/fs/broker_file_system.cpp +++ b/be/src/io/fs/broker_file_system.cpp @@ -56,6 +56,13 @@ inline const std::string& client_id(const TNetworkAddress& addr) { } #endif +std::shared_ptr<BrokerFileSystem> BrokerFileSystem::create( + const TNetworkAddress& broker_addr, const std::map<std::string, std::string>& broker_prop, + size_t file_size) { + return std::shared_ptr<BrokerFileSystem>( + new BrokerFileSystem(broker_addr, broker_prop, file_size)); +} + BrokerFileSystem::BrokerFileSystem(const TNetworkAddress& broker_addr, const std::map<std::string, std::string>& broker_prop, size_t file_size) diff --git a/be/src/io/fs/broker_file_system.h b/be/src/io/fs/broker_file_system.h index ec091ec577..bf55d49a53 100644 --- a/be/src/io/fs/broker_file_system.h +++ b/be/src/io/fs/broker_file_system.h @@ -24,8 +24,9 @@ namespace doris { namespace io { class BrokerFileSystem final : public RemoteFileSystem { public: - BrokerFileSystem(const TNetworkAddress& broker_addr, - const std::map<std::string, std::string>& broker_prop, size_t file_size); + static std::shared_ptr<BrokerFileSystem> create( + const TNetworkAddress& broker_addr, + const std::map<std::string, std::string>& broker_prop, size_t file_size); ~BrokerFileSystem() override = default; @@ -66,6 +67,9 @@ public: Status get_client(std::shared_ptr<BrokerServiceConnection>* client) const; private: + BrokerFileSystem(const TNetworkAddress& broker_addr, + const std::map<std::string, std::string>& broker_prop, size_t file_size); + const TNetworkAddress& _broker_addr; const std::map<std::string, std::string>& _broker_prop; size_t _file_size; diff --git a/be/src/io/fs/file_system.h b/be/src/io/fs/file_system.h index 4598ff32d2..f1fd331038 100644 --- a/be/src/io/fs/file_system.h +++ b/be/src/io/fs/file_system.h @@ -44,9 +44,6 @@ enum class FileSystemType : uint8_t { class FileSystem : public std::enable_shared_from_this<FileSystem> { public: - FileSystem(Path&& root_path, ResourceId&& resource_id, FileSystemType type) - : _root_path(std::move(root_path)), _resource_id(std::move(resource_id)), _type(type) {} - virtual ~FileSystem() = default; DISALLOW_COPY_AND_ASSIGN(FileSystem); @@ -81,6 +78,9 @@ public: const FileSystemType type() const { return _type; } protected: + FileSystem(Path&& root_path, ResourceId&& resource_id, FileSystemType type) + : _root_path(std::move(root_path)), _resource_id(std::move(resource_id)), _type(type) {} + Path _root_path; ResourceId _resource_id; FileSystemType _type; diff --git a/be/src/io/fs/hdfs_file_system.cpp b/be/src/io/fs/hdfs_file_system.cpp index 0cbfc32f2a..d623a8b245 100644 --- a/be/src/io/fs/hdfs_file_system.cpp +++ b/be/src/io/fs/hdfs_file_system.cpp @@ -61,6 +61,11 @@ private: void _clean_oldest(); }; +std::shared_ptr<HdfsFileSystem> HdfsFileSystem::create(const THdfsParams& hdfs_params, + const std::string& path) { + return std::shared_ptr<HdfsFileSystem>(new HdfsFileSystem(hdfs_params, path)); +} + HdfsFileSystem::HdfsFileSystem(const THdfsParams& hdfs_params, const std::string& path) : RemoteFileSystem(path, "", FileSystemType::HDFS), _hdfs_params(hdfs_params), diff --git a/be/src/io/fs/hdfs_file_system.h b/be/src/io/fs/hdfs_file_system.h index 49f0cb6a25..9e5edf6752 100644 --- a/be/src/io/fs/hdfs_file_system.h +++ b/be/src/io/fs/hdfs_file_system.h @@ -81,7 +81,9 @@ private: class HdfsFileSystem final : public RemoteFileSystem { public: - HdfsFileSystem(const THdfsParams& hdfs_params, const std::string& path); + static std::shared_ptr<HdfsFileSystem> create(const THdfsParams& hdfs_params, + const std::string& path); + ~HdfsFileSystem() override; Status create_file(const Path& path, FileWriterPtr* writer) override; @@ -119,6 +121,8 @@ public: HdfsFileSystemHandle* get_handle(); private: + HdfsFileSystem(const THdfsParams& hdfs_params, const std::string& path); + Path _covert_path(const Path& path) const; const THdfsParams& _hdfs_params; std::string _namenode; diff --git a/be/src/io/fs/local_file_system.cpp b/be/src/io/fs/local_file_system.cpp index ea5efe17c4..9cbe0c0814 100644 --- a/be/src/io/fs/local_file_system.cpp +++ b/be/src/io/fs/local_file_system.cpp @@ -24,6 +24,11 @@ namespace doris { namespace io { +std::shared_ptr<LocalFileSystem> LocalFileSystem::create(Path path, ResourceId resource_id) { + return std::shared_ptr<LocalFileSystem>( + new LocalFileSystem(std::move(path), std::move(resource_id))); +} + LocalFileSystem::LocalFileSystem(Path root_path, ResourceId resource_id) : FileSystem(std::move(root_path), std::move(resource_id), FileSystemType::LOCAL) {} @@ -144,7 +149,7 @@ Status LocalFileSystem::list(const Path& path, std::vector<Path>* files) { return Status::OK(); } -static FileSystemSPtr local_fs = std::make_shared<io::LocalFileSystem>(""); +static FileSystemSPtr local_fs = io::LocalFileSystem::create(""); const FileSystemSPtr& global_local_filesystem() { return local_fs; diff --git a/be/src/io/fs/local_file_system.h b/be/src/io/fs/local_file_system.h index d4b8e2e044..bc9faa1d98 100644 --- a/be/src/io/fs/local_file_system.h +++ b/be/src/io/fs/local_file_system.h @@ -25,7 +25,8 @@ namespace io { class LocalFileSystem final : public FileSystem { public: - LocalFileSystem(Path root_path, ResourceId resource_id = ResourceId()); + static std::shared_ptr<LocalFileSystem> create(Path path, ResourceId resource_id = ""); + ~LocalFileSystem() override; Status create_file(const Path& path, FileWriterPtr* writer) override; @@ -52,6 +53,8 @@ public: Status list(const Path& path, std::vector<Path>* files) override; private: + LocalFileSystem(Path root_path, ResourceId resource_id = ResourceId()); + Path absolute_path(const Path& path) const; }; diff --git a/be/src/io/fs/path.h b/be/src/io/fs/path.h index 695d51063a..9832ea6322 100644 --- a/be/src/io/fs/path.h +++ b/be/src/io/fs/path.h @@ -28,11 +28,5 @@ inline Path operator/(Path&& lhs, const Path& rhs) { return std::move(lhs /= rhs); } -struct PathHasher { - std::size_t operator()(const doris::io::Path& k) const { - return std::hash<std::string>()(k.filename().native()); - } -}; - } // namespace io } // namespace doris diff --git a/be/src/io/fs/s3_file_system.cpp b/be/src/io/fs/s3_file_system.cpp index 5796e68f4e..7f872586c6 100644 --- a/be/src/io/fs/s3_file_system.cpp +++ b/be/src/io/fs/s3_file_system.cpp @@ -49,6 +49,11 @@ namespace io { } #endif +std::shared_ptr<S3FileSystem> S3FileSystem::create(S3Conf s3_conf, ResourceId resource_id) { + return std::shared_ptr<S3FileSystem>( + new S3FileSystem(std::move(s3_conf), std::move(resource_id))); +} + S3FileSystem::S3FileSystem(S3Conf s3_conf, ResourceId resource_id) : RemoteFileSystem( fmt::format("{}/{}/{}", s3_conf.endpoint, s3_conf.bucket, s3_conf.prefix), diff --git a/be/src/io/fs/s3_file_system.h b/be/src/io/fs/s3_file_system.h index 015b75908c..93472bebd2 100644 --- a/be/src/io/fs/s3_file_system.h +++ b/be/src/io/fs/s3_file_system.h @@ -35,7 +35,8 @@ namespace io { // This class is thread-safe.(Except `set_xxx` method) class S3FileSystem final : public RemoteFileSystem { public: - S3FileSystem(S3Conf s3_conf, ResourceId resource_id); + static std::shared_ptr<S3FileSystem> create(S3Conf s3_conf, ResourceId resource_id); + ~S3FileSystem() override; Status create_file(const Path& path, FileWriterPtr* writer) override; @@ -78,6 +79,8 @@ public: std::string get_key(const Path& path) const; private: + S3FileSystem(S3Conf s3_conf, ResourceId resource_id); + S3Conf _s3_conf; // FIXME(cyx): We can use std::atomic<std::shared_ptr> since c++20. diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp index 0b70977721..e29fca2276 100644 --- a/be/src/olap/data_dir.cpp +++ b/be/src/olap/data_dir.cpp @@ -73,7 +73,7 @@ DataDir::DataDir(const std::string& path, int64_t capacity_bytes, TStorageMedium::type storage_medium, TabletManager* tablet_manager, TxnManager* txn_manager) : _path(path), - _fs(std::make_shared<io::LocalFileSystem>(path)), + _fs(io::LocalFileSystem::create(path)), _capacity_bytes(capacity_bytes), _available_bytes(0), _disk_capacity_bytes(0), diff --git a/be/src/olap/storage_policy_mgr.cpp b/be/src/olap/storage_policy_mgr.cpp index ce9241c7f8..58da3ff03e 100644 --- a/be/src/olap/storage_policy_mgr.cpp +++ b/be/src/olap/storage_policy_mgr.cpp @@ -67,7 +67,7 @@ void StoragePolicyMgr::periodic_put(const std::string& name, const StoragePolicy s3_conf.connect_timeout_ms = policy->s3_conn_timeout_ms; s3_conf.bucket = policy->bucket; s3_conf.prefix = policy->root_path; - s3_fs = std::make_shared<io::S3FileSystem>(std::move(s3_conf), name); + s3_fs = io::S3FileSystem::create(std::move(s3_conf), name); io::FileSystemMap::instance()->insert(name, s3_fs); _policy_map.emplace(name, policy); } else if (it->second->md5_sum != policy->md5_sum) { diff --git a/be/src/service/brpc_conflict.h b/be/src/service/brpc_conflict.h deleted file mode 100644 index 35ef1b815c..0000000000 --- a/be/src/service/brpc_conflict.h +++ /dev/null @@ -1,48 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -// This file is used to fixed macro conflict between butil and gutil -// and this file must put the first include in source file - -#include "gutil/macros.h" -// Macros in the guti/macros.h, use butil's define -#ifdef DISALLOW_IMPLICIT_CONSTRUCTORS -#undef DISALLOW_IMPLICIT_CONSTRUCTORS -#endif - -#ifdef arraysize -#undef arraysize -#endif - -#ifdef ARRAY_SIZE -#undef ARRAY_SIZE -#endif - -#undef OVERRIDE -#undef FINAL - -// use be/src/gutil/integral_types.h override butil/basictypes.h -#include "gutil/integral_types.h" -#ifdef BASE_INTEGRAL_TYPES_H_ -#define BUTIL_BASICTYPES_H_ -#endif - -#ifdef DEBUG_MODE -#undef DEBUG_MODE -#endif diff --git a/be/test/io/cache/remote_file_cache_test.cpp b/be/test/io/cache/remote_file_cache_test.cpp index 596ff88725..176e306a8a 100644 --- a/be/test/io/cache/remote_file_cache_test.cpp +++ b/be/test/io/cache/remote_file_cache_test.cpp @@ -163,7 +163,7 @@ protected: // just use to create s3 filesystem, otherwise won't use cache S3Conf s3_conf; std::shared_ptr<io::S3FileSystem> fs = - std::make_shared<io::S3FileSystem>(std::move(s3_conf), resource_id); + io::S3FileSystem::create(std::move(s3_conf), resource_id); rowset.rowset_meta()->set_resource_id(resource_id); rowset.rowset_meta()->set_num_segments(1); rowset.rowset_meta()->set_fs(fs); diff --git a/be/test/olap/remote_rowset_gc_test.cpp b/be/test/olap/remote_rowset_gc_test.cpp index f24a1a1086..e4d6eea11d 100644 --- a/be/test/olap/remote_rowset_gc_test.cpp +++ b/be/test/olap/remote_rowset_gc_test.cpp @@ -52,7 +52,7 @@ public: s3_conf.region = config::test_s3_region; s3_conf.bucket = config::test_s3_bucket; s3_conf.prefix = "remote_rowset_gc_test"; - auto s3_fs = std::make_shared<io::S3FileSystem>(std::move(s3_conf), kResourceId); + auto s3_fs = io::S3FileSystem::create(std::move(s3_conf), kResourceId); ASSERT_TRUE(s3_fs->connect().ok()); io::FileSystemMap::instance()->insert(kResourceId, s3_fs); diff --git a/be/test/olap/rowset/beta_rowset_test.cpp b/be/test/olap/rowset/beta_rowset_test.cpp index e696995d25..8766faab6c 100644 --- a/be/test/olap/rowset/beta_rowset_test.cpp +++ b/be/test/olap/rowset/beta_rowset_test.cpp @@ -232,7 +232,7 @@ TEST_F(BetaRowsetTest, ReadTest) { s3_conf.prefix = "prefix"; io::ResourceId resource_id = "test_resourse_id"; std::shared_ptr<io::S3FileSystem> fs = - std::make_shared<io::S3FileSystem>(std::move(s3_conf), resource_id); + io::S3FileSystem::create(std::move(s3_conf), resource_id); Aws::SDKOptions aws_options = Aws::SDKOptions {}; Aws::InitAPI(aws_options); // failed to head object diff --git a/be/test/olap/tablet_cooldown_test.cpp b/be/test/olap/tablet_cooldown_test.cpp index 15521a6f12..d243f32a4e 100644 --- a/be/test/olap/tablet_cooldown_test.cpp +++ b/be/test/olap/tablet_cooldown_test.cpp @@ -51,7 +51,7 @@ public: s3_conf.region = config::test_s3_region; s3_conf.bucket = config::test_s3_bucket; s3_conf.prefix = "tablet_cooldown_test"; - auto s3_fs = std::make_shared<io::S3FileSystem>(std::move(s3_conf), kResourceId); + auto s3_fs = io::S3FileSystem::create(std::move(s3_conf), kResourceId); ASSERT_TRUE(s3_fs->connect().ok()); io::FileSystemMap::instance()->insert(kResourceId, s3_fs); diff --git a/be/test/vec/exec/parquet/parquet_reader_test.cpp b/be/test/vec/exec/parquet/parquet_reader_test.cpp index 3046e48dc5..6379513e3b 100644 --- a/be/test/vec/exec/parquet/parquet_reader_test.cpp +++ b/be/test/vec/exec/parquet/parquet_reader_test.cpp @@ -89,7 +89,7 @@ TEST_F(ParquetReaderTest, normal) { DescriptorTbl::create(&obj_pool, t_desc_table, &desc_tbl); auto slot_descs = desc_tbl->get_tuple_descriptor(0)->slots(); - io::FileSystemSPtr local_fs = std::make_shared<io::LocalFileSystem>(""); + io::FileSystemSPtr local_fs = io::LocalFileSystem::create(""); io::FileReaderSPtr reader; local_fs->open_file("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet", &reader, nullptr); diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp index 9bc9eb97c0..f41b2da5d9 100644 --- a/be/test/vec/exec/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp @@ -47,7 +47,7 @@ public: }; TEST_F(ParquetThriftReaderTest, normal) { - io::FileSystemSPtr local_fs = std::make_shared<io::LocalFileSystem>(""); + io::FileSystemSPtr local_fs = io::LocalFileSystem::create(""); io::FileReaderSPtr reader; auto st = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/localfile.parquet", &reader, nullptr); @@ -79,7 +79,7 @@ TEST_F(ParquetThriftReaderTest, complex_nested_file) { // `friend` map<string,string>, // `mark` struct<math:int,english:int>) - io::FileSystemSPtr local_fs = std::make_shared<io::LocalFileSystem>(""); + io::FileSystemSPtr local_fs = io::LocalFileSystem::create(""); io::FileReaderSPtr reader; auto st = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/hive-complex.parquet", &reader, nullptr); @@ -283,7 +283,7 @@ static void read_parquet_data_and_check(const std::string& parquet_file, * `list_string` array<string>) // 14 */ - io::FileSystemSPtr local_fs = std::make_shared<io::LocalFileSystem>(""); + io::FileSystemSPtr local_fs = io::LocalFileSystem::create(""); io::FileReaderSPtr reader; auto st = local_fs->open_file(parquet_file, &reader, nullptr); EXPECT_TRUE(st.ok()); @@ -405,7 +405,7 @@ TEST_F(ParquetThriftReaderTest, group_reader) { lazy_read_ctx.all_read_columns.emplace_back(slot->col_name()); read_columns.emplace_back(ParquetReadColumn(7, slot->col_name())); } - io::FileSystemSPtr local_fs = std::make_shared<io::LocalFileSystem>(""); + io::FileSystemSPtr local_fs = io::LocalFileSystem::create(""); io::FileReaderSPtr file_reader; auto st = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet", &file_reader, nullptr); diff --git a/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md b/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md index 452fa13e90..46c917a069 100644 --- a/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md +++ b/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md @@ -57,6 +57,12 @@ Related parameters for accessing S3: - `secret_key`: (required) - `use_path_style`: (optional) default `false` . The S3 SDK uses the virtual-hosted style by default. However, some object storage systems may not be enabled or support virtual-hosted style access. At this time, we can add the `use_path_style` parameter to force the use of path style access method. +> Note: URI currently supports three SCHEMA: http://, https:// and s3://. +> 1. If you use http:// or https://, you will decide whether to use the 'path style' to access s3 based on the 'use_path_style' parameter +> 2. If you use s3://, you will use the "virtual-hosted style' to access the s3, 'use_path_style' parameter is invalid. +> +> For detailed use cases, you can refer to Best Practice at the bottom. + file format parameter: - `format`: (required) Currently support `csv/csv_with_names/csv_with_names_and_types/json/parquet/orc` @@ -99,8 +105,43 @@ MySQL [(none)]> Desc function s3("uri" = "http://127.0.0.1:9312/test2/student1.c s3, table-valued-function, tvf ### Best Practice +Since the S3 table-valued-function does not know the table schema in advance, it will read the file first to parse out the table schema. + +**Usage of different uri schemas** +Example of http:// 、https:// + +```sql +// Note how to write your bucket of URI and set the 'use_path_style' parameter, as well as http://. +// Because of "use_path_style"="true", s3 will be accessed in 'path style'. +select * from s3( + "URI" = "https://endpoint/bucket/file/student.csv", + "ACCESS_KEY"= "ak", + "SECRET_KEY" = "sk", + "FORMAT" = "csv", + "use_path_style"="true"); + +// Note how to write your bucket of URI and set the 'use_path_style' parameter, as well as http://. +// Because of "use_path_style"="false", s3 will be accessed in 'virtual-hosted style'. +select * from s3( + "URI" = "https://bucket.endpoint/file/student.csv", + "ACCESS_KEY"= "ak", + "SECRET_KEY" = "sk", + "FORMAT" = "csv", + "use_path_style"="false"); +``` + +Example of s3://: + +```sql +// Note how to write your bucket of URI, no need to set 'use_path_style'. +// s3 will be accessed in 'virtual-hosted style'. +select * from s3( + "URI" = "s3://bucket.endpoint/file/student.csv", + "ACCESS_KEY"= "ak", + "SECRET_KEY" = "sk", + "FORMAT" = "csv"); +``` -Since the S3 table-valued-function does not know the table schema in advance, it will read the file first to parse out the table schema. Specifically, for different file formats: **csv foramt** `csv` format: Read the file on S3 and process it as a csv file, read the first line in the file to parse out the table schema. The number of columns in the first line of the file `n` will be used as the number of columns in the table schema, and the column names of the table schema will be automatically named `c1, c2, ..., cn`, and the column type is set to `String` , for example: diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md index a27b8c56df..a77b6007ff 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md @@ -58,7 +58,13 @@ S3 tvf中的每一个参数都是一个 `"key"="value"` 对。 - `uri`: (必填) 访问S3的uri,S3表函数会根据 `use_path_style` 参数来决定是否使用 path style 访问方式,默认为 virtual-hosted style 方式 - `access_key`: (必填) - `secret_key`: (必填) -- `use_path_style`:(选填) 默认为`false` 。S3 SDK 默认使用 virtual-hosted style 方式。但某些对象存储系统可能没开启或没支持virtual-hosted style 方式的访问,此时我们可以添加 use_path_style 参数来强制使用 path style 方式。 +- `use_path_style`:(选填) 默认为`false` 。S3 SDK 默认使用 virtual-hosted style 方式。但某些对象存储系统可能没开启或没支持virtual-hosted style 方式的访问,此时我们可以添加 use_path_style 参数来强制使用 path style 方式。比如 `minio`默认情况下只允许`path style`访问方式,所以在访问minio时要加上`use_path_style=true`。 + +> 注意:uri目前支持三种schema:http://, https:// 和 s3:// +> 1. 如果使用http://或https://, 则会根据 'use_path_style' 参数来决定是否使用'path style'方式访问s3 +> 2. 如果使用s3://, 则都使用 'virtual-hosted style' 方式访问s3, 'use_path_style'参数无效。 +> +> 详细使用案例可以参考最下方 Best Practice。 文件格式参数: - `format`:(必填) 目前支持 `csv/csv_with_names/csv_with_names_and_types/json/parquet/orc` @@ -102,9 +108,44 @@ MySQL [(none)]> Desc function s3("uri" = "http://127.0.0.1:9312/test2/student1.c ### Best Practice -由于S3 table-valued-function事先并不知道table schema,所以会先读一遍文件来解析出table schema,具体到不同的文件格式来说: +**不同url schema的写法** +http:// 、https:// 使用示例: +```sql +// 注意URI bucket写法以及use_path_style参数设置,http同理。 +// 由于设置了"use_path_style"="true", 所以将采用path style方式访问s3。 +select * from s3( + "URI" = "https://endpoint/bucket/file/student.csv", + "ACCESS_KEY"= "ak", + "SECRET_KEY" = "sk", + "FORMAT" = "csv", + "use_path_style"="true"); + +// 注意URI bucket写法以及use_path_style参数设置,http同理。 +// 由于设置了"use_path_style"="false", 所以将采用virtual-hosted style方式访问s3。 +select * from s3( + "URI" = "https://bucket.endpoint/file/student.csv", + "ACCESS_KEY"= "ak", + "SECRET_KEY" = "sk", + "FORMAT" = "csv", + "use_path_style"="false"); +``` + +s3:// 使用示例: + +```sql +// 注意URI bucket写法, 无需设置use_path_style参数。 +// 将采用virtual-hosted style方式访问s3。 +select * from s3( + "URI" = "s3://bucket.endpoint/file/student.csv", + "ACCESS_KEY"= "ak", + "SECRET_KEY" = "sk", + "FORMAT" = "csv"); +``` + **csv foramt** +由于S3 table-valued-function事先并不知道table schema,所以会先读一遍文件来解析出table schema。 + `csv` 格式: S3 table-valued-function 读取S3上的文件并当作csv文件来处理,读取文件中的第一行用于解析table schema。文件第一行的列个数`n`将作为table schema的列个数,table schema的列名则自动取名为`c1, c2, ..., cn` ,列类型都设置为 `String`, 举例: student1.csv文件内容为: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org