This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 449c595f9d [opt](FileReader) InMemoryReader is only used in s3 (#23486) 449c595f9d is described below commit 449c595f9d862e5d2e2fcbb79f0d06122fbf2b76 Author: Ashin Gau <ashin...@users.noreply.github.com> AuthorDate: Wed Aug 30 20:43:39 2023 +0800 [opt](FileReader) InMemoryReader is only used in s3 (#23486) If file size < 8MB, the file will be read into memory, and this idea is from https://github.com/apache/hadoop/blob/trunk/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md#s3inmemoryinputstream. However, in some cases, we only read one or two columns in a file, and the actually required bytes is only 1%, resulting in a multiple fold increase in the amount of data read. Therefore, `InMemoryReader` can only used in object storage, and reduce the threshold. --- be/src/common/config.cpp | 3 +++ be/src/common/config.h | 3 +++ be/src/io/fs/buffered_reader.cpp | 8 ++++++-- be/src/io/fs/buffered_reader.h | 4 +--- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 2c6dc99876..3ada8cb82f 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -850,6 +850,9 @@ DEFINE_Validator(jsonb_type_length_soft_limit_bytes, // is greater than object_pool_buffer_size, release the object in the unused_object_pool. DEFINE_Int32(object_pool_buffer_size, "100"); +// Threshold of reading a small file into memory +DEFINE_mInt32(in_memory_file_size, "1048576"); // 1MB + // ParquetReaderWrap prefetch buffer size DEFINE_Int32(parquet_reader_max_buffer_size, "50"); // Max size of parquet page header in bytes diff --git a/be/src/common/config.h b/be/src/common/config.h index e66c618369..beca3957ef 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -897,6 +897,9 @@ DECLARE_mInt32(jsonb_type_length_soft_limit_bytes); // is greater than object_pool_buffer_size, release the object in the unused_object_pool. DECLARE_Int32(object_pool_buffer_size); +// Threshold fo reading a small file into memory +DECLARE_mInt32(in_memory_file_size); + // ParquetReaderWrap prefetch buffer size DECLARE_Int32(parquet_reader_max_buffer_size); // Max size of parquet page header in bytes diff --git a/be/src/io/fs/buffered_reader.cpp b/be/src/io/fs/buffered_reader.cpp index 726f5331c9..00f88c7515 100644 --- a/be/src/io/fs/buffered_reader.cpp +++ b/be/src/io/fs/buffered_reader.cpp @@ -778,8 +778,12 @@ Status DelegateReader::create_file_reader(RuntimeProfile* profile, io::FileReaderSPtr reader; RETURN_IF_ERROR(FileFactory::create_file_reader(system_properties, file_description, reader_options, file_system, &reader, profile)); - if (reader->size() < IN_MEMORY_FILE_SIZE) { - *file_reader = std::make_shared<InMemoryFileReader>(reader); + if (reader->size() < config::in_memory_file_size) { + if (typeid_cast<io::S3FileReader*>(reader.get())) { + *file_reader = std::make_shared<InMemoryFileReader>(reader); + } else { + *file_reader = std::move(reader); + } } else if (access_mode == AccessMode::SEQUENTIAL) { bool is_thread_safe = false; if (typeid_cast<io::S3FileReader*>(reader.get())) { diff --git a/be/src/io/fs/buffered_reader.h b/be/src/io/fs/buffered_reader.h index 34e1ff34fe..25a6811330 100644 --- a/be/src/io/fs/buffered_reader.h +++ b/be/src/io/fs/buffered_reader.h @@ -238,7 +238,7 @@ private: /** * Create a file reader suitable for accessing scenarios: - * 1. When file size < 8MB, create InMemoryFileReader file reader + * 1. When file size < config::in_memory_file_size, create InMemoryFileReader file reader * 2. When reading sequential file(csv/json), create PrefetchBufferedReader * 3. When reading random access file(parquet/orc), create normal file reader */ @@ -246,8 +246,6 @@ class DelegateReader { public: enum AccessMode { SEQUENTIAL, RANDOM }; - static constexpr size_t IN_MEMORY_FILE_SIZE = 8 * 1024 * 1024; - static Status create_file_reader( RuntimeProfile* profile, const FileSystemProperties& system_properties, const FileDescription& file_description, const io::FileReaderOptions& reader_options, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org