This is an automated email from the ASF dual-hosted git repository. caiconghui pushed a commit to branch orc-2.1 in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
commit 6ad6875ad4eb32a6b4aebb214454e8ed900de712 Author: caiconghui1 <caicongh...@jd.com> AuthorDate: Tue Oct 29 21:01:23 2024 +0800 [opt] support orc merge multi stripe index io --- c++/include/orc/OrcFile.hh | 2 +- c++/src/Reader.cc | 27 ++++++++++++++++++++++----- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/c++/include/orc/OrcFile.hh b/c++/include/orc/OrcFile.hh index c52b66b7210..d840e67cf28 100644 --- a/c++/include/orc/OrcFile.hh +++ b/c++/include/orc/OrcFile.hh @@ -64,7 +64,7 @@ namespace orc { virtual const std::string& getName() const = 0; virtual void beforeReadStripe(std::unique_ptr<StripeInformation> currentStripeInformation, - std::vector<bool> selectedColumns); + std::vector<bool>& selectedColumns); }; /** diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index b8e1a914215..88f1a50305d 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -540,19 +540,36 @@ namespace orc { // reset all previous row indexes rowIndexes.clear(); bloomFilterIndex.clear(); - - // obtain row indexes for selected columns + static const uint64_t MAX_READ_STRIPE_INDEX_SIZE_FOR_BUFFER = 1024 * 1024; + const char* stripeIndexBuffer = nullptr; uint64_t offset = currentStripeInfo.offset(); + uint64_t startOffset = offset; + uint64_t stripeIndexSize = currentStripeInfo.indexlength(); + std::unique_ptr<SeekableFileInputStream> stripeIndexInputStream = nullptr; + if (stripeIndexSize < MAX_READ_STRIPE_INDEX_SIZE_FOR_BUFFER) { + stripeIndexInputStream = std::unique_ptr<SeekableFileInputStream>(new SeekableFileInputStream( + contents->stream.get(), startOffset, stripeIndexSize, *contents->pool, stripeIndexSize)); + int size = 0; + const void* buffer = nullptr; + if (!stripeIndexInputStream->Next(&buffer, &size) || static_cast<uint64_t>(size) != stripeIndexSize) { + throw ParseError("Failed to read the stripe index"); + } + stripeIndexBuffer = static_cast<const char*>(buffer); + } + // obtain row indexes for selected columns for (int i = 0; i < currentStripeFooter.streams_size(); ++i) { const proto::Stream& pbStream = currentStripeFooter.streams(i); uint64_t colId = pbStream.column(); if (selectedColumns[colId] && pbStream.has_kind() && (pbStream.kind() == proto::Stream_Kind_ROW_INDEX || pbStream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8)) { + std::unique_ptr<SeekableInputStream> inputStream = stripeIndexBuffer ? std::unique_ptr<SeekableInputStream>( + new SeekableArrayInputStream(stripeIndexBuffer + offset - startOffset, pbStream.length())) + : std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(contents->stream.get(), offset, + pbStream.length(), *contents->pool)); std::unique_ptr<SeekableInputStream> inStream = createDecompressor( getCompression(), - std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream( - contents->stream.get(), offset, pbStream.length(), *contents->pool)), + std::move(inputStream), getCompressionSize(), *contents->pool, contents->readerMetrics); if (pbStream.kind() == proto::Stream_Kind_ROW_INDEX) { @@ -1761,6 +1778,6 @@ namespace orc { }; void InputStream::beforeReadStripe(std::unique_ptr<StripeInformation> currentStripeInformation, - std::vector<bool> selectedColumns) {} + std::vector<bool>& selectedColumns) {} } // namespace orc --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org