This is an automated email from the ASF dual-hosted git repository.

caiconghui pushed a commit to branch orc-2.1
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git

commit 6ad6875ad4eb32a6b4aebb214454e8ed900de712
Author: caiconghui1 <caicongh...@jd.com>
AuthorDate: Tue Oct 29 21:01:23 2024 +0800

     [opt] support orc merge multi stripe index io
---
 c++/include/orc/OrcFile.hh |  2 +-
 c++/src/Reader.cc          | 27 ++++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/c++/include/orc/OrcFile.hh b/c++/include/orc/OrcFile.hh
index c52b66b7210..d840e67cf28 100644
--- a/c++/include/orc/OrcFile.hh
+++ b/c++/include/orc/OrcFile.hh
@@ -64,7 +64,7 @@ namespace orc {
     virtual const std::string& getName() const = 0;
 
     virtual void beforeReadStripe(std::unique_ptr<StripeInformation> 
currentStripeInformation,
-                                  std::vector<bool> selectedColumns);
+                                  std::vector<bool>& selectedColumns);
   };
 
   /**
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index b8e1a914215..88f1a50305d 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -540,19 +540,36 @@ namespace orc {
     // reset all previous row indexes
     rowIndexes.clear();
     bloomFilterIndex.clear();
-
-    // obtain row indexes for selected columns
+    static const uint64_t MAX_READ_STRIPE_INDEX_SIZE_FOR_BUFFER = 1024 * 1024;
+    const char* stripeIndexBuffer = nullptr;
     uint64_t offset = currentStripeInfo.offset();
+    uint64_t startOffset = offset;
+    uint64_t stripeIndexSize = currentStripeInfo.indexlength();
+    std::unique_ptr<SeekableFileInputStream> stripeIndexInputStream = nullptr;
+    if (stripeIndexSize < MAX_READ_STRIPE_INDEX_SIZE_FOR_BUFFER) {
+      stripeIndexInputStream = std::unique_ptr<SeekableFileInputStream>(new 
SeekableFileInputStream(
+        contents->stream.get(), startOffset, stripeIndexSize, *contents->pool, 
stripeIndexSize));
+      int size = 0;
+      const void* buffer = nullptr;
+      if (!stripeIndexInputStream->Next(&buffer, &size) || 
static_cast<uint64_t>(size) != stripeIndexSize) {
+        throw ParseError("Failed to read the stripe index");
+      }
+      stripeIndexBuffer = static_cast<const char*>(buffer);
+    }
+    // obtain row indexes for selected columns
     for (int i = 0; i < currentStripeFooter.streams_size(); ++i) {
       const proto::Stream& pbStream = currentStripeFooter.streams(i);
       uint64_t colId = pbStream.column();
       if (selectedColumns[colId] && pbStream.has_kind() &&
           (pbStream.kind() == proto::Stream_Kind_ROW_INDEX ||
            pbStream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8)) {
+        std::unique_ptr<SeekableInputStream> inputStream = stripeIndexBuffer ? 
std::unique_ptr<SeekableInputStream>(
+          new SeekableArrayInputStream(stripeIndexBuffer + offset - 
startOffset, pbStream.length()))
+          : std::unique_ptr<SeekableInputStream>(new 
SeekableFileInputStream(contents->stream.get(), offset,
+            pbStream.length(), *contents->pool));
         std::unique_ptr<SeekableInputStream> inStream = createDecompressor(
             getCompression(),
-            std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(
-                contents->stream.get(), offset, pbStream.length(), 
*contents->pool)),
+            std::move(inputStream),
             getCompressionSize(), *contents->pool, contents->readerMetrics);
 
         if (pbStream.kind() == proto::Stream_Kind_ROW_INDEX) {
@@ -1761,6 +1778,6 @@ namespace orc {
   };
 
   void InputStream::beforeReadStripe(std::unique_ptr<StripeInformation> 
currentStripeInformation,
-                                     std::vector<bool> selectedColumns) {}
+                                     std::vector<bool>& selectedColumns) {}
 
 }  // namespace orc


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to