This is an automated email from the ASF dual-hosted git repository.

kakachen pushed a commit to branch cq_test
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git

commit c5c6731dc246a4dc579ced4fe89efde3952696f6
Author: kakachen <[email protected]>
AuthorDate: Thu Oct 16 11:33:07 2025 +0800

    Init commit.
---
 c++/include/orc/Reader.hh | 22 +++++++++++++++++++
 c++/src/ColumnReader.cc   |  8 +++++--
 c++/src/Options.hh        | 15 +++++++++++++
 c++/src/Reader.cc         | 55 +++++++++++++++++++++++++++++++++++++++++------
 c++/src/Reader.hh         |  2 +-
 c++/src/TypeImpl.cc       |  1 +
 6 files changed, 94 insertions(+), 9 deletions(-)

diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 96a431faae8..1fa3b9bf733 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -206,6 +206,18 @@ namespace orc {
      */
     RowReaderOptions& filter(const std::list<std::string>& filterColNames);
 
+    /**
+     * Selects which type ids to filter. The root type is always 0 and the
+     * rest of the types are labeled in a preorder traversal of the tree.
+     * The parent types are automatically selected, but the children are not.
+     *
+     * This option clears any previous setting of the filter columns or
+     * types.
+     * @param types a list of the type ids to filter
+     * @return this
+     */
+    RowReaderOptions& filterTypes(const std::list<uint64_t>& types);
+
     /**
      * A map type of <typeId, ReadIntent>.
      */
@@ -308,6 +320,16 @@ namespace orc {
      */
     const std::list<std::string>& getFilterColNames() const;
 
+    /**
+     * Were the filter type ids set?
+     */
+    bool getFilterTypeIdsSet() const;
+
+    /**
+     * Get the list of filter type ids.
+     */
+    const std::list<uint64_t>& getFilterTypeIds() const;
+
     /**
      * Get the start of the range for the data being processed.
      * @return if not set, return 0
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index 4dde99917c0..bc1179ff13f 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -1158,9 +1158,13 @@ namespace orc {
   }
 
   uint64_t StructColumnReader::skip(uint64_t numValues, const ReadPhase& 
readPhase) {
-    if (readPhase.contains(this->type.getReaderCategory())) {
-      numValues = ColumnReader::skip(numValues, readPhase);
+    if (!readPhase.contains(this->type.getReaderCategory())) {
+      return 0;
     }
+    // if (readPhase.contains(this->type.getReaderCategory())) {
+    //   numValues = ColumnReader::skip(numValues, readPhase);
+    // }
+    numValues = ColumnReader::skip(numValues, readPhase);
     for (auto& ptr : children) {
       if (shouldProcessChild(ptr->getType().getReaderCategory(), readPhase)) {
         ptr->skip(numValues, readPhase);
diff --git a/c++/src/Options.hh b/c++/src/Options.hh
index 40a583e9397..5cd2b49ffac 100644
--- a/c++/src/Options.hh
+++ b/c++/src/Options.hh
@@ -238,6 +238,13 @@ namespace orc {
     return *this;
   }
 
+  RowReaderOptions& RowReaderOptions::filterTypes(const std::list<uint64_t>& 
types) {
+    privateBits->filter = ColumnFilter_TYPE_IDS;
+    privateBits->filterColumnIndexes.assign(types.begin(), types.end());
+    privateBits->filterColumnNames.clear();
+    return *this;
+  }
+
   RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) {
     privateBits->dataStart = offset;
     privateBits->dataLength = length;
@@ -268,6 +275,14 @@ namespace orc {
     return privateBits->filterColumnNames;
   }
 
+  bool RowReaderOptions::getFilterTypeIdsSet() const {
+    return privateBits->filter == ColumnFilter_TYPE_IDS;
+  }
+
+  const std::list<uint64_t>& RowReaderOptions::getFilterTypeIds() const {
+    return privateBits->filterColumnIndexes;
+  }
+
   uint64_t RowReaderOptions::getOffset() const {
     return privateBits->dataStart;
   }
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 82cd3e0f0d7..673b668b1df 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -337,9 +337,6 @@ namespace orc {
         while (current != nullptr) {
           if (current->getSubtypeCount() == 0) {
             current->setReaderCategory(ReaderCategory::FILTER_CHILD);
-          } else if (current->getKind() == TypeKind::LIST
-                     || current->getKind() == TypeKind::MAP) {
-            
current->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT);
           } else {
             current->setReaderCategory(ReaderCategory::FILTER_PARENT);
           }
@@ -358,11 +355,57 @@ namespace orc {
             if (child->getSubtypeCount() == 0) {
               // Leaf node (no children)
               child->setReaderCategory(ReaderCategory::FILTER_CHILD);
-            } else if (child->getKind() == TypeKind::LIST
-                       || child->getKind() == TypeKind::MAP) {
-              
child->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT);
+            } else {
+              // Non-leaf node (has children)
+              child->setReaderCategory(ReaderCategory::FILTER_PARENT);
               // Recursively process its children
               processChildren(child);
+            }
+          }
+        };
+        processChildren(type);
+      }
+
+      startReadPhase = ReadPhase::LEADERS;
+      readerContext = std::unique_ptr<ReaderContext>(new ReaderContext());
+      readerContext->setFilterCallback(std::move(filterColIds), filter);
+    } else if (opts.getFilterTypeIdsSet()) {
+      // Handle filter by type IDs
+      const std::list<uint64_t>& filterTypeIds = opts.getFilterTypeIds();
+
+      for (const auto& typeId : filterTypeIds) {
+        if (typeId >= idTypeMap.size()) {
+          std::stringstream buffer;
+          buffer << "Invalid type id for filter " << typeId << " out of " << 
idTypeMap.size();
+          throw ParseError(buffer.str());
+        }
+
+        Type* type = idTypeMap[typeId];
+
+        // Process current node and all its parent nodes
+        // Set FILTER_CHILD for leaf nodes and FILTER_PARENT for non-leaf nodes
+        Type* current = type;
+        while (current != nullptr) {
+          if (current->getSubtypeCount() == 0) {
+            current->setReaderCategory(ReaderCategory::FILTER_CHILD);
+          } else {
+            current->setReaderCategory(ReaderCategory::FILTER_PARENT);
+          }
+          filterColIds.emplace(current->getColumnId());
+          current = current->getParent();
+        }
+
+        // Process all child nodes of the current node
+        // For child nodes: set FILTER_CHILD if it's a leaf, FILTER_PARENT if 
it has children
+        std::function<void(Type*)> processChildren = [&processChildren](Type* 
node) {
+          if (node == nullptr) return;
+
+          // Iterate through all child nodes
+          for (int i = 0; i < node->getSubtypeCount(); ++i) {
+            Type* child = node->getSubtype(i);
+            if (child->getSubtypeCount() == 0) {
+              // Leaf node (no children)
+              child->setReaderCategory(ReaderCategory::FILTER_CHILD);
             } else {
               // Non-leaf node (has children)
               child->setReaderCategory(ReaderCategory::FILTER_PARENT);
diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh
index 1fd429be86a..05990851a74 100644
--- a/c++/src/Reader.hh
+++ b/c++/src/Reader.hh
@@ -212,7 +212,7 @@ namespace orc {
     ReadPhase startReadPhase;
     bool needsFollowColumnsRead;
 
-    std::map<uint64_t, const Type*> idTypeMap;
+    std::map<uint64_t, Type*> idTypeMap;
     std::map<std::string, Type*> nameTypeMap;
     std::vector<std::string> columns;
 
diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc
index a85e32bcd1f..a418fa94f24 100644
--- a/c++/src/TypeImpl.cc
+++ b/c++/src/TypeImpl.cc
@@ -591,6 +591,7 @@ namespace orc {
         throw NotImplementedYet("Unknown type kind");
     }
     result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId());
+    result->setReaderCategory(fileType->getReaderCategory());
     for (auto& key : fileType->getAttributeKeys()) {
       const auto& value = fileType->getAttributeValue(key);
       result->setAttribute(key, value);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to