This is an automated email from the ASF dual-hosted git repository. kakachen pushed a commit to branch cq_nested_column_prune_external_table in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
commit 8c0930fd239b2434b53bb1979075ce637e75a467 Author: kakachen <[email protected]> AuthorDate: Tue Oct 21 21:14:22 2025 +0800 Init commit. --- c++/include/orc/Reader.hh | 22 +++++++++++++- c++/src/ColumnReader.cc | 8 +++-- c++/src/Options.hh | 15 +++++++++ c++/src/Reader.cc | 77 +++++++++++++++++++++++++++++++++++++++++++++-- c++/src/Reader.hh | 2 +- c++/src/TypeImpl.cc | 1 + 6 files changed, 119 insertions(+), 6 deletions(-) diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh index 96a431faae8..1ab60b1d2c5 100644 --- a/c++/include/orc/Reader.hh +++ b/c++/include/orc/Reader.hh @@ -178,7 +178,6 @@ namespace orc { /** * Selects which type ids to read. The root type is always 0 and the * rest of the types are labeled in a preorder traversal of the tree. - * The parent types are automatically selected, but the children are not. * * This option clears any previous setting of the selected columns or * types. @@ -206,6 +205,17 @@ namespace orc { */ RowReaderOptions& filter(const std::list<std::string>& filterColNames); + /** + * Selects which type ids to filter. The root type is always 0 and the + * rest of the types are labeled in a preorder traversal of the tree. + * + * This option clears any previous setting of the filter columns or + * types. + * @param types a list of the type ids to filter + * @return this + */ + RowReaderOptions& filterTypes(const std::list<uint64_t>& types); + /** * A map type of <typeId, ReadIntent>. */ @@ -308,6 +318,16 @@ namespace orc { */ const std::list<std::string>& getFilterColNames() const; + /** + * Were the filter type ids set? + */ + bool getFilterTypeIdsSet() const; + + /** + * Get the list of filter type ids. + */ + const std::list<uint64_t>& getFilterTypeIds() const; + /** * Get the start of the range for the data being processed. * @return if not set, return 0 diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc index 4dde99917c0..bc1179ff13f 100644 --- a/c++/src/ColumnReader.cc +++ b/c++/src/ColumnReader.cc @@ -1158,9 +1158,13 @@ namespace orc { } uint64_t StructColumnReader::skip(uint64_t numValues, const ReadPhase& readPhase) { - if (readPhase.contains(this->type.getReaderCategory())) { - numValues = ColumnReader::skip(numValues, readPhase); + if (!readPhase.contains(this->type.getReaderCategory())) { + return 0; } + // if (readPhase.contains(this->type.getReaderCategory())) { + // numValues = ColumnReader::skip(numValues, readPhase); + // } + numValues = ColumnReader::skip(numValues, readPhase); for (auto& ptr : children) { if (shouldProcessChild(ptr->getType().getReaderCategory(), readPhase)) { ptr->skip(numValues, readPhase); diff --git a/c++/src/Options.hh b/c++/src/Options.hh index 40a583e9397..5cd2b49ffac 100644 --- a/c++/src/Options.hh +++ b/c++/src/Options.hh @@ -238,6 +238,13 @@ namespace orc { return *this; } + RowReaderOptions& RowReaderOptions::filterTypes(const std::list<uint64_t>& types) { + privateBits->filter = ColumnFilter_TYPE_IDS; + privateBits->filterColumnIndexes.assign(types.begin(), types.end()); + privateBits->filterColumnNames.clear(); + return *this; + } + RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) { privateBits->dataStart = offset; privateBits->dataLength = length; @@ -268,6 +275,14 @@ namespace orc { return privateBits->filterColumnNames; } + bool RowReaderOptions::getFilterTypeIdsSet() const { + return privateBits->filter == ColumnFilter_TYPE_IDS; + } + + const std::list<uint64_t>& RowReaderOptions::getFilterTypeIds() const { + return privateBits->filterColumnIndexes; + } + uint64_t RowReaderOptions::getOffset() const { return privateBits->dataStart; } diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 82cd3e0f0d7..d5f83fc3590 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -176,22 +176,36 @@ namespace orc { field != options.getInclude().end(); ++field) { updateSelectedByFieldId(selectedColumns, *field); } + selectParents(selectedColumns, *contents->schema.get()); } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) { for (std::list<std::string>::const_iterator field = options.getIncludeNames().begin(); field != options.getIncludeNames().end(); ++field) { updateSelectedByName(selectedColumns, *field); } + selectParents(selectedColumns, *contents->schema.get()); } else if (options.getTypeIdsSet()) { const RowReaderOptions::IdReadIntentMap idReadIntentMap = options.getIdReadIntentMap(); for (std::list<uint64_t>::const_iterator typeId = options.getInclude().begin(); typeId != options.getInclude().end(); ++typeId) { - updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap); + if (!idReadIntentMap.empty()) { + updateSelectedByTypeId(selectedColumns, *typeId, idReadIntentMap); + selectParents(selectedColumns, *contents->schema.get()); + } else { + if (*typeId < selectedColumns.size()) { + // Only select the specified type ID, do not automatically select children or parents + selectedColumns[*typeId] = true; + } else { + std::stringstream buffer; + buffer << "Invalid type id selected " << *typeId << " out of " << selectedColumns.size(); + throw ParseError(buffer.str()); + } + } } } else { // default is to select all columns std::fill(selectedColumns.begin(), selectedColumns.end(), true); + selectParents(selectedColumns, *contents->schema.get()); } - selectParents(selectedColumns, *contents->schema.get()); selectedColumns[0] = true; // column 0 is selected by default } @@ -374,6 +388,65 @@ namespace orc { processChildren(type); } + startReadPhase = ReadPhase::LEADERS; + readerContext = std::unique_ptr<ReaderContext>(new ReaderContext()); + readerContext->setFilterCallback(std::move(filterColIds), filter); + } else if (opts.getFilterTypeIdsSet()) { + // Handle filter by type IDs + const std::list<uint64_t>& filterTypeIds = opts.getFilterTypeIds(); + + for (const auto& typeId : filterTypeIds) { + if (typeId >= idTypeMap.size()) { + std::stringstream buffer; + buffer << "Invalid type id for filter " << typeId << " out of " << idTypeMap.size(); + throw ParseError(buffer.str()); + } + + Type* type = idTypeMap[typeId]; + + // Process current node and all its parent nodes + // Set FILTER_CHILD for leaf nodes and FILTER_PARENT for non-leaf nodes + Type* current = type; + while (current != nullptr) { + if (current->getSubtypeCount() == 0) { + current->setReaderCategory(ReaderCategory::FILTER_CHILD); + } else if (current->getKind() == TypeKind::LIST + || current->getKind() == TypeKind::MAP) { + current->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT); + } else { + current->setReaderCategory(ReaderCategory::FILTER_PARENT); + } + filterColIds.emplace(current->getColumnId()); + current = current->getParent(); + } + + // Process all child nodes of the current node + // For child nodes: set FILTER_CHILD if it's a leaf, FILTER_PARENT if it has children + std::function<void(Type*)> processChildren = [&processChildren](Type* node) { + if (node == nullptr) return; + + // Iterate through all child nodes + for (int i = 0; i < node->getSubtypeCount(); ++i) { + Type* child = node->getSubtype(i); + if (child->getSubtypeCount() == 0) { + // Leaf node (no children) + child->setReaderCategory(ReaderCategory::FILTER_CHILD); + } else if (child->getKind() == TypeKind::LIST + || child->getKind() == TypeKind::MAP) { + child->setReaderCategory(ReaderCategory::FILTER_COMPOUND_ELEMENT); + // Recursively process its children + processChildren(child); + } else { + // Non-leaf node (has children) + child->setReaderCategory(ReaderCategory::FILTER_PARENT); + // Recursively process its children + processChildren(child); + } + } + }; + processChildren(type); + } + startReadPhase = ReadPhase::LEADERS; readerContext = std::unique_ptr<ReaderContext>(new ReaderContext()); readerContext->setFilterCallback(std::move(filterColIds), filter); diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh index 1fd429be86a..05990851a74 100644 --- a/c++/src/Reader.hh +++ b/c++/src/Reader.hh @@ -212,7 +212,7 @@ namespace orc { ReadPhase startReadPhase; bool needsFollowColumnsRead; - std::map<uint64_t, const Type*> idTypeMap; + std::map<uint64_t, Type*> idTypeMap; std::map<std::string, Type*> nameTypeMap; std::vector<std::string> columns; diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index a85e32bcd1f..a418fa94f24 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -591,6 +591,7 @@ namespace orc { throw NotImplementedYet("Unknown type kind"); } result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId()); + result->setReaderCategory(fileType->getReaderCategory()); for (auto& key : fileType->getAttributeKeys()) { const auto& value = fileType->getAttributeValue(key); result->setAttribute(key, value); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
