This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new a44a2745634 [Fix](parquet-reader) Fix and optimize parquet min-max filtering. (#39375) a44a2745634 is described below commit a44a2745634a089581c99d394f1e333e3ebcb400 Author: Qi Chen <kaka11.c...@gmail.com> AuthorDate: Thu Aug 15 14:12:54 2024 +0800 [Fix](parquet-reader) Fix and optimize parquet min-max filtering. (#39375) Backport #38277. --- be/src/vec/exec/format/parquet/parquet_common.cpp | 340 +++++++++++++++++++++ be/src/vec/exec/format/parquet/parquet_common.h | 131 +++++++- be/src/vec/exec/format/parquet/parquet_pred_cmp.h | 142 +++++++-- .../exec/format/parquet/vparquet_page_index.cpp | 4 +- be/src/vec/exec/format/parquet/vparquet_reader.cpp | 103 ++++++- be/src/vec/exec/format/parquet/vparquet_reader.h | 3 + .../parquet/parquet_corrupt_statistics_test.cpp | 134 ++++++++ .../vec/exec/parquet/parquet_statistics_test.cpp | 155 ++++++++++ be/test/vec/exec/parquet/parquet_version_test.cpp | 221 ++++++++++++++ 9 files changed, 1207 insertions(+), 26 deletions(-) diff --git a/be/src/vec/exec/format/parquet/parquet_common.cpp b/be/src/vec/exec/format/parquet/parquet_common.cpp index 33e9f11242b..59e12fcc71a 100644 --- a/be/src/vec/exec/format/parquet/parquet_common.cpp +++ b/be/src/vec/exec/format/parquet/parquet_common.cpp @@ -162,4 +162,344 @@ bool ColumnSelectVector::can_filter_all(size_t remaining_num_values) { void ColumnSelectVector::skip(size_t num_values) { _filter_map_index += num_values; } + +ParsedVersion::ParsedVersion(std::string application, std::optional<std::string> version, + std::optional<std::string> app_build_hash) + : _application(std::move(application)), + _version(std::move(version)), + _app_build_hash(std::move(app_build_hash)) {} + +bool ParsedVersion::operator==(const ParsedVersion& other) const { + return _application == other._application && _version == other._version && + _app_build_hash == other._app_build_hash; +} + +bool ParsedVersion::operator!=(const ParsedVersion& other) const { + return !(*this == other); +} + +size_t ParsedVersion::hash() const { + std::hash<std::string> hasher; + return hasher(_application) ^ (_version ? hasher(*_version) : 0) ^ + (_app_build_hash ? hasher(*_app_build_hash) : 0); +} + +std::string ParsedVersion::to_string() const { + return "ParsedVersion(application=" + _application + + ", semver=" + (_version ? *_version : "null") + + ", app_build_hash=" + (_app_build_hash ? *_app_build_hash : "null") + ")"; +} + +Status VersionParser::parse(const std::string& created_by, + std::unique_ptr<ParsedVersion>* parsed_version) { + static const std::string FORMAT = + "(.*?)\\s+version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?"; + static const std::regex PATTERN(FORMAT); + + std::smatch matcher; + if (!std::regex_match(created_by, matcher, PATTERN)) { + return Status::InternalError(fmt::format("Could not parse created_by: {}, using format: {}", + created_by, FORMAT)); + } + + std::string application = matcher[1].str(); + if (application.empty()) { + return Status::InternalError("application cannot be null or empty"); + } + std::optional<std::string> semver = + matcher[2].str().empty() ? std::nullopt : std::optional<std::string>(matcher[2].str()); + std::optional<std::string> app_build_hash = + matcher[3].str().empty() ? std::nullopt : std::optional<std::string>(matcher[3].str()); + *parsed_version = std::make_unique<ParsedVersion>(application, semver, app_build_hash); + return Status::OK(); +} + +SemanticVersion::SemanticVersion(int major, int minor, int patch) + : _major(major), + _minor(minor), + _patch(patch), + _prerelease(false), + _unknown(std::nullopt), + _pre(std::nullopt), + _build_info(std::nullopt) {} + +#ifdef BE_TEST +SemanticVersion::SemanticVersion(int major, int minor, int patch, bool has_unknown) + : _major(major), + _minor(minor), + _patch(patch), + _prerelease(has_unknown), + _unknown(std::nullopt), + _pre(std::nullopt), + _build_info(std::nullopt) {} +#endif + +SemanticVersion::SemanticVersion(int major, int minor, int patch, + std::optional<std::string> unknown, std::optional<std::string> pre, + std::optional<std::string> build_info) + : _major(major), + _minor(minor), + _patch(patch), + _prerelease(unknown.has_value() && !unknown.value().empty()), + _unknown(std::move(unknown)), + _pre(pre.has_value() ? std::optional<Prerelease>(Prerelease(std::move(pre.value()))) + : std::nullopt), + _build_info(std::move(build_info)) {} + +Status SemanticVersion::parse(const std::string& version, + std::unique_ptr<SemanticVersion>* semantic_version) { + static const std::regex pattern(R"(^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$)"); + std::smatch match; + + if (!std::regex_match(version, match, pattern)) { + return Status::InternalError(version + " does not match format"); + } + + int major = std::stoi(match[1].str()); + int minor = std::stoi(match[2].str()); + int patch = std::stoi(match[3].str()); + std::optional<std::string> unknown = + match[4].str().empty() ? std::nullopt : std::optional<std::string>(match[4].str()); + std::optional<std::string> prerelease = + match[5].str().empty() ? std::nullopt : std::optional<std::string>(match[5].str()); + std::optional<std::string> build_info = + match[6].str().empty() ? std::nullopt : std::optional<std::string>(match[6].str()); + if (major < 0 || minor < 0 || patch < 0) { + return Status::InternalError("major({}), minor({}), and patch({}) must all be >= 0", major, + minor, patch); + } + *semantic_version = + std::make_unique<SemanticVersion>(major, minor, patch, unknown, prerelease, build_info); + return Status::OK(); +} + +int SemanticVersion::compare_to(const SemanticVersion& other) const { + if (int cmp = _compare_integers(_major, other._major); cmp != 0) { + return cmp; + } + if (int cmp = _compare_integers(_minor, other._minor); cmp != 0) { + return cmp; + } + if (int cmp = _compare_integers(_patch, other._patch); cmp != 0) { + return cmp; + } + if (int cmp = _compare_booleans(other._prerelease, _prerelease); cmp != 0) { + return cmp; + } + if (_pre.has_value()) { + if (other._pre.has_value()) { + return _pre.value().compare_to(other._pre.value()); + } else { + return -1; + } + } else if (other._pre.has_value()) { + return 1; + } + return 0; +} + +bool SemanticVersion::operator==(const SemanticVersion& other) const { + return compare_to(other) == 0; +} + +bool SemanticVersion::operator!=(const SemanticVersion& other) const { + return !(*this == other); +} + +std::string SemanticVersion::to_string() const { + std::string result = + std::to_string(_major) + "." + std::to_string(_minor) + "." + std::to_string(_patch); + if (_prerelease && _unknown) result += _unknown.value(); + if (_pre) result += _pre.value().to_string(); + if (_build_info) result += _build_info.value(); + return result; +} + +SemanticVersion::NumberOrString::NumberOrString(const std::string& value_string) + : _original(value_string) { + const static std::regex NUMERIC("\\d+"); + _is_numeric = std::regex_match(_original, NUMERIC); + _number = -1; + if (_is_numeric) { + _number = std::stoi(_original); + } +} + +SemanticVersion::NumberOrString::NumberOrString(const NumberOrString& other) + : _original(other._original), _is_numeric(other._is_numeric), _number(other._number) {} + +int SemanticVersion::NumberOrString::compare_to(const SemanticVersion::NumberOrString& that) const { + if (this->_is_numeric != that._is_numeric) { + return this->_is_numeric ? -1 : 1; + } + + if (_is_numeric) { + return this->_number - that._number; + } + + return this->_original.compare(that._original); +} + +std::string SemanticVersion::NumberOrString::to_string() const { + return _original; +} + +bool SemanticVersion::NumberOrString::operator<(const SemanticVersion::NumberOrString& that) const { + return compare_to(that) < 0; +} + +bool SemanticVersion::NumberOrString::operator==( + const SemanticVersion::NumberOrString& that) const { + return compare_to(that) == 0; +} + +bool SemanticVersion::NumberOrString::operator!=( + const SemanticVersion::NumberOrString& that) const { + return !(*this == that); +} + +bool SemanticVersion::NumberOrString::operator>(const SemanticVersion::NumberOrString& that) const { + return compare_to(that) > 0; +} + +bool SemanticVersion::NumberOrString::operator<=( + const SemanticVersion::NumberOrString& that) const { + return !(*this > that); +} + +bool SemanticVersion::NumberOrString::operator>=( + const SemanticVersion::NumberOrString& that) const { + return !(*this < that); +} + +int SemanticVersion::_compare_integers(int x, int y) { + return (x < y) ? -1 : ((x == y) ? 0 : 1); +} + +int SemanticVersion::_compare_booleans(bool x, bool y) { + return (x == y) ? 0 : (x ? 1 : -1); +} + +std::vector<std::string> SemanticVersion::Prerelease::_split(const std::string& s, + const std::regex& delimiter) { + std::sregex_token_iterator iter(s.begin(), s.end(), delimiter, -1); + std::sregex_token_iterator end; + std::vector<std::string> tokens(iter, end); + return tokens; +} + +SemanticVersion::Prerelease::Prerelease(std::string original) : _original(std::move(original)) { + static const std::regex DOT("\\."); + auto parts = _split(_original, DOT); + for (const auto& part : parts) { + NumberOrString number_or_string(part); + _identifiers.emplace_back(number_or_string); + } +} + +int SemanticVersion::Prerelease::compare_to(const Prerelease& that) const { + int size = std::min(this->_identifiers.size(), that._identifiers.size()); + for (int i = 0; i < size; ++i) { + int cmp = this->_identifiers[i].compare_to(that._identifiers[i]); + if (cmp != 0) { + return cmp; + } + } + return static_cast<int>(this->_identifiers.size()) - static_cast<int>(that._identifiers.size()); +} + +std::string SemanticVersion::Prerelease::to_string() const { + return _original; +} + +bool SemanticVersion::Prerelease::operator<(const Prerelease& that) const { + return compare_to(that) < 0; +} + +bool SemanticVersion::Prerelease::operator==(const Prerelease& that) const { + return compare_to(that) == 0; +} + +bool SemanticVersion::Prerelease::operator!=(const Prerelease& that) const { + return !(*this == that); +} + +bool SemanticVersion::Prerelease::operator>(const Prerelease& that) const { + return compare_to(that) > 0; +} + +bool SemanticVersion::Prerelease::operator<=(const Prerelease& that) const { + return !(*this > that); +} + +bool SemanticVersion::Prerelease::operator>=(const Prerelease& that) const { + return !(*this < that); +} + +const SemanticVersion CorruptStatistics::PARQUET_251_FIXED_VERSION(1, 8, 0); +const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_START(1, 5, 0, std::nullopt, + "cdh5.5.0", std::nullopt); +const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_END(1, 5, 0); + +bool CorruptStatistics::should_ignore_statistics(const std::string& created_by, + tparquet::Type::type physical_type) { + if (physical_type != tparquet::Type::BYTE_ARRAY && + physical_type != tparquet::Type::FIXED_LEN_BYTE_ARRAY) { + // The bug only applies to binary columns + return false; + } + + if (created_by.empty()) { + // created_by is not populated + VLOG_DEBUG + << "Ignoring statistics because created_by is null or empty! See PARQUET-251 and " + "PARQUET-297"; + return true; + } + + Status status; + std::unique_ptr<ParsedVersion> parsed_version; + status = VersionParser::parse(created_by, &parsed_version); + if (!status.ok()) { + VLOG_DEBUG << "Ignoring statistics because created_by could not be parsed (see " + "PARQUET-251)." + " CreatedBy: " + << created_by << ", msg: " << status.msg(); + return true; + } + + if (parsed_version->application() != "parquet-mr") { + // Assume other applications don't have this bug + return false; + } + + if ((!parsed_version->version().has_value()) || parsed_version->version().value().empty()) { + VLOG_DEBUG << "Ignoring statistics because created_by did not contain a semver (see " + "PARQUET-251): " + << created_by; + return true; + } + + std::unique_ptr<SemanticVersion> semantic_version; + status = SemanticVersion::parse(parsed_version->version().value(), &semantic_version); + if (!status.ok()) { + VLOG_DEBUG << "Ignoring statistics because created_by could not be parsed (see " + "PARQUET-251)." + " CreatedBy: " + << created_by << ", msg: " << status.msg(); + return true; + } + if (semantic_version->compare_to(PARQUET_251_FIXED_VERSION) < 0 && + !(semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_START) >= 0 && + semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_END) < 0)) { + VLOG_DEBUG + << "Ignoring statistics because this file was created prior to the fixed version, " + "see PARQUET-251"; + return true; + } + + // This file was created after the fix + return false; +} + } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/parquet_common.h b/be/src/vec/exec/format/parquet/parquet_common.h index 2cf745882ee..da374d5fe79 100644 --- a/be/src/vec/exec/format/parquet/parquet_common.h +++ b/be/src/vec/exec/format/parquet/parquet_common.h @@ -17,10 +17,12 @@ #pragma once +#include <gen_cpp/parquet_types.h> #include <stddef.h> #include <cstdint> #include <ostream> +#include <regex> #include <string> #include <vector> @@ -156,4 +158,131 @@ private: size_t _num_filtered; size_t _read_index; }; -} // namespace doris::vectorized \ No newline at end of file + +enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER }; + +enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN }; + +class ParsedVersion { +public: + ParsedVersion(std::string application, std::optional<std::string> version, + std::optional<std::string> app_build_hash); + + const std::string& application() const { return _application; } + + const std::optional<std::string>& version() const { return _version; } + + const std::optional<std::string>& app_build_hash() const { return _app_build_hash; } + + bool operator==(const ParsedVersion& other) const; + + bool operator!=(const ParsedVersion& other) const; + + size_t hash() const; + + std::string to_string() const; + +private: + std::string _application; + std::optional<std::string> _version; + std::optional<std::string> _app_build_hash; +}; + +class VersionParser { +public: + static Status parse(const std::string& created_by, + std::unique_ptr<ParsedVersion>* parsed_version); +}; + +class SemanticVersion { +public: + SemanticVersion(int major, int minor, int patch); + +#ifdef BE_TEST + SemanticVersion(int major, int minor, int patch, bool has_unknown); +#endif + + SemanticVersion(int major, int minor, int patch, std::optional<std::string> unknown, + std::optional<std::string> pre, std::optional<std::string> build_info); + + static Status parse(const std::string& version, + std::unique_ptr<SemanticVersion>* semantic_version); + + int compare_to(const SemanticVersion& other) const; + + bool operator==(const SemanticVersion& other) const; + + bool operator!=(const SemanticVersion& other) const; + + std::string to_string() const; + +private: + class NumberOrString { + public: + explicit NumberOrString(const std::string& value_string); + + NumberOrString(const NumberOrString& other); + + int compare_to(const NumberOrString& that) const; + std::string to_string() const; + + bool operator<(const NumberOrString& that) const; + bool operator==(const NumberOrString& that) const; + bool operator!=(const NumberOrString& that) const; + bool operator>(const NumberOrString& that) const; + bool operator<=(const NumberOrString& that) const; + bool operator>=(const NumberOrString& that) const; + + private: + std::string _original; + bool _is_numeric; + int _number; + }; + + class Prerelease { + public: + explicit Prerelease(std::string original); + + int compare_to(const Prerelease& that) const; + std::string to_string() const; + + bool operator<(const Prerelease& that) const; + bool operator==(const Prerelease& that) const; + bool operator!=(const Prerelease& that) const; + bool operator>(const Prerelease& that) const; + bool operator<=(const Prerelease& that) const; + bool operator>=(const Prerelease& that) const; + + const std::string& original() const { return _original; } + + private: + static std::vector<std::string> _split(const std::string& s, const std::regex& delimiter); + + std::string _original; + std::vector<NumberOrString> _identifiers; + }; + + static int _compare_integers(int x, int y); + static int _compare_booleans(bool x, bool y); + + int _major; + int _minor; + int _patch; + bool _prerelease; + std::optional<std::string> _unknown; + std::optional<Prerelease> _pre; + std::optional<std::string> _build_info; +}; + +class CorruptStatistics { +public: + static bool should_ignore_statistics(const std::string& created_by, + tparquet::Type::type physical_type); + +private: + static const SemanticVersion PARQUET_251_FIXED_VERSION; + static const SemanticVersion CDH_5_PARQUET_251_FIXED_START; + static const SemanticVersion CDH_5_PARQUET_251_FIXED_END; +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h index 916f3f64ee6..316cbc5d716 100644 --- a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h +++ b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h @@ -17,6 +17,7 @@ #pragma once +#include <cmath> #include <cstring> #include <vector> @@ -38,9 +39,7 @@ class ParquetPredicate { M(TYPE_TINYINT, tparquet::Type::INT32) \ M(TYPE_SMALLINT, tparquet::Type::INT32) \ M(TYPE_INT, tparquet::Type::INT32) \ - M(TYPE_BIGINT, tparquet::Type::INT64) \ - M(TYPE_FLOAT, tparquet::Type::FLOAT) \ - M(TYPE_DOUBLE, tparquet::Type::DOUBLE) + M(TYPE_BIGINT, tparquet::Type::INT64) private: struct ScanPredicate { @@ -132,6 +131,8 @@ private: CppType min_value; CppType max_value; + std::unique_ptr<std::string> encoded_min_copy; + std::unique_ptr<std::string> encoded_max_copy; tparquet::Type::type physical_type = col_schema->physical_type; switch (col_val_range.type()) { #define DISPATCH(REINTERPRET_TYPE, PARQUET_TYPE) \ @@ -142,24 +143,69 @@ private: break; FOR_REINTERPRET_TYPES(DISPATCH) #undef DISPATCH + case TYPE_FLOAT: + if constexpr (std::is_same_v<CppType, float>) { + if (col_schema->physical_type != tparquet::Type::FLOAT) { + return false; + } + min_value = *reinterpret_cast<const CppType*>(encoded_min.data()); + max_value = *reinterpret_cast<const CppType*>(encoded_max.data()); + if (std::isnan(min_value) || std::isnan(max_value)) { + return false; + } + // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped + if (std::signbit(min_value) == 0 && min_value == 0.0F) { + min_value = -0.0F; + } + if (std::signbit(max_value) != 0 && max_value == -0.0F) { + max_value = 0.0F; + } + break; + } else { + return false; + } + case TYPE_DOUBLE: + if constexpr (std::is_same_v<CppType, float>) { + if (col_schema->physical_type != tparquet::Type::DOUBLE) { + return false; + } + min_value = *reinterpret_cast<const CppType*>(encoded_min.data()); + max_value = *reinterpret_cast<const CppType*>(encoded_max.data()); + if (std::isnan(min_value) || std::isnan(max_value)) { + return false; + } + // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped + if (std::signbit(min_value) == 0 && min_value == 0.0) { + min_value = -0.0; + } + if (std::signbit(max_value) != 0 && max_value == -0.0) { + max_value = 0.0; + } + break; + } else { + return false; + } case TYPE_VARCHAR: [[fallthrough]]; case TYPE_CHAR: [[fallthrough]]; case TYPE_STRING: - // TODO: In parquet, min and max statistics may not be able to handle UTF8 correctly. - // Current processing method is using min_value and max_value statistics introduced by PARQUET-1025 if they are used. - // If not, current processing method is temporarily ignored. A better way is try to read min and max statistics - // if it contains only ASCII characters. - if (!use_min_max_value) { - return false; - } if constexpr (std::is_same_v<CppType, StringRef>) { - min_value = StringRef(encoded_min); - max_value = StringRef(encoded_max); + if (!use_min_max_value) { + encoded_min_copy = std::make_unique<std::string>(encoded_min); + encoded_max_copy = std::make_unique<std::string>(encoded_max); + if (!_try_read_old_utf8_stats(*encoded_min_copy, *encoded_max_copy)) { + return false; + } + min_value = StringRef(*encoded_min_copy); + max_value = StringRef(*encoded_max_copy); + } else { + min_value = StringRef(encoded_min); + max_value = StringRef(encoded_max); + } } else { return false; - }; + } break; case TYPE_DECIMALV2: if constexpr (std::is_same_v<CppType, DecimalV2Value>) { @@ -397,9 +443,64 @@ private: return predicates; } + static inline bool _is_ascii(uint8_t byte) { return byte < 128; } + + static int _common_prefix(const std::string& encoding_min, const std::string& encoding_max) { + int min_length = std::min(encoding_min.size(), encoding_max.size()); + int common_length = 0; + while (common_length < min_length && + encoding_min[common_length] == encoding_max[common_length]) { + common_length++; + } + return common_length; + } + + static bool _try_read_old_utf8_stats(std::string& encoding_min, std::string& encoding_max) { + if (encoding_min == encoding_max) { + // If min = max, then there is a single value only + // No need to modify, just use min + encoding_max = encoding_min; + return true; + } else { + int common_prefix_length = _common_prefix(encoding_min, encoding_max); + + // For min we can retain all-ASCII, because this produces a strictly lower value. + int min_good_length = common_prefix_length; + while (min_good_length < encoding_min.size() && + _is_ascii(static_cast<uint8_t>(encoding_min[min_good_length]))) { + min_good_length++; + } + + // For max we can be sure only of the part matching the min. When they differ, we can consider only one next, and only if both are ASCII + int max_good_length = common_prefix_length; + if (max_good_length < encoding_max.size() && max_good_length < encoding_min.size() && + _is_ascii(static_cast<uint8_t>(encoding_min[max_good_length])) && + _is_ascii(static_cast<uint8_t>(encoding_max[max_good_length]))) { + max_good_length++; + } + // Incrementing 127 would overflow. Incrementing within non-ASCII can have side-effects. + while (max_good_length > 0 && + (static_cast<uint8_t>(encoding_max[max_good_length - 1]) == 127 || + !_is_ascii(static_cast<uint8_t>(encoding_max[max_good_length - 1])))) { + max_good_length--; + } + if (max_good_length == 0) { + // We can return just min bound, but code downstream likely expects both are present or both are absent. + return false; + } + + encoding_min.resize(min_good_length); + encoding_max.resize(max_good_length); + if (max_good_length > 0) { + encoding_max[max_good_length - 1]++; + } + return true; + } + } + public: static bool filter_by_stats(const ColumnValueRangeType& col_val_range, - const FieldSchema* col_schema, bool is_set_min_max, + const FieldSchema* col_schema, bool ignore_min_max_stats, const std::string& encoded_min, const std::string& encoded_max, bool is_all_null, const cctz::time_zone& ctz, bool use_min_max_value = false) { @@ -416,11 +517,14 @@ public: return; } } - for (auto& filter : filters) { - need_filter |= _filter_by_min_max(range, filter, col_schema, encoded_min, - encoded_max, ctz, use_min_max_value); - if (need_filter) { - break; + if (!ignore_min_max_stats) { + for (auto& filter : filters) { + need_filter |= + _filter_by_min_max(range, filter, col_schema, encoded_min, + encoded_max, ctz, use_min_max_value); + if (need_filter) { + break; + } } } }, diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp index 35cf076318e..53fb1579c8e 100644 --- a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp @@ -68,7 +68,7 @@ Status PageIndex::collect_skipped_page_range(tparquet::ColumnIndex* column_index const int num_of_pages = column_index->null_pages.size(); for (int page_id = 0; page_id < num_of_pages; page_id++) { bool is_all_null = column_index->null_pages[page_id]; - if (ParquetPredicate::filter_by_stats(col_val_range, col_schema, !is_all_null, + if (ParquetPredicate::filter_by_stats(col_val_range, col_schema, false, encoded_min_vals[page_id], encoded_max_vals[page_id], is_all_null, ctz)) { skipped_ranges.emplace_back(page_id); @@ -125,4 +125,4 @@ Status PageIndex::parse_offset_index(const tparquet::ColumnChunk& chunk, const u return Status::OK(); } -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 57396c349dd..84c572a3a2f 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -938,15 +938,53 @@ Status ParquetReader::_process_column_stat_filter(const std::vector<tparquet::Co continue; } const FieldSchema* col_schema = schema_desc.get_column(col_name); + bool ignore_min_max_stats = false; // Min-max of statistic is plain-encoded value - if (statistic.__isset.min_value) { + if (statistic.__isset.min_value && statistic.__isset.max_value) { + ColumnOrderName column_order = + col_schema->physical_type == tparquet::Type::INT96 || + col_schema->parquet_schema.logicalType.__isset.UNKNOWN + ? ColumnOrderName::UNDEFINED + : ColumnOrderName::TYPE_DEFINED_ORDER; + if ((statistic.min_value != statistic.max_value) && + (column_order != ColumnOrderName::TYPE_DEFINED_ORDER)) { + ignore_min_max_stats = true; + } *filter_group = ParquetPredicate::filter_by_stats( - slot_iter->second, col_schema, is_set_min_max, statistic.min_value, + slot_iter->second, col_schema, ignore_min_max_stats, statistic.min_value, statistic.max_value, is_all_null, *_ctz, true); } else { + if (statistic.__isset.min && statistic.__isset.max) { + bool max_equals_min = statistic.min == statistic.max; + + SortOrder sort_order = _determine_sort_order(col_schema->parquet_schema); + bool sort_orders_match = SortOrder::SIGNED == sort_order; + if (!sort_orders_match && !max_equals_min) { + ignore_min_max_stats = true; + } + bool should_ignore_corrupted_stats = false; + if (_ignored_stats.count(col_schema->physical_type) == 0) { + if (CorruptStatistics::should_ignore_statistics(_t_metadata->created_by, + col_schema->physical_type)) { + _ignored_stats[col_schema->physical_type] = true; + should_ignore_corrupted_stats = true; + } else { + _ignored_stats[col_schema->physical_type] = false; + } + } else if (_ignored_stats[col_schema->physical_type]) { + should_ignore_corrupted_stats = true; + } + if (should_ignore_corrupted_stats) { + ignore_min_max_stats = true; + } else if (!sort_orders_match && !max_equals_min) { + ignore_min_max_stats = true; + } + } else { + ignore_min_max_stats = true; + } *filter_group = ParquetPredicate::filter_by_stats( - slot_iter->second, col_schema, is_set_min_max, statistic.min, statistic.max, - is_all_null, *_ctz, false); + slot_iter->second, col_schema, ignore_min_max_stats, statistic.min, + statistic.max, is_all_null, *_ctz, false); } if (*filter_group) { break; @@ -1021,4 +1059,61 @@ void ParquetReader::_collect_profile_before_close() { _collect_profile(); } +SortOrder ParquetReader::_determine_sort_order(const tparquet::SchemaElement& parquet_schema) { + tparquet::Type::type physical_type = parquet_schema.type; + const tparquet::LogicalType& logical_type = parquet_schema.logicalType; + + // Assume string type is SortOrder::SIGNED, use ParquetPredicate::_try_read_old_utf8_stats() to handle it. + if (logical_type.__isset.STRING && (physical_type == tparquet::Type::BYTE_ARRAY || + physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY)) { + return SortOrder::SIGNED; + } + + if (logical_type.__isset.INTEGER) { + if (logical_type.INTEGER.isSigned) { + return SortOrder::SIGNED; + } else { + return SortOrder::UNSIGNED; + } + } else if (logical_type.__isset.DATE) { + return SortOrder::SIGNED; + } else if (logical_type.__isset.ENUM) { + return SortOrder::UNSIGNED; + } else if (logical_type.__isset.BSON) { + return SortOrder::UNSIGNED; + } else if (logical_type.__isset.JSON) { + return SortOrder::UNSIGNED; + } else if (logical_type.__isset.STRING) { + return SortOrder::UNSIGNED; + } else if (logical_type.__isset.DECIMAL) { + return SortOrder::UNKNOWN; + } else if (logical_type.__isset.MAP) { + return SortOrder::UNKNOWN; + } else if (logical_type.__isset.LIST) { + return SortOrder::UNKNOWN; + } else if (logical_type.__isset.TIME) { + return SortOrder::SIGNED; + } else if (logical_type.__isset.TIMESTAMP) { + return SortOrder::SIGNED; + } else if (logical_type.__isset.UNKNOWN) { + return SortOrder::UNKNOWN; + } else { + switch (physical_type) { + case tparquet::Type::BOOLEAN: + case tparquet::Type::INT32: + case tparquet::Type::INT64: + case tparquet::Type::FLOAT: + case tparquet::Type::DOUBLE: + return SortOrder::SIGNED; + case tparquet::Type::BYTE_ARRAY: + case tparquet::Type::FIXED_LEN_BYTE_ARRAY: + return SortOrder::UNSIGNED; + case tparquet::Type::INT96: + return SortOrder::UNKNOWN; + default: + return SortOrder::UNKNOWN; + } + } +} + } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h index 3cc262e14e6..9691e596b78 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -220,6 +220,8 @@ private: const RowGroupReader::RowGroupIndex& group, size_t* avg_io_size); void _collect_profile(); + static SortOrder _determine_sort_order(const tparquet::SchemaElement& parquet_schema); + private: RuntimeProfile* _profile = nullptr; const TFileScanRangeParams& _scan_params; @@ -284,5 +286,6 @@ private: const VExprContextSPtrs* _not_single_slot_filter_conjuncts = nullptr; const std::unordered_map<int, VExprContextSPtrs>* _slot_id_to_filter_conjuncts = nullptr; bool _hive_use_column_names = false; + std::unordered_map<tparquet::Type::type, bool> _ignored_stats; }; } // namespace doris::vectorized diff --git a/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp b/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp new file mode 100644 index 00000000000..bad95614f00 --- /dev/null +++ b/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <regex> + +#include "vec/exec/format/parquet/parquet_common.h" + +namespace doris { +namespace vectorized { +class ParquetCorruptStatisticsTest : public testing::Test { +public: + ParquetCorruptStatisticsTest() = default; +}; + +TEST_F(ParquetCorruptStatisticsTest, test_only_applies_to_binary) { + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.6.0 (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.6.0 (build abcd)", + tparquet::Type::FIXED_LEN_BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.6.0 (build abcd)", tparquet::Type::DOUBLE)); +} + +TEST_F(ParquetCorruptStatisticsTest, test_corrupt_statistics) { + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.6.0 (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.4.2 (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.6.100 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.7.999 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.6.22rc99 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.6.22rc99-SNAPSHOT (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.6.1-SNAPSHOT (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.6.0t-01-abcdefg (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("unparseable string", + tparquet::Type::BYTE_ARRAY)); + + // missing semver + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version (build abcd)", + tparquet::Type::BYTE_ARRAY)); + + // missing build hash + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.6.0 (build )", + tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.6.0 (build)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version (build)", + tparquet::Type::BYTE_ARRAY)); + + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("imapla version 1.6.0 (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("imapla version 1.10.0 (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.8.0 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.8.1 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.8.1rc3 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.8.1rc3-SNAPSHOT (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.9.0 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 2.0.0 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.9.0t-01-abcdefg (build abcd)", tparquet::Type::BYTE_ARRAY)); + + // missing semver + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version (build abcd)", + tparquet::Type::BYTE_ARRAY)); + + // missing build hash + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version 1.6.0 (build )", + tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version 1.6.0 (build)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics("impala version (build)", + tparquet::Type::BYTE_ARRAY)); +} + +TEST_F(ParquetCorruptStatisticsTest, test_distribution_corrupt_statistics) { + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.5.0-cdh5.4.999 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.5.0-cdh5.5.0-SNAPSHOT (build " + "956ed6c14c611b4c4eaaa1d6e5b9a9c6d4dfa336)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.5.0-cdh5.5.0 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.5.0-cdh5.5.1 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_FALSE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.5.0-cdh5.6.0 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics( + "parquet-mr version 1.4.10 (build abcd)", tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.5.0 (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.5.1 (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.6.0 (build abcd)", + tparquet::Type::BYTE_ARRAY)); + EXPECT_TRUE(CorruptStatistics::should_ignore_statistics("parquet-mr version 1.7.0 (build abcd)", + tparquet::Type::BYTE_ARRAY)); +} + +} // namespace vectorized +} // namespace doris diff --git a/be/test/vec/exec/parquet/parquet_statistics_test.cpp b/be/test/vec/exec/parquet/parquet_statistics_test.cpp new file mode 100644 index 00000000000..cd8d3068fe1 --- /dev/null +++ b/be/test/vec/exec/parquet/parquet_statistics_test.cpp @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <regex> + +#include "vec/exec/format/parquet/parquet_pred_cmp.h" + +namespace doris { +namespace vectorized { +class ParquetStatisticsTest : public testing::Test { +public: + ParquetStatisticsTest() = default; +}; + +TEST_F(ParquetStatisticsTest, test_try_read_old_utf8_stats) { + // [, bcé]: min is empty, max starts with ASCII + { + std::string encoding_min(""); + std::string encoding_max("bcé"); + EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + } + + // // [, ébc]: min is empty, max starts with non-ASCII + { + std::string encoding_min(""); + std::string encoding_max("ébc"); + EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + } + + // [aa, bé]: no common prefix, first different are both ASCII, min is all ASCII + { + std::string encoding_min("aa"); + std::string encoding_max("bé"); + EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + EXPECT_EQ(encoding_min, "aa"); + EXPECT_EQ(encoding_max, "c"); + } + + // [abcd, abcdN]: common prefix, not only ASCII, one prefix of the other, last common ASCII + { + std::string encoding_min("abcd"); + std::string encoding_max("abcdN"); + EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + EXPECT_EQ(encoding_min, "abcd"); + EXPECT_EQ(encoding_max, "abce"); + } + + // [abcé, abcéN]: common prefix, not only ASCII, one prefix of the other, last common non ASCII + { + std::string encoding_min("abcé"); + std::string encoding_max("abcéN"); + EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + EXPECT_EQ(encoding_min, "abcé"); + EXPECT_EQ(encoding_max, "abd"); + } + + // [abcéM, abcéN]: common prefix, not only ASCII, first different are both ASCII + { + std::string encoding_min("abcéM"); + std::string encoding_max("abcéN"); + EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + EXPECT_EQ(encoding_min, "abcéM"); + EXPECT_EQ(encoding_max, "abcéO"); + } + + // [abcéMab, abcéNxy]: common prefix, not only ASCII, first different are both ASCII, more characters afterwards + { + std::string encoding_min("abcéMab"); + std::string encoding_max("abcéNxy"); + EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + EXPECT_EQ(encoding_min, "abcéMab"); + EXPECT_EQ(encoding_max, "abcéO"); + } + + // [abcéM, abcé\u00f7]: common prefix, not only ASCII, first different are both ASCII, but need to be chopped off (127) + { + std::string encoding_min("abcéM"); + std::string encoding_max("abcé\u00f7"); + EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + EXPECT_EQ(encoding_min, "abcéM"); + EXPECT_EQ(encoding_max, "abd"); + } + + // [abc\u007fé, bcd\u007fé]: no common prefix, first different are both ASCII + { + std::string encoding_min("abc\u007fé"); + std::string encoding_max("bcd\u007fé"); + EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + EXPECT_EQ(encoding_min, "abc\u007f"); + EXPECT_EQ(encoding_max, "c"); + } + + // [é, a]: no common prefix, first different are not both ASCII + { + std::string encoding_min("é"); + std::string encoding_max("a"); + EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + } + + // [é, ê]: no common prefix, first different are both not ASCII + { + std::string encoding_min("é"); + std::string encoding_max("ê"); + EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + } + + // [aé, aé]: min = max (common prefix, first different are both not ASCII) + { + std::string encoding_min("aé"); + std::string encoding_max("aé"); + EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + EXPECT_EQ(encoding_min, "aé"); + EXPECT_EQ(encoding_max, "aé"); + } + + // [aé, bé]: no common prefix, first different are both ASCII + { + std::string encoding_min("aé"); + std::string encoding_max("bé"); + EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max)); + ; + EXPECT_EQ(encoding_min, "a"); + EXPECT_EQ(encoding_max, "c"); + } +} + +} // namespace vectorized +} // namespace doris diff --git a/be/test/vec/exec/parquet/parquet_version_test.cpp b/be/test/vec/exec/parquet/parquet_version_test.cpp new file mode 100644 index 00000000000..10d17e27790 --- /dev/null +++ b/be/test/vec/exec/parquet/parquet_version_test.cpp @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <regex> + +#include "vec/exec/format/parquet/parquet_common.h" + +namespace doris { +namespace vectorized { +class ParquetVersionTest : public testing::Test { +public: + ParquetVersionTest() = default; +}; + +TEST_F(ParquetVersionTest, test_version_parser) { + std::unique_ptr<ParsedVersion> parsed_version; + + Status status = VersionParser::parse("parquet-mr version 1.6.0 (build abcd)", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", "abcd"), *parsed_version); + + status = VersionParser::parse("parquet-mr version 1.6.22rc99-SNAPSHOT (build abcd)", + &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.22rc99-SNAPSHOT", "abcd"), *parsed_version); + + status = VersionParser::parse("unparseable string", &parsed_version); + EXPECT_FALSE(status.ok()); + + // missing semver + status = VersionParser::parse("parquet-mr version (build abcd)", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, "abcd"), *parsed_version); + + status = VersionParser::parse("parquet-mr version (build abcd)", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, "abcd"), *parsed_version); + + // missing build hash + status = VersionParser::parse("parquet-mr version 1.6.0 (build )", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version 1.6.0 (build)", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version (build)", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version (build )", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), *parsed_version); + + // Missing entire build section + status = VersionParser::parse("parquet-mr version 1.6.0", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version 1.8.0rc4", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4", std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version 1.8.0rc4-SNAPSHOT", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), *parsed_version); + + // Various spaces + status = VersionParser::parse("parquet-mr version 1.6.0", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version 1.8.0rc4", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4", std::nullopt), *parsed_version); + + status = + VersionParser::parse("parquet-mr version 1.8.0rc4-SNAPSHOT ", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version 1.6.0 ( build )", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version 1.6.0 ( build)", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", "1.6.0", std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version ( build)", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), *parsed_version); + + status = VersionParser::parse("parquet-mr version (build )", &parsed_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(ParsedVersion("parquet-mr", std::nullopt, std::nullopt), *parsed_version); +} + +void assertLessThan(const std::string& a, const std::string& b) { + std::unique_ptr<SemanticVersion> version_a; + Status status = SemanticVersion::parse(a, &version_a); + EXPECT_TRUE(status.ok()); + std::unique_ptr<SemanticVersion> version_b; + status = SemanticVersion::parse(b, &version_b); + EXPECT_TRUE(status.ok()); + EXPECT_LT(version_a->compare_to(*version_b), 0) << a << " should be < " << b; + EXPECT_GT(version_b->compare_to(*version_a), 0) << b << " should be > " << a; +} + +void assertEqualTo(const std::string& a, const std::string& b) { + std::unique_ptr<SemanticVersion> version_a; + Status status = SemanticVersion::parse(a, &version_a); + EXPECT_TRUE(status.ok()); + std::unique_ptr<SemanticVersion> version_b; + status = SemanticVersion::parse(b, &version_b); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(version_a->compare_to(*version_b), 0) << a << " should equal " << b; +} + +TEST_F(ParquetVersionTest, test_compare) { + EXPECT_EQ(SemanticVersion(1, 8, 1).compare_to(SemanticVersion(1, 8, 1)), 0); + EXPECT_LT(SemanticVersion(1, 8, 0).compare_to(SemanticVersion(1, 8, 1)), 0); + EXPECT_GT(SemanticVersion(1, 8, 2).compare_to(SemanticVersion(1, 8, 1)), 0); + + EXPECT_EQ(SemanticVersion(1, 8, 1).compare_to(SemanticVersion(1, 8, 1)), 0); + EXPECT_LT(SemanticVersion(1, 8, 0).compare_to(SemanticVersion(1, 8, 1)), 0); + EXPECT_GT(SemanticVersion(1, 8, 2).compare_to(SemanticVersion(1, 8, 1)), 0); + + EXPECT_LT(SemanticVersion(1, 7, 0).compare_to(SemanticVersion(1, 8, 0)), 0); + EXPECT_GT(SemanticVersion(1, 9, 0).compare_to(SemanticVersion(1, 8, 0)), 0); + + EXPECT_LT(SemanticVersion(0, 0, 0).compare_to(SemanticVersion(1, 0, 0)), 0); + EXPECT_GT(SemanticVersion(2, 0, 0).compare_to(SemanticVersion(1, 0, 0)), 0); + + EXPECT_LT(SemanticVersion(1, 8, 100).compare_to(SemanticVersion(1, 9, 0)), 0); + + EXPECT_GT(SemanticVersion(1, 8, 0).compare_to(SemanticVersion(1, 8, 0, true)), 0); + EXPECT_EQ(SemanticVersion(1, 8, 0, true).compare_to(SemanticVersion(1, 8, 0, true)), 0); + EXPECT_LT(SemanticVersion(1, 8, 0, true).compare_to(SemanticVersion(1, 8, 0)), 0); +} + +TEST_F(ParquetVersionTest, test_semver_prerelease_examples) { + std::vector<std::string> examples = {"1.0.0-alpha", "1.0.0-alpha.1", "1.0.0-alpha.beta", + "1.0.0-beta", "1.0.0-beta.2", "1.0.0-beta.11", + "1.0.0-rc.1", "1.0.0"}; + for (size_t i = 0; i < examples.size() - 1; ++i) { + assertLessThan(examples[i], examples[i + 1]); + assertEqualTo(examples[i], examples[i]); + } + assertEqualTo(examples.back(), examples.back()); +} + +TEST_F(ParquetVersionTest, test_semver_build_info_examples) { + assertEqualTo("1.0.0-alpha+001", "1.0.0-alpha+001"); + assertEqualTo("1.0.0-alpha", "1.0.0-alpha+001"); + assertEqualTo("1.0.0+20130313144700", "1.0.0+20130313144700"); + assertEqualTo("1.0.0", "1.0.0+20130313144700"); + assertEqualTo("1.0.0-beta+exp.sha.5114f85", "1.0.0-beta+exp.sha.5114f85"); + assertEqualTo("1.0.0-beta", "1.0.0-beta+exp.sha.5114f85"); +} + +TEST_F(ParquetVersionTest, test_unknown_comparisons) { + assertLessThan("1.0.0rc0-alpha+001", "1.0.0-alpha"); +} + +TEST_F(ParquetVersionTest, test_distribution_versions) { + assertEqualTo("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.0"); + assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.1"); + assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.1-SNAPSHOT"); + assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.6.0"); + assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh6.0.0"); + assertLessThan("1.5.0-cdh5.5.0", "1.5.0"); + assertLessThan("1.5.0-cdh5.5.0", "1.5.0-cdh5.5.0-SNAPSHOT"); +} + +TEST_F(ParquetVersionTest, test_parse) { + std::unique_ptr<SemanticVersion> semantic_version; + Status status = SemanticVersion::parse("1.8.0", &semantic_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0)); + status = SemanticVersion::parse("1.8.0rc3", &semantic_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0, true)); + status = SemanticVersion::parse("1.8.0rc3-SNAPSHOT", &semantic_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0, "rc3", "SNAPSHOT", std::nullopt)); + status = SemanticVersion::parse("1.8.0-SNAPSHOT", &semantic_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(*semantic_version, SemanticVersion(1, 8, 0, std::nullopt, "SNAPSHOT", std::nullopt)); + status = SemanticVersion::parse("1.5.0-cdh5.5.0", &semantic_version); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(*semantic_version, SemanticVersion(1, 5, 0, std::nullopt, "cdh5.5.0", std::nullopt)); +} + +} // namespace vectorized +} // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org