wgtmac commented on code in PR #180:
URL: https://github.com/apache/iceberg-cpp/pull/180#discussion_r2312255611
##########
src/iceberg/util/string_util.h:
##########
@@ -46,4 +46,15 @@ class ICEBERG_EXPORT StringUtils {
}
};
+/// \brief Transparent hash function that supports std::string_view as lookup
key
+///
+/// Enables std::unordered_map to directly accept std::string_view lookup keys
+/// without creating temporary std::string objects, using C++20's transparent
lookup.
+struct ICEBERG_EXPORT string_hash {
+ using hash_type = std::hash<std::string_view>;
+ using is_transparent = void;
+
+ std::size_t operator()(std::string_view str) const { return
hash_type{}(str); }
Review Comment:
```suggestion
struct ICEBERG_EXPORT StringHash {
using hash_type = std::hash<std::string_view>;
using is_transparent = void;
std::size_t operator()(const char* str) const { return
hash_type{}(str); }
std::size_t operator()(std::string_view str) const { return
hash_type{}(str); }
std::size_t operator()(const std::string& str) const { return
hash_type{}(str); }
};
```
Suggestions for naming consistency and completeness.
##########
src/iceberg/schema.cc:
##########
@@ -19,13 +19,57 @@
#include "iceberg/schema.h"
+#include <algorithm>
#include <format>
+#include <functional>
#include "iceberg/type.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
+#include "iceberg/util/macros.h"
+#include "iceberg/util/visit_type.h"
namespace iceberg {
+class IdToFieldVisitor {
+ public:
+ explicit IdToFieldVisitor(
+ std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
+ id_to_field);
+ Status Visit(const Type& type);
+ Status VisitNestedType(const Type& type);
+
+ private:
+ std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
id_to_field_;
+};
+
+class NameToIdVisitor {
Review Comment:
If read carefully, you might find that `IdToFieldVisitor` and
`NameToIdVisitor` have used different styles of `Visit` functions. Should we
make them consistent? For example, changing `IdToFieldVisitor` as below?
```cpp
class IdToFieldVisitor {
public:
Status Visit(const PrimitiveType& type);
Status Visit(const NestedType& type);
};
```
##########
src/iceberg/schema.cc:
##########
@@ -44,4 +88,181 @@ bool Schema::Equals(const Schema& other) const {
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
}
+Result<std::optional<std::reference_wrapper<const SchemaField>>>
Schema::FindFieldByName(
+ std::string_view name, bool case_sensitive) const {
+ if (case_sensitive) {
+ ICEBERG_RETURN_UNEXPECTED(InitNameToIdMap());
+ auto it = name_to_id_.find(name);
+ if (it == name_to_id_.end()) return std::nullopt;
+ return FindFieldById(it->second);
+ }
+ ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap());
+ auto it = lowercase_name_to_id_.find(StringUtils::ToLower(name));
+ if (it == lowercase_name_to_id_.end()) return std::nullopt;
+ return FindFieldById(it->second);
+}
+
+Status Schema::InitIdToFieldMap() const {
+ if (!id_to_field_.empty()) {
+ return {};
+ }
+ IdToFieldVisitor visitor(id_to_field_);
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
+ return {};
+}
+
+Status Schema::InitNameToIdMap() const {
+ if (!name_to_id_.empty()) {
+ return {};
+ }
+ NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true);
+ ICEBERG_RETURN_UNEXPECTED(
+ VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
+ visitor.Finish();
+ return {};
+}
+
+Status Schema::InitLowerCaseNameToIdMap() const {
+ if (!lowercase_name_to_id_.empty()) {
+ return {};
+ }
+ NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false);
+ ICEBERG_RETURN_UNEXPECTED(
+ VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
+ visitor.Finish();
+ return {};
+}
+
+Result<std::optional<std::reference_wrapper<const SchemaField>>>
Schema::FindFieldById(
+ int32_t field_id) const {
+ ICEBERG_RETURN_UNEXPECTED(InitIdToFieldMap());
+ auto it = id_to_field_.find(field_id);
+ if (it == id_to_field_.end()) {
+ return std::nullopt;
+ }
+ return it->second;
+}
+
+IdToFieldVisitor::IdToFieldVisitor(
+ std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
id_to_field)
+ : id_to_field_(id_to_field) {}
+
+Status IdToFieldVisitor::Visit(const Type& type) {
+ if (type.is_nested()) {
+ ICEBERG_RETURN_UNEXPECTED(VisitNestedType(type));
+ }
+ return {};
+}
+
+Status IdToFieldVisitor::VisitNestedType(const Type& type) {
+ const auto& nested = iceberg::internal::checked_cast<const
NestedType&>(type);
+ const auto& fields = nested.fields();
+ for (const auto& field : fields) {
+ auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field));
+ if (!it.second) {
+ return InvalidSchema("Duplicate field id found: {}", field.field_id());
+ }
+ ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
+ }
+ return {};
+}
+
+NameToIdVisitor::NameToIdVisitor(
+ std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>&
name_to_id,
+ bool case_sensitive, std::function<std::string(std::string_view)>
quoting_func)
+ : name_to_id_(name_to_id),
+ case_sensitive_(case_sensitive),
+ quoting_func_(std::move(quoting_func)) {}
+
+Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
+ const std::string& short_path) {
+ const auto& field = type.fields()[0];
+ std::string new_path = BuildPath(path, field.name(), case_sensitive_);
+ std::string new_short_path;
+ if (field.type()->type_id() == TypeId::kStruct) {
+ new_short_path = short_path;
+ } else {
+ new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
+ }
+ auto it = name_to_id_.try_emplace(new_path, field.field_id());
+ if (!it.second) {
+ return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
+ it.first->first, it.first->second, field.field_id());
+ }
+ short_name_to_id_.try_emplace(new_short_path, field.field_id());
+ ICEBERG_RETURN_UNEXPECTED(
+ VisitTypeInline(*field.type(), this, new_path, new_short_path));
+ return {};
+}
+
+Status NameToIdVisitor::Visit(const MapType& type, const std::string& path,
+ const std::string& short_path) {
+ std::string new_path, new_short_path;
+ const auto& fields = type.fields();
+ for (const auto& field : fields) {
+ new_path = BuildPath(path, field.name(), case_sensitive_);
+ if (field.name() == MapType::kValueName &&
+ field.type()->type_id() == TypeId::kStruct) {
+ new_short_path = short_path;
+ } else {
+ new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
+ }
+ auto it = name_to_id_.try_emplace(new_path, field.field_id());
+ if (!it.second) {
+ return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id:
{}",
+ it.first->first, it.first->second,
field.field_id());
+ }
+ short_name_to_id_.try_emplace(new_short_path, field.field_id());
+ ICEBERG_RETURN_UNEXPECTED(
+ VisitTypeInline(*field.type(), this, new_path, new_short_path));
+ }
+ return {};
+}
+
+Status NameToIdVisitor::Visit(const StructType& type, const std::string& path,
+ const std::string& short_path) {
+ const auto& fields = type.fields();
+ std::string new_path, new_short_path;
+ for (const auto& field : fields) {
+ new_path = BuildPath(path, field.name(), case_sensitive_);
+ new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
+ auto it = name_to_id_.try_emplace(new_path, field.field_id());
+ if (!it.second) {
+ return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id:
{}",
+ it.first->first, it.first->second,
field.field_id());
+ }
+ short_name_to_id_.try_emplace(new_short_path, field.field_id());
+ ICEBERG_RETURN_UNEXPECTED(
+ VisitTypeInline(*field.type(), this, new_path, new_short_path));
+ }
+ return {};
+}
+
+Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string&
path,
+ const std::string& short_path) {
+ return {};
+}
+
+std::string NameToIdVisitor::BuildPath(std::string_view prefix,
+ std::string_view field_name, bool
case_sensitive) {
+ std::string quoted_name;
+ if (!quoting_func_) {
+ quoted_name = std::string(field_name);
+ } else {
+ quoted_name = quoting_func_(field_name);
+ }
+ if (case_sensitive) {
+ return prefix.empty() ? quoted_name : std::string(prefix) + "." +
quoted_name;
+ }
+ return prefix.empty() ? StringUtils::ToLower(quoted_name)
+ : std::string(prefix) + "." +
StringUtils::ToLower(quoted_name);
+ ;
Review Comment:
```suggestion
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]