Re: [PR] feat(schema): add insensitive way to find schemafield [iceberg-cpp]

via GitHub Thu, 21 Aug 2025 09:56:49 -0700


wgtmac commented on code in PR #180:
URL: https://github.com/apache/iceberg-cpp/pull/180#discussion_r2291329948



##########
src/iceberg/schema.h:
##########
@@ -31,6 +32,8 @@
 #include "iceberg/iceberg_export.h"
 #include "iceberg/schema_field.h"
 #include "iceberg/type.h"
+#include "iceberg/util/macros.h"
+#include "iceberg/util/visit_type.h"

Review Comment:
   ```suggestion
   ```
   
   Only add them in the source file.



##########
src/iceberg/schema.h:
##########
@@ -23,6 +23,7 @@
 /// Schemas for Iceberg tables.  This header contains the definition of Schema
 /// and any utility functions.  See iceberg/type.h and iceberg/field.h as well.
 
+#include <algorithm>

Review Comment:
   ```suggestion
   ```



##########
src/iceberg/type.cc:
##########
@@ -66,8 +70,9 @@ std::optional<std::reference_wrapper<const SchemaField>> 
StructType::GetFieldByI
   }
   return fields_[index];
 }
+// todo

Review Comment:
   ```suggestion
   // TODO: respect case_sensitive
   ```



##########
src/iceberg/schema.h:
##########
@@ -54,13 +57,41 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  ///\brief Get thd SchemaField By Name

Review Comment:
   ```suggestion
     /// \brief Find SchemaField by the field name.
   ```



##########
src/iceberg/schema.cc:
##########
@@ -44,4 +80,172 @@ bool Schema::Equals(const Schema& other) const {
   return schema_id_ == other.schema_id_ && fields_ == other.fields_;
 }
 
+Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldByName(
+    std::string_view name, bool case_sensitive) const {
+  if (case_sensitive) {
+    ICEBERG_RETURN_UNEXPECTED(InitNameToIndexMap());
+    auto it = name_to_index_.find(std::string(name));
+    if (it == name_to_index_.end()) return std::nullopt;
+    return full_schemafield_[it->second];
+  }
+  ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIndexMap());
+  std::string lower_name(name);
+  std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
+  auto it = lowercase_name_to_index_.find(lower_name);
+  if (it == lowercase_name_to_index_.end()) return std::nullopt;
+  return full_schemafield_[it->second];
+}
+
+Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldByName(
+    std::string_view name) const {
+  return FindFieldByName(name, /*case_sensitive*/ true);
+}
+
+Result<Status> Schema::InitIdToIndexMap() const {
+  if (!id_to_index_.empty()) {
+    return {};
+  }
+  bool has_init = !full_schemafield_.empty();

Review Comment:
   Can we just pass the `full_schemafield_` reference to the visitor so we 
don't need to set the flag in the beginning and perform the move in the end?



##########
src/iceberg/schema.cc:
##########
@@ -23,9 +23,45 @@
 
 #include "iceberg/type.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
-

Review Comment:
   Please revert this line



##########
src/iceberg/type.h:
##########
@@ -114,7 +117,9 @@ class ICEBERG_EXPORT StructType : public NestedType {
   std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex(
       int32_t index) const override;
   std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
-      std::string_view name) const override;
+      std::string_view name, bool case_sensitive) const override;
+  std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName(
+      std::string_view name) const;

Review Comment:
   You may simply add `using NestedType::GetFieldByName;` to make it visible to 
the subclass.



##########
src/iceberg/schema.h:
##########
@@ -54,13 +57,41 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  ///\brief Get thd SchemaField By Name
+  ///
+  /// Short names for maps and lists are included for any name that does not 
conflict with
+  /// a canonical name. For example, a list, 'l', of structs with field 'x' 
will produce
+  /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', 
if its
+  /// value include a structs with field 'x' wil produce short name 'm.x' in 
addition to
+  /// canonical name 'm.value.x'
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name, bool case_sensitive) const;
+
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name) const;
+
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldById(int32_t field_id) const;
+
   friend bool operator==(const Schema& lhs, const Schema& rhs) { return 
lhs.Equals(rhs); }
 
+  /// Mapping from field id to index of `full_schemafield_`.
+  mutable std::unordered_map<int, size_t> id_to_index_;
+  /// Mapping from field name to index of `full_schemafield_`.
+  mutable std::unordered_map<std::string, size_t> name_to_index_;
+  /// Mapping from field lowercase_name(suppoert case_insensitive query) to 
index of
+  /// `full_schemafield_`.
+  mutable std::unordered_map<std::string, size_t> lowercase_name_to_index_;
+  mutable std::vector<std::reference_wrapper<const SchemaField>> 
full_schemafield_;

Review Comment:
   Just double-checked the Java impl, it has several collections as below and 
the `idToField` is the ground as it is used by `aliasToId`, `nameToId`, 
`lowerCaseNameToId` and `identifierFieldIdSet`. So please change it to 
`std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>` so we 
don't have to refactor it again if we want to support alias and identifier 
fields.
   
   ```
     private transient BiMap<String, Integer> aliasToId = null;
     private transient Map<Integer, NestedField> idToField = null;
     private transient Map<String, Integer> nameToId = null;
     private transient Map<String, Integer> lowerCaseNameToId = null;
     private transient Map<Integer, Accessor<StructLike>> idToAccessor = null;
     private transient Map<Integer, String> idToName = null;
     private transient Set<Integer> identifierFieldIdSet = null;
     private final transient Map<Integer, Integer> idsToReassigned;
     private final transient Map<Integer, Integer> idsToOriginal;
   ```



##########
src/iceberg/type.cc:
##########
@@ -66,8 +70,9 @@ std::optional<std::reference_wrapper<const SchemaField>> 
StructType::GetFieldByI
   }
   return fields_[index];
 }
+// todo

Review Comment:
   Usually a PR should contain only a single improvement. Changes in 
`type.h/.cc` are incomplete and unrelated, it would be better to remove them 
for now and add them back in a separate PR. 



##########
src/iceberg/schema.h:
##########
@@ -54,13 +57,41 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  ///\brief Get thd SchemaField By Name
+  ///
+  /// Short names for maps and lists are included for any name that does not 
conflict with
+  /// a canonical name. For example, a list, 'l', of structs with field 'x' 
will produce
+  /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', 
if its
+  /// value include a structs with field 'x' wil produce short name 'm.x' in 
addition to
+  /// canonical name 'm.value.x'
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name, bool case_sensitive) const;

Review Comment:
   ```suggestion
     FindFieldByName(std::string_view name, bool case_sensitive = true) const;
   ```



##########
src/iceberg/schema.h:
##########
@@ -54,13 +57,41 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  ///\brief Get thd SchemaField By Name
+  ///
+  /// Short names for maps and lists are included for any name that does not 
conflict with
+  /// a canonical name. For example, a list, 'l', of structs with field 'x' 
will produce
+  /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', 
if its
+  /// value include a structs with field 'x' wil produce short name 'm.x' in 
addition to
+  /// canonical name 'm.value.x'
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name, bool case_sensitive) const;
+
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name) const;

Review Comment:
   ```suggestion
   ```
   
   It is not virtual function so safe to add default parameter.



##########
src/iceberg/schema.cc:
##########
@@ -23,9 +23,45 @@
 
 #include "iceberg/type.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
-
 namespace iceberg {
+class IdVisitor {
+ public:
+  explicit IdVisitor(bool has_init_ = false);
+  Status Visit(const Type& type);
+
+  bool has_init;
+  int index = 0;
+  std::unordered_map<int, size_t> id_to_index;
+  std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
+};
+
+std::string GetPath(const std::string& last_path, const std::string& 
field_name,
+                    bool case_sensitive) {
+  if (case_sensitive) {
+    return last_path.empty() ? field_name : last_path + "." + field_name;
+  }
+  std::string lower_name(field_name);
+  std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
+  return last_path.empty() ? lower_name : last_path + "." + lower_name;
+}
+class NameVisitor {

Review Comment:
   nit: rename it to NameToIdVisitor



##########
src/iceberg/schema.h:
##########
@@ -54,13 +57,41 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  ///\brief Get thd SchemaField By Name
+  ///
+  /// Short names for maps and lists are included for any name that does not 
conflict with
+  /// a canonical name. For example, a list, 'l', of structs with field 'x' 
will produce
+  /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', 
if its
+  /// value include a structs with field 'x' wil produce short name 'm.x' in 
addition to
+  /// canonical name 'm.value.x'
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name, bool case_sensitive) const;
+
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name) const;
+
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldById(int32_t field_id) const;
+
   friend bool operator==(const Schema& lhs, const Schema& rhs) { return 
lhs.Equals(rhs); }
 
+  /// Mapping from field id to index of `full_schemafield_`.
+  mutable std::unordered_map<int, size_t> id_to_index_;
+  /// Mapping from field name to index of `full_schemafield_`.
+  mutable std::unordered_map<std::string, size_t> name_to_index_;
+  /// Mapping from field lowercase_name(suppoert case_insensitive query) to 
index of
+  /// `full_schemafield_`.
+  mutable std::unordered_map<std::string, size_t> lowercase_name_to_index_;
+  mutable std::vector<std::reference_wrapper<const SchemaField>> 
full_schemafield_;

Review Comment:
   Can we merge `std::unordered_map<int, size_t> id_to_index_` and 
`std::vector<std::reference_wrapper<const SchemaField>> full_schemafield_` into 
`std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>` which 
has less memory footprint?



##########
src/iceberg/schema.cc:
##########
@@ -44,4 +80,172 @@ bool Schema::Equals(const Schema& other) const {
   return schema_id_ == other.schema_id_ && fields_ == other.fields_;
 }
 
+Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldByName(
+    std::string_view name, bool case_sensitive) const {
+  if (case_sensitive) {
+    ICEBERG_RETURN_UNEXPECTED(InitNameToIndexMap());
+    auto it = name_to_index_.find(std::string(name));
+    if (it == name_to_index_.end()) return std::nullopt;
+    return full_schemafield_[it->second];
+  }
+  ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIndexMap());
+  std::string lower_name(name);
+  std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
+  auto it = lowercase_name_to_index_.find(lower_name);
+  if (it == lowercase_name_to_index_.end()) return std::nullopt;
+  return full_schemafield_[it->second];
+}
+
+Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldByName(
+    std::string_view name) const {
+  return FindFieldByName(name, /*case_sensitive*/ true);
+}
+
+Result<Status> Schema::InitIdToIndexMap() const {
+  if (!id_to_index_.empty()) {
+    return {};
+  }
+  bool has_init = !full_schemafield_.empty();
+  IdVisitor visitor(has_init);
+  ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
+  id_to_index_ = std::move(visitor.id_to_index);
+  if (!has_init) {
+    full_schemafield_ = std::move(visitor.full_schemafield);
+  }
+  return {};
+}
+
+Result<Status> Schema::InitNameToIndexMap() const {
+  if (!name_to_index_.empty()) {
+    return {};
+  }
+  bool has_init = !full_schemafield_.empty();
+  std::string path, short_path;
+  NameVisitor visitor(true, has_init);
+  ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, 
short_path));
+  name_to_index_ = std::move(visitor.name_to_index);
+  if (!has_init) {
+    full_schemafield_ = std::move(visitor.full_schemafield);
+  }
+  return {};
+}
+
+Result<Status> Schema::InitLowerCaseNameToIndexMap() const {
+  if (!lowercase_name_to_index_.empty()) {
+    return {};
+  }
+  bool has_init = !full_schemafield_.empty();
+  std::string path, short_path;
+  NameVisitor visitor(false, has_init);
+  ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, path, 
short_path));
+  lowercase_name_to_index_ = std::move(visitor.name_to_index);
+  if (!has_init) {
+    full_schemafield_ = std::move(visitor.full_schemafield);
+  }
+  return {};
+}
+
+Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldById(
+    int32_t field_id) const {
+  ICEBERG_RETURN_UNEXPECTED(InitIdToIndexMap());
+  auto it = id_to_index_.find(field_id);
+  if (it == id_to_index_.end()) {
+    return std::nullopt;
+  }
+  return full_schemafield_[it->second];
+}
+
+IdVisitor::IdVisitor(bool has_init_) : has_init(has_init_) {}
+
+Status IdVisitor::Visit(const Type& type) {
+  const auto& nested = iceberg::internal::checked_cast<const 
NestedType&>(type);

Review Comment:
   It looks weird. If calling IdVisitor on a primitive type, this will crash.



##########
src/iceberg/schema.cc:
##########
@@ -23,9 +23,45 @@
 
 #include "iceberg/type.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
-
 namespace iceberg {
+class IdVisitor {
+ public:
+  explicit IdVisitor(bool has_init_ = false);
+  Status Visit(const Type& type);
+
+  bool has_init;
+  int index = 0;
+  std::unordered_map<int, size_t> id_to_index;
+  std::vector<std::reference_wrapper<const SchemaField>> full_schemafield;
+};
+
+std::string GetPath(const std::string& last_path, const std::string& 
field_name,

Review Comment:
   Shouldn't it be a static member function of `NameVisitor` since it is 
exclusively used by it?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(schema): add insensitive way to find schemafield [iceberg-cpp]

Reply via email to