wgtmac commented on code in PR #180:
URL: https://github.com/apache/iceberg-cpp/pull/180#discussion_r2312786050


##########
src/iceberg/schema.cc:
##########
@@ -19,13 +19,57 @@
 
 #include "iceberg/schema.h"
 
+#include <algorithm>
 #include <format>
+#include <functional>
 
 #include "iceberg/type.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
+#include "iceberg/util/macros.h"
+#include "iceberg/util/visit_type.h"
 
 namespace iceberg {
 
+class IdToFieldVisitor {
+ public:
+  explicit IdToFieldVisitor(
+      std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
+          id_to_field);
+  Status Visit(const PrimitiveType& type);
+  Status Visit(const NestedType& type);
+
+ private:
+  std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& 
id_to_field_;
+};
+
+class NameToIdVisitor {
+ public:
+  explicit NameToIdVisitor(
+      std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& 
name_to_id,

Review Comment:
   ```suggestion
         std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& 
name_to_id,
   ```



##########
src/iceberg/util/string_util.h:
##########
@@ -46,4 +46,17 @@ class ICEBERG_EXPORT StringUtils {
   }
 };
 
+/// \brief Transparent hash function that supports std::string_view as lookup 
key
+///
+/// Enables std::unordered_map to directly accept std::string_view lookup keys
+/// without creating temporary std::string objects, using C++20's transparent 
lookup.
+struct ICEBERG_EXPORT string_hash {

Review Comment:
   ```suggestion
   struct ICEBERG_EXPORT StringHash {
   ```
   
   This is not yet resolved. It can be one of the following names:
   - `iceberg::StringHash`: a simple struct in this library.
   - `iceberg::internal::string_hash`: mimic the std-equivalent with a internal 
prefix.



##########
src/iceberg/schema.h:
##########
@@ -54,13 +56,46 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  /// \brief Find the SchemaField by field name.
+  ///
+  /// Short names for maps and lists are included for any name that does not 
conflict with
+  /// a canonical name. For example, a list, 'l', of structs with field 'x' 
will produce
+  /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', 
if its
+  /// value include a structs with field 'x' wil produce short name 'm.x' in 
addition to
+  /// canonical name 'm.value.x'
+  /// FIXME: Currently only handles ASCII lowercase conversion; extend to 
support
+  /// non-ASCII characters (e.g., using std::towlower or ICU)
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+
+  /// \brief Find the SchemaField by field id.
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldById(int32_t field_id) const;
+
   friend bool operator==(const Schema& lhs, const Schema& rhs) { return 
lhs.Equals(rhs); }
 
+ private:
+  /// Mapping from field id to field.
+  mutable std::unordered_map<int32_t, std::reference_wrapper<const 
SchemaField>>
+      id_to_field_;
+  /// Mapping from field name to field id.
+  mutable std::unordered_map<std::string, int32_t, string_hash, 
std::equal_to<>>
+      name_to_id_;
+  /// Mapping from lowercased field name to field id
+  mutable std::unordered_map<std::string, int32_t, string_hash, 
std::equal_to<>>

Review Comment:
   ```suggestion
     mutable std::unordered_map<std::string, int32_t, StringHash, 
std::equal_to<>>
   ```



##########
src/iceberg/schema.h:
##########
@@ -54,13 +56,46 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  /// \brief Find the SchemaField by field name.
+  ///
+  /// Short names for maps and lists are included for any name that does not 
conflict with
+  /// a canonical name. For example, a list, 'l', of structs with field 'x' 
will produce
+  /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', 
if its
+  /// value include a structs with field 'x' wil produce short name 'm.x' in 
addition to
+  /// canonical name 'm.value.x'
+  /// FIXME: Currently only handles ASCII lowercase conversion; extend to 
support
+  /// non-ASCII characters (e.g., using std::towlower or ICU)
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+
+  /// \brief Find the SchemaField by field id.
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldById(int32_t field_id) const;
+
   friend bool operator==(const Schema& lhs, const Schema& rhs) { return 
lhs.Equals(rhs); }
 
+ private:
+  /// Mapping from field id to field.
+  mutable std::unordered_map<int32_t, std::reference_wrapper<const 
SchemaField>>
+      id_to_field_;
+  /// Mapping from field name to field id.
+  mutable std::unordered_map<std::string, int32_t, string_hash, 
std::equal_to<>>
+      name_to_id_;
+  /// Mapping from lowercased field name to field id
+  mutable std::unordered_map<std::string, int32_t, string_hash, 
std::equal_to<>>
+      lowercase_name_to_id_;
+
  private:
   /// \brief Compare two schemas for equality.
   [[nodiscard]] bool Equals(const Schema& other) const;
 
   const std::optional<int32_t> schema_id_;
+
+  // TODO(nullccxsy): Address potential concurrency issues in lazy 
initialization (e.g.,
+  // use std::call_once)
+  Status InitIdToFieldMap() const;
+  Status InitNameToIdMap() const;
+  Status InitLowerCaseNameToIdMap() const;

Review Comment:
   ```suggestion
   ```
   
   Let's move them above.



##########
src/iceberg/util/macros.h:
##########
@@ -19,10 +19,13 @@
 
 #pragma once
 
-#define ICEBERG_RETURN_UNEXPECTED(result)          \
-  if (!result) [[unlikely]] {                      \
-    return std::unexpected<Error>(result.error()); \
-  }
+#define ICEBERG_RETURN_UNEXPECTED(result)                         \
+  do {                                                            \
+    auto&& iceberg_temp_result = (result);                        \
+    if (!iceberg_temp_result) [[unlikely]] {                      \
+      return std::unexpected<Error>(iceberg_temp_result.error()); \

Review Comment:
   ```suggestion
       auto&& result_name = (result);                        \
       if (!result_name) [[unlikely]] {                      \
         return std::unexpected<Error>(result_name.error()); \
   ```
   
   Keep consistency with lines below.



##########
src/iceberg/schema.cc:
##########
@@ -44,4 +88,175 @@ bool Schema::Equals(const Schema& other) const {
   return schema_id_ == other.schema_id_ && fields_ == other.fields_;
 }
 
+Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldByName(
+    std::string_view name, bool case_sensitive) const {
+  if (case_sensitive) {
+    ICEBERG_RETURN_UNEXPECTED(InitNameToIdMap());
+    auto it = name_to_id_.find(name);
+    if (it == name_to_id_.end()) return std::nullopt;
+    return FindFieldById(it->second);
+  }
+  ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap());
+  auto it = lowercase_name_to_id_.find(StringUtils::ToLower(name));
+  if (it == lowercase_name_to_id_.end()) return std::nullopt;
+  return FindFieldById(it->second);
+}
+
+Status Schema::InitIdToFieldMap() const {
+  if (!id_to_field_.empty()) {
+    return {};
+  }
+  IdToFieldVisitor visitor(id_to_field_);
+  ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
+  return {};
+}
+
+Status Schema::InitNameToIdMap() const {
+  if (!name_to_id_.empty()) {
+    return {};
+  }
+  NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true);
+  ICEBERG_RETURN_UNEXPECTED(
+      VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
+  visitor.Finish();
+  return {};
+}
+
+Status Schema::InitLowerCaseNameToIdMap() const {
+  if (!lowercase_name_to_id_.empty()) {
+    return {};
+  }
+  NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false);
+  ICEBERG_RETURN_UNEXPECTED(
+      VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
+  visitor.Finish();
+  return {};
+}
+
+Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldById(
+    int32_t field_id) const {
+  ICEBERG_RETURN_UNEXPECTED(InitIdToFieldMap());
+  auto it = id_to_field_.find(field_id);
+  if (it == id_to_field_.end()) {
+    return std::nullopt;
+  }
+  return it->second;
+}
+
+IdToFieldVisitor::IdToFieldVisitor(
+    std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& 
id_to_field)
+    : id_to_field_(id_to_field) {}
+
+Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; }
+
+Status IdToFieldVisitor::Visit(const NestedType& type) {
+  const auto& nested = iceberg::internal::checked_cast<const 
NestedType&>(type);

Review Comment:
   ```suggestion
     const auto& nested = internal::checked_cast<const NestedType&>(type);
   ```



##########
src/iceberg/schema.cc:
##########
@@ -19,13 +19,57 @@
 
 #include "iceberg/schema.h"
 
+#include <algorithm>
 #include <format>
+#include <functional>
 
 #include "iceberg/type.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
+#include "iceberg/util/macros.h"
+#include "iceberg/util/visit_type.h"
 
 namespace iceberg {
 
+class IdToFieldVisitor {
+ public:
+  explicit IdToFieldVisitor(
+      std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
+          id_to_field);
+  Status Visit(const PrimitiveType& type);
+  Status Visit(const NestedType& type);
+
+ private:
+  std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& 
id_to_field_;
+};
+
+class NameToIdVisitor {
+ public:
+  explicit NameToIdVisitor(
+      std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& 
name_to_id,
+      bool case_sensitive = true,
+      std::function<std::string(std::string_view)> quoting_func = {});
+  Status Visit(const ListType& type, const std::string& path,
+               const std::string& short_path);
+  Status Visit(const MapType& type, const std::string& path,
+               const std::string& short_path);
+  Status Visit(const StructType& type, const std::string& path,
+               const std::string& short_path);
+  Status Visit(const PrimitiveType& type, const std::string& path,
+               const std::string& short_path);
+  void Finish();
+
+ private:
+  std::string BuildPath(std::string_view prefix, std::string_view field_name,
+                        bool case_sensitive);
+
+ private:
+  bool case_sensitive_;
+  std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& 
name_to_id_;
+  std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>

Review Comment:
   ```suggestion
     std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& 
name_to_id_;
     std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
   ```



##########
src/iceberg/schema.cc:
##########
@@ -44,4 +88,175 @@ bool Schema::Equals(const Schema& other) const {
   return schema_id_ == other.schema_id_ && fields_ == other.fields_;
 }
 
+Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldByName(
+    std::string_view name, bool case_sensitive) const {
+  if (case_sensitive) {
+    ICEBERG_RETURN_UNEXPECTED(InitNameToIdMap());
+    auto it = name_to_id_.find(name);
+    if (it == name_to_id_.end()) return std::nullopt;
+    return FindFieldById(it->second);
+  }
+  ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap());
+  auto it = lowercase_name_to_id_.find(StringUtils::ToLower(name));
+  if (it == lowercase_name_to_id_.end()) return std::nullopt;
+  return FindFieldById(it->second);
+}
+
+Status Schema::InitIdToFieldMap() const {
+  if (!id_to_field_.empty()) {
+    return {};
+  }
+  IdToFieldVisitor visitor(id_to_field_);
+  ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
+  return {};
+}
+
+Status Schema::InitNameToIdMap() const {
+  if (!name_to_id_.empty()) {
+    return {};
+  }
+  NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true);
+  ICEBERG_RETURN_UNEXPECTED(
+      VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
+  visitor.Finish();
+  return {};
+}
+
+Status Schema::InitLowerCaseNameToIdMap() const {
+  if (!lowercase_name_to_id_.empty()) {
+    return {};
+  }
+  NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false);
+  ICEBERG_RETURN_UNEXPECTED(
+      VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
+  visitor.Finish();
+  return {};
+}
+
+Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldById(
+    int32_t field_id) const {
+  ICEBERG_RETURN_UNEXPECTED(InitIdToFieldMap());
+  auto it = id_to_field_.find(field_id);
+  if (it == id_to_field_.end()) {
+    return std::nullopt;
+  }
+  return it->second;
+}
+
+IdToFieldVisitor::IdToFieldVisitor(
+    std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& 
id_to_field)
+    : id_to_field_(id_to_field) {}
+
+Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; }
+
+Status IdToFieldVisitor::Visit(const NestedType& type) {
+  const auto& nested = iceberg::internal::checked_cast<const 
NestedType&>(type);
+  const auto& fields = nested.fields();
+  for (const auto& field : fields) {
+    auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field));
+    if (!it.second) {
+      return InvalidSchema("Duplicate field id found: {}", field.field_id());
+    }
+    ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
+  }
+  return {};
+}
+
+NameToIdVisitor::NameToIdVisitor(
+    std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& 
name_to_id,

Review Comment:
   ```suggestion
       std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& 
name_to_id,
   ```



##########
src/iceberg/schema.h:
##########
@@ -29,8 +29,10 @@
 #include <vector>
 
 #include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
 #include "iceberg/schema_field.h"
 #include "iceberg/type.h"
+#include "iceberg/util/string_util.h"

Review Comment:
   ```suggestion
   ```
   
   This is only required by schema.cc. Please use forward declaration as much 
as possible.



##########
src/iceberg/schema.h:
##########
@@ -54,13 +56,46 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  /// \brief Find the SchemaField by field name.
+  ///
+  /// Short names for maps and lists are included for any name that does not 
conflict with
+  /// a canonical name. For example, a list, 'l', of structs with field 'x' 
will produce
+  /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', 
if its
+  /// value include a structs with field 'x' wil produce short name 'm.x' in 
addition to
+  /// canonical name 'm.value.x'
+  /// FIXME: Currently only handles ASCII lowercase conversion; extend to 
support
+  /// non-ASCII characters (e.g., using std::towlower or ICU)
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+
+  /// \brief Find the SchemaField by field id.
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldById(int32_t field_id) const;
+
   friend bool operator==(const Schema& lhs, const Schema& rhs) { return 
lhs.Equals(rhs); }
 
+ private:
+  /// Mapping from field id to field.

Review Comment:
   ```suggestion
     Status InitIdToFieldMap() const;
     Status InitNameToIdMap() const;
     Status InitLowerCaseNameToIdMap() const;
   
     /// Mapping from field id to field.
   ```



##########
src/iceberg/schema.h:
##########
@@ -54,13 +56,46 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  /// \brief Find the SchemaField by field name.
+  ///
+  /// Short names for maps and lists are included for any name that does not 
conflict with
+  /// a canonical name. For example, a list, 'l', of structs with field 'x' 
will produce
+  /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', 
if its
+  /// value include a structs with field 'x' wil produce short name 'm.x' in 
addition to
+  /// canonical name 'm.value.x'
+  /// FIXME: Currently only handles ASCII lowercase conversion; extend to 
support
+  /// non-ASCII characters (e.g., using std::towlower or ICU)
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+
+  /// \brief Find the SchemaField by field id.
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldById(int32_t field_id) const;
+
   friend bool operator==(const Schema& lhs, const Schema& rhs) { return 
lhs.Equals(rhs); }
 
+ private:
+  /// Mapping from field id to field.
+  mutable std::unordered_map<int32_t, std::reference_wrapper<const 
SchemaField>>
+      id_to_field_;
+  /// Mapping from field name to field id.
+  mutable std::unordered_map<std::string, int32_t, string_hash, 
std::equal_to<>>
+      name_to_id_;
+  /// Mapping from lowercased field name to field id
+  mutable std::unordered_map<std::string, int32_t, string_hash, 
std::equal_to<>>
+      lowercase_name_to_id_;
+
  private:

Review Comment:
   ```suggestion
   ```



##########
src/iceberg/schema.h:
##########
@@ -54,13 +56,46 @@ class ICEBERG_EXPORT Schema : public StructType {
 
   [[nodiscard]] std::string ToString() const override;
 
+  /// \brief Find the SchemaField by field name.
+  ///
+  /// Short names for maps and lists are included for any name that does not 
conflict with
+  /// a canonical name. For example, a list, 'l', of structs with field 'x' 
will produce
+  /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', 
if its
+  /// value include a structs with field 'x' wil produce short name 'm.x' in 
addition to
+  /// canonical name 'm.value.x'
+  /// FIXME: Currently only handles ASCII lowercase conversion; extend to 
support
+  /// non-ASCII characters (e.g., using std::towlower or ICU)
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+
+  /// \brief Find the SchemaField by field id.
+  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
+  FindFieldById(int32_t field_id) const;
+
   friend bool operator==(const Schema& lhs, const Schema& rhs) { return 
lhs.Equals(rhs); }
 
+ private:
+  /// Mapping from field id to field.
+  mutable std::unordered_map<int32_t, std::reference_wrapper<const 
SchemaField>>
+      id_to_field_;
+  /// Mapping from field name to field id.
+  mutable std::unordered_map<std::string, int32_t, string_hash, 
std::equal_to<>>

Review Comment:
   ```suggestion
     mutable std::unordered_map<std::string, int32_t, StringHash, 
std::equal_to<>>
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to