wgtmac commented on code in PR #467:
URL: https://github.com/apache/iceberg-cpp/pull/467#discussion_r2659289142


##########
src/iceberg/location_provider.h:
##########
@@ -48,9 +50,57 @@ class ICEBERG_EXPORT LocationProvider {
   ///
   /// TODO(wgtmac): StructLike is not well thought yet, we may wrap an 
ArrowArray

Review Comment:
   Let's update the comment.



##########
src/iceberg/location_provider.cc:
##########
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/location_provider.h"
+
+#include "iceberg/partition_spec.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/util/location_util.h"
+#include "iceberg/util/murmurhash3_internal.h"
+
+namespace iceberg {
+
+namespace {
+
+constexpr uint8_t kEntropyDirMask = 0x0f;
+constexpr uint8_t kRestDirMask = 0xff;
+constexpr int32_t kHashBits = 20;
+constexpr int32_t kEntropyDirLength = 4;
+constexpr int32_t kEntropyDirDepth = 3;
+
+std::string DataLocation(const TableProperties& properties,
+                         const std::string& table_location) {
+  auto data_location = properties.Get(TableProperties::kWriteDataLocation);
+  if (data_location.empty()) {
+    data_location = std::format("{}/data", table_location);
+  }
+  return data_location;
+}
+
+std::string PathContext(std::string_view table_location) {
+  std::string path = LocationUtil::StripTrailingSlash(table_location);
+
+  size_t last_slash = path.find_last_of('/');
+  if (last_slash != std::string::npos && last_slash < path.length() - 1) {
+    std::string_view data_path(path.data(), path.size() - last_slash - 1);
+    std::string_view parent_path(path.data(), last_slash);
+    size_t parent_last_slash = parent_path.find_last_of('/');
+
+    if (parent_last_slash != std::string::npos) {
+      std::string_view parent_name = parent_path.substr(parent_last_slash + 1);
+      return std::format("{}/{}", parent_name, data_path);
+    } else {
+      return std::format("{}/{}", parent_path, data_path);
+    }
+  }
+
+  return std::string(table_location);
+}
+
+/// \brief Divides hash into directories for optimized orphan removal 
operation using
+/// kEntropyDirDepth and kEntropyDirLength.
+///
+/// If the low `kHashBits = 20` of `hash` is '10011001100110011001', then 
return
+/// '1001/1001/1001/10011001' with depth 3 and length 4.
+///
+/// \param hash The hash value to be divided.
+/// \return The path according to the `hash` value.
+std::string DirsFromHash(int32_t hash) {
+  std::string hash_with_dirs;
+
+  for (int i = 0; i < kEntropyDirDepth * kEntropyDirLength; i += 
kEntropyDirLength) {
+    if (i > 0) {
+      hash_with_dirs += "/";
+    }
+    uint8_t dir_bits = kEntropyDirMask & (hash >> (kHashBits - i - 
kEntropyDirLength));
+    hash_with_dirs += std::format("{:04b}", dir_bits);
+  }
+
+  hash_with_dirs += "/";
+  uint8_t rest_bits = kRestDirMask & hash;
+  hash_with_dirs += std::format("{:08b}", rest_bits);
+
+  return hash_with_dirs;
+}
+
+std::string ComputeHash(std::string_view file_name) {
+  int32_t hash_value = 0;
+  MurmurHash3_x86_32(file_name.data(), file_name.size(), 0, &hash_value);
+  return DirsFromHash(hash_value);
+}
+
+}  // namespace
+
+Result<std::unique_ptr<LocationProvider>> LocationProviderFactory::For(
+    const std::string& input_location, const TableProperties& properties) {
+  std::string location = LocationUtil::StripTrailingSlash(input_location);
+
+  // Note: Not support dynamic constructor according to 
kWriteLocationProviderImpl

Review Comment:
   I think we need to allow users to register their own LocationProvider 
implementations, perhaps to a LocationProviderRegistry and we try to find it if 
`write.location-provider.impl` has been configured.



##########
src/iceberg/location_provider.h:
##########
@@ -48,9 +50,57 @@ class ICEBERG_EXPORT LocationProvider {
   ///
   /// TODO(wgtmac): StructLike is not well thought yet, we may wrap an 
ArrowArray
   /// with single row in StructLike.
-  virtual std::string NewDataLocation(const PartitionSpec& spec,
-                                      const StructLike& partition_data,
-                                      const std::string& filename) = 0;
+  virtual Result<std::string> NewDataLocation(const PartitionSpec& spec,
+                                              const PartitionValues& 
partition_data,
+                                              const std::string& filename) = 0;
+};
+
+class ICEBERG_EXPORT LocationProviderFactory {
+ public:
+  virtual ~LocationProviderFactory() = default;
+
+  /// \brief Create a LocationProvider for the given table location and 
properties.
+  ///
+  /// \param input_location the table location
+  /// \param properties the table properties
+  /// \return a LocationProvider instance
+  static Result<std::unique_ptr<LocationProvider>> For(const std::string& 
input_location,
+                                                       const TableProperties& 
properties);
+};
+
+/// \brief DefaultLocationProvider privides default location provider for 
local file
+/// system.
+class ICEBERG_EXPORT DefaultLocationProvider : public LocationProvider {
+ public:
+  DefaultLocationProvider(const std::string& table_location,

Review Comment:
   Should we use `std::string_view` for table_location?



##########
src/iceberg/location_provider.h:
##########
@@ -48,9 +50,57 @@ class ICEBERG_EXPORT LocationProvider {
   ///
   /// TODO(wgtmac): StructLike is not well thought yet, we may wrap an 
ArrowArray
   /// with single row in StructLike.
-  virtual std::string NewDataLocation(const PartitionSpec& spec,
-                                      const StructLike& partition_data,
-                                      const std::string& filename) = 0;
+  virtual Result<std::string> NewDataLocation(const PartitionSpec& spec,
+                                              const PartitionValues& 
partition_data,
+                                              const std::string& filename) = 0;
+};
+
+class ICEBERG_EXPORT LocationProviderFactory {
+ public:
+  virtual ~LocationProviderFactory() = default;
+
+  /// \brief Create a LocationProvider for the given table location and 
properties.
+  ///
+  /// \param input_location the table location
+  /// \param properties the table properties
+  /// \return a LocationProvider instance
+  static Result<std::unique_ptr<LocationProvider>> For(const std::string& 
input_location,

Review Comment:
   Use `Make` to be consistent with other APIs.



##########
src/iceberg/util/location_util.h:
##########
@@ -27,16 +27,15 @@ namespace iceberg {
 
 class ICEBERG_EXPORT LocationUtil {
  public:
-  static std::string StripTrailingSlash(const std::string& path) {
+  static std::string StripTrailingSlash(std::string_view path) {
     if (path.empty()) {
       return "";
     }
 
-    std::string_view result = path;
-    while (result.ends_with("/") && !result.ends_with("://")) {
-      result.remove_suffix(1);
+    while (path.ends_with("/") && !path.ends_with("://")) {
+      path.remove_suffix(1);
     }
-    return std::string(result);
+    return std::string(path);

Review Comment:
   It seems that we can just return a `std::string_view` and let the caller to 
decide which type to use to be more flexible.



##########
src/iceberg/partition_spec.h:
##########
@@ -64,6 +64,9 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable 
{
   /// \brief Get the partition type binding to the input schema.
   Result<std::unique_ptr<StructType>> PartitionType(const Schema& schema) 
const;
 
+  /// \brief Get the partition path for the given partition data.
+  Result<std::string> PartitionPath(const PartitionValues& data) const;

Review Comment:
   Can we add some test cases for this? This function can also be in a separate 
PR if you see fit.



##########
src/iceberg/location_provider.cc:
##########
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/location_provider.h"
+
+#include "iceberg/partition_spec.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/util/location_util.h"
+#include "iceberg/util/murmurhash3_internal.h"
+
+namespace iceberg {
+
+namespace {
+
+constexpr uint8_t kEntropyDirMask = 0x0f;
+constexpr uint8_t kRestDirMask = 0xff;
+constexpr int32_t kHashBits = 20;
+constexpr int32_t kEntropyDirLength = 4;
+constexpr int32_t kEntropyDirDepth = 3;
+
+std::string DataLocation(const TableProperties& properties,
+                         const std::string& table_location) {
+  auto data_location = properties.Get(TableProperties::kWriteDataLocation);
+  if (data_location.empty()) {
+    data_location = std::format("{}/data", table_location);
+  }
+  return data_location;
+}
+
+std::string PathContext(std::string_view table_location) {
+  std::string path = LocationUtil::StripTrailingSlash(table_location);

Review Comment:
   If `StripTrailingSlash` returns a string_view, we can avoid creating a 
string object here.



##########
src/iceberg/location_provider.h:
##########
@@ -48,9 +50,57 @@ class ICEBERG_EXPORT LocationProvider {
   ///
   /// TODO(wgtmac): StructLike is not well thought yet, we may wrap an 
ArrowArray
   /// with single row in StructLike.
-  virtual std::string NewDataLocation(const PartitionSpec& spec,
-                                      const StructLike& partition_data,
-                                      const std::string& filename) = 0;
+  virtual Result<std::string> NewDataLocation(const PartitionSpec& spec,
+                                              const PartitionValues& 
partition_data,
+                                              const std::string& filename) = 0;
+};
+
+class ICEBERG_EXPORT LocationProviderFactory {
+ public:
+  virtual ~LocationProviderFactory() = default;
+
+  /// \brief Create a LocationProvider for the given table location and 
properties.
+  ///
+  /// \param input_location the table location
+  /// \param properties the table properties
+  /// \return a LocationProvider instance
+  static Result<std::unique_ptr<LocationProvider>> For(const std::string& 
input_location,
+                                                       const TableProperties& 
properties);
+};
+
+/// \brief DefaultLocationProvider privides default location provider for 
local file
+/// system.
+class ICEBERG_EXPORT DefaultLocationProvider : public LocationProvider {
+ public:
+  DefaultLocationProvider(const std::string& table_location,
+                          const TableProperties& properties);
+
+  std::string NewDataLocation(const std::string& filename) override;
+
+  Result<std::string> NewDataLocation(const PartitionSpec& spec,
+                                      const PartitionValues& partition_data,
+                                      const std::string& filename) override;

Review Comment:
   Same question for filename to use string_view in both `NewDataLocation` 
functions. We can change the base class function signature if appropriate.



##########
src/iceberg/location_provider.cc:
##########
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/location_provider.h"
+
+#include "iceberg/partition_spec.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/util/location_util.h"
+#include "iceberg/util/murmurhash3_internal.h"
+
+namespace iceberg {
+
+namespace {
+
+constexpr uint8_t kEntropyDirMask = 0x0f;
+constexpr uint8_t kRestDirMask = 0xff;
+constexpr int32_t kHashBits = 20;
+constexpr int32_t kEntropyDirLength = 4;
+constexpr int32_t kEntropyDirDepth = 3;
+
+std::string DataLocation(const TableProperties& properties,
+                         const std::string& table_location) {
+  auto data_location = properties.Get(TableProperties::kWriteDataLocation);
+  if (data_location.empty()) {
+    data_location = std::format("{}/data", table_location);
+  }
+  return data_location;
+}
+
+std::string PathContext(std::string_view table_location) {
+  std::string path = LocationUtil::StripTrailingSlash(table_location);
+
+  size_t last_slash = path.find_last_of('/');
+  if (last_slash != std::string::npos && last_slash < path.length() - 1) {
+    std::string_view data_path(path.data(), path.size() - last_slash - 1);
+    std::string_view parent_path(path.data(), last_slash);
+    size_t parent_last_slash = parent_path.find_last_of('/');
+
+    if (parent_last_slash != std::string::npos) {
+      std::string_view parent_name = parent_path.substr(parent_last_slash + 1);
+      return std::format("{}/{}", parent_name, data_path);
+    } else {
+      return std::format("{}/{}", parent_path, data_path);
+    }
+  }
+
+  return std::string(table_location);
+}
+
+/// \brief Divides hash into directories for optimized orphan removal 
operation using
+/// kEntropyDirDepth and kEntropyDirLength.
+///
+/// If the low `kHashBits = 20` of `hash` is '10011001100110011001', then 
return
+/// '1001/1001/1001/10011001' with depth 3 and length 4.
+///
+/// \param hash The hash value to be divided.
+/// \return The path according to the `hash` value.
+std::string DirsFromHash(int32_t hash) {
+  std::string hash_with_dirs;
+
+  for (int i = 0; i < kEntropyDirDepth * kEntropyDirLength; i += 
kEntropyDirLength) {

Review Comment:
   ```suggestion
     for (int32_t i = 0; i < kEntropyDirDepth * kEntropyDirLength; i += 
kEntropyDirLength) {
   ```



##########
src/iceberg/expression/literal.cc:
##########
@@ -488,6 +488,7 @@ std::string Literal::ToString() const {
           .value_or("invalid literal of type decimal");
     }
     case TypeId::kString: {
+      // TODO(zhuo.wang): escape string?

Review Comment:
   IMO `ToString` is for human readable representation. So it is no need to 
escape it here?



##########
src/iceberg/location_provider.cc:
##########
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/location_provider.h"
+
+#include "iceberg/partition_spec.h"
+#include "iceberg/table_properties.h"
+#include "iceberg/util/location_util.h"
+#include "iceberg/util/murmurhash3_internal.h"
+
+namespace iceberg {
+
+namespace {
+
+constexpr uint8_t kEntropyDirMask = 0x0f;
+constexpr uint8_t kRestDirMask = 0xff;
+constexpr int32_t kHashBits = 20;
+constexpr int32_t kEntropyDirLength = 4;
+constexpr int32_t kEntropyDirDepth = 3;
+
+std::string DataLocation(const TableProperties& properties,
+                         const std::string& table_location) {
+  auto data_location = properties.Get(TableProperties::kWriteDataLocation);
+  if (data_location.empty()) {
+    data_location = std::format("{}/data", table_location);
+  }
+  return data_location;
+}
+
+std::string PathContext(std::string_view table_location) {
+  std::string path = LocationUtil::StripTrailingSlash(table_location);
+
+  size_t last_slash = path.find_last_of('/');
+  if (last_slash != std::string::npos && last_slash < path.length() - 1) {
+    std::string_view data_path(path.data(), path.size() - last_slash - 1);

Review Comment:
   ```suggestion
       std::string_view data_path = std::string_view(path).substr(last_slash + 
1);
   ```



##########
src/iceberg/location_provider.h:
##########
@@ -48,9 +50,57 @@ class ICEBERG_EXPORT LocationProvider {
   ///
   /// TODO(wgtmac): StructLike is not well thought yet, we may wrap an 
ArrowArray
   /// with single row in StructLike.
-  virtual std::string NewDataLocation(const PartitionSpec& spec,
-                                      const StructLike& partition_data,
-                                      const std::string& filename) = 0;
+  virtual Result<std::string> NewDataLocation(const PartitionSpec& spec,
+                                              const PartitionValues& 
partition_data,
+                                              const std::string& filename) = 0;
+};
+
+class ICEBERG_EXPORT LocationProviderFactory {
+ public:
+  virtual ~LocationProviderFactory() = default;
+
+  /// \brief Create a LocationProvider for the given table location and 
properties.
+  ///
+  /// \param input_location the table location
+  /// \param properties the table properties
+  /// \return a LocationProvider instance
+  static Result<std::unique_ptr<LocationProvider>> For(const std::string& 
input_location,
+                                                       const TableProperties& 
properties);
+};
+
+/// \brief DefaultLocationProvider privides default location provider for 
local file
+/// system.
+class ICEBERG_EXPORT DefaultLocationProvider : public LocationProvider {

Review Comment:
   It seems that we can move these impl classes to location_provider.cc without 
declaring them here.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to