This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d437efea333 [chore](thirdparty)(paimon-cpp) reuse Doris Arrow stack 
and isolate external headers (#60946)
d437efea333 is described below

commit d437efea3331da63de72ba5939d9283f1bad7385
Author: Chenjunwei <[email protected]>
AuthorDate: Mon Mar 2 20:20:43 2026 +0800

    [chore](thirdparty)(paimon-cpp) reuse Doris Arrow stack and isolate 
external headers (#60946)
    
    ## Summary
    Split thirdparty-only changes from #60883 into an independent PR, so
    `thirdparty` can merge first.
    
    ## Included Files
    - `thirdparty/build-thirdparty.sh`
    - `thirdparty/download-thirdparty.sh`
    - `thirdparty/paimon-cpp-cache.cmake`
    - `thirdparty/patches/apache-arrow-17.0.0-paimon.patch`
    - `thirdparty/patches/paimon-cpp-buildutils-static-deps.patch`
    
    ## Why Split
    - Keep this PR focused on `thirdparty` integration only.
    - Reduce rebase/conflict risk for the original feature branch.
    
    ## Follow-up
    1. Merge this PR first.
    2. Rebase the original feature branch on latest `master`.
    3. Keep non-thirdparty logic in the original PR.
---
 thirdparty/build-thirdparty.sh                     |  38 ++--
 thirdparty/download-thirdparty.sh                  |  10 +
 thirdparty/paimon-cpp-cache.cmake                  |  47 +++--
 .../patches/apache-arrow-17.0.0-paimon.patch       | 224 +++++++++++++++++++++
 .../paimon-cpp-buildutils-static-deps.patch        | 160 ++++++++++++++-
 5 files changed, 446 insertions(+), 33 deletions(-)

diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh
index fc413fbb896..d858596d1bd 100755
--- a/thirdparty/build-thirdparty.sh
+++ b/thirdparty/build-thirdparty.sh
@@ -1090,6 +1090,10 @@ build_arrow() {
         -DARROW_BUILD_STATIC=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_LZ4=ON 
-DARROW_USE_GLOG=ON \
         -DARROW_WITH_SNAPPY=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON 
-DARROW_JSON=ON \
         -DARROW_WITH_UTF8PROC=OFF -DARROW_WITH_RE2=ON -DARROW_ORC=ON \
+        -DARROW_COMPUTE=ON \
+        -DARROW_FILESYSTEM=ON \
+        -DARROW_DATASET=ON \
+        -DARROW_ACERO=ON \
         -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
         -DCMAKE_INSTALL_LIBDIR=lib64 \
         -DARROW_BOOST_USE_SHARED=OFF \
@@ -1137,6 +1141,8 @@ build_arrow() {
     cp -rf ./brotli_ep/src/brotli_ep-install/lib/libbrotlicommon-static.a 
"${TP_INSTALL_DIR}/lib64/libbrotlicommon.a"
     strip_lib libarrow.a
     strip_lib libparquet.a
+    strip_lib libarrow_dataset.a
+    strip_lib libarrow_acero.a
 }
 
 # abseil
@@ -2028,20 +2034,26 @@ build_paimon_cpp() {
     # These libraries are built but not installed by default
     echo "Installing paimon-cpp internal dependencies..."
 
-    # Install paimon-cpp Arrow deps used by paimon parquet static libs.
-    # Keep them in an isolated directory to avoid clashing with Doris Arrow.
+    # Arrow deps: When PAIMON_USE_EXTERNAL_ARROW=ON (Plan B), paimon-cpp
+    # reuses Doris's Arrow and does NOT build arrow_ep, so the paimon_deps
+    # directory is not needed.  When building its own Arrow (legacy), copy
+    # arrow artefacts into an isolated directory to avoid clashing with Doris.
     local paimon_deps_dir="${TP_INSTALL_DIR}/paimon-cpp/lib64/paimon_deps"
-    mkdir -p "${paimon_deps_dir}"
-    for paimon_arrow_dep in \
-        libarrow.a \
-        libarrow_filesystem.a \
-        libarrow_dataset.a \
-        libarrow_acero.a \
-        libparquet.a; do
-        if [ -f "arrow_ep-install/lib/${paimon_arrow_dep}" ]; then
-            cp -v "arrow_ep-install/lib/${paimon_arrow_dep}" 
"${paimon_deps_dir}/${paimon_arrow_dep}"
-        fi
-    done
+    if [ -d "arrow_ep-install/lib" ]; then
+        mkdir -p "${paimon_deps_dir}"
+        for paimon_arrow_dep in \
+            libarrow.a \
+            libarrow_filesystem.a \
+            libarrow_dataset.a \
+            libarrow_acero.a \
+            libparquet.a; do
+            if [ -f "arrow_ep-install/lib/${paimon_arrow_dep}" ]; then
+                cp -v "arrow_ep-install/lib/${paimon_arrow_dep}" 
"${paimon_deps_dir}/${paimon_arrow_dep}"
+            fi
+        done
+    else
+        echo "  arrow_ep-install not found (PAIMON_USE_EXTERNAL_ARROW=ON?) – 
skipping paimon_deps Arrow copy"
+    fi
 
     # Install roaring_bitmap, renamed to avoid conflict with Doris's 
croaringbitmap
     if [ -f "release/libroaring_bitmap.a" ]; then
diff --git a/thirdparty/download-thirdparty.sh 
b/thirdparty/download-thirdparty.sh
index 447b8852618..d913c389ee0 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -425,6 +425,16 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
         fi
         cd -
     fi
+    if [[ "${ARROW_SOURCE}" == "arrow-apache-arrow-17.0.0" ]]; then
+        cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}"
+        if [[ ! -f "${PATCHED_MARK}" ]]; then
+            # Paimon-cpp parquet patches: row-group-aware batch reader, 
max_row_group_size,
+            # GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty 
fix.
+            patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch"
+            touch "${PATCHED_MARK}"
+        fi
+        cd -
+    fi
     echo "Finished patching ${ARROW_SOURCE}"
 fi
 
diff --git a/thirdparty/paimon-cpp-cache.cmake 
b/thirdparty/paimon-cpp-cache.cmake
index 40623dd10f0..dbebd94a0cc 100644
--- a/thirdparty/paimon-cpp-cache.cmake
+++ b/thirdparty/paimon-cpp-cache.cmake
@@ -57,18 +57,29 @@ set(LZ4_LIBRARY "${DORIS_LIB_DIR}/liblz4.a" CACHE FILEPATH 
"LZ4 library")
 set(LZ4_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "LZ4 include directory")
 
 # ============================================================================
-# glog - Reuse from Doris (version 0.6.0)
-# Note: Paimon-cpp uses 0.7.1, but 0.6.0 is compatible
+# glog - NOT reused from Doris
+# paimon-cpp's build_glog() unconditionally calls externalproject_add() to
+# build glog 0.7.1.  Any GLOG_ROOT/GLOG_LIBRARY/GLOG_INCLUDE_DIR set here
+# would be overwritten by that macro, so we skip them entirely.
 # ============================================================================
-set(GLOG_ROOT "${DORIS_THIRDPARTY_DIR}" CACHE PATH "glog root directory")
-set(GLOG_LIBRARY "${DORIS_LIB_DIR}/libglog.a" CACHE FILEPATH "glog library")
-set(GLOG_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "glog include 
directory")
 
 # ============================================================================
-# Arrow, Protobuf, Thrift - NOT reusing from Doris
-# paimon-cpp will build its own versions with symbol visibility=hidden
-# to prevent conflicts with Doris's versions
+# Arrow - Reuse from Doris (Doris Arrow now includes 
COMPUTE/DATASET/ACERO/FILESYSTEM)
+# Doris's Arrow 17.0.0 is built with the full module set that paimon-cpp
+# needs, so we skip paimon-cpp's internal externalproject_add(arrow_ep ...).
 # ============================================================================
+set(PAIMON_USE_EXTERNAL_ARROW ON CACHE BOOL "Use pre-built Arrow from Doris 
instead of building from source")
+
+set(DORIS_LIB64_DIR "${DORIS_THIRDPARTY_DIR}/lib64" CACHE PATH "Doris lib64 
directory")
+
+set(PAIMON_EXTERNAL_ARROW_INCLUDE_DIR "${DORIS_INCLUDE_DIR}" CACHE PATH "Arrow 
include directory")
+set(PAIMON_EXTERNAL_ARROW_LIB "${DORIS_LIB64_DIR}/libarrow.a" CACHE FILEPATH 
"Arrow core library")
+set(PAIMON_EXTERNAL_ARROW_DATASET_LIB "${DORIS_LIB64_DIR}/libarrow_dataset.a" 
CACHE FILEPATH "Arrow Dataset library")
+set(PAIMON_EXTERNAL_ARROW_ACERO_LIB "${DORIS_LIB64_DIR}/libarrow_acero.a" 
CACHE FILEPATH "Arrow Acero library")
+set(PAIMON_EXTERNAL_PARQUET_LIB "${DORIS_LIB64_DIR}/libparquet.a" CACHE 
FILEPATH "Parquet library")
+set(PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB 
"${DORIS_LIB64_DIR}/libarrow_bundled_dependencies.a" CACHE FILEPATH "Arrow 
bundled dependencies library")
+
+# Protobuf, Thrift - still built separately by paimon-cpp
 
 # ============================================================================
 # Snappy - Reuse from Doris
@@ -103,17 +114,23 @@ endif()
 if(NOT EXISTS "${SNAPPY_LIBRARY}")
     message(FATAL_ERROR "Snappy library not found: ${SNAPPY_LIBRARY}")
 endif()
-if(NOT EXISTS "${GLOG_LIBRARY}")
-    message(FATAL_ERROR "glog library not found: ${GLOG_LIBRARY}")
-endif()
 
 message(STATUS "========================================")
 message(STATUS "Paimon-cpp Library Reuse Configuration")
 message(STATUS "========================================")
 message(STATUS "Reusing from Doris:")
-message(STATUS "  ✓ ZLIB, ZSTD, LZ4, Snappy, glog")
+message(STATUS "  ✓ ZLIB, ZSTD, LZ4, Snappy")
+if(PAIMON_USE_EXTERNAL_ARROW)
+    message(STATUS "  ✓ Arrow, Parquet, Arrow Dataset, Arrow Acero (Plan B)")
+else()
+    message(STATUS "  ✗ Arrow (building separately, symbol visibility=hidden)")
+endif()
 message(STATUS "")
-message(STATUS "Building separately (symbol visibility=hidden):")
-message(STATUS "  - Arrow, Protobuf, Thrift, ORC")
-message(STATUS "  - RapidJSON, TBB")
+message(STATUS "Building separately:")
+if(NOT PAIMON_USE_EXTERNAL_ARROW)
+    message(STATUS "  - Arrow, Protobuf, Thrift, ORC")
+else()
+    message(STATUS "  - Protobuf, Thrift, ORC")
+endif()
+message(STATUS "  - glog, RapidJSON, TBB")
 message(STATUS "========================================")
diff --git a/thirdparty/patches/apache-arrow-17.0.0-paimon.patch 
b/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
new file mode 100644
index 00000000000..4e53117b79b
--- /dev/null
+++ b/thirdparty/patches/apache-arrow-17.0.0-paimon.patch
@@ -0,0 +1,224 @@
+diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
+index ec3890a41f..943f69bb6c 100644
+--- a/cpp/src/parquet/arrow/schema.cc
++++ b/cpp/src/parquet/arrow/schema.cc
+@@ -178,7 +178,7 @@ static Status GetTimestampMetadata(const 
::arrow::TimestampType& type,
+
+   // The user is explicitly asking for Impala int96 encoding, there is no
+   // logical type.
+-  if (arrow_properties.support_deprecated_int96_timestamps()) {
++  if (arrow_properties.support_deprecated_int96_timestamps() && target_unit 
== ::arrow::TimeUnit::NANO) {
+     *physical_type = ParquetType::INT96;
+     return Status::OK();
+   }
+
+diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
+index 285e2a5973..aa6f92f077 100644
+--- a/cpp/src/parquet/arrow/reader.cc
++++ b/cpp/src/parquet/arrow/reader.cc
+@@ -1013,25 +1013,32 @@ Status FileReaderImpl::GetRecordBatchReader(const 
std::vector<int>& row_groups,
+     return Status::OK();
+   }
+
+-  int64_t num_rows = 0;
++  std::vector<int64_t> num_rows;
+   for (int row_group : row_groups) {
+-    num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
++    
num_rows.push_back(parquet_reader()->metadata()->RowGroup(row_group)->num_rows());
+   }
+
+   using ::arrow::RecordBatchIterator;
++  int row_group_idx = 0;
+
+   // NB: This lambda will be invoked outside the scope of this call to
+   // `GetRecordBatchReader()`, so it must capture `readers` and 
`batch_schema` by value.
+   // `this` is a non-owning pointer so we are relying on the parent 
FileReader outliving
+   // this RecordBatchReader.
+   ::arrow::Iterator<RecordBatchIterator> batches = 
::arrow::MakeFunctionIterator(
+-      [readers, batch_schema, num_rows,
++      [readers, batch_schema, num_rows, row_group_idx,
+        this]() mutable -> ::arrow::Result<RecordBatchIterator> {
+         ::arrow::ChunkedArrayVector columns(readers.size());
+
+-        // don't reserve more rows than necessary
+-        int64_t batch_size = std::min(properties().batch_size(), num_rows);
+-        num_rows -= batch_size;
++        int64_t batch_size = 0;
++        if (!num_rows.empty()) {
++          // don't reserve more rows than necessary
++          batch_size = std::min(properties().batch_size(), 
num_rows[row_group_idx]);
++          num_rows[row_group_idx] -= batch_size;
++          if (num_rows[row_group_idx] == 0 && (num_rows.size() - 1) != 
row_group_idx) {
++            row_group_idx++;
++          }
++        }
+
+         RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
+             reader_properties_.use_threads(), 
static_cast<int>(readers.size()),
+diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
+index 4fd7ef1b47..87326a54f1 100644
+--- a/cpp/src/parquet/arrow/writer.cc
++++ b/cpp/src/parquet/arrow/writer.cc
+@@ -314,6 +314,14 @@ class FileWriterImpl : public FileWriter {
+     return Status::OK();
+   }
+
++  int64_t GetBufferedSize() override {
++    if (row_group_writer_ == nullptr) {
++      return 0;
++    }
++    return row_group_writer_->total_compressed_bytes() +
++      row_group_writer_->total_compressed_bytes_written();
++  }
++
+   Status Close() override {
+     if (!closed_) {
+       // Make idempotent
+@@ -418,10 +426,13 @@ class FileWriterImpl : public FileWriter {
+
+     // Max number of rows allowed in a row group.
+     const int64_t max_row_group_length = 
this->properties().max_row_group_length();
++    const int64_t max_row_group_size = 
this->properties().max_row_group_size();
+
+     // Initialize a new buffered row group writer if necessary.
+     if (row_group_writer_ == nullptr || !row_group_writer_->buffered() ||
+-        row_group_writer_->num_rows() >= max_row_group_length) {
++        row_group_writer_->num_rows() >= max_row_group_length ||
++        (row_group_writer_->total_compressed_bytes_written() +
++         row_group_writer_->total_compressed_bytes() >= max_row_group_size)) {
+       RETURN_NOT_OK(NewBufferedRowGroup());
+     }
+
+diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h
+index 4a1a033a7b..0f13d05e44 100644
+--- a/cpp/src/parquet/arrow/writer.h
++++ b/cpp/src/parquet/arrow/writer.h
+@@ -138,6 +138,9 @@ class PARQUET_EXPORT FileWriter {
+   /// option in this case.
+   virtual ::arrow::Status WriteRecordBatch(const ::arrow::RecordBatch& batch) 
= 0;
+
++  /// \brief Return the buffered size in bytes.
++  virtual int64_t GetBufferedSize() = 0;
++
+   /// \brief Write the footer and close the file.
+   virtual ::arrow::Status Close() = 0;
+   virtual ~FileWriter();
+diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
+index 4d3acb491e..3906ff3c59 100644
+--- a/cpp/src/parquet/properties.h
++++ b/cpp/src/parquet/properties.h
+@@ -139,6 +139,7 @@ static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
+ static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = 
kDefaultDataPageSize;
+ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
+ static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
++static constexpr int64_t DEFAULT_MAX_ROW_GROUP_SIZE = 128 * 1024 * 1024;
+ static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
+ static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
+ static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
+@@ -232,6 +233,7 @@ class PARQUET_EXPORT WriterProperties {
+           dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
+           write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
+           max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
++          max_row_group_size_(DEFAULT_MAX_ROW_GROUP_SIZE),
+           pagesize_(kDefaultDataPageSize),
+           version_(ParquetVersion::PARQUET_2_6),
+           data_page_version_(ParquetDataPageVersion::V1),
+@@ -244,6 +246,7 @@ class PARQUET_EXPORT WriterProperties {
+           dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()),
+           write_batch_size_(properties.write_batch_size()),
+           max_row_group_length_(properties.max_row_group_length()),
++          max_row_group_size_(properties.max_row_group_size()),
+           pagesize_(properties.data_pagesize()),
+           version_(properties.version()),
+           data_page_version_(properties.data_page_version()),
+@@ -321,6 +324,13 @@ class PARQUET_EXPORT WriterProperties {
+       return this;
+     }
+
++    /// Specify the max bytes size to put in a single row group.
++    /// Default 128 M.
++    Builder* max_row_group_size(int64_t max_row_group_size) {
++      max_row_group_size_ = max_row_group_size;
++      return this;
++    }
++
+     /// Specify the data page size.
+     /// Default 1MB.
+     Builder* data_pagesize(int64_t pg_size) {
+@@ -664,7 +674,7 @@ class PARQUET_EXPORT WriterProperties {
+
+       return std::shared_ptr<WriterProperties>(new WriterProperties(
+           pool_, dictionary_pagesize_limit_, write_batch_size_, 
max_row_group_length_,
+-          pagesize_, version_, created_by_, page_checksum_enabled_,
++          max_row_group_size_, pagesize_, version_, created_by_, 
page_checksum_enabled_,
+           std::move(file_encryption_properties_), default_column_properties_,
+           column_properties, data_page_version_, store_decimal_as_integer_,
+           std::move(sorting_columns_)));
+@@ -675,6 +685,7 @@ class PARQUET_EXPORT WriterProperties {
+     int64_t dictionary_pagesize_limit_;
+     int64_t write_batch_size_;
+     int64_t max_row_group_length_;
++    int64_t max_row_group_size_;
+     int64_t pagesize_;
+     ParquetVersion::type version_;
+     ParquetDataPageVersion data_page_version_;
+@@ -705,6 +716,8 @@ class PARQUET_EXPORT WriterProperties {
+
+   inline int64_t max_row_group_length() const { return max_row_group_length_; 
}
+
++  inline int64_t max_row_group_size() const { return max_row_group_size_; }
++
+   inline int64_t data_pagesize() const { return pagesize_; }
+
+   inline ParquetDataPageVersion data_page_version() const {
+@@ -810,7 +823,7 @@ class PARQUET_EXPORT WriterProperties {
+  private:
+   explicit WriterProperties(
+       MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t 
write_batch_size,
+-      int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type 
version,
++      int64_t max_row_group_length, int64_t max_row_group_size, int64_t 
pagesize, ParquetVersion::type version,
+       const std::string& created_by, bool page_write_checksum_enabled,
+       std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
+       const ColumnProperties& default_column_properties,
+@@ -821,6 +834,7 @@ class PARQUET_EXPORT WriterProperties {
+         dictionary_pagesize_limit_(dictionary_pagesize_limit),
+         write_batch_size_(write_batch_size),
+         max_row_group_length_(max_row_group_length),
++        max_row_group_size_(max_row_group_size),
+         pagesize_(pagesize),
+         parquet_data_page_version_(data_page_version),
+         parquet_version_(version),
+@@ -836,6 +850,7 @@ class PARQUET_EXPORT WriterProperties {
+   int64_t dictionary_pagesize_limit_;
+   int64_t write_batch_size_;
+   int64_t max_row_group_length_;
++  int64_t max_row_group_size_;
+   int64_t pagesize_;
+   ParquetDataPageVersion parquet_data_page_version_;
+   ParquetVersion::type parquet_version_;
+diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake 
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
+index 9df922afa2..5c8b3d4d07 100644
+--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1789,7 +1789,20 @@ if(ARROW_WITH_THRIFT)
+                     REQUIRED_VERSION
+                     0.11.0)
+ 
+-  string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION})
++  if(NOT Thrift_VERSION)
++    if(DEFINED thrift_PC_VERSION AND thrift_PC_VERSION)
++      set(Thrift_VERSION "${thrift_PC_VERSION}")
++    elseif(DEFINED ThriftAlt_VERSION AND ThriftAlt_VERSION)
++      set(Thrift_VERSION "${ThriftAlt_VERSION}")
++    elseif(DEFINED THRIFT_VERSION AND THRIFT_VERSION)
++      set(Thrift_VERSION "${THRIFT_VERSION}")
++    endif()
++  endif()
++  if(NOT Thrift_VERSION)
++    message(FATAL_ERROR "Thrift_VERSION is empty after resolving Thrift 
dependency")
++  endif()
++
++  string(REPLACE "." ";" Thrift_VERSION_LIST "${Thrift_VERSION}")
+   list(GET Thrift_VERSION_LIST 0 Thrift_VERSION_MAJOR)
+   list(GET Thrift_VERSION_LIST 1 Thrift_VERSION_MINOR)
+   list(GET Thrift_VERSION_LIST 2 Thrift_VERSION_PATCH)
diff --git a/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch 
b/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
index 7de7d2875ca..31af1db7f0f 100644
--- a/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
+++ b/thirdparty/patches/paimon-cpp-buildutils-static-deps.patch
@@ -41,7 +41,7 @@ diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyT
 @@ -923,6 +920,13 @@ macro(build_orc)
          -DBUILD_TOOLS=OFF
          -DBUILD_CPP_ENABLE_METRICS=ON)
- 
+
 +    if(ORC_RPATH)
 +        list(APPEND ORC_CMAKE_ARGS
 +             "-DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath,${ORC_RPATH}"
@@ -82,7 +82,7 @@ diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyT
 +    set(THIRDPARTY_ZLIB_STATIC_LIB
 +        
"${THIRDPARTY_ZLIB_ROOT}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}z${CMAKE_STATIC_LIBRARY_SUFFIX}"
 +    )
- 
+
      # Strip lto flags (which may be added by dh_auto_configure)
      # See https://github.com/protocolbuffers/protobuf/issues/7092
 @@ -778,6 +781,10 @@ macro(build_protobuf)
@@ -97,6 +97,156 @@ diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyT
          -Dprotobuf_DEBUG_POSTFIX=)
      set(PROTOBUF_CONFIGURE SOURCE_SUBDIR "cmake" CMAKE_ARGS 
${PROTOBUF_CMAKE_ARGS})
 
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -34,6 +34,16 @@ set(EP_COMMON_TOOLCHAIN 
"-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+                         "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
+
++option(PAIMON_USE_EXTERNAL_ARROW "Reuse external Arrow/Parquet instead of 
building arrow_ep" OFF)
++set(PAIMON_EXTERNAL_ARROW_INCLUDE_DIR "" CACHE PATH
++    "Include directory for external Arrow/Parquet headers")
++set(PAIMON_EXTERNAL_ARROW_LIB "" CACHE FILEPATH "Path to external libarrow.a")
++set(PAIMON_EXTERNAL_ARROW_DATASET_LIB "" CACHE FILEPATH "Path to external 
libarrow_dataset.a")
++set(PAIMON_EXTERNAL_ARROW_ACERO_LIB "" CACHE FILEPATH "Path to external 
libarrow_acero.a")
++set(PAIMON_EXTERNAL_PARQUET_LIB "" CACHE FILEPATH "Path to external 
libparquet.a")
++set(PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB "" CACHE FILEPATH
++    "Path to external libarrow_bundled_dependencies.a")
++
+ macro(set_urls URLS)
+     set(${URLS} ${ARGN})
+ endmacro()
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -961,5 +961,95 @@ macro(build_orc)
+ endmacro()
+
+ macro(build_arrow)
+-    message(STATUS "Building Arrow from source")
++    if(PAIMON_USE_EXTERNAL_ARROW)
++        set(ARROW_INCLUDE_DIR 
"${CMAKE_CURRENT_BINARY_DIR}/doris_external_arrow_include")
++        file(MAKE_DIRECTORY "${ARROW_INCLUDE_DIR}")
++        if(NOT EXISTS "${ARROW_INCLUDE_DIR}/arrow")
++            execute_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink
++                            "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/arrow"
++                            "${ARROW_INCLUDE_DIR}/arrow")
++        endif()
++        if(EXISTS "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/parquet"
++           AND NOT EXISTS "${ARROW_INCLUDE_DIR}/parquet")
++            execute_process(COMMAND "${CMAKE_COMMAND}" -E create_symlink
++                            "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}/parquet"
++                            "${ARROW_INCLUDE_DIR}/parquet")
++        endif()
++
++        if(NOT PAIMON_EXTERNAL_ARROW_INCLUDE_DIR)
++            message(FATAL_ERROR
++                    "PAIMON_EXTERNAL_ARROW_INCLUDE_DIR must be set when 
PAIMON_USE_EXTERNAL_ARROW=ON"
++            )
++        endif()
++        if(NOT EXISTS "${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}")
++            message(FATAL_ERROR
++                    "PAIMON_EXTERNAL_ARROW_INCLUDE_DIR not found: 
${PAIMON_EXTERNAL_ARROW_INCLUDE_DIR}"
++            )
++        endif()
++
++        foreach(_paimon_external_lib
++                IN ITEMS PAIMON_EXTERNAL_ARROW_LIB
++                         PAIMON_EXTERNAL_ARROW_DATASET_LIB
++                         PAIMON_EXTERNAL_ARROW_ACERO_LIB
++                         PAIMON_EXTERNAL_PARQUET_LIB
++                         PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB)
++            if(NOT ${_paimon_external_lib})
++                message(FATAL_ERROR
++                        "${_paimon_external_lib} must be set when 
PAIMON_USE_EXTERNAL_ARROW=ON")
++            endif()
++            if(NOT EXISTS "${${_paimon_external_lib}}")
++                message(FATAL_ERROR
++                        "${_paimon_external_lib} not found: 
${${_paimon_external_lib}}")
++            endif()
++        endforeach()
++
++        add_library(arrow STATIC IMPORTED)
++        set_target_properties(arrow
++                              PROPERTIES IMPORTED_LOCATION 
"${PAIMON_EXTERNAL_ARROW_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        add_library(arrow_dataset STATIC IMPORTED)
++        set_target_properties(arrow_dataset
++                              PROPERTIES IMPORTED_LOCATION
++                                         
"${PAIMON_EXTERNAL_ARROW_DATASET_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        add_library(arrow_acero STATIC IMPORTED)
++        set_target_properties(arrow_acero
++                              PROPERTIES IMPORTED_LOCATION
++                                         "${PAIMON_EXTERNAL_ARROW_ACERO_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        add_library(parquet STATIC IMPORTED)
++        set_target_properties(parquet
++                              PROPERTIES IMPORTED_LOCATION 
"${PAIMON_EXTERNAL_PARQUET_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        add_library(arrow_bundled_dependencies STATIC IMPORTED)
++        set_target_properties(arrow_bundled_dependencies
++                              PROPERTIES IMPORTED_LOCATION
++                                         
"${PAIMON_EXTERNAL_ARROW_BUNDLED_DEPS_LIB}"
++                                         INTERFACE_INCLUDE_DIRECTORIES
++                                         "${ARROW_INCLUDE_DIR}")
++
++        target_link_libraries(arrow_acero INTERFACE arrow)
++
++        target_link_libraries(arrow_dataset INTERFACE arrow_acero)
++
++        target_link_libraries(arrow
++                              INTERFACE zstd
++                                        snappy
++                                        lz4
++                                        zlib
++                                        arrow_bundled_dependencies)
++
++        target_link_libraries(parquet
++                              INTERFACE zstd snappy lz4 zlib 
arrow_bundled_dependencies
++                                        arrow_dataset)
++    else()
++        message(STATUS "Building Arrow from source")
+
+     get_target_property(ARROW_SNAPPY_INCLUDE_DIR snappy 
INTERFACE_INCLUDE_DIRECTORIES)
+     get_filename_component(ARROW_SNAPPY_ROOT "${ARROW_SNAPPY_INCLUDE_DIR}" 
DIRECTORY)
+
+diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
+--- a/cmake_modules/ThirdpartyToolchain.cmake
++++ b/cmake_modules/ThirdpartyToolchain.cmake
+@@ -1121,6 +1121,7 @@ macro(build_arrow)
+                                     zlib
+                                     arrow_bundled_dependencies
+                                     arrow_dataset)
++    endif()
+ 
+ endmacro(build_arrow)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -326,10 +326,10 @@ if(PAIMON_ENABLE_LUMINA)
+     include_directories("${CMAKE_SOURCE_DIR}/third_party/lumina/include")
+ endif()
+ 
++include_directories(SYSTEM ${GLOG_INCLUDE_DIR})
+ include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
+ include_directories(SYSTEM ${TBB_INCLUDE_DIR})
+ 
+-include_directories(SYSTEM ${GLOG_INCLUDE_DIR})
+ add_compile_definitions("GLOG_USE_GLOG_EXPORT")
+ 
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
+
 diff --git a/src/paimon/common/logging/logging.cpp 
b/src/paimon/common/logging/logging.cpp
 --- a/src/paimon/common/logging/logging.cpp
 +++ b/src/paimon/common/logging/logging.cpp
@@ -116,7 +266,7 @@ diff --git a/src/paimon/common/memory/memory_pool.cpp 
b/src/paimon/common/memory
 @@ -55,7 +55,7 @@ void* MemoryPoolImpl::Malloc(uint64_t size, uint64_t 
alignment) {
      return memptr;
  }
- 
+
 -void* MemoryPoolImpl::Realloc(void* p, size_t old_size, size_t new_size, 
size_t alignment) {
 +void* MemoryPoolImpl::Realloc(void* p, size_t old_size, size_t new_size, 
uint64_t alignment) {
      if (alignment == 0) {
@@ -144,7 +294,7 @@ diff --git a/src/paimon/format/blob/blob_format_writer.cpp 
b/src/paimon/format/b
 +        read_len = static_cast<uint32_t>(
 +            std::min<uint64_t>(file_length - total_read_length, 
tmp_buffer_->size()));
      }
- 
+
      // write bin length
 
 --- a/cmake_modules/arrow.diff
@@ -160,7 +310,7 @@ diff --git a/src/paimon/format/blob/blob_format_writer.cpp 
b/src/paimon/format/b
 +@@ -1789,7 +1789,20 @@ if(ARROW_WITH_THRIFT)
 +                     REQUIRED_VERSION
 +                     0.11.0)
-+ 
++
 +-  string(REPLACE "." ";" Thrift_VERSION_LIST ${Thrift_VERSION})
 ++  if(NOT Thrift_VERSION)
 ++    if(DEFINED thrift_PC_VERSION AND thrift_PC_VERSION)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to