corleyma commented on issue #1259: URL: https://github.com/apache/iceberg-python/issues/1259#issuecomment-2448393527
> It looks like we're spending a lot of time on generating the sequence. I think it would be good to add this to Arrow itself: https://github.com/apache/arrow/issues/44583 I think we can move this forward since it doesn't happy very often that a file is affected by multiple positional deletes. Once it has been added to Arrow, we can also add it to PyIceberg If we wanted, I suspect we could use the pyarrow cython/C++ API to allocate and create the range array much faster. Something like: ```cython # range_impl.pyx # distutils: language=c++ from pyarrow.lib cimport * import pyarrow as pa import pyarrow.compute as pc from libcpp.memory cimport shared_ptr, make_shared from libcpp.vector cimport vector from libc.stdint cimport int64_t, uint8_t cdef extern from "arrow/array.h" namespace "arrow": cdef cppclass CInt64Array "arrow::Int64Array": CInt64Array(const shared_ptr[CDataType]& type, int64_t length, const shared_ptr[CBuffer]& data) def create_arrow_range(int64_t start, int64_t end): if start >= end: raise ValueError("start must be less than end") cdef: vector[int64_t] values int64_t i shared_ptr[CArray] out shared_ptr[CBuffer] buffer shared_ptr[CDataType] dtype # Pre-allocate the vector size values.reserve(end - start) # Fill vector with range values for i in range(start, end): values.push_back(i) # Create buffer from vector data buffer = make_shared[CBuffer]( <uint8_t*>values.data(), values.size() * sizeof(int64_t) ) # Get int64 datatype dtype = pyarrow_unwrap_data_type(pa.int64()) # Create Int64Array out = make_shared[CInt64Array](dtype, end - start, buffer) # Wrap and return as Python object return pyarrow_wrap_array(out) ``` ```python # setup.py import os import pyarrow as pa from Cython.Build import cythonize from setuptools import Extension, setup # Ensure you have the Arrow C++ shared libraries available try: pa.create_library_symlinks() except: pass ext_modules = cythonize( "range_impl.pyx", compiler_directives={ "language_level": "3", "embedsignature": True, }, ) for ext in ext_modules: # We only need numpy headers at build time ext.include_dirs.append(pa.get_include()) ext.libraries.extend(pa.get_libraries()) ext.library_dirs.extend(pa.get_library_dirs()) # Enable C++17 if os.name == "posix": ext.extra_compile_args.append("-std=c++17") setup( name="arrow_range", ext_modules=ext_modules, setup_requires=["pyarrow"], # Only needed during build install_requires=["pyarrow"], # Runtime dependency ) ``` and using this to benchmark: ```python # benchmark.py import time import pyarrow as pa from range_impl import create_arrow_range def test_python_range(start, end): return pa.array(range(start, end)) def test_cython_range(start, end): return create_arrow_range(start, end) def benchmark(start, end, n_runs=5): times_python = [] times_cython = [] print(f"Testing range from {start} to {end} ({end-start} elements)") # Warm up _ = test_python_range(0, 1000) _ = test_cython_range(0, 1000) for i in range(n_runs): # Test Python range t0 = time.perf_counter() arr_py = test_python_range(start, end) t1 = time.perf_counter() times_python.append(t1 - t0) # Test Cython range t0 = time.perf_counter() arr_cy = test_cython_range(start, end) t1 = time.perf_counter() times_cython.append(t1 - t0) # Verify results match assert arr_py.equals(arr_cy), "Arrays don't match!" avg_python = sum(times_python) / len(times_python) avg_cython = sum(times_cython) / len(times_cython) print(f"Python average: {avg_python:.4f}s") print(f"Cython average: {avg_cython:.4f}s") print(f"Speedup: {avg_python/avg_cython:.2f}x") if __name__ == "__main__": benchmark(0, 10_000_000) ``` Gives me this on my m1: ```sh python setup.py build_ext --inplace python benchmark.py running build_ext Testing range from 0 to 10000000 (10000000 elements) Python average: 0.9227s Cython average: 0.0125s Speedup: 73.64x ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org