Re: [I] Remove `numpy` as a dependency [iceberg-python]

via GitHub Wed, 30 Oct 2024 14:14:09 -0700


corleyma commented on issue #1259:
URL: 
https://github.com/apache/iceberg-python/issues/1259#issuecomment-2448393527


   > It looks like we're spending a lot of time on generating the sequence. I 
think it would be good to add this to Arrow itself: 
https://github.com/apache/arrow/issues/44583 I think we can move this forward 
since it doesn't happy very often that a file is affected by multiple 
positional deletes. Once it has been added to Arrow, we can also add it to 
PyIceberg
   
   If we wanted, I suspect we could use the pyarrow cython/C++ API to allocate 
and create the range array much faster.
   
   Something like:
   ```cython
   # range_impl.pyx
   # distutils: language=c++
   
   from pyarrow.lib cimport *
   import pyarrow as pa
   import pyarrow.compute as pc
   from libcpp.memory cimport shared_ptr, make_shared
   from libcpp.vector cimport vector
   from libc.stdint cimport int64_t, uint8_t
   
   cdef extern from "arrow/array.h" namespace "arrow":
       cdef cppclass CInt64Array "arrow::Int64Array":
           CInt64Array(const shared_ptr[CDataType]& type,
                      int64_t length,
                      const shared_ptr[CBuffer]& data)
   
   def create_arrow_range(int64_t start, int64_t end):
       if start >= end:
           raise ValueError("start must be less than end")
           
       cdef:
           vector[int64_t] values
           int64_t i
           shared_ptr[CArray] out
           shared_ptr[CBuffer] buffer
           shared_ptr[CDataType] dtype
       
       # Pre-allocate the vector size
       values.reserve(end - start)
       
       # Fill vector with range values
       for i in range(start, end):
           values.push_back(i)
       
       # Create buffer from vector data
       buffer = make_shared[CBuffer](
           <uint8_t*>values.data(),
           values.size() * sizeof(int64_t)
       )
       
       # Get int64 datatype
       dtype = pyarrow_unwrap_data_type(pa.int64())
       
       # Create Int64Array
       out = make_shared[CInt64Array](dtype, end - start, buffer)
       
       # Wrap and return as Python object
       return pyarrow_wrap_array(out)
    
   ```
   
   ```python
   # setup.py
   import os
   
   import pyarrow as pa
   from Cython.Build import cythonize
   from setuptools import Extension, setup
   
   # Ensure you have the Arrow C++ shared libraries available
   try:
       pa.create_library_symlinks()
   except:
       pass
   
   ext_modules = cythonize(
       "range_impl.pyx",
       compiler_directives={
           "language_level": "3",
           "embedsignature": True,
       },
   )
   
   for ext in ext_modules:
       # We only need numpy headers at build time
       ext.include_dirs.append(pa.get_include())
       ext.libraries.extend(pa.get_libraries())
       ext.library_dirs.extend(pa.get_library_dirs())
   
       # Enable C++17
       if os.name == "posix":
           ext.extra_compile_args.append("-std=c++17")
   
   setup(
       name="arrow_range",
       ext_modules=ext_modules,
       setup_requires=["pyarrow"],  # Only needed during build
       install_requires=["pyarrow"],  # Runtime dependency
   )
   ```
   
   and using this to benchmark:
   ```python
   # benchmark.py
   import time
   
   import pyarrow as pa
   
   from range_impl import create_arrow_range
   
   
   def test_python_range(start, end):
       return pa.array(range(start, end))
   
   
   def test_cython_range(start, end):
       return create_arrow_range(start, end)
   
   
   def benchmark(start, end, n_runs=5):
       times_python = []
       times_cython = []
   
       print(f"Testing range from {start} to {end} ({end-start} elements)")
   
       # Warm up
       _ = test_python_range(0, 1000)
       _ = test_cython_range(0, 1000)
   
       for i in range(n_runs):
           # Test Python range
           t0 = time.perf_counter()
           arr_py = test_python_range(start, end)
           t1 = time.perf_counter()
           times_python.append(t1 - t0)
   
           # Test Cython range
           t0 = time.perf_counter()
           arr_cy = test_cython_range(start, end)
           t1 = time.perf_counter()
           times_cython.append(t1 - t0)
   
           # Verify results match
           assert arr_py.equals(arr_cy), "Arrays don't match!"
   
       avg_python = sum(times_python) / len(times_python)
       avg_cython = sum(times_cython) / len(times_cython)
   
       print(f"Python average: {avg_python:.4f}s")
       print(f"Cython average: {avg_cython:.4f}s")
       print(f"Speedup: {avg_python/avg_cython:.2f}x")
   
   
   if __name__ == "__main__":
       benchmark(0, 10_000_000)  
   ```
   
   Gives me this on my m1:
   ```sh
   python setup.py build_ext --inplace
   python benchmark.py
   running build_ext
   Testing range from 0 to 10000000 (10000000 elements)
   Python average: 0.9227s
   Cython average: 0.0125s
   Speedup: 73.64x
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [I] Remove `numpy` as a dependency [iceberg-python]

Reply via email to