Implement SSE2 lrint() and lrintf() on amd64.

Index: Makefile
===================================================================
RCS file: /cvs/ports/audio/libsamplerate/Makefile,v
retrieving revision 1.27
diff -u -p -u -p -r1.27 Makefile
--- Makefile    5 Sep 2023 16:13:38 -0000       1.27
+++ Makefile    27 Apr 2024 00:26:05 -0000
@@ -2,7 +2,7 @@ COMMENT=        audio sample rate conversion li
 
 VER=           0.2.2
 DISTNAME=      libsamplerate-${VER}
-REVISION=      0
+REVISION=      1
 CATEGORIES=    audio
 EXTRACT_SUFX=  .tar.xz
 
@@ -18,7 +18,9 @@ SITES=        https://github.com/libsndfile/lib
 
 WANTLIB=       m
 
-CONFIGURE_STYLE=gnu
+AUTOCONF_VERSION=      2.71
+AUTOMAKE_VERSION=      1.16
+CONFIGURE_STYLE=autoreconf
 CONFIGURE_ARGS=        --disable-cpu-clip \
                --disable-fftw \
                --disable-sndfile
Index: patches/patch-configure_ac
===================================================================
RCS file: patches/patch-configure_ac
diff -N patches/patch-configure_ac
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ patches/patch-configure_ac  27 Apr 2024 00:26:05 -0000
@@ -0,0 +1,43 @@
+- Implement SSE2 lrint() and lrintf()
+  7a81766b14fa03e97822cf1e0b1651648df13116
+- use sse2 intrinsics for lrint/lrintf only on windows x64
+  c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9
+- sse2 lrint/lrintf updates
+  c164eaa25ffdeedc7d25e731172cc45a25f483d4
+
+Index: configure.ac
+--- configure.ac.orig
++++ configure.ac
+@@ -89,7 +89,7 @@ m4_define([abi_version_patch], [lt_revision])
+ 
+ dnl 
====================================================================================
+ 
+-AC_CHECK_HEADERS([stdbool.h stdint.h sys/times.h unistd.h])
++AC_CHECK_HEADERS([stdbool.h stdint.h sys/times.h unistd.h immintrin.h])
+ 
+ dnl 
====================================================================================
+ dnl  Couple of initializations here. Fill in real values later.
+@@ -105,6 +105,9 @@ AC_ARG_ENABLE([werror],
+ AC_ARG_ENABLE([cpu-clip],
+       [AS_HELP_STRING([--disable-cpu-clip], [disable tricky cpu specific 
clipper])])
+ 
++AC_ARG_ENABLE([sse2-lrint],
++      [AS_HELP_STRING([--enable-sse2-lrint], [implement lrintf using SSE2 on 
x86 CPUs if possible])])
++
+ AC_ARG_ENABLE([sndfile],
+       [AS_HELP_STRING([--disable-sndfile], [disable support for sndfile 
(default=autodetect)])], [], [enable_sndfile=auto])
+ 
+@@ -178,6 +181,13 @@ AS_IF([test "x$enable_cpu_clip" != "xno"], [
+ 
+ AC_DEFINE_UNQUOTED([CPU_CLIPS_POSITIVE], [${ac_cv_c_clip_positive}], [Host 
processor clips on positive float to int conversion.])
+ AC_DEFINE_UNQUOTED([CPU_CLIPS_NEGATIVE], [${ac_cv_c_clip_negative}], [Host 
processor clips on negative float to int conversion.])
++
++dnl 
====================================================================================
++dnl  Determine if the user enabled lrint implementations using SSE2.
++
++AS_IF([test "x$enable_sse2_lrint" = "xyes"], [
++              CFLAGS="$CFLAGS -DENABLE_SSE2_LRINT"
++      ])
+ 
+ dnl 
====================================================================================
+ dnl  Check for libsndfile which is required for the test and example programs.
Index: patches/patch-examples_audio_out_c
===================================================================
RCS file: patches/patch-examples_audio_out_c
diff -N patches/patch-examples_audio_out_c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ patches/patch-examples_audio_out_c  27 Apr 2024 00:26:05 -0000
@@ -0,0 +1,19 @@
+- Implement SSE2 lrint() and lrintf()
+  7a81766b14fa03e97822cf1e0b1651648df13116
+- use sse2 intrinsics for lrint/lrintf only on windows x64
+  c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9
+- sse2 lrint/lrintf updates
+  c164eaa25ffdeedc7d25e731172cc45a25f483d4
+
+Index: examples/audio_out.c
+--- examples/audio_out.c.orig
++++ examples/audio_out.c
+@@ -960,7 +960,7 @@ solaris_play (get_audio_callback_t callback, AUDIO_OUT
+ 
+       while ((read_frames = callback (callback_data, float_buffer, BUFFER_LEN 
/ solaris_out->channels)))
+       {       for (k = 0 ; k < read_frames * solaris_out->channels ; k++)
+-                      buffer [k] = lrint (32767.0 * float_buffer [k]) ;
++                      buffer [k] = psf_lrint (32767.0 * float_buffer [k]) ;
+               write (solaris_out->fd, buffer, read_frames * 
solaris_out->channels * sizeof (short)) ;
+               } ;
+ 
Index: patches/patch-src_common_h
===================================================================
RCS file: patches/patch-src_common_h
diff -N patches/patch-src_common_h
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ patches/patch-src_common_h  27 Apr 2024 00:26:05 -0000
@@ -0,0 +1,98 @@
+- Implement SSE2 lrint() and lrintf()
+  7a81766b14fa03e97822cf1e0b1651648df13116
+- use sse2 intrinsics for lrint/lrintf only on windows x64
+  c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9
+- sse2 lrint/lrintf updates
+  c164eaa25ffdeedc7d25e731172cc45a25f483d4
+
+Index: src/common.h
+--- src/common.h.orig
++++ src/common.h
+@@ -14,6 +14,36 @@
+ #include <stdbool.h>
+ #endif
+ 
++#if defined(__x86_64__) || defined(_M_X64)
++#   define HAVE_SSE2_INTRINSICS
++#elif defined(ENABLE_SSE2_LRINT) && (defined(_M_IX86) || defined(__i386__))
++#   if defined(_MSC_VER)
++#       define HAVE_SSE2_INTRINSICS
++#   elif defined(__clang__)
++#       ifdef __SSE2__
++#           define HAVE_SSE2_INTRINSICS
++#       elif (__has_attribute(target))
++#           define HAVE_SSE2_INTRINSICS
++#           define USE_TARGET_ATTRIBUTE
++#       endif
++#   elif defined(__GNUC__)
++#       ifdef __SSE2__
++#           define HAVE_SSE2_INTRINSICS
++#       elif (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9))
++#           define HAVE_SSE2_INTRINSICS
++#           define USE_TARGET_ATTRIBUTE
++#       endif
++#   endif
++#endif
++
++#ifdef HAVE_SSE2_INTRINSICS
++#ifdef HAVE_IMMINTRIN_H
++#include <immintrin.h>
++#else
++#include <emmintrin.h>
++#endif
++#endif /* HAVE_SSE2_INTRINSICS */
++
+ #include <math.h>
+ 
+ #ifdef HAVE_VISIBILITY
+@@ -163,6 +193,41 @@ const char* zoh_get_description (int src_enum) ;
+ SRC_STATE *zoh_state_new (int channels, SRC_ERROR *error) ;
+ 
+ /*----------------------------------------------------------
++** SIMD optimized math functions.
++*/
++
++#ifdef HAVE_SSE2_INTRINSICS
++static inline int
++#ifdef USE_TARGET_ATTRIBUTE
++__attribute__((target("sse2")))
++#endif
++psf_lrintf (float x)
++{
++      return _mm_cvtss_si32 (_mm_load_ss (&x)) ;
++}
++static inline int
++#ifdef USE_TARGET_ATTRIBUTE
++__attribute__((target("sse2")))
++#endif
++psf_lrint (double x)
++{
++      return _mm_cvtsd_si32 (_mm_load_sd (&x)) ;
++}
++
++#else
++
++static inline int psf_lrintf (float x)
++{
++      return lrintf (x) ;
++} /* psf_lrintf */
++
++static inline int psf_lrint (double x)
++{
++      return lrint (x) ;
++} /* psf_lrint */
++#endif
++
++/*----------------------------------------------------------
+ **    Common static inline functions.
+ */
+ 
+@@ -170,7 +235,7 @@ static inline double
+ fmod_one (double x)
+ {     double res ;
+ 
+-      res = x - lrint (x) ;
++      res = x - psf_lrint (x) ;
+       if (res < 0.0)
+               return res + 1.0 ;
+ 
Index: patches/patch-src_samplerate_c
===================================================================
RCS file: patches/patch-src_samplerate_c
diff -N patches/patch-src_samplerate_c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ patches/patch-src_samplerate_c      27 Apr 2024 00:26:05 -0000
@@ -0,0 +1,28 @@
+- Implement SSE2 lrint() and lrintf()
+  7a81766b14fa03e97822cf1e0b1651648df13116
+- use sse2 intrinsics for lrint/lrintf only on windows x64
+  c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9
+- sse2 lrint/lrintf updates
+  c164eaa25ffdeedc7d25e731172cc45a25f483d4
+
+Index: src/samplerate.c
+--- src/samplerate.c.orig
++++ src/samplerate.c
+@@ -445,7 +445,7 @@ src_float_to_short_array (const float *in, short *out,
+               else if (scaled_value <= -32768.f)
+                       out [i] = -32768 ;
+               else
+-                      out [i] = (short) (lrintf (scaled_value)) ;
++                      out [i] = (short) (psf_lrintf (scaled_value)) ;
+       }
+ } /* src_float_to_short_array */
+ 
+@@ -477,7 +477,7 @@ src_float_to_int_array (const float *in, int *out, int
+                       continue ;
+                       } ;
+ #endif
+-              out [i] = (int) lrint (scaled_value) ;
++              out [i] = (int) psf_lrint (scaled_value) ;
+               } ;
+ 
+ } /* src_float_to_int_array */
Index: patches/patch-src_src_linear_c
===================================================================
RCS file: patches/patch-src_src_linear_c
diff -N patches/patch-src_src_linear_c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ patches/patch-src_src_linear_c      27 Apr 2024 00:26:05 -0000
@@ -0,0 +1,28 @@
+- Implement SSE2 lrint() and lrintf()
+  7a81766b14fa03e97822cf1e0b1651648df13116
+- use sse2 intrinsics for lrint/lrintf only on windows x64
+  c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9
+- sse2 lrint/lrintf updates
+  c164eaa25ffdeedc7d25e731172cc45a25f483d4
+
+Index: src/src_linear.c
+--- src/src_linear.c.orig
++++ src/src_linear.c
+@@ -102,7 +102,7 @@ linear_vari_process (SRC_STATE *state, SRC_DATA *data)
+               } ;
+ 
+       rem = fmod_one (input_index) ;
+-      priv->in_used += state->channels * lrint (input_index - rem) ;
++      priv->in_used += state->channels * psf_lrint (input_index - rem) ;
+       input_index = rem ;
+ 
+       /* Main processing loop. */
+@@ -128,7 +128,7 @@ linear_vari_process (SRC_STATE *state, SRC_DATA *data)
+               input_index += 1.0 / src_ratio ;
+               rem = fmod_one (input_index) ;
+ 
+-              priv->in_used += state->channels * lrint (input_index - rem) ;
++              priv->in_used += state->channels * psf_lrint (input_index - 
rem) ;
+               input_index = rem ;
+               } ;
+ 
Index: patches/patch-src_src_sinc_c
===================================================================
RCS file: patches/patch-src_src_sinc_c
diff -N patches/patch-src_src_sinc_c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ patches/patch-src_src_sinc_c        27 Apr 2024 00:26:05 -0000
@@ -0,0 +1,148 @@
+- Implement SSE2 lrint() and lrintf()
+  7a81766b14fa03e97822cf1e0b1651648df13116
+- use sse2 intrinsics for lrint/lrintf only on windows x64
+  c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9
+- sse2 lrint/lrintf updates
+  c164eaa25ffdeedc7d25e731172cc45a25f483d4
+
+Index: src/src_sinc.c
+--- src/src_sinc.c.orig
++++ src/src_sinc.c
+@@ -132,7 +132,7 @@ static SRC_STATE_VT sinc_mono_state_vt =
+ 
+ static inline increment_t
+ double_to_fp (double x)
+-{     return (increment_t) (lrint ((x) * FP_ONE)) ;
++{     return (increment_t) (psf_lrint ((x) * FP_ONE)) ;
+ } /* double_to_fp */
+ 
+ static inline increment_t
+@@ -240,7 +240,7 @@ sinc_filter_new (int converter_type, int channels)
+ #endif
+               }
+ 
+-              priv->b_len = 3 * (int) lrint ((priv->coeff_half_len + 2.0) / 
priv->index_inc * SRC_MAX_RATIO + 1) ;
++              priv->b_len = 3 * (int) psf_lrint ((priv->coeff_half_len + 2.0) 
/ priv->index_inc * SRC_MAX_RATIO + 1) ;
+               priv->b_len = MAX (priv->b_len, 4096) ;
+               priv->b_len *= channels ;
+               priv->b_len += 1 ; // There is a <= check against 
samples_in_hand requiring a buffer bigger than the calculation above
+@@ -458,12 +458,12 @@ sinc_mono_vari_process (SRC_STATE *state, SRC_DATA *da
+               count /= MIN (state->last_ratio, data->src_ratio) ;
+ 
+       /* Maximum coefficientson either side of center point. */
+-      half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ;
++      half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ;
+ 
+       input_index = state->last_position ;
+ 
+       rem = fmod_one (input_index) ;
+-      filter->b_current = (filter->b_current + state->channels * lrint 
(input_index - rem)) % filter->b_len ;
++      filter->b_current = (filter->b_current + state->channels * psf_lrint 
(input_index - rem)) % filter->b_len ;
+       input_index = rem ;
+ 
+       terminate = 1.0 / src_ratio + 1e-20 ;
+@@ -505,7 +505,7 @@ sinc_mono_vari_process (SRC_STATE *state, SRC_DATA *da
+               input_index += 1.0 / src_ratio ;
+               rem = fmod_one (input_index) ;
+ 
+-              filter->b_current = (filter->b_current + state->channels * 
lrint (input_index - rem)) % filter->b_len ;
++              filter->b_current = (filter->b_current + state->channels * 
psf_lrint (input_index - rem)) % filter->b_len ;
+               input_index = rem ;
+               } ;
+ 
+@@ -614,12 +614,12 @@ sinc_stereo_vari_process (SRC_STATE *state, SRC_DATA *
+               count /= MIN (state->last_ratio, data->src_ratio) ;
+ 
+       /* Maximum coefficientson either side of center point. */
+-      half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ;
++      half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ;
+ 
+       input_index = state->last_position ;
+ 
+       rem = fmod_one (input_index) ;
+-      filter->b_current = (filter->b_current + state->channels * lrint 
(input_index - rem)) % filter->b_len ;
++      filter->b_current = (filter->b_current + state->channels * psf_lrint 
(input_index - rem)) % filter->b_len ;
+       input_index = rem ;
+ 
+       terminate = 1.0 / src_ratio + 1e-20 ;
+@@ -660,7 +660,7 @@ sinc_stereo_vari_process (SRC_STATE *state, SRC_DATA *
+               input_index += 1.0 / src_ratio ;
+               rem = fmod_one (input_index) ;
+ 
+-              filter->b_current = (filter->b_current + state->channels * 
lrint (input_index - rem)) % filter->b_len ;
++              filter->b_current = (filter->b_current + state->channels * 
psf_lrint (input_index - rem)) % filter->b_len ;
+               input_index = rem ;
+               } ;
+ 
+@@ -770,12 +770,12 @@ sinc_quad_vari_process (SRC_STATE *state, SRC_DATA *da
+               count /= MIN (state->last_ratio, data->src_ratio) ;
+ 
+       /* Maximum coefficientson either side of center point. */
+-      half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ;
++      half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ;
+ 
+       input_index = state->last_position ;
+ 
+       rem = fmod_one (input_index) ;
+-      filter->b_current = (filter->b_current + state->channels * lrint 
(input_index - rem)) % filter->b_len ;
++      filter->b_current = (filter->b_current + state->channels * psf_lrint 
(input_index - rem)) % filter->b_len ;
+       input_index = rem ;
+ 
+       terminate = 1.0 / src_ratio + 1e-20 ;
+@@ -816,7 +816,7 @@ sinc_quad_vari_process (SRC_STATE *state, SRC_DATA *da
+               input_index += 1.0 / src_ratio ;
+               rem = fmod_one (input_index) ;
+ 
+-              filter->b_current = (filter->b_current + state->channels * 
lrint (input_index - rem)) % filter->b_len ;
++              filter->b_current = (filter->b_current + state->channels * 
psf_lrint (input_index - rem)) % filter->b_len ;
+               input_index = rem ;
+               } ;
+ 
+@@ -925,12 +925,12 @@ sinc_hex_vari_process (SRC_STATE *state, SRC_DATA *dat
+               count /= MIN (state->last_ratio, data->src_ratio) ;
+ 
+       /* Maximum coefficientson either side of center point. */
+-      half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ;
++      half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ;
+ 
+       input_index = state->last_position ;
+ 
+       rem = fmod_one (input_index) ;
+-      filter->b_current = (filter->b_current + state->channels * lrint 
(input_index - rem)) % filter->b_len ;
++      filter->b_current = (filter->b_current + state->channels * psf_lrint 
(input_index - rem)) % filter->b_len ;
+       input_index = rem ;
+ 
+       terminate = 1.0 / src_ratio + 1e-20 ;
+@@ -971,7 +971,7 @@ sinc_hex_vari_process (SRC_STATE *state, SRC_DATA *dat
+               input_index += 1.0 / src_ratio ;
+               rem = fmod_one (input_index) ;
+ 
+-              filter->b_current = (filter->b_current + state->channels * 
lrint (input_index - rem)) % filter->b_len ;
++              filter->b_current = (filter->b_current + state->channels * 
psf_lrint (input_index - rem)) % filter->b_len ;
+               input_index = rem ;
+               } ;
+ 
+@@ -1090,12 +1090,12 @@ sinc_multichan_vari_process (SRC_STATE *state, SRC_DAT
+               count /= MIN (state->last_ratio, data->src_ratio) ;
+ 
+       /* Maximum coefficientson either side of center point. */
+-      half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ;
++      half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ;
+ 
+       input_index = state->last_position ;
+ 
+       rem = fmod_one (input_index) ;
+-      filter->b_current = (filter->b_current + state->channels * lrint 
(input_index - rem)) % filter->b_len ;
++      filter->b_current = (filter->b_current + state->channels * psf_lrint 
(input_index - rem)) % filter->b_len ;
+       input_index = rem ;
+ 
+       terminate = 1.0 / src_ratio + 1e-20 ;
+@@ -1136,7 +1136,7 @@ sinc_multichan_vari_process (SRC_STATE *state, SRC_DAT
+               input_index += 1.0 / src_ratio ;
+               rem = fmod_one (input_index) ;
+ 
+-              filter->b_current = (filter->b_current + state->channels * 
lrint (input_index - rem)) % filter->b_len ;
++              filter->b_current = (filter->b_current + state->channels * 
psf_lrint (input_index - rem)) % filter->b_len ;
+               input_index = rem ;
+               } ;
+ 
Index: patches/patch-src_src_zoh_c
===================================================================
RCS file: patches/patch-src_src_zoh_c
diff -N patches/patch-src_src_zoh_c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ patches/patch-src_src_zoh_c 27 Apr 2024 00:26:05 -0000
@@ -0,0 +1,28 @@
+- Implement SSE2 lrint() and lrintf()
+  7a81766b14fa03e97822cf1e0b1651648df13116
+- use sse2 intrinsics for lrint/lrintf only on windows x64
+  c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9
+- sse2 lrint/lrintf updates
+  c164eaa25ffdeedc7d25e731172cc45a25f483d4
+
+Index: src/src_zoh.c
+--- src/src_zoh.c.orig
++++ src/src_zoh.c
+@@ -99,7 +99,7 @@ zoh_vari_process (SRC_STATE *state, SRC_DATA *data)
+               } ;
+ 
+       rem = fmod_one (input_index) ;
+-      priv->in_used += state->channels * lrint (input_index - rem) ;
++      priv->in_used += state->channels * psf_lrint (input_index - rem) ;
+       input_index = rem ;
+ 
+       /* Main processing loop. */
+@@ -117,7 +117,7 @@ zoh_vari_process (SRC_STATE *state, SRC_DATA *data)
+               input_index += 1.0 / src_ratio ;
+               rem = fmod_one (input_index) ;
+ 
+-              priv->in_used += state->channels * lrint (input_index - rem) ;
++              priv->in_used += state->channels * psf_lrint (input_index - 
rem) ;
+               input_index = rem ;
+               } ;
+ 

Reply via email to