Implement SSE2 lrint() and lrintf() on amd64.
Index: Makefile =================================================================== RCS file: /cvs/ports/audio/libsamplerate/Makefile,v retrieving revision 1.27 diff -u -p -u -p -r1.27 Makefile --- Makefile 5 Sep 2023 16:13:38 -0000 1.27 +++ Makefile 27 Apr 2024 00:26:05 -0000 @@ -2,7 +2,7 @@ COMMENT= audio sample rate conversion li VER= 0.2.2 DISTNAME= libsamplerate-${VER} -REVISION= 0 +REVISION= 1 CATEGORIES= audio EXTRACT_SUFX= .tar.xz @@ -18,7 +18,9 @@ SITES= https://github.com/libsndfile/lib WANTLIB= m -CONFIGURE_STYLE=gnu +AUTOCONF_VERSION= 2.71 +AUTOMAKE_VERSION= 1.16 +CONFIGURE_STYLE=autoreconf CONFIGURE_ARGS= --disable-cpu-clip \ --disable-fftw \ --disable-sndfile Index: patches/patch-configure_ac =================================================================== RCS file: patches/patch-configure_ac diff -N patches/patch-configure_ac --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-configure_ac 27 Apr 2024 00:26:05 -0000 @@ -0,0 +1,43 @@ +- Implement SSE2 lrint() and lrintf() + 7a81766b14fa03e97822cf1e0b1651648df13116 +- use sse2 intrinsics for lrint/lrintf only on windows x64 + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 +- sse2 lrint/lrintf updates + c164eaa25ffdeedc7d25e731172cc45a25f483d4 + +Index: configure.ac +--- configure.ac.orig ++++ configure.ac +@@ -89,7 +89,7 @@ m4_define([abi_version_patch], [lt_revision]) + + dnl ==================================================================================== + +-AC_CHECK_HEADERS([stdbool.h stdint.h sys/times.h unistd.h]) ++AC_CHECK_HEADERS([stdbool.h stdint.h sys/times.h unistd.h immintrin.h]) + + dnl ==================================================================================== + dnl Couple of initializations here. Fill in real values later. +@@ -105,6 +105,9 @@ AC_ARG_ENABLE([werror], + AC_ARG_ENABLE([cpu-clip], + [AS_HELP_STRING([--disable-cpu-clip], [disable tricky cpu specific clipper])]) + ++AC_ARG_ENABLE([sse2-lrint], ++ [AS_HELP_STRING([--enable-sse2-lrint], [implement lrintf using SSE2 on x86 CPUs if possible])]) ++ + AC_ARG_ENABLE([sndfile], + [AS_HELP_STRING([--disable-sndfile], [disable support for sndfile (default=autodetect)])], [], [enable_sndfile=auto]) + +@@ -178,6 +181,13 @@ AS_IF([test "x$enable_cpu_clip" != "xno"], [ + + AC_DEFINE_UNQUOTED([CPU_CLIPS_POSITIVE], [${ac_cv_c_clip_positive}], [Host processor clips on positive float to int conversion.]) + AC_DEFINE_UNQUOTED([CPU_CLIPS_NEGATIVE], [${ac_cv_c_clip_negative}], [Host processor clips on negative float to int conversion.]) ++ ++dnl ==================================================================================== ++dnl Determine if the user enabled lrint implementations using SSE2. ++ ++AS_IF([test "x$enable_sse2_lrint" = "xyes"], [ ++ CFLAGS="$CFLAGS -DENABLE_SSE2_LRINT" ++ ]) + + dnl ==================================================================================== + dnl Check for libsndfile which is required for the test and example programs. Index: patches/patch-examples_audio_out_c =================================================================== RCS file: patches/patch-examples_audio_out_c diff -N patches/patch-examples_audio_out_c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-examples_audio_out_c 27 Apr 2024 00:26:05 -0000 @@ -0,0 +1,19 @@ +- Implement SSE2 lrint() and lrintf() + 7a81766b14fa03e97822cf1e0b1651648df13116 +- use sse2 intrinsics for lrint/lrintf only on windows x64 + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 +- sse2 lrint/lrintf updates + c164eaa25ffdeedc7d25e731172cc45a25f483d4 + +Index: examples/audio_out.c +--- examples/audio_out.c.orig ++++ examples/audio_out.c +@@ -960,7 +960,7 @@ solaris_play (get_audio_callback_t callback, AUDIO_OUT + + while ((read_frames = callback (callback_data, float_buffer, BUFFER_LEN / solaris_out->channels))) + { for (k = 0 ; k < read_frames * solaris_out->channels ; k++) +- buffer [k] = lrint (32767.0 * float_buffer [k]) ; ++ buffer [k] = psf_lrint (32767.0 * float_buffer [k]) ; + write (solaris_out->fd, buffer, read_frames * solaris_out->channels * sizeof (short)) ; + } ; + Index: patches/patch-src_common_h =================================================================== RCS file: patches/patch-src_common_h diff -N patches/patch-src_common_h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-src_common_h 27 Apr 2024 00:26:05 -0000 @@ -0,0 +1,98 @@ +- Implement SSE2 lrint() and lrintf() + 7a81766b14fa03e97822cf1e0b1651648df13116 +- use sse2 intrinsics for lrint/lrintf only on windows x64 + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 +- sse2 lrint/lrintf updates + c164eaa25ffdeedc7d25e731172cc45a25f483d4 + +Index: src/common.h +--- src/common.h.orig ++++ src/common.h +@@ -14,6 +14,36 @@ + #include <stdbool.h> + #endif + ++#if defined(__x86_64__) || defined(_M_X64) ++# define HAVE_SSE2_INTRINSICS ++#elif defined(ENABLE_SSE2_LRINT) && (defined(_M_IX86) || defined(__i386__)) ++# if defined(_MSC_VER) ++# define HAVE_SSE2_INTRINSICS ++# elif defined(__clang__) ++# ifdef __SSE2__ ++# define HAVE_SSE2_INTRINSICS ++# elif (__has_attribute(target)) ++# define HAVE_SSE2_INTRINSICS ++# define USE_TARGET_ATTRIBUTE ++# endif ++# elif defined(__GNUC__) ++# ifdef __SSE2__ ++# define HAVE_SSE2_INTRINSICS ++# elif (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) ++# define HAVE_SSE2_INTRINSICS ++# define USE_TARGET_ATTRIBUTE ++# endif ++# endif ++#endif ++ ++#ifdef HAVE_SSE2_INTRINSICS ++#ifdef HAVE_IMMINTRIN_H ++#include <immintrin.h> ++#else ++#include <emmintrin.h> ++#endif ++#endif /* HAVE_SSE2_INTRINSICS */ ++ + #include <math.h> + + #ifdef HAVE_VISIBILITY +@@ -163,6 +193,41 @@ const char* zoh_get_description (int src_enum) ; + SRC_STATE *zoh_state_new (int channels, SRC_ERROR *error) ; + + /*---------------------------------------------------------- ++** SIMD optimized math functions. ++*/ ++ ++#ifdef HAVE_SSE2_INTRINSICS ++static inline int ++#ifdef USE_TARGET_ATTRIBUTE ++__attribute__((target("sse2"))) ++#endif ++psf_lrintf (float x) ++{ ++ return _mm_cvtss_si32 (_mm_load_ss (&x)) ; ++} ++static inline int ++#ifdef USE_TARGET_ATTRIBUTE ++__attribute__((target("sse2"))) ++#endif ++psf_lrint (double x) ++{ ++ return _mm_cvtsd_si32 (_mm_load_sd (&x)) ; ++} ++ ++#else ++ ++static inline int psf_lrintf (float x) ++{ ++ return lrintf (x) ; ++} /* psf_lrintf */ ++ ++static inline int psf_lrint (double x) ++{ ++ return lrint (x) ; ++} /* psf_lrint */ ++#endif ++ ++/*---------------------------------------------------------- + ** Common static inline functions. + */ + +@@ -170,7 +235,7 @@ static inline double + fmod_one (double x) + { double res ; + +- res = x - lrint (x) ; ++ res = x - psf_lrint (x) ; + if (res < 0.0) + return res + 1.0 ; + Index: patches/patch-src_samplerate_c =================================================================== RCS file: patches/patch-src_samplerate_c diff -N patches/patch-src_samplerate_c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-src_samplerate_c 27 Apr 2024 00:26:05 -0000 @@ -0,0 +1,28 @@ +- Implement SSE2 lrint() and lrintf() + 7a81766b14fa03e97822cf1e0b1651648df13116 +- use sse2 intrinsics for lrint/lrintf only on windows x64 + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 +- sse2 lrint/lrintf updates + c164eaa25ffdeedc7d25e731172cc45a25f483d4 + +Index: src/samplerate.c +--- src/samplerate.c.orig ++++ src/samplerate.c +@@ -445,7 +445,7 @@ src_float_to_short_array (const float *in, short *out, + else if (scaled_value <= -32768.f) + out [i] = -32768 ; + else +- out [i] = (short) (lrintf (scaled_value)) ; ++ out [i] = (short) (psf_lrintf (scaled_value)) ; + } + } /* src_float_to_short_array */ + +@@ -477,7 +477,7 @@ src_float_to_int_array (const float *in, int *out, int + continue ; + } ; + #endif +- out [i] = (int) lrint (scaled_value) ; ++ out [i] = (int) psf_lrint (scaled_value) ; + } ; + + } /* src_float_to_int_array */ Index: patches/patch-src_src_linear_c =================================================================== RCS file: patches/patch-src_src_linear_c diff -N patches/patch-src_src_linear_c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-src_src_linear_c 27 Apr 2024 00:26:05 -0000 @@ -0,0 +1,28 @@ +- Implement SSE2 lrint() and lrintf() + 7a81766b14fa03e97822cf1e0b1651648df13116 +- use sse2 intrinsics for lrint/lrintf only on windows x64 + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 +- sse2 lrint/lrintf updates + c164eaa25ffdeedc7d25e731172cc45a25f483d4 + +Index: src/src_linear.c +--- src/src_linear.c.orig ++++ src/src_linear.c +@@ -102,7 +102,7 @@ linear_vari_process (SRC_STATE *state, SRC_DATA *data) + } ; + + rem = fmod_one (input_index) ; +- priv->in_used += state->channels * lrint (input_index - rem) ; ++ priv->in_used += state->channels * psf_lrint (input_index - rem) ; + input_index = rem ; + + /* Main processing loop. */ +@@ -128,7 +128,7 @@ linear_vari_process (SRC_STATE *state, SRC_DATA *data) + input_index += 1.0 / src_ratio ; + rem = fmod_one (input_index) ; + +- priv->in_used += state->channels * lrint (input_index - rem) ; ++ priv->in_used += state->channels * psf_lrint (input_index - rem) ; + input_index = rem ; + } ; + Index: patches/patch-src_src_sinc_c =================================================================== RCS file: patches/patch-src_src_sinc_c diff -N patches/patch-src_src_sinc_c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-src_src_sinc_c 27 Apr 2024 00:26:05 -0000 @@ -0,0 +1,148 @@ +- Implement SSE2 lrint() and lrintf() + 7a81766b14fa03e97822cf1e0b1651648df13116 +- use sse2 intrinsics for lrint/lrintf only on windows x64 + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 +- sse2 lrint/lrintf updates + c164eaa25ffdeedc7d25e731172cc45a25f483d4 + +Index: src/src_sinc.c +--- src/src_sinc.c.orig ++++ src/src_sinc.c +@@ -132,7 +132,7 @@ static SRC_STATE_VT sinc_mono_state_vt = + + static inline increment_t + double_to_fp (double x) +-{ return (increment_t) (lrint ((x) * FP_ONE)) ; ++{ return (increment_t) (psf_lrint ((x) * FP_ONE)) ; + } /* double_to_fp */ + + static inline increment_t +@@ -240,7 +240,7 @@ sinc_filter_new (int converter_type, int channels) + #endif + } + +- priv->b_len = 3 * (int) lrint ((priv->coeff_half_len + 2.0) / priv->index_inc * SRC_MAX_RATIO + 1) ; ++ priv->b_len = 3 * (int) psf_lrint ((priv->coeff_half_len + 2.0) / priv->index_inc * SRC_MAX_RATIO + 1) ; + priv->b_len = MAX (priv->b_len, 4096) ; + priv->b_len *= channels ; + priv->b_len += 1 ; // There is a <= check against samples_in_hand requiring a buffer bigger than the calculation above +@@ -458,12 +458,12 @@ sinc_mono_vari_process (SRC_STATE *state, SRC_DATA *da + count /= MIN (state->last_ratio, data->src_ratio) ; + + /* Maximum coefficientson either side of center point. */ +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; + + input_index = state->last_position ; + + rem = fmod_one (input_index) ; +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + + terminate = 1.0 / src_ratio + 1e-20 ; +@@ -505,7 +505,7 @@ sinc_mono_vari_process (SRC_STATE *state, SRC_DATA *da + input_index += 1.0 / src_ratio ; + rem = fmod_one (input_index) ; + +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + } ; + +@@ -614,12 +614,12 @@ sinc_stereo_vari_process (SRC_STATE *state, SRC_DATA * + count /= MIN (state->last_ratio, data->src_ratio) ; + + /* Maximum coefficientson either side of center point. */ +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; + + input_index = state->last_position ; + + rem = fmod_one (input_index) ; +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + + terminate = 1.0 / src_ratio + 1e-20 ; +@@ -660,7 +660,7 @@ sinc_stereo_vari_process (SRC_STATE *state, SRC_DATA * + input_index += 1.0 / src_ratio ; + rem = fmod_one (input_index) ; + +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + } ; + +@@ -770,12 +770,12 @@ sinc_quad_vari_process (SRC_STATE *state, SRC_DATA *da + count /= MIN (state->last_ratio, data->src_ratio) ; + + /* Maximum coefficientson either side of center point. */ +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; + + input_index = state->last_position ; + + rem = fmod_one (input_index) ; +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + + terminate = 1.0 / src_ratio + 1e-20 ; +@@ -816,7 +816,7 @@ sinc_quad_vari_process (SRC_STATE *state, SRC_DATA *da + input_index += 1.0 / src_ratio ; + rem = fmod_one (input_index) ; + +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + } ; + +@@ -925,12 +925,12 @@ sinc_hex_vari_process (SRC_STATE *state, SRC_DATA *dat + count /= MIN (state->last_ratio, data->src_ratio) ; + + /* Maximum coefficientson either side of center point. */ +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; + + input_index = state->last_position ; + + rem = fmod_one (input_index) ; +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + + terminate = 1.0 / src_ratio + 1e-20 ; +@@ -971,7 +971,7 @@ sinc_hex_vari_process (SRC_STATE *state, SRC_DATA *dat + input_index += 1.0 / src_ratio ; + rem = fmod_one (input_index) ; + +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + } ; + +@@ -1090,12 +1090,12 @@ sinc_multichan_vari_process (SRC_STATE *state, SRC_DAT + count /= MIN (state->last_ratio, data->src_ratio) ; + + /* Maximum coefficientson either side of center point. */ +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; + + input_index = state->last_position ; + + rem = fmod_one (input_index) ; +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + + terminate = 1.0 / src_ratio + 1e-20 ; +@@ -1136,7 +1136,7 @@ sinc_multichan_vari_process (SRC_STATE *state, SRC_DAT + input_index += 1.0 / src_ratio ; + rem = fmod_one (input_index) ; + +- filter->b_current = (filter->b_current + state->channels * lrint (input_index - rem)) % filter->b_len ; ++ filter->b_current = (filter->b_current + state->channels * psf_lrint (input_index - rem)) % filter->b_len ; + input_index = rem ; + } ; + Index: patches/patch-src_src_zoh_c =================================================================== RCS file: patches/patch-src_src_zoh_c diff -N patches/patch-src_src_zoh_c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-src_src_zoh_c 27 Apr 2024 00:26:05 -0000 @@ -0,0 +1,28 @@ +- Implement SSE2 lrint() and lrintf() + 7a81766b14fa03e97822cf1e0b1651648df13116 +- use sse2 intrinsics for lrint/lrintf only on windows x64 + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 +- sse2 lrint/lrintf updates + c164eaa25ffdeedc7d25e731172cc45a25f483d4 + +Index: src/src_zoh.c +--- src/src_zoh.c.orig ++++ src/src_zoh.c +@@ -99,7 +99,7 @@ zoh_vari_process (SRC_STATE *state, SRC_DATA *data) + } ; + + rem = fmod_one (input_index) ; +- priv->in_used += state->channels * lrint (input_index - rem) ; ++ priv->in_used += state->channels * psf_lrint (input_index - rem) ; + input_index = rem ; + + /* Main processing loop. */ +@@ -117,7 +117,7 @@ zoh_vari_process (SRC_STATE *state, SRC_DATA *data) + input_index += 1.0 / src_ratio ; + rem = fmod_one (input_index) ; + +- priv->in_used += state->channels * lrint (input_index - rem) ; ++ priv->in_used += state->channels * psf_lrint (input_index - rem) ; + input_index = rem ; + } ; +