ping.

On 02/19/2019 03:03 PM, Paul Clarke wrote:
> Test FAILS: sse2-cvtpd2dq-1, sse2-cvtpd2ps, sse2-cvttpd2dq on powerpc64
> (big-endian).
> 
> _mm_cvtpd_epi32, _mm_cvtpd_ps, _mm_cvttpd_epi32: Type conversion from
> vector doubleword type to vector word type leaves the results in even
> lanes in big endian mode.
> 
> Test FAILS: sse-cvtss2si-1, sse-cvtss2si-2, sse-movmskb-1 on powerpc
> (32-bit big-endian).
> 
> Incorrect type for interpreting the result from mfvsrd instruction leads
> to incorrect results.  Also, mfvsrd instruction only works as expected in
> 64-bit mode or for 32-bit quantities in 32-bit mode.  A more general,
> if slower, solution is needed for 32-bit mode.
> 
> 2019-02-19  Paul A. Clarke  <p...@us.ibm.com>
> 
> [gcc]
> 
>       * config/rs6000/emmintrin.h (_mm_cvtpd_epi32): Fix big endian.
>       (_mm_cvtpd_ps): Likewise.
>       (_mm_cvttpd_epi32): Likewise.
> 
>       PR89338
>       * config/rs6000/xmmintrin.h (_mm_cvtss_f32):  Fix type mismatch.
>       (_mm_cvt_ss2si): Fix type mismatch and 32-bit.
> 
>       PR89339
>       * config/rs6000/xmmintrin.h (_mm_movemask_pi8): Fix 32-bit.
> 
> ---
> v2: more elegant solution for the 32-bit mode fix in _mm_movemask_pi8,
>     as suggested by Segher.
> 
> Index: gcc/config/rs6000/emmintrin.h
> ===================================================================
> diff --git a/trunk/gcc/config/rs6000/emmintrin.h 
> b/trunk/gcc/config/rs6000/emmintrin.h
> --- a/trunk/gcc/config/rs6000/emmintrin.h     (revision 268997)
> +++ b/trunk/gcc/config/rs6000/emmintrin.h     (working copy)
> @@ -887,7 +887,11 @@ _mm_cvtpd_epi32 (__m128d __A)
>        : );
>  
>  #ifdef _ARCH_PWR8
> +#ifdef __LITTLE_ENDIAN__
>    temp = vec_mergeo (temp, temp);
> +#else
> +  temp = vec_mergee (temp, temp);
> +#endif
>    result = (__v4si) vec_vpkudum ((__vector long long) temp,
>                                (__vector long long) vzero);
>  #else
> @@ -922,7 +926,11 @@ _mm_cvtpd_ps (__m128d __A)
>        : );
>  
>  #ifdef _ARCH_PWR8
> +#ifdef __LITTLE_ENDIAN__
>    temp = vec_mergeo (temp, temp);
> +#else
> +  temp = vec_mergee (temp, temp);
> +#endif
>    result = (__v4sf) vec_vpkudum ((__vector long long) temp,
>                                (__vector long long) vzero);
>  #else
> @@ -951,7 +959,11 @@ _mm_cvttpd_epi32 (__m128d __A)
>        : );
>  
>  #ifdef _ARCH_PWR8
> +#ifdef __LITTLE_ENDIAN__
>    temp = vec_mergeo (temp, temp);
> +#else
> +  temp = vec_mergee (temp, temp);
> +#endif
>    result = (__v4si) vec_vpkudum ((__vector long long) temp,
>                                (__vector long long) vzero);
>  #else
> Index: gcc/config/rs6000/xmmintrin.h
> ===================================================================
> diff --git a/trunk/gcc/config/rs6000/xmmintrin.h 
> b/trunk/gcc/config/rs6000/xmmintrin.h
> --- a/trunk/gcc/config/rs6000/xmmintrin.h     (revision 268997)
> +++ b/trunk/gcc/config/rs6000/xmmintrin.h     (working copy)
> @@ -905,7 +905,7 @@ _mm_cvtss_f32 (__m128 __A)
>  extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_cvtss_si32 (__m128 __A)
>  {
> -  __m64 res = 0;
> +  int res;
>  #ifdef _ARCH_PWR8
>    double dtmp;
>    __asm__(
> @@ -938,8 +938,8 @@ _mm_cvt_ss2si (__m128 __A)
>  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_cvtss_si64 (__m128 __A)
>  {
> -  __m64 res = 0;
> -#ifdef _ARCH_PWR8
> +  long long res;
> +#if defined (_ARCH_PWR8) && defined (__powerpc64__)
>    double dtmp;
>    __asm__(
>  #ifdef __LITTLE_ENDIAN__
> @@ -1577,6 +1577,7 @@ _m_pminub (__m64 __A, __m64 __B)
>  extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_movemask_pi8 (__m64 __A)
>  {
> +#ifdef __powerpc64__
>    unsigned long long p =
>  #ifdef __LITTLE_ENDIAN__
>                           0x0008101820283038UL; // permute control for sign 
> bits
> @@ -1584,6 +1585,12 @@ _mm_movemask_pi8 (__m64 __A)
>                           0x3830282018100800UL; // permute control for sign 
> bits
>  #endif
>    return __builtin_bpermd (p, __A);
> +#else
> +  unsigned int mask = 0x20283038UL;
> +  unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
> +  unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
> +  return (r2 << 4) | r1;
> +#endif
>  }
>  
>  extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> 

Reply via email to