I've only had a brief look at this and point out certain stylistic issues that I noticed and would like another set of eyes on this and the next patch.
On 29 May 2012 05:13, Matt Turner <matts...@gmail.com> wrote: > From: Xinyu Qi <x...@marvell.com> > > gcc/ > * config/arm/mmintrin.h: Use __IWMMXT__ to enable iWMMXt intrinsics. > Use __IWMMXT2__ to enable iWMMXt2 intrinsics. > Use C name-mangling for intrinsics. > (__v8qi): Redefine. > (_mm_cvtsi32_si64, _mm_andnot_si64, _mm_sad_pu8): Revise. > (_mm_sad_pu16, _mm_align_si64, _mm_setwcx, _mm_getwcx): Likewise. > (_m_from_int): Likewise. > (_mm_sada_pu8, _mm_sada_pu16): New intrinsic. > (_mm_alignr0_si64, _mm_alignr1_si64, _mm_alignr2_si64): Likewise. > (_mm_alignr3_si64, _mm_tandcb, _mm_tandch, _mm_tandcw): Likewise. > (_mm_textrcb, _mm_textrch, _mm_textrcw, _mm_torcb): Likewise. > (_mm_torch, _mm_torcw, _mm_tbcst_pi8, _mm_tbcst_pi16): Likewise. > (_mm_tbcst_pi32): Likewise. > (_mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32): New iWMMXt2 intrinsic. > (_mm_addsubhx_pi16, _mm_absdiff_pu8, _mm_absdiff_pu16): Likewise. > (_mm_absdiff_pu32, _mm_addc_pu16, _mm_addc_pu32): Likewise. > (_mm_avg4_pu8, _mm_avg4r_pu8, _mm_maddx_pi16, _mm_maddx_pu16): > Likewise. > (_mm_msub_pi16, _mm_msub_pu16, _mm_mulhi_pi32): Likewise. > (_mm_mulhi_pu32, _mm_mulhir_pi16, _mm_mulhir_pi32): Likewise. > (_mm_mulhir_pu16, _mm_mulhir_pu32, _mm_mullo_pi32): Likewise. > (_mm_qmulm_pi16, _mm_qmulm_pi32, _mm_qmulmr_pi16): Likewise. > (_mm_qmulmr_pi32, _mm_subaddhx_pi16, _mm_addbhusl_pu8): Likewise. > (_mm_addbhusm_pu8, _mm_qmiabb_pi32, _mm_qmiabbn_pi32): Likewise. > (_mm_qmiabt_pi32, _mm_qmiabtn_pi32, _mm_qmiatb_pi32): Likewise. > (_mm_qmiatbn_pi32, _mm_qmiatt_pi32, _mm_qmiattn_pi32): Likewise. > (_mm_wmiabb_si64, _mm_wmiabbn_si64, _mm_wmiabt_si64): Likewise. > (_mm_wmiabtn_si64, _mm_wmiatb_si64, _mm_wmiatbn_si64): Likewise. > (_mm_wmiatt_si64, _mm_wmiattn_si64, _mm_wmiawbb_si64): Likewise. > (_mm_wmiawbbn_si64, _mm_wmiawbt_si64, _mm_wmiawbtn_si64): Likewise. > (_mm_wmiawtb_si64, _mm_wmiawtbn_si64, _mm_wmiawtt_si64): Likewise. > (_mm_wmiawttn_si64, _mm_merge_si64): Likewise. > (_mm_torvscb, _mm_torvsch, _mm_torvscw): Likewise. > (_m_to_int): New define. > --- > gcc/config/arm/mmintrin.h | 649 > ++++++++++++++++++++++++++++++++++++++++++--- > 1 files changed, 614 insertions(+), 35 deletions(-) > > diff --git a/gcc/config/arm/mmintrin.h b/gcc/config/arm/mmintrin.h > index 2cc500d..0fe551d 100644 > --- a/gcc/config/arm/mmintrin.h > +++ b/gcc/config/arm/mmintrin.h > @@ -24,16 +24,30 @@ > #ifndef _MMINTRIN_H_INCLUDED > #define _MMINTRIN_H_INCLUDED > > +#ifndef __IWMMXT__ > +#error You must enable WMMX/WMMX2 instructions (e.g. -march=iwmmxt or > -march=iwmmxt2) to use iWMMXt/iWMMXt2 intrinsics > +#else > + > +#ifndef __IWMMXT2__ > +#warning You only enable iWMMXt intrinsics. Extended iWMMXt2 intrinsics > available only if WMMX2 instructions enabled (e.g. -march=iwmmxt2) > +#endif > + Extra newline. > + > +#if defined __cplusplus > +extern "C" { /* Begin "C" */ > +/* Intrinsics use C name-mangling. */ > +#endif /* __cplusplus */ > + > /* The data type intended for user use. */ > typedef unsigned long long __m64, __int64; > > /* Internal data types for implementing the intrinsics. */ > typedef int __v2si __attribute__ ((vector_size (8))); > typedef short __v4hi __attribute__ ((vector_size (8))); > -typedef char __v8qi __attribute__ ((vector_size (8))); > +typedef signed char __v8qi __attribute__ ((vector_size (8))); > > /* "Convert" __m64 and __int64 into each other. */ > -static __inline __m64 > +static __inline __m64 > _mm_cvtsi64_m64 (__int64 __i) > { > return __i; > @@ -54,7 +68,7 @@ _mm_cvtsi64_si32 (__int64 __i) > static __inline __int64 > _mm_cvtsi32_si64 (int __i) > { > - return __i; > + return (__i & 0xffffffff); > } > > /* Pack the four 16-bit values from M1 into the lower four 8-bit values of > @@ -603,7 +617,7 @@ _mm_and_si64 (__m64 __m1, __m64 __m2) > static __inline __m64 > _mm_andnot_si64 (__m64 __m1, __m64 __m2) > { > - return __builtin_arm_wandn (__m1, __m2); > + return __builtin_arm_wandn (__m2, __m1); > } > > /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ > @@ -935,7 +949,13 @@ _mm_avg2_pu16 (__m64 __A, __m64 __B) > static __inline __m64 > _mm_sad_pu8 (__m64 __A, __m64 __B) > { > - return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B); > + return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B); > +} > + > +static __inline __m64 > +_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C) > +{ > + return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C); > } > > /* Compute the sum of the absolute differences of the unsigned 16-bit > @@ -944,9 +964,16 @@ _mm_sad_pu8 (__m64 __A, __m64 __B) > static __inline __m64 > _mm_sad_pu16 (__m64 __A, __m64 __B) > { > - return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B); > + return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); > } > > +static __inline __m64 > +_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C) > +{ > + return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C); > +} > + > + > /* Compute the sum of the absolute differences of the unsigned 8-bit > values in A and B. Return the value in the lower 16-bit word; the > upper words are cleared. */ > @@ -965,11 +992,8 @@ _mm_sadz_pu16 (__m64 __A, __m64 __B) > return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); > } > > -static __inline __m64 > -_mm_align_si64 (__m64 __A, __m64 __B, int __C) > -{ > - return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C); > -} > +#define _mm_align_si64(__A,__B, N) \ > + (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N)) > > /* Creates a 64-bit zero. */ > static __inline __m64 > @@ -987,42 +1011,76 @@ _mm_setwcx (const int __value, const int __regno) > { > switch (__regno) > { > - case 0: __builtin_arm_setwcx (__value, 0); break; > - case 1: __builtin_arm_setwcx (__value, 1); break; > - case 2: __builtin_arm_setwcx (__value, 2); break; > - case 3: __builtin_arm_setwcx (__value, 3); break; > - case 8: __builtin_arm_setwcx (__value, 8); break; > - case 9: __builtin_arm_setwcx (__value, 9); break; > - case 10: __builtin_arm_setwcx (__value, 10); break; > - case 11: __builtin_arm_setwcx (__value, 11); break; > - default: break; > + case 0: > + __asm __volatile ("tmcr wcid, %0" :: "r"(__value)); > + break; > + case 1: > + __asm __volatile ("tmcr wcon, %0" :: "r"(__value)); > + break; > + case 2: > + __asm __volatile ("tmcr wcssf, %0" :: "r"(__value)); > + break; > + case 3: > + __asm __volatile ("tmcr wcasf, %0" :: "r"(__value)); > + break; > + case 8: > + __builtin_arm_setwcgr0 (__value); > + break; > + case 9: > + __builtin_arm_setwcgr1 (__value); > + break; > + case 10: > + __builtin_arm_setwcgr2 (__value); > + break; > + case 11: > + __builtin_arm_setwcgr3 (__value); > + break; > + default: > + break; > } > } > > static __inline int > _mm_getwcx (const int __regno) > { > + int __value; > switch (__regno) > { > - case 0: return __builtin_arm_getwcx (0); > - case 1: return __builtin_arm_getwcx (1); > - case 2: return __builtin_arm_getwcx (2); > - case 3: return __builtin_arm_getwcx (3); > - case 8: return __builtin_arm_getwcx (8); > - case 9: return __builtin_arm_getwcx (9); > - case 10: return __builtin_arm_getwcx (10); > - case 11: return __builtin_arm_getwcx (11); > - default: return 0; > + case 0: > + __asm __volatile ("tmrc %0, wcid" : "=r"(__value)); > + break; > + case 1: > + __asm __volatile ("tmrc %0, wcon" : "=r"(__value)); > + break; > + case 2: > + __asm __volatile ("tmrc %0, wcssf" : "=r"(__value)); > + break; > + case 3: > + __asm __volatile ("tmrc %0, wcasf" : "=r"(__value)); > + break; > + case 8: > + return __builtin_arm_getwcgr0 (); > + case 9: > + return __builtin_arm_getwcgr1 (); > + case 10: > + return __builtin_arm_getwcgr2 (); > + case 11: > + return __builtin_arm_getwcgr3 (); > + default: > + break; > } > + return __value; > } > > /* Creates a vector of two 32-bit values; I0 is least significant. */ > static __inline __m64 > _mm_set_pi32 (int __i1, int __i0) > { > - union { > + union > + { > __m64 __q; > - struct { > + struct > + { > unsigned int __i0; > unsigned int __i1; > } __s; > @@ -1041,7 +1099,7 @@ _mm_set_pi16 (short __w3, short __w2, short __w1, short > __w0) > unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2; > unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0; > return _mm_set_pi32 (__i1, __i0); > - > + Extra newline again here. > } > > /* Creates a vector of eight 8-bit values; B0 is least significant. */ > @@ -1108,11 +1166,526 @@ _mm_set1_pi8 (char __b) > return _mm_set1_pi32 (__i); > } > > -/* Convert an integer to a __m64 object. */ > +#ifdef __IWMMXT2__ > +static __inline __m64 > +_mm_abs_pi8 (__m64 m1) > +{ > + return (__m64) __builtin_arm_wabsb ((__v8qi)m1); > +} > + > +static __inline __m64 > +_mm_abs_pi16 (__m64 m1) > +{ > + return (__m64) __builtin_arm_wabsh ((__v4hi)m1); > + And here. > +} > + > +static __inline __m64 > +_mm_abs_pi32 (__m64 m1) > +{ > + return (__m64) __builtin_arm_wabsw ((__v2si)m1); > + and here. <large part snipped.> > + > +#define _mm_qmiabb_pi32(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, > (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_qmiabbn_pi32(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, > (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_qmiabt_pi32(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, > (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_qmiabtn_pi32(acc, m1, m2) \ > + ({\ > + __m64 _acc=acc;\ > + __m64 _m1=m1;\ > + __m64 _m2=m2;\ > + _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, > (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_qmiatb_pi32(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, > (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_qmiatbn_pi32(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, > (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_qmiatt_pi32(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, > (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_qmiattn_pi32(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, > (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiabb_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiabbn_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiabt_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiabtn_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiatb_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiatbn_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiatt_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiattn_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiawbb_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiawbbn_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiawbt_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiawbtn_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiawtb_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiawtbn_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiawtt_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\ > + _acc;\ > + }) > + > +#define _mm_wmiawttn_si64(acc, m1, m2) \ > + ({\ > + __m64 _acc = acc;\ > + __m64 _m1 = m1;\ > + __m64 _m2 = m2;\ > + _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\ > + _acc;\ > + }) I assume someone knows why these are macros and not inline functions like the others ? > + > +/* The third arguments should be an immediate. */ s/arguments/argument > +#define _mm_merge_si64(a, b, n) \ > + ({\ > + __m64 result;\ > + result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\ > + result;\ > + }) > +#endif /* __IWMMXT2__ */ > +