Hi all,Following the implementation of the Crypto intrinsics I posted earlier this week, this patch implements the vceq_p64 and vtst_p64 intrinsics that operate on the new poly64_t type. They do not have a regular form and can thus not be autogenerated from our beloved ML scripts and are therefore synthesised as a vceq_u32 or vtst_u32 operation, followed by a pairwise reduce with min or max respectively.
These intrinsics are only available when the crypto intrinsics are available (i.e. -mfpu=crypto-neon-fp-armv8 and -mfloat-abi=(hard|softfp)).
I've added two runtime tests to make sure they generate correct results. Ok for trunk? Thanks, Kyrill 2013-12-06 Kyrylo Tkachov <kyrylo.tkac...@arm.com> * config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64. * config/arm/arm_neon.h: Regenerate. * config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64. * doc/arm-neon-intrinsics.texi: Regenerate. 2013-12-06 Kyrylo Tkachov <kyrylo.tkac...@arm.com> * gcc.target/arm/neon-vceq_p64.c: New test. * gcc.target/arm/neon-vtst_p64.c: Likewise.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 59ef22c..cc3f56c 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -13278,6 +13278,26 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val) #endif } +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_p64 (poly64x1_t a, poly64x1_t b) +{ + uint32x2_t t_a = vreinterpret_u32_p64 (a); + uint32x2_t t_b = vreinterpret_u32_p64 (b); + uint32x2_t c = vceq_u32 (t_a, t_b); + uint32x2_t m = vpmin_u32 (c, c); + return vreinterpret_u64_u32 (m); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vtst_p64 (poly64x1_t a, poly64x1_t b) +{ + uint32x2_t t_a = vreinterpret_u32_p64 (a); + uint32x2_t t_b = vreinterpret_u32_p64 (b); + uint32x2_t c = vtst_u32 (t_a, t_b); + uint32x2_t m = vpmax_u32 (c, c); + return vreinterpret_u64_u32 (m); +} + __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vaeseq_u8 (uint8x16_t __data, uint8x16_t __key) { diff --git a/gcc/config/arm/neon-docgen.ml b/gcc/config/arm/neon-docgen.ml index 41ae059..8945da7 100644 --- a/gcc/config/arm/neon-docgen.ml +++ b/gcc/config/arm/neon-docgen.ml @@ -340,6 +340,14 @@ let crypto_doc = @end itemize @itemize @bullet +@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t) +@end itemize + +@itemize @bullet +@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t) +@end itemize + +@itemize @bullet @item uint32_t vsha1h_u32 (uint32_t) @*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}} @end itemize diff --git a/gcc/config/arm/neon.ml b/gcc/config/arm/neon.ml index 968c171..69618d0 100644 --- a/gcc/config/arm/neon.ml +++ b/gcc/config/arm/neon.ml @@ -2208,6 +2208,26 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val) #endif } +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_p64 (poly64x1_t a, poly64x1_t b) +{ + uint32x2_t t_a = vreinterpret_u32_p64 (a); + uint32x2_t t_b = vreinterpret_u32_p64 (b); + uint32x2_t c = vceq_u32 (t_a, t_b); + uint32x2_t m = vpmin_u32 (c, c); + return vreinterpret_u64_u32 (m); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vtst_p64 (poly64x1_t a, poly64x1_t b) +{ + uint32x2_t t_a = vreinterpret_u32_p64 (a); + uint32x2_t t_b = vreinterpret_u32_p64 (b); + uint32x2_t c = vtst_u32 (t_a, t_b); + uint32x2_t m = vpmax_u32 (c, c); + return vreinterpret_u64_u32 (m); +} + __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vaeseq_u8 (uint8x16_t __data, uint8x16_t __key) { diff --git a/gcc/doc/arm-neon-intrinsics.texi b/gcc/doc/arm-neon-intrinsics.texi index 610892d..b146868 100644 --- a/gcc/doc/arm-neon-intrinsics.texi +++ b/gcc/doc/arm-neon-intrinsics.texi @@ -11939,6 +11939,14 @@ @end itemize @itemize @bullet +@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t) +@end itemize + +@itemize @bullet +@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t) +@end itemize + +@itemize @bullet @item uint32_t vsha1h_u32 (uint32_t) @*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}} @end itemize diff --git a/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c b/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c new file mode 100644 index 0000000..21a6a78 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-require-effective-target arm_neon_hw } */ +/* { dg-add-options arm_crypto } */ + +#include "arm_neon.h" +#include <stdio.h> + +extern void abort (void); + +int +main (void) +{ + uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff, + ~0xffff, 0xffffffff, ~0xffffffff, ~0x0 }; + int i, j; + + for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i) + { + for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j) + { + uint64_t a1 = args[i]; + uint64_t a2 = args[j]; + uint64_t res = vceq_p64 (vreinterpret_p64_u64 (a1), + vreinterpret_p64_u64 (a2)); + uint64_t exp = (a1 == a2) ? ~0x0 : 0x0; + + if (res != exp) + { + fprintf (stderr, "vceq_p64 (a1= %lx, a2= %lx)" + " returned %lx, expected %lx\n", + a1, a2, res, exp); + abort (); + } + } + } + return 0; +} diff --git a/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c b/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c new file mode 100644 index 0000000..3a0b117 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-require-effective-target arm_neon_hw } */ +/* { dg-add-options arm_crypto } */ + +#include "arm_neon.h" +#include <stdio.h> + +extern void abort (void); + +int +main (void) +{ + uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff, + ~0xffff, 0xffffffff, ~0xffffffff, ~0x0 }; + int i, j; + + for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i) + { + for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j) + { + uint64_t a1 = args[i]; + uint64_t a2 = args[j]; + uint64_t res = vtst_p64 (vreinterpret_p64_u64 (a1), + vreinterpret_p64_u64 (a2)); + uint64_t exp = (a1 & a2) ? ~0x0 : 0x0; + + if (res != exp) + { + fprintf (stderr, "vtst_p64 (a1= %lx, a2= %lx)" + " returned %lx, expected %lx\n", + a1, a2, res, exp); + abort (); + } + } + } + return 0; +}