[PATCH] aarch64: Use RTL builtins for integer mla intrinsics

2021-01-22 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites integer mla Neon intrinsics to use
RTL builtins rather than inline assembly code, allowing for better
scheduling and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

If ok, please commit to master (I don't have commit rights.)

Thanks,
Jonathan

---

gcc/Changelog:

2021-01-14  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add mla builtin
generator macro.
* config/aarch64/arm_neon.h (vmla_s8): Use RTL builtin rather
than asm.
(vmla_s16): Likewise.
(vmla_s32): Likewise.
(vmla_u8): Likewise.
(vmla_u16): Likewise.
(vmla_u32): Likewise.
(vmlaq_s8): Likewise.
(vmlaq_s16): Likewise.
(vmlaq_s32): Likewise.
(vmlaq_u8): Likewise.
(vmlaq_u16): Likewise.
(vmlaq_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 73a24d59745ab03fbed213b01eb3134d053295e1..d156f50e5df5568e563f9b175b84062b6575e7e5 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -178,6 +178,9 @@
   /* Implemented by aarch64_xtn.  */
   BUILTIN_VQN (UNOP, xtn, 0, NONE)
 
+  /* Implemented by aarch64_mla.  */
+  BUILTIN_VDQ_BHSI (TERNOP, mla, 0, NONE)
+
   /* Implemented by aarch64_mlsl.  */
   BUILTIN_VD_BHSI (TERNOP, smlsl, 0, NONE)
   BUILTIN_VD_BHSI (TERNOPU, umlsl, 0, NONE)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index f7efee61de4c5268acf446555af4a93fece6b169..da696d9fee2ffbabc9d89f2e9299fbde086cfee1 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7294,72 +7294,48 @@ __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mlav8qi(__a, __b, __c);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mlav4hi(__a, __b, __c);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mlav2si(__a, __b, __c);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint8x8_t __result;
-  __asm__ ("mla %0.8b, %2.8b, %3.8b"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return (uint8x8_t) __builtin_aarch64_mlav8qi((int8x8_t) __a,
+   (int8x8_t) __b,
+   (int8x8_t) __c);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint16x4_t __result;
-  __asm__ ("mla %0.4h, %2.4h, %3.4h"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return (uint16x4_t) __builtin_aarch64_mlav4hi((int16x4_t) __a,
+(int16x4_t) __b,
+(int16x4_t) __c);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint32x2_t __result;
-  __asm__ ("mla %0.2s, %2.2s, %3.2s"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return (uint32x2_t) __builtin_aarch64_mlav2si((int32x2_t) __a,
+(int32x2_t) __b,
+(int32x2_t) __c);
 }
 
 #define vmlal_high_lane_s16(a, b, c, d) \
@@ -7835,72 +7811,48 @@ __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int8x16_t __result;
-  __asm__ ("mla %0.16b, %2

Re: [PATCH] aarch64: Use RTL builtins for integer mla intrinsics

2021-01-22 Thread Jonathan Wright via Gcc-patches
GNU style (followed in the header file) is to insert a space between
the function name and the arguments.  Same for the other functions.
Ah, yes - will change.

Since other patches like this are on their way, would you mind
going through the process on https://gcc.gnu.org/gitwrite.html
to get commit access?  (I'll sponsor.)
Request submitted.

Thanks,
Jonathan

From: Richard Sandiford 
Sent: 22 January 2021 14:56
To: Jonathan Wright 
Cc: gcc-patches@gcc.gnu.org ; Kyrylo Tkachov 

Subject: Re: [PATCH] aarch64: Use RTL builtins for integer mla intrinsics

Thanks for doing this.  The patch looks good with one very minor nit fixed:

Jonathan Wright  writes:
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 
> f7efee61de4c5268acf446555af4a93fece6b169..da696d9fee2ffbabc9d89f2e9299fbde086cfee1
>  100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -7294,72 +7294,48 @@ __extension__ extern __inline int8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
>  {
> -  int8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav8qi(__a, __b, __c);

GNU style (followed in the header file) is to insert a space between
the function name and the arguments.  Same for the other functions.

Since other patches like this are on their way, would you mind
going through the process on https://gcc.gnu.org/gitwrite.html
to get commit access?  (I'll sponsor.)

Once you've got access, the patch is OK to commit with the change above.

A nice follow-on would be to lower the mla intrinsics to IFN_FMA.
See aarch64_general_gimple_fold_builtin, which does something similar
for IFN_REDUC_PLUS etc.

Thanks,
Richard

>  }
>
>  __extension__ extern __inline int16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
>  {
> -  int16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav4hi(__a, __b, __c);
>  }
>
>  __extension__ extern __inline int32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
>  {
> -  int32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav2si(__a, __b, __c);
>  }
>
>  __extension__ extern __inline uint8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
>  {
> -  uint8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  return (uint8x8_t) __builtin_aarch64_mlav8qi((int8x8_t) __a,
> +   (int8x8_t) __b,
> +   (int8x8_t) __c);
>  }
>
>  __extension__ extern __inline uint16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
>  {
> -  uint16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  return (uint16x4_t) __builtin_aarch64_mlav4hi((int16x4_t) __a,
> +(int16x4_t) __b,
> +(int16x4_t) __c);
>  }
>
>  __extension__ extern __inline uint32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
>  {
> -  uint32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  return (uint32x2_t) __builtin_aarch64_mlav2si((int32x2_t) __a,
> +(int32x2_t) __b,
> +(int32x2_t) __c);
>  }
>
>  #define vmlal_high_lane_s16(a, b, c, d) \
> @@ -7835,72 +7811,48 @@ __extension__ extern __inline int8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
>  {
> -  int8x16_t __result;
> -  __asm__ ("mla %0.16b, %2.16b, %3.16b"
> -   : "=w"(__result)
> -   : "0"(

[COMMITTED] MAINTAINERS: Add myself for write after approval

2021-01-22 Thread Jonathan Wright via Gcc-patches
ChangeLog:

2021-01-22  Jonathan Wright  

* MAINTAINERS (Write After Approval): Add myself.
From 32a93eac7adbb34bb50ed07a9841c870b7ebcb7a Mon Sep 17 00:00:00 2001
From: Jonathan Wright 
Date: Fri, 22 Jan 2021 19:09:11 +
Subject: [PATCH] MAINTAINERS: Add myself for write after approval

ChangeLog:

2021-01-22  Jonathan Wright  

	* MAINTAINERS (Write After Approval): Add myself.
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1be2692ddc5..f72b649b6ce 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -658,6 +658,7 @@ Kevin Williams	
 Przemyslaw Wirkus
 Carlo Wood	
 Jackson Woodruff
+Jonathan Wright	
 Mingjie Xing	
 Chenghua Xu	
 Canqun Yang	
-- 
2.25.1



[PATCH] aarch64: Use RTL builtins for integer mla_n intrinsics

2021-01-26 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites integer mla_n Neon intrinsics to use RTL 
builtins rather than inline assembly code, allowing for better scheduling 
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-01-15  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add mla_n builtin
generator macro.
* config/aarch64/aarch64-simd.md (*aarch64_mla_elt_merge):
Rename to...
(aarch64_mla_n): This.
* config/aarch64/arm_neon.h (vmla_n_s16): Use RTL builtin
instead of asm.
(vmla_n_s32): Likewise.
(vmla_n_u16): Likewise.
(vmla_n_u32): Likewise.
(vmlaq_n_s16): Likewise.
(vmlaq_n_s32): Likewise.
(vmlaq_n_u16): Likewise.
(vmlaq_n_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index a233156010481f8e0869a0e6ab3315107696eade..ef83d5eee55a3d3e952a5078abaf0c03b4c3b01c 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -180,6 +180,8 @@
 
   /* Implemented by aarch64_mla.  */
   BUILTIN_VDQ_BHSI (TERNOP, mla, 0, NONE)
+  /* Implemented by aarch64_mla_n.  */
+  BUILTIN_VDQHS (TERNOP, mla_n, 0, NONE)
 
   /* Implemented by aarch64_mlsl.  */
   BUILTIN_VD_BHSI (TERNOP, smlsl, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index be2a5a865172bdd7848be4082abb0fdfb0b35937..693a61871051cb5030811e772b21bd0429c0fddb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1384,15 +1384,16 @@
   [(set_attr "type" "neon_mla__scalar")]
 )
 
-(define_insn "*aarch64_mla_elt_merge"
-  [(set (match_operand:VDQHS 0 "register_operand" "=w")
+(define_insn "aarch64_mla_n"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
 	(plus:VDQHS
-	  (mult:VDQHS (vec_duplicate:VDQHS
-		  (match_operand: 1 "register_operand" ""))
-		(match_operand:VDQHS 2 "register_operand" "w"))
-	  (match_operand:VDQHS 3 "register_operand" "0")))]
+	  (mult:VDQHS
+	(vec_duplicate:VDQHS
+	  (match_operand: 3 "register_operand" ""))
+	(match_operand:VDQHS 2 "register_operand" "w"))
+	  (match_operand:VDQHS 1 "register_operand" "0")))]
  "TARGET_SIMD"
- "mla\t%0., %2., %1.[0]"
+ "mla\t%0., %2., %3.[0]"
   [(set_attr "type" "neon_mla__scalar")]
 )
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 15fb34527c6823a3dcb0be29695c78af770fbdae..ef865625a3da545470549745afea03878a0bdbbc 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7246,48 +7246,32 @@ __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
 {
-  int16x4_t __result;
-  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mla_nv4hi (__a, __b, __c);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
 {
-  int32x2_t __result;
-  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mla_nv2si (__a, __b, __c);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
 {
-  uint16x4_t __result;
-  __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return (uint16x4_t) __builtin_aarch64_mla_nv4hi ((int16x4_t) __a,
+   (int16x4_t) __b,
+   (int16_t) __c);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
 {
-  uint32x2_t __result;
-  __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return (uint32x2_t) __builtin_aarch64_mla_nv2si ((int32x2_t) __a,
+   (int32x2_t) __b,
+   (int32_t) __c);
 }
 
 __extension__ extern __inline int8x8_t
@@ -7763,48 +7747,32 @@ __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
 {
-  int16x8_t __result;
-  __asm__ ("mla %0.8h,%2.8

Re: [PATCH] aarch64: Use GCC vector extensions for integer mls intrinsics

2021-01-27 Thread Jonathan Wright via Gcc-patches
I have re-written this to use RTL builtins - regression tested and bootstrapped 
on aarch64-none-linux-gnu with no issues:

aarch64: Use RTL builtins for integer mls intrinsics

Rewrite integer mls Neon intrinsics to use RTL builtins rather than
inline assembly code, allowing for better scheduling and
optimization.

gcc/Changelog:

2021-01-11  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add mls builtin
generator macro.
* config/aarch64/arm_neon.h (vmls_s8): Use RTL builtin rather
than asm.
(vmls_s16): Likewise.
(vmls_s32): Likewise.
(vmls_u8): Likewise.
(vmls_u16): Likewise.
(vmls_u32): Likewise.
(vmlsq_s8): Likewise.
(vmlsq_s16): Likewise.
(vmlsq_s32): Likewise.
(vmlsq_u8): Likewise.
(vmlsq_u16): Likewise.
(vmlsq_u32): Likewise.

​

From: Richard Sandiford 
Sent: 19 January 2021 17:43
To: Jonathan Wright 
Cc: gcc-patches@gcc.gnu.org ; Richard Earnshaw 
; Kyrylo Tkachov 
Subject: Re: [PATCH] aarch64: Use GCC vector extensions for integer mls 
intrinsics

Jonathan Wright  writes:
> Hi,
>
> As subject, this patch rewrites integer mls Neon intrinsics to use
> a - b * c rather than inline assembly code, allowing for better
> scheduling and optimization.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> If ok, please commit to master (I don't have commit rights.)

Thanks for doing this.  The patch looks good from a functional
point of view.  I guess my only concern is that things like:

a = vmla_u8 (vmulq_u8 (b, c), d, e);

would become:

a = b * c + d * e;

and I don't think anything guarantees that the user's original
choice of instructon selection will be preserved.  We might end
up with the equivalent of:

a = vmla_u8 (vmulq_u8 (d, e), b, c);

giving different latencies.

If we added built-in functions instead, we could lower them to
IFN_FMA and IFN_FNMA, which support integers as well as floats,
and which stand a better chance of preserving the original grouping.

There again, the unfused floating-point MLAs already decompose
into separate multiplies and adds (although they can't of course
use IFN_FMA).

Any thoughts on doing it that way instead?

I'm not saying the patch shouldn't go in though, just thought it
was worth asking.

Thanks,
Richard

>
> Thanks,
> Jonathan
>
> ---
>
> gcc/Changelog:
>
> 2021-01-14  Jonathan Wright  
>
> * config/aarch64/arm_neon.h (vmls_s8): Use C rather than asm.
> (vmls_s16): Likewise.
> (vmls_s32): Likewise.
> (vmls_u8): Likewise.
> (vmls_u16): Likewise.
> (vmls_u32): Likewise.
> (vmlsq_s8): Likewise.
> (vmlsq_s16): Likewise.
> (vmlsq_s32): Likewise.
> (vmlsq_u8): Likewise.
> (vmlsq_u16): Likewise.
> (vmlsq_u32): Likewise.
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 
> 608e582d25820062a409310e7f3fc872660f8041..ad04eab1e753aa86f20a8f6cc2717368b1840ef7
>  100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -7968,72 +7968,45 @@ __extension__ extern __inline int8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
>  {
> -  int8x8_t __result;
> -  __asm__ ("mls %0.8b,%2.8b,%3.8b"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  uint8x8_t __result = (uint8x8_t) __a - (uint8x8_t) __b * (uint8x8_t) __c;
> +  return (int8x8_t) __result;
>  }
>
>  __extension__ extern __inline int16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
>  {
> -  int16x4_t __result;
> -  __asm__ ("mls %0.4h,%2.4h,%3.4h"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  uint16x4_t __result = (uint16x4_t) __a - (uint16x4_t) __b * (uint16x4_t) 
> __c;
> +  return (int16x4_t) __result;
>  }
>
>  __extension__ extern __inline int32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
>  {
> -  int32x2_t __result;
> -  __asm__ ("mls %0.2s,%2.2s,%3.2s"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  uint32x2_t __result = (uint32x2_t) __a - (uint32x2_t) __b * (uint32x2_t) 
> __c;
> +  return (int32x2_t) __result;
>  }
>
>  __extension__ extern __inline uint8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
>  {
> -  uint8x8_t __result;
> -  __asm__ ("mls %0.8b,%2.8b,%3.8b"
> -   : "=w"(__result)
> -   : "0"(__a), "w"(__b), "w"(__c)
> -   : /* No clobbers */);
> -  return __result;
> +  return __a - __b * 

aarch64: Use RTL builtins for integer mls_n intrinsics

2021-01-27 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites integer mls_n Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-01-15  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add mls_n builtin
generator macro.
* config/aarch64/aarch64-simd.md (*aarch64_mls_elt_merge):
Rename to...
(aarch64_mls_n): This.
* config/aarch64/arm_neon.h (vmls_n_s16): Use RTL builtin
instead of asm.
(vmls_n_s32): Likewise.
(vmls_n_u16): Likewise.
(vmls_n_u32): Likewise.
(vmlsq_n_s16): Likewise.
(vmlsq_n_s32): Likewise.
(vmlsq_n_u16): Likewise.
(vmlsq_n_u32): Likewise.
​diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 93a087987bb7f039b2f85a6e1d2e05eb95fa0058..32aee6024a89e6ca1f423717463fe67d011afd8b 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -185,6 +185,8 @@
 
   /* Implemented by aarch64_mls.  */
   BUILTIN_VDQ_BHSI (TERNOP, mls, 0, NONE)
+  /* Implemented by aarch64_mls_n.  */
+  BUILTIN_VDQHS (TERNOP, mls_n, 0, NONE)
 
   /* Implemented by aarch64_mlsl.  */
   BUILTIN_VD_BHSI (TERNOP, smlsl, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 693a61871051cb5030811e772b21bd0429c0fddb..544bac7dc9b62a9d5387465ec26d0e3204be6601 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1443,15 +1443,16 @@
   [(set_attr "type" "neon_mla__scalar")]
 )
 
-(define_insn "*aarch64_mls_elt_merge"
+(define_insn "aarch64_mls_n"
   [(set (match_operand:VDQHS 0 "register_operand" "=w")
 	(minus:VDQHS
 	  (match_operand:VDQHS 1 "register_operand" "0")
-	  (mult:VDQHS (vec_duplicate:VDQHS
-		  (match_operand: 2 "register_operand" ""))
-		(match_operand:VDQHS 3 "register_operand" "w"]
+	  (mult:VDQHS
+	(vec_duplicate:VDQHS
+	  (match_operand: 3 "register_operand" ""))
+	(match_operand:VDQHS 2 "register_operand" "w"]
   "TARGET_SIMD"
-  "mls\t%0., %3., %2.[0]"
+  "mls\t%0., %2., %3.[0]"
   [(set_attr "type" "neon_mla__scalar")]
 )
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 45b3c125babae2e3d32d6cd3b36ce09c502c04d8..d891067f021a0bcc24af79dfbe2d9dd5889b23bc 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7840,48 +7840,32 @@ __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
 {
-  int16x4_t __result;
-  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mls_nv4hi (__a, __b, __c);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
 {
-  int32x2_t __result;
-  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_mls_nv2si (__a, __b, __c);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
 {
-  uint16x4_t __result;
-  __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return (uint16x4_t) __builtin_aarch64_mls_nv4hi ((int16x4_t) __a,
+   (int16x4_t) __b,
+   (int16_t) __c);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
 {
-  uint32x2_t __result;
-  __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return (uint32x2_t) __builtin_aarch64_mls_nv2si ((int32x2_t) __a,
+   (int32x2_t) __b,
+   (int32_t) __c);
 }
 
 __extension__ extern __inline int8x8_t
@@ -8353,48 +8337,32 @@ __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
 {
-  int16x8_t __result;
-  __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-  

[PATCH] aarch64: Use GCC vector extensions for FP ml[as]_n intrinsics

2021-01-27 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites floating-point mla_n/mls_n intrinsics to use
a + b * c / a - b * c rather than inline assembly code, allowing for better
scheduling and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-01-18  Jonathan Wright  

* config/aarch64/arm_neon.h (vmla_n_f32): Use C rather than asm.
(vmlaq_n_f32): Likewise.
(vmls_n_f32): Likewise.
(vmlsq_n_f32): Likewise.
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d891067f021a0bcc24af79dfbe2d9dd5889b23bc..d1ab3b7d54cd5b965f91e685139677864fcfe3e1 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7233,13 +7233,7 @@ __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
 {
-  float32x2_t __result;
-  float32x2_t __t1;
-  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s"
-   : "=w"(__result), "=w"(__t1)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int16x4_t
@@ -7734,13 +7728,7 @@ __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
-  float32x4_t __result;
-  float32x4_t __t1;
-  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s"
-   : "=w"(__result), "=w"(__t1)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline int16x8_t
@@ -7827,13 +7815,7 @@ __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
 {
-  float32x2_t __result;
-  float32x2_t __t1;
-  __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s"
-   : "=w"(__result), "=w"(__t1)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a - __b * __c;
 }
 
 __extension__ extern __inline int16x4_t
@@ -8324,13 +8306,7 @@ __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
-  float32x4_t __result;
-  float32x4_t __t1;
-  __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s"
-   : "=w"(__result), "=w"(__t1)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __a - __b * __c;
 }
 
 __extension__ extern __inline int16x8_t


[PATCH] aarch64: Use RTL builtins for [su]mlal intrinsics

2021-01-27 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mlal Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better
scheduling and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-01-26  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add [su]mlal
builtin generator macros.
* config/aarch64/aarch64-simd.md (*aarch64_mlal):
Rename to...
(aarch64_mlal): This.
* config/aarch64/arm_neon.h (vmlal_s8): Use RTL builtin
instead of inline asm.
(vmlal_s16): Likewise.
(vmlal_s32): Likewise.
(vmlal_u8): Likewise.
(vmlal_u16): Likewise.
(vmlal_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 32aee6024a89e6ca1f423717463fe67d011afd8b..a71ae4d724136c8b626d397bf6187e8b595a2b8a 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -192,6 +192,10 @@
   BUILTIN_VD_BHSI (TERNOP, smlsl, 0, NONE)
   BUILTIN_VD_BHSI (TERNOPU, umlsl, 0, NONE)
 
+  /* Implemented by aarch64_mlal.  */
+  BUILTIN_VD_BHSI (TERNOP, smlal, 0, NONE)
+  BUILTIN_VD_BHSI (TERNOPU, umlal, 0, NONE)
+
   /* Implemented by aarch64_mlsl_hi.  */
   BUILTIN_VQW (TERNOP, smlsl_hi, 0, NONE)
   BUILTIN_VQW (TERNOPU, umlsl_hi, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 544bac7dc9b62a9d5387465ec26d0e3204be6601..db56b61baf2093c88d8757b25580b3032f00a355 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1825,17 +1825,17 @@
 }
 )
 
-(define_insn "*aarch64_mlal"
+(define_insn "aarch64_mlal"
   [(set (match_operand: 0 "register_operand" "=w")
 (plus:
   (mult:
 (ANY_EXTEND:
-  (match_operand:VD_BHSI 1 "register_operand" "w"))
+  (match_operand:VD_BHSI 2 "register_operand" "w"))
 (ANY_EXTEND:
-  (match_operand:VD_BHSI 2 "register_operand" "w")))
-  (match_operand: 3 "register_operand" "0")))]
+  (match_operand:VD_BHSI 3 "register_operand" "w")))
+  (match_operand: 1 "register_operand" "0")))]
   "TARGET_SIMD"
-  "mlal\t%0., %1., %2."
+  "mlal\t%0., %2., %3."
   [(set_attr "type" "neon_mla__long")]
 )
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d1ab3b7d54cd5b965f91e685139677864fcfe3e1..674ccc63b69ca1945dc684d2b06c1e31f52bfdb3 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7656,72 +7656,42 @@ __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int16x8_t __result;
-  __asm__ ("smlal %0.8h,%2.8b,%3.8b"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlalv8qi (__a, __b, __c);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("smlal %0.4s,%2.4h,%3.4h"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlalv4hi (__a, __b, __c);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int64x2_t __result;
-  __asm__ ("smlal %0.2d,%2.2s,%3.2s"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlalv2si (__a, __b, __c);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint16x8_t __result;
-  __asm__ ("umlal %0.8h,%2.8b,%3.8b"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlalv8qi_ (__a, __b, __c);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("umlal %0.4s,%2.4h,%3.4h"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlalv4hi_ (__a, __b, __c);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint64x2_t __result;

aarch64: Use RTL builtins for [su]mlal_n intrinsics

2021-01-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mlal_n Neon intrinsics to use RTL builtins
rather than inline assembly code, allowing for better scheduling and
optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-01-26  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add [su]mlal_n
builtin generator macros.
* config/aarch64/aarch64-simd.md (aarch64_mlal_n):
Define.
* config/aarch64/arm_neon.h (vmlal_n_s16): Use RTL builtin
instead of inline asm.
(vmlal_n_s32): Likewise.
(vmlal_n_u16): Likewise.
(vmlal_n_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index a71ae4d724136c8b626d397bf6187e8b595a2b8a..4f8e28dc3c8478ea50aad333b21bd83f4a4b750e 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -196,6 +196,10 @@
   BUILTIN_VD_BHSI (TERNOP, smlal, 0, NONE)
   BUILTIN_VD_BHSI (TERNOPU, umlal, 0, NONE)
 
+  /* Implemented by aarch64_mlal_n.  */
+  BUILTIN_VD_HSI (TERNOP, smlal_n, 0, NONE)
+  BUILTIN_VD_HSI (TERNOPU, umlal_n, 0, NONE)
+
   /* Implemented by aarch64_mlsl_hi.  */
   BUILTIN_VQW (TERNOP, smlsl_hi, 0, NONE)
   BUILTIN_VQW (TERNOPU, umlsl_hi, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index db56b61baf2093c88d8757b25580b3032f00a355..d78f26be19a16163eb1b8f661c6100ac290e6c6b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1839,6 +1839,21 @@
   [(set_attr "type" "neon_mla__long")]
 )
 
+(define_insn "aarch64_mlal_n"
+  [(set (match_operand: 0 "register_operand" "=w")
+(plus:
+  (mult:
+(ANY_EXTEND:
+  (vec_duplicate:VD_HSI
+	  (match_operand: 3 "register_operand" "")))
+(ANY_EXTEND:
+  (match_operand:VD_HSI 2 "register_operand" "w")))
+  (match_operand: 1 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "mlal\t%0., %2., %3.[0]"
+  [(set_attr "type" "neon_mla__long")]
+)
+
 (define_insn "aarch64_mlsl"
   [(set (match_operand: 0 "register_operand" "=w")
 (minus:
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 674ccc63b69ca1945dc684d2b06c1e31f52bfdb3..004c73d9e0ec4c33e24968d17e4307f858b51263 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7608,48 +7608,28 @@ __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("smlal %0.4s,%2.4h,%3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlal_nv4hi (__a, __b, __c);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
-  int64x2_t __result;
-  __asm__ ("smlal %0.2d,%2.2s,%3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlal_nv2si (__a, __b, __c);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("umlal %0.4s,%2.4h,%3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlal_nv4hi_ (__a, __b, __c);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
 {
-  uint64x2_t __result;
-  __asm__ ("umlal %0.2d,%2.2s,%3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlal_nv2si_ (__a, __b, __c);
 }
 
 __extension__ extern __inline int16x8_t


[PATCH] aarch64: Use RTL builtins for [su]mlsl_n intrinsics

2021-01-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mlsl_n Neon intrinsics to use RTL builtins
rather than inline assembly code, allowing for better scheduling and
optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-01-27  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add [su]mlsl_n
builtin generator macros.
* config/aarch64/aarch64-simd.md (aarch64_mlsl_n):
Define.
* config/aarch64/arm_neon.h (vmlsl_n_s16): Use RTL builtin
instead of inline asm.
(vmlsl_n_s32): Likewise.
(vmlsl_n_u16): Likewise.
(vmlsl_n_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 4f8e28dc3c8478ea50aad333b21bd83f4a4b750e..2b582bee9133039b05b4fdbef92766a30caeab20 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -192,6 +192,10 @@
   BUILTIN_VD_BHSI (TERNOP, smlsl, 0, NONE)
   BUILTIN_VD_BHSI (TERNOPU, umlsl, 0, NONE)
 
+  /* Implemented by aarch64_mlsl_n.  */
+  BUILTIN_VD_HSI (TERNOP, smlsl_n, 0, NONE)
+  BUILTIN_VD_HSI (TERNOPU, umlsl_n, 0, NONE)
+
   /* Implemented by aarch64_mlal.  */
   BUILTIN_VD_BHSI (TERNOP, smlal, 0, NONE)
   BUILTIN_VD_BHSI (TERNOPU, umlal, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d78f26be19a16163eb1b8f661c6100ac290e6c6b..f2539cf84e30032ed609c12de7530d3e9be77b60 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1868,6 +1868,21 @@
   [(set_attr "type" "neon_mla__long")]
 )
 
+(define_insn "aarch64_mlsl_n"
+  [(set (match_operand: 0 "register_operand" "=w")
+(minus:
+  (match_operand: 1 "register_operand" "0")
+  (mult:
+(ANY_EXTEND:
+  (vec_duplicate:VD_HSI
+	  (match_operand: 3 "register_operand" "")))
+(ANY_EXTEND:
+  (match_operand:VD_HSI 2 "register_operand" "w")]
+  "TARGET_SIMD"
+  "mlsl\t%0., %2., %3.[0]"
+  [(set_attr "type" "neon_mla__long")]
+)
+
 (define_insn "aarch64_simd_vec_mult_lo_"
  [(set (match_operand: 0 "register_operand" "=w")
(mult: (ANY_EXTEND: (vec_select:
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 004c73d9e0ec4c33e24968d17e4307f858b51263..95c5e36530f1a3b72672f62737ced45704323fff 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8166,48 +8166,28 @@ __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlsl_nv4hi (__a, __b, __c);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
-  int64x2_t __result;
-  __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlsl_nv2si (__a, __b, __c);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlsl_nv4hi_ (__a, __b, __c);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
 {
-  uint64x2_t __result;
-  __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlsl_nv2si_ (__a, __b, __c);
 }
 
 __extension__ extern __inline int16x8_t


[PATCH] aarch64: Use RTL builtins for [su]mlsl_lane[q] intrinsics

2021-01-29 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mlsl_lane[q] Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu and 
aarch64_be-none-elf - no issues.

Ok for master?

Thanks,
Jonathan

gcc/ChangeLog:

2021-01-28  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add [su]mlsl_lane[q]
builtin generator macros.
* config/aarch64/aarch64-simd.md (aarch64_vec_mlsl_lane):
Define.
* config/aarch64/arm_neon.h (vmlsl_lane_s16): Use RTL builtin
instead of inline asm.
(vmlsl_lane_s32): Likewise.
(vmlsl_lane_u16): Likewise.
(vmlsl_lane_u32): Likewise.
(vmlsl_laneq_s16): Likewise.
(vmlsl_laneq_s32): Likewise.
(vmlsl_laneq_u16): Likewise.
(vmlsl_laneq_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index cb79c08ba66df817e289d891b206ea7f66c81527..4913231ea55260fea1c7511a28a436e1e1e2ab20 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -264,6 +264,11 @@
   BUILTIN_VD_HSI (TERNOPU_LANE, vec_umult_laneq_, 0, ALL)
   BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlal_laneq_, 0, ALL)
 
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlsl_lane_, 0, NONE)
+  BUILTIN_VD_HSI (QUADOP_LANE, vec_smlsl_laneq_, 0, NONE)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlsl_lane_, 0, NONE)
+  BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlsl_laneq_, 0, NONE)
+
   BUILTIN_VSD_HSI (BINOP, sqdmull, 0, NONE)
   BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0, NONE)
   BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 919d0b03998d893232331d6f4da5c93ae6bf41b8..adeec028d49f06156a5e84ce4dd83dbd6f151474 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2082,6 +2082,26 @@
   [(set_attr "type" "neon_mla__scalar_long")]
 )
 
+(define_insn "aarch64_vec_mlsl_lane"
+  [(set (match_operand: 0 "register_operand" "=w")
+   (minus:
+ (match_operand: 1 "register_operand" "0")
+ (mult:
+   (ANY_EXTEND:
+	 (match_operand: 2 "register_operand" "w"))
+   (ANY_EXTEND:
+	 (vec_duplicate:
+	   (vec_select:
+	 (match_operand:VDQHS 3 "register_operand" "")
+	 (parallel [(match_operand:SI 4 "immediate_operand" "i")])))]
+  "TARGET_SIMD"
+  {
+operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4]));
+return "mlsl\\t%0., %2., %3.[%4]";
+  }
+  [(set_attr "type" "neon_mla__scalar_long")]
+)
+
 ;; FP vector operations.
 ;; AArch64 AdvSIMD supports single-precision (32-bit) and 
 ;; double-precision (64-bit) floating-point data types and arithmetic as
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index b56ab68aad57afb97447c9f5d24f392f6e2b618b..2a71ca9aa3c8c4095e99aa08c48e583f037a41ed 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8068,117 +8068,65 @@ vmlsl_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c)
   return __builtin_aarch64_umlsl_hiv4si_ (__a, __b, __c);
 }
 
-#define vmlsl_lane_s16(a, b, c, d)  \
-  __extension__ \
-({  \
-   int16x4_t c_ = (c);  \
-   int16x4_t b_ = (b);  \
-   int32x4_t a_ = (a);  \
-   int32x4_t result;\
-   __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]"  \
-: "=w"(result)  \
-: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
-: /* No clobbers */);   \
-   result;  \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __v, const int __lane)
+{
+  return __builtin_aarch64_vec_smlsl_lane_v4hi (__a, __b, __v, __lane);
+}
 
-#define vmlsl_lane_s32(a, b, c, d)  \
-  __extension__ \
-({  \
-   int32x2_t c_ = (c);  \
-   int32x2_t b_ = (b);  \
-   int64x2_t a_ = (a);  \
-   int64x2_t result;\
-   __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]"   

[PATCH] aarch64: Use RTL builtins for [su]mull_n intrinsics

2021-01-29 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mull_n Neon intrinsics to use RTL builtins
rather than inline assembly code, allowing for better scheduling and
optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

gcc/ChangeLog:

2021-01-19  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add [su]mull_n
builtin generator macros.
* config/aarch64/aarch64-simd.md (aarch64_mull_n):
Define.
* config/aarch64/arm_neon.h (vmull_n_s16): Use RTL builtin
instead of inline asm.
(vmull_n_s32): Likewise.
(vmull_n_u16): Likewise.
(vmull_n_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 4913231ea55260fea1c7511a28a436e1e1e2ab20..198aa7e85423c8f5fd7abbdbaae6ce1fc6d9c37f 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -255,6 +255,9 @@
   BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10, NONE)
   BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10, NONE)
 
+  BUILTIN_VD_HSI (BINOP, smull_n, 0, NONE)
+  BUILTIN_VD_HSI (BINOPU, umull_n, 0, NONE)
+
   BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, ALL)
   BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, ALL)
   BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, ALL)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index adeec028d49f06156a5e84ce4dd83dbd6f151474..912b94bcfd731fdab9a813bf1a089d025fbd4a89 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2061,6 +2061,19 @@
   [(set_attr "type" "neon_mul__scalar_long")]
 )
 
+(define_insn "aarch64_mull_n"
+  [(set (match_operand: 0 "register_operand" "=w")
+(mult:
+  (ANY_EXTEND:
+(vec_duplicate:
+	  (match_operand: 2 "register_operand" "")))
+  (ANY_EXTEND:
+(match_operand:VD_HSI 1 "register_operand" "w"]
+  "TARGET_SIMD"
+  "mull\t%0., %1., %2.[0]"
+  [(set_attr "type" "neon_mul__scalar_long")]
+)
+
 ;; vmlal_lane_s16 intrinsics
 (define_insn "aarch64_vec_mlal_lane"
   [(set (match_operand: 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 2a71ca9aa3c8c4095e99aa08c48e583f037a41ed..57959b6b0e22d44048e735e92ed7f578ec4153ea 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8799,48 +8799,28 @@ __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_n_s16 (int16x4_t __a, int16_t __b)
 {
-  int32x4_t __result;
-  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-   : "=w"(__result)
-   : "w"(__a), "x"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smull_nv4hi (__a, __b);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_n_s32 (int32x2_t __a, int32_t __b)
 {
-  int64x2_t __result;
-  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-   : "=w"(__result)
-   : "w"(__a), "w"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smull_nv2si (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_n_u16 (uint16x4_t __a, uint16_t __b)
 {
-  uint32x4_t __result;
-  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-   : "=w"(__result)
-   : "w"(__a), "x"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umull_nv4hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_n_u32 (uint32x2_t __a, uint32_t __b)
 {
-  uint64x2_t __result;
-  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-   : "=w"(__result)
-   : "w"(__a), "w"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umull_nv2si_uuu (__a, __b);
 }
 
 __extension__ extern __inline poly16x8_t


[PATCH] testsuite: aarch64: Add tests for vmull_high intrinsics

2021-02-01 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch adds tests for vmull_high_* Neon intrinsics. Since
these intrinsics are only supported for AArch64, these tests are
restricted to only run on AArch64 targets.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-01-29  Jonathan Wright  

* gcc.target/aarch64/advsimd-intrinsics/vmull_high.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmull_high_laneq.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c:
New test.


Re: [PATCH] testsuite: aarch64: Add tests for vmull_high intrinsics

2021-02-01 Thread Jonathan Wright via Gcc-patches
Woops, didn't attach the diff. Here we go.

Thanks,
Jonathan

From: Jonathan Wright
Sent: 01 February 2021 11:42
To: gcc-patches@gcc.gnu.org 
Cc: Kyrylo Tkachov 
Subject: [PATCH] testsuite: aarch64: Add tests for vmull_high intrinsics

Hi,

As subject, this patch adds tests for vmull_high_* Neon intrinsics. Since
these intrinsics are only supported for AArch64, these tests are
restricted to only run on AArch64 targets.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-01-29  Jonathan Wright  

* gcc.target/aarch64/advsimd-intrinsics/vmull_high.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmull_high_laneq.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c:
New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
new file mode 100644
index ..36094fce24f364f6a314f66ae153a211b2a75dff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
@@ -0,0 +1,78 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x40, 0x31,0x24, 0x19,
+	   0x10, 0x9, 0x4, 0x1 };
+VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x90, 0x79, 0x64, 0x51 };
+VECT_VAR_DECL(expected, int, 64, 2) [] = { 0xc4, 0xa9 };
+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0xf040, 0xf231, 0xf424, 0xf619,
+	0xf810, 0xfa09, 0xfc04, 0xfe01 };
+VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0xffe80090, 0xffea0079,
+	0xffec0064, 0xffee0051 };
+VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0xffe400c4,
+	0xffe600a9 };
+VECT_VAR_DECL(expected, poly, 16, 8) [] = { 0x5540, 0x5541, 0x5544, 0x5545,
+	0x5550, 0x5551, 0x5554, 0x };
+
+#define TEST_MSG "VMULL_HIGH"
+void exec_vmull_high (void)
+{
+  /* Basic test: y = vmull_high(x, x), then store the result.  */
+#define TEST_VMULL_HIGH(T1, T2, W1, W2, N1, N2) \
+  VECT_VAR(vector_res, T1, W2, N1) =	 \
+vmull_high_##T2##W1(VECT_VAR(vector, T1, W1, N2),			 \
+			VECT_VAR(vector, T1, W1, N2));			 \
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), \
+		 VECT_VAR(vector_res, T1, W2, N1))
+
+  DECL_VARIABLE(vector, int, 8, 16);
+  DECL_VARIABLE(vector, int, 16, 8);
+  DECL_VARIABLE(vector, int, 32, 4);
+  DECL_VARIABLE(vector, uint, 8, 16);
+  DECL_VARIABLE(vector, uint, 16, 8);
+  DECL_VARIABLE(vector, uint, 32, 4);
+  DECL_VARIABLE(vector, poly, 8, 16);
+  DECL_VARIABLE(vector_res, int, 16, 8);
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+  DECL_VARIABLE(vector_res, poly, 16, 8);
+
+  clean_results ();
+
+  VLOAD(vector, buffer, q, int, s, 8, 16);
+  VLOAD(vector, buffer, q, int, s, 16, 8);
+  VLOAD(vector, buffer, q, int, s, 32, 4);
+  VLOAD(vector, buffer, q, uint, u, 8, 16);
+  VLOAD(vector, buffer, q, uint, u, 16, 8);
+  VLOAD(vector, buffer, q, uint, u, 32, 4);
+  VLOAD(vector, buffer, q, poly, p, 8, 16);
+
+  TEST_VMULL_HIGH(int, s, 8, 16, 8, 16);
+  TEST_VMULL_HIGH(int, s, 16, 32, 4, 8);
+  TEST_VMULL_HIGH(int, s, 32, 64, 2, 4);
+  TEST_VMULL_HIGH(uint, u, 8, 16, 8, 16);
+  TEST_VMULL_HIGH(uint, u, 16, 32, 4, 8);
+  TEST_VMULL_HIGH(uint, u, 32, 64, 2, 4);
+  TEST_VMULL_HIGH(poly, p, 8, 16, 8, 16);
+
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
+  CHECK_POLY(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
+}
+
+int main (void)
+{
+  exec_vmull_high ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
new file mode 100644
index ..30bc954cd18f9f9f72f985aba8745fc1808dbbf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
@@ -0,0 +1,69 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 };
+VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000 };
+VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 };
+VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 };
+
+#define TEST_MSG "VMULL_HIGH_LANE"
+void exec_vmull_high_lane (void)
+{

[PATCH] testsuite: aarch64: Add tests for vmlXl_high intrinsics

2021-02-01 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch adds tests for vmlal_high_* and vmlsl_high_*
Neon intrinsics. Since these intrinsics are only supported for AArch64,
these tests are restricted to only run on AArch64 targets.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-01-31  Jonathan Wright  

* gcc.target/aarch64/advsimd-intrinsics/vmlXl_high.inc:
New test template.
* gcc.target/aarch64/advsimd-intrinsics/vmlXl_high_lane.inc:
New test template.
* gcc.target/aarch64/advsimd-intrinsics/vmlXl_high_laneq.inc:
New test template.
* gcc.target/aarch64/advsimd-intrinsics/vmlXl_high_n.inc:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmlal_high.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmlal_high_lane.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmlal_high_laneq.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmlal_high_n.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmlsl_high.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmlsl_high_lane.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmlsl_high_laneq.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vmlsl_high_n.c:
New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmlXl_high.inc b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmlXl_high.inc
new file mode 100644
index ..7c9ee26b142669c48d27aca6bd11988e948cf52d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmlXl_high.inc
@@ -0,0 +1,89 @@
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+  /* vector_res = OP(vector, vector3, vector4),
+ then store the result.  */
+#define TEST_VMLXL_HIGH1(INSN, T1, T2, W1, W2, N1, N2)			   \
+  VECT_VAR(vector_res, T1, W1, N1) =	   \
+INSN##_##T2##W2(VECT_VAR(vector, T1, W1, N1),			   \
+VECT_VAR(vector3, T1, W2, N2),			   \
+VECT_VAR(vector4, T1, W2, N2));			   \
+  vst1q_##T2##W1(VECT_VAR(result, T1, W1, N1), VECT_VAR(vector_res, T1, W1, N1))
+
+#define TEST_VMLXL_HIGH(INSN, T1, T2, W1, W2, N1, N2)			   \
+  TEST_VMLXL_HIGH1(INSN, T1, T2, W1, W2, N1, N2)
+
+  DECL_VARIABLE(vector, int, 16, 8);
+  DECL_VARIABLE(vector3, int, 8, 16);
+  DECL_VARIABLE(vector4, int, 8, 16);
+  DECL_VARIABLE(vector_res, int, 16, 8);
+
+  DECL_VARIABLE(vector, int, 32, 4);
+  DECL_VARIABLE(vector3, int, 16, 8);
+  DECL_VARIABLE(vector4, int, 16, 8);
+  DECL_VARIABLE(vector_res, int, 32, 4);
+
+  DECL_VARIABLE(vector, int, 64, 2);
+  DECL_VARIABLE(vector3, int, 32, 4);
+  DECL_VARIABLE(vector4, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+
+  DECL_VARIABLE(vector, uint, 16, 8);
+  DECL_VARIABLE(vector3, uint, 8, 16);
+  DECL_VARIABLE(vector4, uint, 8, 16);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+
+  DECL_VARIABLE(vector, uint, 32, 4);
+  DECL_VARIABLE(vector3, uint, 16, 8);
+  DECL_VARIABLE(vector4, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+
+  DECL_VARIABLE(vector, uint, 64, 2);
+  DECL_VARIABLE(vector3, uint, 32, 4);
+  DECL_VARIABLE(vector4, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+
+  clean_results ();
+
+  VLOAD(vector, buffer, q, int, s, 16, 8);
+  VLOAD(vector, buffer, q, int, s, 32, 4);
+  VLOAD(vector, buffer, q, int, s, 64, 2);
+  VLOAD(vector, buffer, q, uint, u, 16, 8);
+  VLOAD(vector, buffer, q, uint, u, 32, 4);
+  VLOAD(vector, buffer, q, uint, u, 64, 2);
+
+  VDUP(vector3, q, int, s, 8, 16, 0x55);
+  VDUP(vector4, q, int, s, 8, 16, 0xBB);
+  VDUP(vector3, q, int, s, 16, 8, 0x55);
+  VDUP(vector4, q, int, s, 16, 8, 0xBB);
+  VDUP(vector3, q, int, s, 32, 4, 0x55);
+  VDUP(vector4, q, int, s, 32, 4, 0xBB);
+  VDUP(vector3, q, uint, u, 8, 16, 0x55);
+  VDUP(vector4, q, uint, u, 8, 16, 0xBB);
+  VDUP(vector3, q, uint, u, 16, 8, 0x55);
+  VDUP(vector4, q, uint, u, 16, 8, 0xBB);
+  VDUP(vector3, q, uint, u, 32, 4, 0x55);
+  VDUP(vector4, q, uint, u, 32, 4, 0xBB);
+
+  TEST_VMLXL_HIGH(INSN_NAME, int, s, 16, 8, 8, 16);
+  TEST_VMLXL_HIGH(INSN_NAME, int, s, 32, 16, 4, 8);
+  TEST_VMLXL_HIGH(INSN_NAME, int, s, 64, 32, 2, 4);
+  TEST_VMLXL_HIGH(INSN_NAME, uint, u, 16, 8, 8, 16);
+  TEST_VMLXL_HIGH(INSN_NAME, uint, u, 32, 16, 4, 8);
+  TEST_VMLXL_HIGH(INSN_NAME, uint, u, 64, 32, 2, 4);
+
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
+}
+
+int main (void)
+{
+  FNNAME (INSN_NAME) ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmlXl_high_lane.inc b/gcc/testsuite/gcc.ta

[PATCH] aarch64: Use RTL builtins for [su]mlal_high intrinsics

2021-02-03 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mlal_high Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-01-27  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add RTL builtin
generator macros.
* config/aarch64/aarch64-simd.md (*aarch64_mlal_hi):
Rename to...
(aarch64_mlal_hi_insn): This.
(aarch64_mlal_hi): Define.
* config/aarch64/arm_neon.h (vmlal_high_s8): Use RTL builtin
instead of inline asm.
(vmlal_high_s16): Likewise.
(vmlal_high_s32): Likewise.
(vmlal_high_u8): Likewise.
(vmlal_high_u16): Likewise.
(vmlal_high_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b82b6431d6f2a8d7d21023da589f3eecec7f0d65..2d91a0768d66fb8570ce518c06faae28c0ffcf27 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -230,6 +230,10 @@
   BUILTIN_VQW (TERNOP, smlsl_hi, 0, NONE)
   BUILTIN_VQW (TERNOPU, umlsl_hi, 0, NONE)
 
+  /* Implemented by aarch64_mlal_hi.  */
+  BUILTIN_VQW (TERNOP, smlal_hi, 0, NONE)
+  BUILTIN_VQW (TERNOPU, umlal_hi, 0, NONE)
+
   BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, NONE)
   /* Implemented by aarch64_qmovn.  */
   BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d1858663a4e78c0861d902b37e93c0b00d75e661..ff5037fb44ebb4d1d37ab838de6391e105e90bbf 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1869,7 +1869,7 @@
   [(set_attr "type" "neon_mla__long")]
 )
 
-(define_insn "*aarch64_mlal_hi"
+(define_insn "aarch64_mlal_hi_insn"
   [(set (match_operand: 0 "register_operand" "=w")
 (plus:
   (mult:
@@ -1885,6 +1885,20 @@
   [(set_attr "type" "neon_mla__long")]
 )
 
+(define_expand "aarch64_mlal_hi"
+  [(match_operand: 0 "register_operand")
+   (match_operand: 1 "register_operand")
+   (ANY_EXTEND:(match_operand:VQW 2 "register_operand"))
+   (match_operand:VQW 3 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+  emit_insn (gen_aarch64_mlal_hi_insn (operands[0], operands[1],
+		 operands[2], p, operands[3]));
+  DONE;
+}
+)
+
 (define_insn "*aarch64_mlsl_lo"
   [(set (match_operand: 0 "register_operand" "=w")
 (minus:
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index ad0dfef80f39c1baf1e8c7c1bb95f325eff6ac7a..53aae934c37eadc179bb1d4e7fe033d06364628a 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7346,72 +7346,42 @@ __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int16x8_t __result;
-  __asm__ ("smlal2 %0.8h,%2.16b,%3.16b"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlal_hiv16qi (__a, __b, __c);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("smlal2 %0.4s,%2.8h,%3.8h"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlal_hiv8hi (__a, __b, __c);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int64x2_t __result;
-  __asm__ ("smlal2 %0.2d,%2.4s,%3.4s"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlal_hiv4si (__a, __b, __c);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint16x8_t __result;
-  __asm__ ("umlal2 %0.8h,%2.16b,%3.16b"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlal_hiv16qi_ (__a, __b, __c);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("umlal2 %0.4s,%2.8h,%3.8h"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_uml

[PATCH] aarch64: Use RTL builtins for [su]mlal_high_n intrinsics

2021-02-03 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mlal_high_n Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling and
optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu and
aarch64_be-none-elf - no issues.

Ok for master?

Thanks,
Jonathan

--

gcc/ChangeLog:

2021-01-27  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add [su]mlal_hi_n
builtin generator macros.
* config/aarch64/aarch64-simd.md (aarch64_mlal_hi_n_insn):
Define.
(aarch64_mlal_hi_n): Define.
* config/aarch64/arm_neon.h (vmlal_high_n_s16): Use RTL builtin
instead of inline asm.
(vmlal_high_n_s32): Likewise.
(vmlal_high_n_u16): Likewise.
(vmlal_high_n_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 2d91a0768d66fb8570ce518c06faae28c0ffcf27..c102289c26123ae913df87d327237647d2621655 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -234,6 +234,10 @@
   BUILTIN_VQW (TERNOP, smlal_hi, 0, NONE)
   BUILTIN_VQW (TERNOPU, umlal_hi, 0, NONE)
 
+  /* Implemented by aarch64_mlal_hi_n.  */
+  BUILTIN_VQ_HSI (TERNOP, smlal_hi_n, 0, NONE)
+  BUILTIN_VQ_HSI (TERNOPU, umlal_hi_n, 0, NONE)
+
   BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, NONE)
   /* Implemented by aarch64_qmovn.  */
   BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ff5037fb44ebb4d1d37ab838de6391e105e90bbf..a883f6ad4de8bb6d0c5f6478df5c516c159df4bb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1899,6 +1899,35 @@
 }
 )
 
+(define_insn "aarch64_mlal_hi_n_insn"
+  [(set (match_operand: 0 "register_operand" "=w")
+(plus:
+  (mult:
+  (ANY_EXTEND: (vec_select:
+ (match_operand:VQ_HSI 2 "register_operand" "w")
+ (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+  (ANY_EXTEND: (vec_duplicate:
+	   (match_operand: 4 "register_operand" ""
+  (match_operand: 1 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "mlal2\t%0., %2., %4.[0]"
+  [(set_attr "type" "neon_mla__long")]
+)
+
+(define_expand "aarch64_mlal_hi_n"
+  [(match_operand: 0 "register_operand")
+   (match_operand: 1 "register_operand")
+   (ANY_EXTEND:(match_operand:VQ_HSI 2 "register_operand"))
+   (match_operand: 3 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+  emit_insn (gen_aarch64_mlal_hi_n_insn (operands[0],
+ operands[1], operands[2], p, operands[3]));
+  DONE;
+}
+)
+
 (define_insn "*aarch64_mlsl_lo"
   [(set (match_operand: 0 "register_operand" "=w")
 (minus:
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 53aae934c37eadc179bb1d4e7fe033d06364628a..ae8526d5972067c05265a1f0bcf9fde5e347fb3b 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7298,48 +7298,28 @@ __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlal_hi_nv8hi (__a, __b, __c);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
 {
-  int64x2_t __result;
-  __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlal_hi_nv4si (__a, __b, __c);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_high_n_u16 (uint32x4_t __a, uint16x8_t __b, uint16_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlal_hi_nv8hi_ (__a, __b, __c);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlal_high_n_u32 (uint64x2_t __a, uint32x4_t __b, uint32_t __c)
 {
-  uint64x2_t __result;
-  __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlal_hi_nv4si_ (__a, __b, __c);
 }
 
 __extension__ extern __inline int16x8_t


[PATCH] aarch64: Use RTL builtins for [su]mlsl_high_n intrinsics

2021-02-03 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mlsl_high_n Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling and
optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu and
aarch64_be-none-elf - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-01-27  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add [su]mlsl_hi_n
builtin generator macros.
* config/aarch64/aarch64-simd.md (aarch64_mlsl_hi_n_insn):
Define.
(aarch64_mlsl_hi_n): Define.
* config/aarch64/arm_neon.h (vmlsl_high_n_s16): Use RTL builtin
instead of inline asm.
(vmlsl_high_n_s32): Likewise.
(vmlsl_high_n_u16): Likewise.
(vmlsl_high_n_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index c102289c26123ae913df87d327237647d2621655..336f9f9a56b07668678e5b384a89f518433da58b 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -230,6 +230,10 @@
   BUILTIN_VQW (TERNOP, smlsl_hi, 0, NONE)
   BUILTIN_VQW (TERNOPU, umlsl_hi, 0, NONE)
 
+  /* Implemented by aarch64_mlsl_hi_n.  */
+  BUILTIN_VQ_HSI (TERNOP, smlsl_hi_n, 0, NONE)
+  BUILTIN_VQ_HSI (TERNOPU, umlsl_hi_n, 0, NONE)
+
   /* Implemented by aarch64_mlal_hi.  */
   BUILTIN_VQW (TERNOP, smlal_hi, 0, NONE)
   BUILTIN_VQW (TERNOPU, umlal_hi, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index a883f6ad4de8bb6d0c5f6478df5c516c159df4bb..1e9b4d933f3f9385d857b497e573de6aee25c57f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1974,6 +1974,35 @@
 }
 )
 
+(define_insn "aarch64_mlsl_hi_n_insn"
+  [(set (match_operand: 0 "register_operand" "=w")
+(minus:
+  (match_operand: 1 "register_operand" "0")
+  (mult:
+(ANY_EXTEND: (vec_select:
+  (match_operand:VQ_HSI 2 "register_operand" "w")
+  (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+(ANY_EXTEND: (vec_duplicate:
+	(match_operand: 4 "register_operand" ""))]
+  "TARGET_SIMD"
+  "mlsl2\t%0., %2., %4.[0]"
+  [(set_attr "type" "neon_mla__long")]
+)
+
+(define_expand "aarch64_mlsl_hi_n"
+  [(match_operand: 0 "register_operand")
+   (match_operand: 1 "register_operand")
+   (ANY_EXTEND:(match_operand:VQ_HSI 2 "register_operand"))
+   (match_operand: 3 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+  emit_insn (gen_aarch64_mlsl_hi_n_insn (operands[0],
+ operands[1], operands[2], p, operands[3]));
+  DONE;
+}
+)
+
 (define_insn "aarch64_mlal"
   [(set (match_operand: 0 "register_operand" "=w")
 (plus:
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index ae8526d5972067c05265a1f0bcf9fde5e347fb3b..7e2c2fc3827e773b960abc137b2cadea61a54577 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7792,48 +7792,28 @@ __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
 {
-  int32x4_t __result;
-  __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlsl_hi_nv8hi (__a, __b, __c);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
 {
-  int64x2_t __result;
-  __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smlsl_hi_nv4si (__a, __b, __c);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_high_n_u16 (uint32x4_t __a, uint16x8_t __b, uint16_t __c)
 {
-  uint32x4_t __result;
-  __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "x"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlsl_hi_nv8hi_ (__a, __b, __c);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmlsl_high_n_u32 (uint64x2_t __a, uint32x4_t __b, uint32_t __c)
 {
-  uint64x2_t __result;
-  __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]"
-   : "=w"(__result)
-   : "0"(__a), "w"(__b), "w"(__c)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umlsl_hi_nv4si_ (__a, __b, __c);
 }
 
 __extension__ extern __inline int16x8_t


[PATCH] aarch64: Use RTL builtins for [su]mlal_high_lane[q] intrinsics

2021-02-03 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mlal_high_lane[q] Neon intrinsics to use
RTL builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu and
aarch64_be-none-elf - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-02  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add
[su]mlal_hi_lane[q] builtin generator macros.
* config/aarch64/aarch64-simd.md
(aarch64_mlal_hi_lane_insn): Define.
(aarch64_mlal_hi_lane): Define.
(aarch64_mlal_hi_laneq_insn): Define.
(aarch64_mlal_hi_laneq): Define.
* config/aarch64/arm_neon.h (vmlal_high_lane_s16): Use RTL
builtin instead of inline asm.
(vmlal_high_lane_s32): Likewise.
(vmlal_high_lane_u16): Likewise.
(vmlal_high_lane_u32): Likewise.
(vmlal_high_laneq_s16): Likewise.
(vmlal_high_laneq_s32): Likewise.
(vmlal_high_laneq_u16): Likewise.
(vmlal_high_laneq_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 336f9f9a56b07668678e5b384a89f518433da58b..152c2e6d361bdab0275e3b38759723fd2a3ffee5 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -300,6 +300,11 @@
   BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlsl_lane_, 0, NONE)
   BUILTIN_VD_HSI (QUADOPU_LANE, vec_umlsl_laneq_, 0, NONE)
 
+  BUILTIN_VQ_HSI (QUADOP_LANE, smlal_hi_lane, 0, NONE)
+  BUILTIN_VQ_HSI (QUADOP_LANE, smlal_hi_laneq, 0, NONE)
+  BUILTIN_VQ_HSI (QUADOPU_LANE, umlal_hi_lane, 0, NONE)
+  BUILTIN_VQ_HSI (QUADOPU_LANE, umlal_hi_laneq, 0, NONE)
+
   BUILTIN_VSD_HSI (BINOP, sqdmull, 0, NONE)
   BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0, NONE)
   BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 1e9b4d933f3f9385d857b497e573de6aee25c57f..e741b656cb081e26b9e6e262ae50fab3716e1ed4 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2180,6 +2180,76 @@
   [(set_attr "type" "neon_mla__scalar_long")]
 )
 
+(define_insn "aarch64_mlal_hi_lane_insn"
+  [(set (match_operand: 0 "register_operand" "=w")
+	(plus:
+	  (mult:
+	(ANY_EXTEND: (vec_select:
+	  (match_operand:VQ_HSI 2 "register_operand" "w")
+	  (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+	(ANY_EXTEND: (vec_duplicate:
+	  (vec_select:
+		(match_operand: 4 "register_operand" "")
+		(parallel [(match_operand:SI 5 "immediate_operand" "i")])
+	  (match_operand: 1 "register_operand" "0")))]
+  "TARGET_SIMD"
+  {
+operands[5] = aarch64_endian_lane_rtx (mode, INTVAL (operands[5]));
+return "mlal2\\t%0., %2., %4.[%5]";
+  }
+  [(set_attr "type" "neon_mla__scalar_long")]
+)
+
+(define_expand "aarch64_mlal_hi_lane"
+  [(match_operand: 0 "register_operand")
+   (match_operand: 1 "register_operand")
+   (ANY_EXTEND:(match_operand:VQ_HSI 2 "register_operand"))
+   (match_operand: 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+  emit_insn (gen_aarch64_mlal_hi_lane_insn (operands[0],
+	 operands[1], operands[2], p, operands[3], operands[4]));
+  DONE;
+}
+)
+
+(define_insn "aarch64_mlal_hi_laneq_insn"
+  [(set (match_operand: 0 "register_operand" "=w")
+	(plus:
+	  (mult:
+	(ANY_EXTEND: (vec_select:
+	  (match_operand:VQ_HSI 2 "register_operand" "w")
+	  (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+	(ANY_EXTEND: (vec_duplicate:
+	  (vec_select:
+		(match_operand: 4 "register_operand" "")
+		(parallel [(match_operand:SI 5 "immediate_operand" "i")])
+	  (match_operand: 1 "register_operand" "0")))]
+  "TARGET_SIMD"
+  {
+operands[5] = aarch64_endian_lane_rtx (mode, INTVAL (operands[5]));
+return "mlal2\\t%0., %2., %4.[%5]";
+  }
+  [(set_attr "type" "neon_mla__scalar_long")]
+)
+
+(define_expand "aarch64_mlal_hi_laneq"
+  [(match_operand: 0 "register_operand")
+   (match_operand: 1 "register_operand")
+   (ANY_EXTEND:(match_operand:VQ_HSI 2 "register_operand"))
+   (match_operand: 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+  emit_insn (gen_aarch64_mlal_hi_laneq_insn (operands[0],
+	 operands[1], operands[2], p, operands[3], operands[4]));
+  DONE;
+}
+)
+
 (define_insn "aarch64_vec_mlsl_lane"
   [(set (match_operand: 0 "register_operand" "=w")
(minus:
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 7e2c2fc3827e773b960abc137b2cadea61a54577..ee68240f0d019a4a3be89e1e923cb14ee8026468 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7182,117 +7182,69 @@ vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x

[PATCH] aarch64: Use RTL builtins for [su]mlsl_high_lane[q] intrinsics

2021-02-03 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mlsl_high_lane[q] Neon intrinsics to use
RTL builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu and
aarch64_be-none-elf - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-02  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add
[su]mlsl_hi_lane[q] builtin macro generators.
* config/aarch64/aarch64-simd.md
(aarch64_mlsl_hi_lane_insn): Define.
(aarch64_mlsl_hi_lane): Define.
(aarch64_mlsl_hi_laneq_insn): Define.
(aarch64_mlsl_hi_laneq): Define.
* config/aarch64/arm_neon.h (vmlsl_high_lane_s16): Use RTL
builtin instead of inline asm.
(vmlsl_high_lane_s32): Likewise.
(vmlsl_high_lane_u16): Likewise.
(vmlsl_high_lane_u32): Likewise.
(vmlsl_high_laneq_s16): Likewise.
(vmlsl_high_laneq_s32): Likewise.
(vmlsl_high_laneq_u16): Likewise.
(vmlsl_high_laneq_u32): Likewise.
(vmlal_high_laneq_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 152c2e6d361bdab0275e3b38759723fd2a3ffee5..76ab021725900b249ecabf3f8df2167169a263e9 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -305,6 +305,11 @@
   BUILTIN_VQ_HSI (QUADOPU_LANE, umlal_hi_lane, 0, NONE)
   BUILTIN_VQ_HSI (QUADOPU_LANE, umlal_hi_laneq, 0, NONE)
 
+  BUILTIN_VQ_HSI (QUADOP_LANE, smlsl_hi_lane, 0, NONE)
+  BUILTIN_VQ_HSI (QUADOP_LANE, smlsl_hi_laneq, 0, NONE)
+  BUILTIN_VQ_HSI (QUADOPU_LANE, umlsl_hi_lane, 0, NONE)
+  BUILTIN_VQ_HSI (QUADOPU_LANE, umlsl_hi_laneq, 0, NONE)
+
   BUILTIN_VSD_HSI (BINOP, sqdmull, 0, NONE)
   BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0, NONE)
   BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index e741b656cb081e26b9e6e262ae50fab3716e1ed4..2e347b92b79cb0c1dfef710034602d8f61f62173 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2270,6 +2270,78 @@
   [(set_attr "type" "neon_mla__scalar_long")]
 )
 
+(define_insn "aarch64_mlsl_hi_lane_insn"
+  [(set (match_operand: 0 "register_operand" "=w")
+	(minus:
+	  (match_operand: 1 "register_operand" "0")
+	  (mult:
+	(ANY_EXTEND: (vec_select:
+	  (match_operand:VQ_HSI 2 "register_operand" "w")
+	  (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+	(ANY_EXTEND: (vec_duplicate:
+	  (vec_select:
+		(match_operand: 4 "register_operand" "")
+		(parallel [(match_operand:SI 5 "immediate_operand" "i")]
+	  )))]
+  "TARGET_SIMD"
+  {
+operands[5] = aarch64_endian_lane_rtx (mode, INTVAL (operands[5]));
+return "mlsl2\\t%0., %2., %4.[%5]";
+  }
+  [(set_attr "type" "neon_mla__scalar_long")]
+)
+
+(define_expand "aarch64_mlsl_hi_lane"
+  [(match_operand: 0 "register_operand")
+   (match_operand: 1 "register_operand")
+   (ANY_EXTEND:(match_operand:VQ_HSI 2 "register_operand"))
+   (match_operand: 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+  emit_insn (gen_aarch64_mlsl_hi_lane_insn (operands[0],
+	 operands[1], operands[2], p, operands[3], operands[4]));
+  DONE;
+}
+)
+
+(define_insn "aarch64_mlsl_hi_laneq_insn"
+  [(set (match_operand: 0 "register_operand" "=w")
+	(minus:
+	  (match_operand: 1 "register_operand" "0")
+	  (mult:
+	(ANY_EXTEND: (vec_select:
+	  (match_operand:VQ_HSI 2 "register_operand" "w")
+	  (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+	(ANY_EXTEND: (vec_duplicate:
+	  (vec_select:
+		(match_operand: 4 "register_operand" "")
+		(parallel [(match_operand:SI 5 "immediate_operand" "i")]
+	  )))]
+  "TARGET_SIMD"
+  {
+operands[5] = aarch64_endian_lane_rtx (mode, INTVAL (operands[5]));
+return "mlsl2\\t%0., %2., %4.[%5]";
+  }
+  [(set_attr "type" "neon_mla__scalar_long")]
+)
+
+(define_expand "aarch64_mlsl_hi_laneq"
+  [(match_operand: 0 "register_operand")
+   (match_operand: 1 "register_operand")
+   (ANY_EXTEND:(match_operand:VQ_HSI 2 "register_operand"))
+   (match_operand: 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+  emit_insn (gen_aarch64_mlsl_hi_laneq_insn (operands[0],
+	 operands[1], operands[2], p, operands[3], operands[4]));
+  DONE;
+}
+)
+
 ;; FP vector operations.
 ;; AArch64 AdvSIMD supports single-precision (32-bit) and 
 ;; double-precision (64-bit) floating-point data types and arithmetic as
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index ee68240f0d019a4a3be89e1e923cb14ee8026468..7b99e16b53cae27f6d2e7a29985cb4963d74739e 100644
--- a/gcc/config/aarch64/arm_neon.h
++

[PATCH] aarch64: Use RTL builtins for [su]mull_high_n intrinsics

2021-02-04 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites [su]mull_high_n Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling and
optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu and
aarch64_be-none-elf - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-03  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add [su]mull_hi_n
builtin generator macros.
* config/aarch64/aarch64-simd.md
(aarch64_mull_hi_n_insn): Define.
(aarch64_mull_hi_n): Define.
* config/aarch64/arm_neon.h (vmull_high_n_s16): Use RTL builtin
instead of inline asm.
(vmull_high_n_s32): Likewise.
(vmull_high_n_u16): Likewise.
(vmull_high_n_u32): Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index b787cb9788e758c1f103eab366b7aed4dc457830..4dac8e0d79bdf887e37e89f09c0bbbfd45aa 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -300,6 +300,9 @@
   BUILTIN_VD_HSI (BINOP, smull_n, 0, NONE)
   BUILTIN_VD_HSI (BINOPU, umull_n, 0, NONE)
 
+  BUILTIN_VQ_HSI (BINOP, smull_hi_n, 0, NONE)
+  BUILTIN_VQ_HSI (BINOPU, umull_hi_n, 0, NONE)
+
   BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, NONE)
   BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, NONE)
   BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 393bab1920100badef21479b2f25cb6e1880c927..1d1ba379c28c052f53fc6c45573f3319ee5784f0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2266,6 +2266,33 @@
   [(set_attr "type" "neon_mul__scalar_long")]
 )
 
+(define_insn "aarch64_mull_hi_n_insn"
+  [(set (match_operand: 0 "register_operand" "=w")
+	(mult:
+	  (ANY_EXTEND: (vec_select:
+	(match_operand:VQ_HSI 1 "register_operand" "w")
+	(match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
+	  (ANY_EXTEND:
+	(vec_duplicate:
+	  (match_operand: 2 "register_operand" "")]
+  "TARGET_SIMD"
+  "mull2\\t%0., %1., %2.[0]"
+  [(set_attr "type" "neon_mul__scalar_long")]
+)
+
+(define_expand "aarch64_mull_hi_n"
+  [(match_operand: 0 "register_operand")
+   (ANY_EXTEND: (match_operand:VQ_HSI 1 "register_operand"))
+   (match_operand: 2 "register_operand")]
+ "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+   emit_insn (gen_aarch64_mull_hi_n_insn (operands[0], operands[1],
+		operands[2], p));
+   DONE;
+ }
+)
+
 ;; vmlal_lane_s16 intrinsics
 (define_insn "aarch64_vec_mlal_lane"
   [(set (match_operand: 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d50bd65c497a02ea67c4aa02aff29f1ae7223b4e..fffd7b7bd2f7a3781f22cb6702b341d4318b1036 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8275,48 +8275,28 @@ __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_n_s16 (int16x8_t __a, int16_t __b)
 {
-  int32x4_t __result;
-  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-   : "=w"(__result)
-   : "w"(__a), "x"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smull_hi_nv8hi (__a, __b);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_n_s32 (int32x4_t __a, int32_t __b)
 {
-  int64x2_t __result;
-  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-   : "=w"(__result)
-   : "w"(__a), "w"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_smull_hi_nv4si (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  uint32x4_t __result;
-  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-   : "=w"(__result)
-   : "w"(__a), "x"(__b)
-   : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_umull_hi_nv8hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  uint64x2_t __result;
-  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-   : "=w"(__result)
-   : "w"(__a), "w"(__b)
-   : /* No clobbers */);
-  return __result;
+ return __builtin_aarch64_umull_hi_nv4si_uuu (__a, __b);
 }
 
 __extension__ extern __inline poly16x8_t


[PATCH] aarch64: Use RTL builtins for [su]mull_high_lane[q] intrinsics

2021-02-04 Thread Jonathan Wright via Gcc-patches
Hi,

As subject this patch rewrites [su]mull_high_lane[q] Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling and
optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu and
aarch64_be-none-elf - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-03  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add
[su]mull_hi_lane[q] builtin generator macros.
* config/aarch64/aarch64-simd.md
(aarch64_mull_hi_lane_insn): Define.
(aarch64_mull_hi_lane): Define.
(aarch64_mull_hi_laneq_insn): Define.
(aarch64_mull_hi_laneq): Define.
* config/aarch64/arm_neon.h (vmull_high_lane_s16): Use RTL
builtin instead of inline asm.
(vmull_high_lane_s32): Likewise.
(vmull_high_lane_u16): Likewise.
(vmull_high_lane_u32): Likewise.
(vmull_high_laneq_s16): Likewise.
(vmull_high_laneq_s32): Likewise.
(vmull_high_laneq_u16): Likewise.
(vmull_high_laneq_u32): Liekwise.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 4dac8e0d79bdf887e37e89f09c0bbbfd45aa..29a7bbc24a7370fc077ab6c66f3de551f6926b7e 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -303,6 +303,11 @@
   BUILTIN_VQ_HSI (BINOP, smull_hi_n, 0, NONE)
   BUILTIN_VQ_HSI (BINOPU, umull_hi_n, 0, NONE)
 
+  BUILTIN_VQ_HSI (TERNOP_LANE, smull_hi_lane, 0, NONE)
+  BUILTIN_VQ_HSI (TERNOP_LANE, smull_hi_laneq, 0, NONE)
+  BUILTIN_VQ_HSI (TERNOPU_LANE, umull_hi_lane, 0, NONE)
+  BUILTIN_VQ_HSI (TERNOPU_LANE, umull_hi_laneq, 0, NONE)
+
   BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_lane_, 0, NONE)
   BUILTIN_VD_HSI (QUADOP_LANE, vec_smlal_lane_, 0, NONE)
   BUILTIN_VD_HSI (TERNOP_LANE, vec_smult_laneq_, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 32ef074118843dd0f84c7fa07d4a8eb68f2b685e..20d976cea337f31c309ef72251ea46c53ecc7c25 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2253,6 +2253,70 @@
   [(set_attr "type" "neon_mul__scalar_long")]
 )
 
+(define_insn "aarch64_mull_hi_lane_insn"
+  [(set (match_operand: 0 "register_operand" "=w")
+	(mult:
+	  (ANY_EXTEND: (vec_select:
+	(match_operand:VQ_HSI 1 "register_operand" "w")
+	(match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" "")))
+	  (ANY_EXTEND: (vec_duplicate:
+	(vec_select:
+	  (match_operand: 3 "register_operand" "")
+	  (parallel [(match_operand:SI 4 "immediate_operand" "i")]))]
+  "TARGET_SIMD"
+  {
+operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4]));
+return "mull2\\t%0., %1., %3.[%4]";
+  }
+  [(set_attr "type" "neon_mul__scalar_long")]
+)
+
+(define_expand "aarch64_mull_hi_lane"
+  [(match_operand: 0 "register_operand")
+   (ANY_EXTEND:(match_operand:VQ_HSI 1 "register_operand"))
+   (match_operand: 2 "register_operand")
+   (match_operand:SI 3 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+  emit_insn (gen_aarch64_mull_hi_lane_insn (operands[0],
+	 operands[1], p, operands[2], operands[3]));
+  DONE;
+}
+)
+
+(define_insn "aarch64_mull_hi_laneq_insn"
+  [(set (match_operand: 0 "register_operand" "=w")
+	(mult:
+	  (ANY_EXTEND: (vec_select:
+	(match_operand:VQ_HSI 1 "register_operand" "w")
+	(match_operand:VQ_HSI 2 "vect_par_cnst_hi_half" "")))
+	  (ANY_EXTEND: (vec_duplicate:
+	(vec_select:
+	  (match_operand: 3 "register_operand" "")
+	  (parallel [(match_operand:SI 4 "immediate_operand" "i")]))]
+  "TARGET_SIMD"
+  {
+operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4]));
+return "mull2\\t%0., %1., %3.[%4]";
+  }
+  [(set_attr "type" "neon_mul__scalar_long")]
+)
+
+(define_expand "aarch64_mull_hi_laneq"
+  [(match_operand: 0 "register_operand")
+   (ANY_EXTEND:(match_operand:VQ_HSI 1 "register_operand"))
+   (match_operand: 2 "register_operand")
+   (match_operand:SI 3 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (mode, , true);
+  emit_insn (gen_aarch64_mull_hi_laneq_insn (operands[0],
+	 operands[1], p, operands[2], operands[3]));
+  DONE;
+}
+)
+
 (define_insn "aarch64_mull_n"
   [(set (match_operand: 0 "register_operand" "=w")
 (mult:
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index fffd7b7bd2f7a3781f22cb6702b341d4318b1036..2d776ef7ef4ed7fad166dd00c4b4eb8bcaf75fc8 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8167,109 +8167,62 @@ vshrn_n_u64 (uint64x2_t __a, const int __b)
 {
   return (uint32x2_t)__builtin_aarch64_shrnv2di ((int64x2_t)__a, __b);
 }
-#define vmull_high_lane_s16(a, b, c)\
-  __extension__ \
-({   

[PATCH] testsuite: aarch64: Add tests for vpaddq intrinsics

2021-02-09 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch adds tests for vpaddq_* Neon intrinsics. Since these
intrinsics are only supported for AArch64, these tests are restricted to
only run on AArch64 targets.

(There are currently no tests covering these intrinsics.)

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-02-09  Jonathan Wright  

* gcc.target/aarch64/advsimd-intrinsics/vpXXXq.inc:
New test template.
* gcc.target/aarch64/advsimd-intrinsics/vpaddq.c: New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpXXXq.inc b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpXXXq.inc
new file mode 100644
index ..3c27d32992c0e3a1d69580d1699c28f01fbb76ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpXXXq.inc
@@ -0,0 +1,96 @@
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+  /* Basic test: y=OP(x), then store the result.  */
+#define TEST_VPXXXQ1(INSN, T1, T2, W, N)\
+  VECT_VAR(vector_res, T1, W, N) =	\
+INSN##_##T2##W(VECT_VAR(vector, T1, W, N),\
+		   VECT_VAR(vector, T1, W, N));\
+  vst1q##_##T2##W(VECT_VAR(result, T1, W, N),\
+		  VECT_VAR(vector_res, T1, W, N))
+
+#define TEST_VPXXXQ(INSN, T1, T2, W, N)	\
+  TEST_VPXXXQ1(INSN, T1, T2, W, N)	\
+
+  DECL_VARIABLE(vector, int, 8, 16);
+  DECL_VARIABLE(vector, int, 16, 8);
+  DECL_VARIABLE(vector, int, 32, 4);
+  DECL_VARIABLE(vector, int, 64, 2);
+  DECL_VARIABLE(vector, uint, 8, 16);
+  DECL_VARIABLE(vector, uint, 16, 8);
+  DECL_VARIABLE(vector, uint, 32, 4);
+  DECL_VARIABLE(vector, uint, 64, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector, float, 16, 8);
+#endif
+  DECL_VARIABLE(vector, float, 32, 4);
+  DECL_VARIABLE(vector, float, 64, 2);
+
+  DECL_VARIABLE(vector_res, int, 8, 16);
+  DECL_VARIABLE(vector_res, int, 16, 8);
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, int, 64, 2);
+  DECL_VARIABLE(vector_res, uint, 8, 16);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 64, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  DECL_VARIABLE(vector_res, float, 16, 8);
+#endif
+  DECL_VARIABLE(vector_res, float, 32, 4);
+  DECL_VARIABLE(vector_res, float, 64, 2);
+
+  clean_results ();
+
+  /* Initialize input "vector" from "buffer".  */
+  VLOAD(vector, buffer, q, int, s, 8, 16);
+  VLOAD(vector, buffer, q, int, s, 16, 8);
+  VLOAD(vector, buffer, q, int, s, 32, 4);
+  VLOAD(vector, buffer, q, int, s, 64, 2);
+  VLOAD(vector, buffer, q, uint, u, 8, 16);
+  VLOAD(vector, buffer, q, uint, u, 16, 8);
+  VLOAD(vector, buffer, q, uint, u, 32, 4);
+  VLOAD(vector, buffer, q, uint, u, 64, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  VLOAD(vector, buffer, q, float, f, 16, 8);
+#endif
+  VLOAD(vector, buffer, q, float, f, 32, 4);
+  VLOAD(vector, buffer, q, float, f, 64, 2);
+
+  /* Apply a binary operator named INSN_NAME.  */
+  TEST_VPXXXQ(INSN_NAME, int, s, 8, 16);
+  TEST_VPXXXQ(INSN_NAME, int, s, 16, 8);
+  TEST_VPXXXQ(INSN_NAME, int, s, 32, 4);
+  TEST_VPXXXQ(INSN_NAME, int, s, 64, 2);
+  TEST_VPXXXQ(INSN_NAME, uint, u, 8, 16);
+  TEST_VPXXXQ(INSN_NAME, uint, u, 16, 8);
+  TEST_VPXXXQ(INSN_NAME, uint, u, 32, 4);
+  TEST_VPXXXQ(INSN_NAME, uint, u, 64, 2);
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  TEST_VPXXXQ(INSN_NAME, float, f, 16, 8);
+#endif
+  TEST_VPXXXQ(INSN_NAME, float, f, 32, 4);
+  TEST_VPXXXQ(INSN_NAME, float, f, 64, 2);
+
+  CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+  CHECK(TEST_MSG, int, 64, 2, PRIx64, expected, "");
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
+#endif
+  CHECK_FP(TEST_MSG, float, 32, 4, PRIx32, expected, "");
+  CHECK_FP(TEST_MSG, float, 64, 2, PRIx64, expected, "");
+}
+
+int main (void)
+{
+  FNNAME (INSN_NAME) ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpaddq.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpaddq.c
new file mode 100644
index ..f15ada8aa52ae004389e014e4c45a5ebdddab291
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vpaddq.c
@@ -0,0 +1,40 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#define INSN_NAME vpaddq
+#define TEST_MSG "VPADDQ"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected, int, 8, 16) [] = { 0xe1, 0xe5, 0xe9, 0xed,
+	   0xf1, 0xf5, 0xf9, 0xfd,
+	   0x

[PATCH] testsuite: aarch64: Add tests for narrowing-arithmetic intrinsics

2021-03-03 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch adds tests for v[r]addhn_high and v[r]subhn_high Neon
intrinsics. Since these intrinsics are only supported for AArch64, these tests
are restricted to only run on AArch64 targets.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-03-02  Jonathan Wright  

* gcc.target/aarch64/advsimd-intrinsics/vXXXhn_high.inc:
New test template.
* gcc.target/aarch64/advsimd-intrinsics/vaddhn_high.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vraddhn_high.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vrsubhn_high.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vsubhn_high.c:
New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vXXXhn_high.inc b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vXXXhn_high.inc
new file mode 100644
index ..e77e84520139069a90cb5d62046744eaf14ff195
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vXXXhn_high.inc
@@ -0,0 +1,65 @@
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1(NAME)
+
+void FNNAME (INSN_NAME) (void)
+{
+  /* Basic test: v128_r=vXXXhn_high(v64_r, v128_a, v128_b), store result.  */
+#define TEST_VXXXHN_HIGH1(INSN, T1, T2, W1, W2, N1, N2)\
+  VECT_VAR(v128_r, T1, W2, N2) = INSN##_##T2##W1(VECT_VAR(v64_r, T1, W2, N1),	\
+		 VECT_VAR(v128_a, T1, W1, N1),	\
+		 VECT_VAR(v128_b, T1, W1, N1));	\
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(v128_r, T1, W2, N2))
+
+#define TEST_VXXXHN_HIGH(INSN, T1, T2, W1, W2, N1, N2)\
+  TEST_VXXXHN_HIGH1(INSN, T1, T2, W1, W2, N1, N2)
+
+  DECL_VARIABLE_128BITS_VARIANTS(v128_r);
+  DECL_VARIABLE_64BITS_VARIANTS(v64_r);
+  DECL_VARIABLE_128BITS_VARIANTS(v128_a);
+  DECL_VARIABLE_128BITS_VARIANTS(v128_b);
+
+  clean_results ();
+
+  /* Fill v64_r with a value easy to recognise in the result vector. */
+  VDUP(v64_r, , int, s, 8, 8, 0x5);
+  VDUP(v64_r, , int, s, 16, 4, 0x5);
+  VDUP(v64_r, , int, s, 32, 2, 0x5);
+  VDUP(v64_r, , uint, u, 8, 8, 0x5);
+  VDUP(v64_r, , uint, u, 16, 4, 0x5);
+  VDUP(v64_r, , uint, u, 32, 2, 0x5);
+
+  /* Fill input v128_a and v128_b with arbitrary values. */
+  VDUP(v128_a, q, int, s, 16, 8, 50*(UINT8_MAX+1));
+  VDUP(v128_a, q, int, s, 32, 4, 50*(UINT16_MAX+1));
+  VDUP(v128_a, q, int, s, 64, 2, 24*((uint64_t)UINT32_MAX+1));
+  VDUP(v128_a, q, uint, u, 16, 8, 3*(UINT8_MAX+1));
+  VDUP(v128_a, q, uint, u, 32, 4, 55*(UINT16_MAX+1));
+  VDUP(v128_a, q, uint, u, 64, 2, 3*((uint64_t)UINT32_MAX+1));
+
+  VDUP(v128_b, q, int, s, 16, 8, (uint16_t)UINT8_MAX);
+  VDUP(v128_b, q, int, s, 32, 4, (uint32_t)UINT16_MAX);
+  VDUP(v128_b, q, int, s, 64, 2, (uint64_t)UINT32_MAX);
+  VDUP(v128_b, q, uint, u, 16, 8, (uint16_t)UINT8_MAX);
+  VDUP(v128_b, q, uint, u, 32, 4, (uint32_t)UINT16_MAX);
+  VDUP(v128_b, q, uint, u, 64, 2, (uint64_t)UINT32_MAX);
+
+  TEST_VXXXHN_HIGH(INSN_NAME, int, s, 16, 8, 8, 16);
+  TEST_VXXXHN_HIGH(INSN_NAME, int, s, 32, 16, 4, 8);
+  TEST_VXXXHN_HIGH(INSN_NAME, int, s, 64, 32, 2, 4);
+  TEST_VXXXHN_HIGH(INSN_NAME, uint, u, 16, 8, 8, 16);
+  TEST_VXXXHN_HIGH(INSN_NAME, uint, u, 32, 16, 4, 8);
+  TEST_VXXXHN_HIGH(INSN_NAME, uint, u, 64, 32, 2, 4);
+
+  CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+}
+
+int main (void)
+{
+  FNNAME (INSN_NAME) ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddhn_high.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddhn_high.c
new file mode 100644
index ..329dd494f8b2cd3b9c64187278b55107651ea05a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vaddhn_high.c
@@ -0,0 +1,32 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+#if defined(__cplusplus)
+#include 
+#else
+#include 
+#endif
+
+#define INSN_NAME vaddhn_high
+#define TEST_MSG "VADDHN_HIGH"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected, int, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+   0x5, 0x5, 0x5, 0x5,
+	   0x32, 0x32, 0x32, 0x32,
+	   0x32, 0x32, 0x32, 0x32 };
+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+	   0x32, 0x32, 0x32, 0x32 };
+VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x5, 0x5, 0x18, 0x18 };
+VECT_VAR_DECL(expected, uint, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	0x5, 0x5, 0x5, 0x5,
+	0x3, 0x3, 0x3, 0x3,
+	0x3, 0x3, 0x3, 0x3 };
+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+	0x37, 0x37, 0x37, 0x37 };
+VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x5, 0x5, 0x3, 0x3 };
+
+#include "vXXXhn_high.inc"
diff --git a/gc

[PATCH] testsuite: aarch64: Add tests for v[r]shrn_high intrinsics

2021-03-03 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch adds tests for v[r]shrn_high Neon intrinsics. Since
these intrinsics are only supported for AArch64, these tests are restricted
to only run on AArch64 targets.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-03-02  Jonathan Wright  

* gcc.target/aarch64/advsimd-intrinsics/vrshrn_high_n.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vshrn_high_n.c:
New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrshrn_high_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrshrn_high_n.c
new file mode 100644
index ..b570ddccde9c7cc9a22c0e23d2a852b78d9dc12a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vrshrn_high_n.c
@@ -0,0 +1,177 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results with input=0.  */
+VECT_VAR_DECL(expected_0, int, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	 0x5, 0x5, 0x5, 0x5,
+	 0x0, 0x0, 0x0, 0x0,
+	 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_0, int, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+	 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_0, int, 32, 4) [] = { 0x5, 0x5, 0x0, 0x0 };
+VECT_VAR_DECL(expected_0, uint, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	  0x5, 0x5, 0x5, 0x5,
+	  0x0, 0x0, 0x0, 0x0,
+	  0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_0, uint, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+	  0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_0, uint, 32, 4) [] = { 0x5, 0x5, 0x0, 0x0 };
+
+/* Expected results.  */
+VECT_VAR_DECL(expected, int, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	   0x5, 0x5, 0x5, 0x5,
+	   0xf8, 0xf9, 0xf9, 0xfa,
+	   0xfa, 0xfb, 0xfb, 0xfc };
+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+	   0xfff8, 0xfff9, 0xfff9, 0xfffa };
+VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x5, 0x5, 0xfffc, 0xfffc };
+VECT_VAR_DECL(expected, uint, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	0x5, 0x5, 0x5, 0x5,
+	0xfc, 0xfc, 0xfd, 0xfd,
+	0xfd, 0xfd, 0xfe, 0xfe };
+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+	0xfffe, 0xfffe, 0xfffe, 0xfffe };
+VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x5, 0x5, 0xfffe, 0xfffe };
+
+/* Expected results with large shift amount.  */
+VECT_VAR_DECL(expected_sh_large, int, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+		0x5, 0x5, 0x5, 0x5,
+		0x0, 0x0, 0x0, 0x0,
+		0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_sh_large, int, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+		0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_sh_large, int, 32, 4) [] = { 0x5, 0x5, 0x0, 0x0 };
+VECT_VAR_DECL(expected_sh_large, uint, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+		 0x5, 0x5, 0x5, 0x5,
+		 0x0, 0x0, 0x0, 0x0,
+		 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_sh_large, uint, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+		 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_sh_large, uint, 32, 4) [] = { 0x5, 0x5, 0x0, 0x0 };
+
+#define TEST_MSG "VRSHRN_HIGH_N"
+void exec_vrshrn_high_n (void)
+{
+  /* Basic test: y=vrshrn_high_n(r,x,v), then store the result.  */
+#define TEST_VRSHRN_HIGH_N(T1, T2, W1, W2, N1, N2, V)\
+  VECT_VAR(vector_res, T1, W2, N2) =		\
+vrshrn_high_n_##T2##W1(VECT_VAR(vector_res_lo, T1, W2, N1),			\
+			   VECT_VAR(vector, T1, W1, N1),			\
+			   V);			\
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2, N2))
+
+  DECL_VARIABLE(vector_res_lo, int, 8, 8);
+  DECL_VARIABLE(vector_res_lo, int, 16, 4);
+  DECL_VARIABLE(vector_res_lo, int, 32, 2);
+  DECL_VARIABLE(vector_res_lo, uint, 8, 8);
+  DECL_VARIABLE(vector_res_lo, uint, 16, 4);
+  DECL_VARIABLE(vector_res_lo, uint, 32, 2);
+
+  DECL_VARIABLE(vector, int, 16, 8);
+  DECL_VARIABLE(vector, int, 32, 4);
+  DECL_VARIABLE(vector, int, 64, 2);
+  DECL_VARIABLE(vector, uint, 16, 8);
+  DECL_VARIABLE(vector, uint, 32, 4);
+  DECL_VARIABLE(vector, uint, 64, 2);
+
+  DECL_VARIABLE(vector_res, int, 8, 16);
+  DECL_VARIABLE(vector_res, int, 16, 8);
+  DECL_VARIABLE(vector_res, int, 32, 4);
+  DECL_VARIABLE(vector_res, uint, 8, 16);
+  DECL_VARIABLE(vector_res, uint, 16, 8);
+  DECL_VARIABLE(vector_res, uint, 32, 4);
+
+  clean_results ();
+
+  /* Fill vector_res_lo with a value easy to recognise in the result vector. */
+  VDUP(vector_res_lo, , int, s, 8, 8, 0x5);
+  VDUP(vector_res_lo, , int, s, 16, 4, 0x5);
+  VDUP(vector_res_lo, , int, s, 32, 2, 0x5);
+  VDUP(vector_res_lo, , uint, u, 8, 8, 0x5);
+  VDUP(vector_res_lo, , uint, u, 16, 4, 0x5);
+  VDUP(vector_res_lo, , uint, u, 32, 2, 0x5);
+
+  /* Fill input vector with 0, to check behavior on limits.  */
+  VDUP(vector, q, int, s, 16, 8, 0);
+  VDUP(vector, q, int, s, 32, 4, 0);
+  VDUP(vector, q, int, s, 64, 2, 0);
+  VDUP(vector, q, uint, u, 16, 8, 0);
+  VDUP(vector, q, uint, u, 32, 4, 0);
+  VDUP(vector, q, u

[PATCH] testsuite: aarch64: Add tests for v[q]mov[u]n_high intrinsics

2021-03-03 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch adds tests for v[q]mov[u]n_high Neon intrinsics. Since
these intrinsics are only supported for AArch64, these tests are restricted
to only run on AArch64 targets.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-03-02  Jonathan Wright  

* gcc.target/aarch64/advsimd-intrinsics/vmovn_high.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vqmovn_high.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vqmovun_high.c:
New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmovn_high.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmovn_high.c
new file mode 100644
index ..e05a40f9619d9e817267d1611257820f62c0ffaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmovn_high.c
@@ -0,0 +1,73 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected, int, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	   0x5, 0x5, 0x5, 0x5,
+	   0xf0, 0xf1, 0xf2, 0xf3,
+   0xf4, 0xf5, 0xf6, 0xf7 };
+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5, 
+	   0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x5, 0x5, 0xfff0, 0xfff1 };
+VECT_VAR_DECL(expected, uint, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	0x5, 0x5, 0x5, 0x5,
+	0xf0, 0xf1, 0xf2, 0xf3,
+	0xf4, 0xf5, 0xf6, 0xf7 };
+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+	0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x5, 0x5, 0xfff0, 0xfff1 };
+
+#define TEST_MSG "VMOVN_HIGH"
+void exec_vmovn_high (void)
+{
+  /* Basic test: vec128_r=vmovn_high(vec64_r, vec128_x), store the result.  */
+#define TEST_VMOVN_HIGH(T1, T2, W1, W2, N1, N2)	\
+  VECT_VAR(vec128_r, T1, W2, N2) =		\
+vmovn_high_##T2##W1(VECT_VAR(vec64_r, T1, W2, N1),\
+			VECT_VAR(vec128_x, T1, W1, N1));			\
+  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vec128_r, T1, W2, N2))
+
+  DECL_VARIABLE_128BITS_VARIANTS(vec128_r);
+  DECL_VARIABLE_64BITS_VARIANTS(vec64_r);
+  DECL_VARIABLE_128BITS_VARIANTS(vec128_x);
+
+  clean_results ();
+
+  /* Fill vec64_r with a value easy to recognise in the result vector. */
+  VDUP(vec64_r, , int, s, 8, 8, 0x5);
+  VDUP(vec64_r, , int, s, 16, 4, 0x5);
+  VDUP(vec64_r, , int, s, 32, 2, 0x5);
+  VDUP(vec64_r, , uint, u, 8, 8, 0x5);
+  VDUP(vec64_r, , uint, u, 16, 4, 0x5);
+  VDUP(vec64_r, , uint, u, 32, 2, 0x5);
+
+  VLOAD(vec128_x, buffer, q, int, s, 16, 8);
+  VLOAD(vec128_x, buffer, q, int, s, 32, 4);
+  VLOAD(vec128_x, buffer, q, int, s, 64, 2);
+  VLOAD(vec128_x, buffer, q, uint, u, 16, 8);
+  VLOAD(vec128_x, buffer, q, uint, u, 32, 4);
+  VLOAD(vec128_x, buffer, q, uint, u, 64, 2);
+
+  TEST_VMOVN_HIGH(int, s, 16, 8, 8, 16);
+  TEST_VMOVN_HIGH(int, s, 32, 16, 4, 8);
+  TEST_VMOVN_HIGH(int, s, 64, 32, 2, 4);
+  TEST_VMOVN_HIGH(uint, u, 16, 8, 8, 16);
+  TEST_VMOVN_HIGH(uint, u, 32, 16, 4, 8);
+  TEST_VMOVN_HIGH(uint, u, 64, 32, 2, 4);
+
+  CHECK(TEST_MSG, int, 8, 16, PRIx8, expected, "");
+  CHECK(TEST_MSG, int, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, int, 32, 4, PRIx32, expected, "");
+  CHECK(TEST_MSG, uint, 8, 16, PRIx8, expected, "");
+  CHECK(TEST_MSG, uint, 16, 8, PRIx16, expected, "");
+  CHECK(TEST_MSG, uint, 32, 4, PRIx32, expected, "");
+}
+
+int main (void)
+{
+  exec_vmovn_high ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqmovn_high.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqmovn_high.c
new file mode 100644
index ..cb4f5c83de889a420b1f4408d4f95575aa783ae5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqmovn_high.c
@@ -0,0 +1,121 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected, int, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	   0x5, 0x5, 0x5, 0x5,
+	   0x12, 0x12, 0x12, 0x12,
+	   0x12, 0x12, 0x12, 0x12 };
+VECT_VAR_DECL(expected, int, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+	   0x1278, 0x1278, 0x1278, 0x1278 };
+VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x5, 0x5, 0x12345678, 0x12345678 };
+VECT_VAR_DECL(expected, uint, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	0x5, 0x5, 0x5, 0x5,
+	0x82, 0x82, 0x82, 0x82,
+	0x82, 0x82, 0x82, 0x82 };
+VECT_VAR_DECL(expected, uint, 16, 8) [] = { 0x5, 0x5, 0x5, 0x5,
+	0x8765, 0x8765, 0x8765, 0x8765 };
+VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x5, 0x5, 0x87654321, 0x87654321 };
+
+/* Expected results when saturation occurs.  */
+VECT_VAR_DECL(expected1, int, 8, 16) [] = { 0x5, 0x5, 0x5, 0x5,
+	0x5, 0x5, 0x5, 0x5,
+	0x7f, 0x7f, 0x7f, 0x7f,
+	0x7f, 0x7f, 0x7f, 0x7f };
+VECT_VAR_DECL(e

[PATCH] testsuite: aarch64: Add tests for vcvt FP intrinsics

2021-03-03 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch adds tests for vcvtx* and vcvt_fXX_fXX floating-point
Neon intrinsics. Since these intrinsics are only supported for AArch64, these
tests are restricted to only run on AArch64 targets.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-02-18  Jonathan Wright  

* gcc.target/aarch64/advsimd-intrinsics/vcvt_fXX_fXX.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vcvtx.c:
New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt_fXX_fXX.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt_fXX_fXX.c
new file mode 100644
index ..248f37a63a7defda9507172d7a00be0b1a230580
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvt_fXX_fXX.c
@@ -0,0 +1,100 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+#include 
+
+/* Expected results for vcvt_f64_f32.  */
+VECT_VAR_DECL (expected, hfloat, 64, 2) [] = { 0x4030,
+	   0x402e};
+/* Expected results for vcvt_f32_f64.  */
+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0x3fc0, 0x4020 };
+
+/* Expected results for vcvt_high_f64_f32.  */
+VECT_VAR_DECL (expected_high, hfloat, 64, 2) [] = { 0xc02c,
+		0xc02a };
+/* Expected results for vcvt_high_f32_f64.  */
+VECT_VAR_DECL (expected_high, hfloat, 32, 4) [] = { 0x4000, 0x4000,
+		0x3fc0, 0x4020 };
+
+void
+exec_vcvt (void)
+{
+  clean_results ();
+
+#define TEST_MSG "vcvt_f64_f32"
+  {
+VECT_VAR_DECL (buffer_src, float, 32, 2) [] = { 16.0, 15.0 };
+
+DECL_VARIABLE (vector_src, float, 32, 2);
+
+VLOAD (vector_src, buffer_src, , float, f, 32, 2);
+DECL_VARIABLE (vector_res, float, 64, 2) =
+	vcvt_f64_f32 (VECT_VAR (vector_src, float, 32, 2));
+vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	   VECT_VAR (vector_res, float, 64, 2));
+
+CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expected, "");
+  }
+#undef TEST_MSG
+
+  clean_results ();
+
+#define TEST_MSG "vcvt_f32_f64"
+  {
+VECT_VAR_DECL (buffer_src, float, 64, 2) [] = { 1.50025, 2.50025 };
+DECL_VARIABLE (vector_src, float, 64, 2);
+
+VLOAD (vector_src, buffer_src, q, float, f, 64, 2);
+DECL_VARIABLE (vector_res, float, 32, 2) =
+  vcvt_f32_f64 (VECT_VAR (vector_src, float, 64, 2));
+vst1_f32 (VECT_VAR (result, float, 32, 2),
+	  VECT_VAR (vector_res, float, 32, 2));
+
+CHECK_FP (TEST_MSG, float, 32, 2, PRIx32, expected, "");
+  }
+#undef TEST_MSG
+
+  clean_results ();
+
+#define TEST_MSG "vcvt_high_f64_f32"
+  {
+DECL_VARIABLE (vector_src, float, 32, 4);
+VLOAD (vector_src, buffer, q, float, f, 32, 4);
+DECL_VARIABLE (vector_res, float, 64, 2);
+VECT_VAR (vector_res, float, 64, 2) =
+  vcvt_high_f64_f32 (VECT_VAR (vector_src, float, 32, 4));
+vst1q_f64 (VECT_VAR (result, float, 64, 2),
+	   VECT_VAR (vector_res, float, 64, 2));
+CHECK_FP (TEST_MSG, float, 64, 2, PRIx64, expected_high, "");
+  }
+#undef TEST_MSG
+
+  clean_results ();
+
+#define TEST_MSG "vcvt_high_f32_f64"
+  {
+VECT_VAR_DECL (buffer_src, float, 64, 2) [] = { 1.50025, 2.50025 };
+DECL_VARIABLE (vector_low, float, 32, 2);
+VDUP (vector_low, , float, f, 32, 2, 2.0);
+
+DECL_VARIABLE (vector_src, float, 64, 2);
+VLOAD (vector_src, buffer_src, q, float, f, 64, 2);
+
+DECL_VARIABLE (vector_res, float, 32, 4) =
+  vcvt_high_f32_f64 (VECT_VAR (vector_low, float, 32, 2),
+			 VECT_VAR (vector_src, float, 64, 2));
+vst1q_f32 (VECT_VAR (result, float, 32, 4),
+	   VECT_VAR (vector_res, float, 32, 4));
+
+CHECK_FP (TEST_MSG, float, 32, 4, PRIx32, expected_high, "");
+  }
+}
+
+int
+main (void)
+{
+  exec_vcvt ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtx.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtx.c
new file mode 100644
index ..8687204ab97310c731144a33943833219a3c341d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcvtx.c
@@ -0,0 +1,83 @@
+/* { dg-skip-if "" { arm*-*-* } } */
+
+#include 
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+#include 
+
+/* Expected results for vcvtx_f32_f64 and vcvtxd_f32_f64.  */
+VECT_VAR_DECL (expected, hfloat, 32, 2) [] = { 0x3fc1, 0x4021 };
+
+/* Expected results for vcvtx_high_f32_f64.  */
+VECT_VAR_DECL (expected_high, hfloat, 32, 4) [] = { 0x4000, 0x4000,
+		0x3fc1, 0x4021 };
+
+void
+exec_vcvtx (void)
+{
+  clean_results ();
+
+#define TEST_MSG "vcvtx_f32_f64"
+  {
+VECT_VAR_DECL (buffer_src, float, 64, 2) [] = { 1.50025, 2.50025 };
+DECL_VARIABLE (vector_src, float, 64, 2);
+
+VLOAD (vector_src, buffer_src, q, float, f, 64, 2);
+DECL_VARIABLE (vector_res, float, 32, 2) =
+  vcvtx_f32_f64 (VECT_VAR (vector_src, fl

[PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian

2021-08-04 Thread Jonathan Wright via Gcc-patches
Hi,

Recent refactoring of the arm_neon.h header enabled better code
generation for intrinsics that manipulate vector structures. New
tests were also added to verify the benefit of these changes. It now
transpires that the code generation improvements are observed only on
little-endian systems. This patch restricts the code generation tests
to little-endian targets (for now.)

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-08-04  Jonathan Wright  

* gcc.target/aarch64/vector_structure_intrinsics.c: Restrict
tests to little-endian targets.



From: Christophe Lyon 
Sent: 03 August 2021 10:42
To: Jonathan Wright 
Cc: gcc-patches@gcc.gnu.org ; Richard Sandiford 

Subject: Re: [PATCH 1/8] aarch64: Use memcpy to copy vector tables in 
vqtbl[234] intrinsics 
 


On Fri, Jul 23, 2021 at 10:22 AM Jonathan Wright via Gcc-patches 
 wrote:
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vqtbl[234] Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.

Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vqtbl[234] intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-08  Jonathan Wright  

        * config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy
        instead of constructing __builtin_aarch64_simd_oi one vector
        at a time.
        (vqtbl2_u8): Likewise.
        (vqtbl2_p8): Likewise.
        (vqtbl2q_s8): Likewise.
        (vqtbl2q_u8): Likewise.
        (vqtbl2q_p8): Likewise.
        (vqtbl3_s8): Use __builtin_memcpy instead of constructing
        __builtin_aarch64_simd_ci one vector at a time.
        (vqtbl3_u8): Likewise.
        (vqtbl3_p8): Likewise.
        (vqtbl3q_s8): Likewise.
        (vqtbl3q_u8): Likewise.
        (vqtbl3q_p8): Likewise.
        (vqtbl4_s8): Use __builtin_memcpy instead of constructing
        __builtin_aarch64_simd_xi one vector at a time.
        (vqtbl4_u8): Likewise.
        (vqtbl4_p8): Likewise.
        (vqtbl4q_s8): Likewise.
        (vqtbl4q_u8): Likewise.
        (vqtbl4q_p8): Likewise.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/vector_structure_intrinsics.c: New test.

Hi,

This new test fails on aarch64_be:
 FAIL: gcc.target/aarch64/vector_structure_intrinsics.c scan-assembler-not 
mov\\t

Can you check?

Thanks

Christophe


rb14749.patch
Description: rb14749.patch


[PATCH V2] aarch64: Don't include vec_select high-half in SIMD multiply cost

2021-08-04 Thread Jonathan Wright via Gcc-patches
Hi,

Changes suggested here and those discussed off-list have been
implemented in V2 of the patch.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  

* config/aarch64/aarch64.c (aarch64_strip_extend_vec_half):
Define.
(aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
vec_select high-half from being added into Neon multiply
cost.
* rtlanal.c (vec_series_highpart_p): Define.
* rtlanal.h (vec_series_highpart_p): Declare.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vmul_high_cost.c: New test.

From: Richard Sandiford 
Sent: 04 August 2021 10:05
To: Jonathan Wright via Gcc-patches 
Cc: Jonathan Wright 
Subject: Re: [PATCH] aarch64: Don't include vec_select high-half in SIMD 
multiply cost 
 
Jonathan Wright via Gcc-patches  writes:
> Hi,
>
> The Neon multiply/multiply-accumulate/multiply-subtract instructions
> can select the top or bottom half of the operand registers. This
> selection does not change the cost of the underlying instruction and
> this should be reflected by the RTL cost function.
>
> This patch adds RTL tree traversal in the Neon multiply cost function
> to match vec_select high-half of its operands. This traversal
> prevents the cost of the vec_select from being added into the cost of
> the multiply - meaning that these instructions can now be emitted in
> the combine pass as they are no longer deemed prohibitively
> expensive.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.

Like you say, the instructions can handle both the low and high halves.
Shouldn't we also check for the low part (as a SIGN/ZERO_EXTEND of
a subreg)?

> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-19  Jonathan Wright  
>
>    * config/aarch64/aarch64.c (aarch64_vec_select_high_operand_p):
>    Define.
>    (aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
>    vec_select high-half from being added into Neon multiply
>    cost.
>    * rtlanal.c (vec_series_highpart_p): Define.
>    * rtlanal.h (vec_series_highpart_p): Declare.
>
> gcc/testsuite/ChangeLog:
>
>    * gcc.target/aarch64/vmul_high_cost.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 5809887997305317c5a81421089db431685e2927..a49672afe785e3517250d324468edacceab5c9d3
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -76,6 +76,7 @@
>  #include "function-abi.h"
>  #include "gimple-pretty-print.h"
>  #include "tree-ssa-loop-niter.h"
> +#include "rtlanal.h"
>  
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -11970,6 +11971,19 @@ aarch64_cheap_mult_shift_p (rtx x)
>    return false;
>  }
>  
> +/* Return true iff X is an operand of a select-high-half vector
> +   instruction.  */
> +
> +static bool
> +aarch64_vec_select_high_operand_p (rtx x)
> +{
> +  return ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
> +   && GET_CODE (XEXP (x, 0)) == VEC_SELECT
> +   && vec_series_highpart_p (GET_MODE (XEXP (x, 0)),
> + GET_MODE (XEXP (XEXP (x, 0), 0)),
> + XEXP (XEXP (x, 0), 1)));
> +}
> +
>  /* Helper function for rtx cost calculation.  Calculate the cost of
> a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
> Return the calculated cost of the expression, recursing manually in to
> @@ -11995,6 +12009,13 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, 
> int outer, bool speed)
>    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
>    if (vec_flags & VEC_ADVSIMD)
>    {
> +   /* The select-operand-high-half versions of the instruction have the
> +  same cost as the three vector version - don't add the costs of the
> +  select into the costs of the multiply.  */
> +   if (aarch64_vec_select_high_operand_p (op0))
> + op0 = XEXP (XEXP (op0, 0), 0);
> +   if (aarch64_vec_select_high_operand_p (op1))
> + op1 = XEXP (XEXP (op1, 0), 0);

For consistency with aarch64_strip_duplicate_vec_elt, I think this
should be something like aarch64_strip_vec_extension, returning
the inner rtx on success and the original one on failure.

Thanks,
Richard

>  /* The by-element versions of the instruction have the same costs as
> the normal 3-vector version.  So don't add the costs of the
> duplicate or subsequent select i

[PATCH V2] aarch64: Don't include vec_select high-half in SIMD add cost

2021-08-04 Thread Jonathan Wright via Gcc-patches
Hi,

V2 of this patch uses the same approach as that just implemented
for the multiply high-half cost patch.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan 

---

gcc/ChangeLog:

2021-07-28  Jonathan Wright  

* config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
of vec_select high-half from being added into Neon add cost.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vaddX_high_cost.c: New test.

From: Jonathan Wright
Sent: 29 July 2021 10:22
To: gcc-patches@gcc.gnu.org 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] aarch64: Don't include vec_select high-half in SIMD add cost 
 
Hi,

The Neon add-long/add-widen instructions can select the top or bottom
half of the operand registers. This selection does not change the
cost of the underlying instruction and this should be reflected by
the RTL cost function.

This patch adds RTL tree traversal in the Neon add cost function to
match vec_select high-half of its operands. This traversal prevents
the cost of the vec_select from being added into the cost of the
subtract - meaning that these instructions can now be emitted in the
combine pass as they are no longer deemed prohibitively expensive.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-28  Jonathan Wright  

    * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
    of vec_select high-half from being added into Neon add cost.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/vaddX_high_cost.c: New test.

rb14710.patch
Description: rb14710.patch


[PATCH V2] aarch64: Don't include vec_select high-half in SIMD subtract cost

2021-08-05 Thread Jonathan Wright via Gcc-patches
Hi,

V2 of this change implements the same approach as for the multiply
and add-widen patches.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-28  Jonathan Wright  

* config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
of vec_select high-half from being added into Neon subtract
cost.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vsubX_high_cost.c: New test.



From: Jonathan Wright
Sent: 29 July 2021 10:23
To: gcc-patches@gcc.gnu.org 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] aarch64: Don't include vec_select high-half in SIMD subtract 
cost 
 
Hi,

The Neon subtract-long/subract-widen instructions can select the top
or bottom half of the operand registers. This selection does not
change the cost of the underlying instruction and this should be
reflected by the RTL cost function.

This patch adds RTL tree traversal in the Neon subtract cost function
to match vec_select high-half of its operands. This traversal
prevents the cost of the vec_select from being added into the cost of
the subtract - meaning that these instructions can now be emitted in
the combine pass as they are no longer deemed prohibitively
expensive.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-28  Jonathan Wright  

    * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
    of vec_select high-half from being added into Neon subtract
    cost.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/vsubX_high_cost.c: New test.

rb14711.patch
Description: rb14711.patch


[PATCH 1/4] aarch64: Use memcpy to copy structures in vst4[q]_lane intrinsics

2021-08-05 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch uses __builtin_memcpy to copy vector structures
instead of using a union - or constructing a new opaque structure one
vector at a time - in each of the vst4[q]_lane Neon intrinsics in
arm_neon.h.

It also adds new code generation tests to verify that superfluous move
instructions are not generated for the vst4q_lane intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-29  Jonathan Wright  

* config/aarch64/arm_neon.h (__ST4_LANE_FUNC): Delete.
(__ST4Q_LANE_FUNC): Delete.
(vst4_lane_f16): Use __builtin_memcpy to copy vector
structure instead of constructing __builtin_aarch64_simd_xi
one vector at a time.
(vst4_lane_f32): Likewise.
(vst4_lane_f64): Likewise.
(vst4_lane_p8): Likewise.
(vst4_lane_p16): Likewise.
(vst4_lane_p64): Likewise.
(vst4_lane_s8): Likewise.
(vst4_lane_s16): Likewise.
(vst4_lane_s32): Likewise.
(vst4_lane_s64): Likewise.
(vst4_lane_u8): Likewise.
(vst4_lane_u16): Likewise.
(vst4_lane_u32): Likewise.
(vst4_lane_u64): Likewise.
(vst4_lane_bf16): Likewise.
(vst4q_lane_f16): Use __builtin_memcpy to copy vector
structure instead of using a union.
(vst4q_lane_f32): Likewise.
(vst4q_lane_f64): Likewise.
(vst4q_lane_p8): Likewise.
(vst4q_lane_p16): Likewise.
(vst4q_lane_p64): Likewise.
(vst4q_lane_s8): Likewise.
(vst4q_lane_s16): Likewise.
(vst4q_lane_s32): Likewise.
(vst4q_lane_s64): Likewise.
(vst4q_lane_u8): Likewise.
(vst4q_lane_u16): Likewise.
(vst4q_lane_u32): Likewise.
(vst4q_lane_u64): Likewise.
(vst4q_lane_bf16): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14728.patch
Description: rb14728.patch


[PATCH 2/4] aarch64: Use memcpy to copy structures in vst3[q]_lane intrinsics

2021-08-05 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch uses __builtin_memcpy to copy vector structures
instead of using a union - or constructing a new opaque structure one
vector at a time - in each of the vst3[q]_lane Neon intrinsics in
arm_neon.h.

It also adds new code generation tests to verify that superfluous move
instructions are not generated for the vst3q_lane intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-30  Jonathan Wright  

* config/aarch64/arm_neon.h (__ST3_LANE_FUNC): Delete.
(__ST3Q_LANE_FUNC): Delete.
(vst3_lane_f16): Use __builtin_memcpy to copy vector
structure instead of constructing __builtin_aarch64_simd_ci
one vector at a time.
(vst3_lane_f32): Likewise.
(vst3_lane_f64): Likewise.
(vst3_lane_p8): Likewise.
(vst3_lane_p16): Likewise.
(vst3_lane_p64): Likewise.
(vst3_lane_s8): Likewise.
(vst3_lane_s16): Likewise.
(vst3_lane_s32): Likewise.
(vst3_lane_s64): Likewise.
(vst3_lane_u8): Likewise.
(vst3_lane_u16): Likewise.
(vst3_lane_u32): Likewise.
(vst3_lane_u64): Likewise.
(vst3_lane_bf16): Likewise.
(vst3q_lane_f16): Use __builtin_memcpy to copy vector
structure instead of using a union.
(vst3q_lane_f32): Likewise.
(vst3q_lane_f64): Likewise.
(vst3q_lane_p8): Likewise.
(vst3q_lane_p16): Likewise.
(vst3q_lane_p64): Likewise.
(vst3q_lane_s8): Likewise.
(vst3q_lane_s16): Likewise.
(vst3q_lane_s32): Likewise.
(vst3q_lane_s64): Likewise.
(vst3q_lane_u8): Likewise.
(vst3q_lane_u16): Likewise.
(vst3q_lane_u32): Likewise.
(vst3q_lane_u64): Likewise.
(vst3q_lane_bf16): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14729.patch
Description: rb14729.patch


[PATCH 3/4] aarch64: Use memcpy to copy structures in vst2[q]_lane intrinsics

2021-08-05 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch uses __builtin_memcpy to copy vector structures
instead of using a union - or constructing a new opaque structure one
vector at a time - in each of the vst2[q]_lane Neon intrinsics in
arm_neon.h.

It also adds new code generation tests to verify that superfluous move
instructions are not generated for the vst2q_lane intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-30  Jonathan Wright  

* config/aarch64/arm_neon.h (__ST2_LANE_FUNC): Delete.
(__ST2Q_LANE_FUNC): Delete.
(vst2_lane_f16): Use __builtin_memcpy to copy vector
structure instead of constructing __builtin_aarch64_simd_oi
one vector at a time.
(vst2_lane_f32): Likewise.
(vst2_lane_f64): Likewise.
(vst2_lane_p8): Likewise.
(vst2_lane_p16): Likewise.
(vst2_lane_p64): Likewise.
(vst2_lane_s8): Likewise.
(vst2_lane_s16): Likewise.
(vst2_lane_s32): Likewise.
(vst2_lane_s64): Likewise.
(vst2_lane_u8): Likewise.
(vst2_lane_u16): Likewise.
(vst2_lane_u32): Likewise.
(vst2_lane_u64): Likewise.
(vst2_lane_bf16): Likewise.
(vst2q_lane_f16): Use __builtin_memcpy to copy vector
structure instead of using a union.
(vst2q_lane_f32): Likewise.
(vst2q_lane_f64): Likewise.
(vst2q_lane_p8): Likewise.
(vst2q_lane_p16): Likewise.
(vst2q_lane_p64): Likewise.
(vst2q_lane_s8): Likewise.
(vst2q_lane_s16): Likewise.
(vst2q_lane_s32): Likewise.
(vst2q_lane_s64): Likewise.
(vst2q_lane_u8): Likewise.
(vst2q_lane_u16): Likewise.
(vst2q_lane_u32): Likewise.
(vst2q_lane_u64): Likewise.
(vst2q_lane_bf16): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14730.patch
Description: rb14730.patch


[PATCH 4/4] aarch64: Use memcpy to copy structures in bfloat vst* intrinsics

2021-08-05 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch uses __builtin_memcpy to copy vector structures
instead of using a union - or constructing a new opaque structure one
vector at a time - in each of the vst[234][q] and vst1[q]_x[234] bfloat
Neon intrinsics in arm_neon.h.

It also adds new code generation tests to verify that superfluous move
instructions are not generated for the vst[234]q or vst1q_x[234] bfloat
intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-30  Jonathan Wright  

* config/aarch64/arm_neon.h (vst1_bf16_x2): Use
__builtin_memcpy instead of constructing an additional
__builtin_aarch64_simd_oi one vector at a time.
(vst1q_bf16_x2): Likewise.
(vst1_bf16_x3): Use __builtin_memcpy instead of constructing
an additional __builtin_aarch64_simd_ci one vector at a time.
(vst1q_bf16_x3): Likewise.
(vst1_bf16_x4): Use __builtin_memcpy instead of a union.
(vst1q_bf16_x4): Likewise.
(vst2_bf16): Use __builtin_memcpy instead of constructing an
additional __builtin_aarch64_simd_oi one vector at a time.
(vst2q_bf16): Likewise.
(vst3_bf16): Use __builtin_memcpy instead of constructing an
additional __builtin_aarch64_simd_ci mode one vector at a
time.
(vst3q_bf16): Likewise.
(vst4_bf16): Use __builtin_memcpy instead of constructing an
additional __builtin_aarch64_simd_xi one vector at a time.
(vst4q_bf16): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14731.patch
Description: rb14731.patch


Re: [PATCH] testsuite: aarch64: Fix failing vector structure tests on big-endian

2021-08-09 Thread Jonathan Wright via Gcc-patches
Hi,

I've corrected the quoting and moved everything on to one line.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-08-04  Jonathan Wright  

* gcc.target/aarch64/vector_structure_intrinsics.c: Restrict
tests to little-endian targets.



From: Richard Sandiford 
Sent: 06 August 2021 13:24
To: Jonathan Wright 
Cc: gcc-patches@gcc.gnu.org ; Christophe Lyon 

Subject: Re: [PATCH] testsuite: aarch64: Fix failing vector structure tests on 
big-endian 
 
Jonathan Wright  writes:
> diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c 
> b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> index 
> 60c53bc27f8378c78b119576ed19fde0e5743894..a8e31ab85d6fd2a045c8efaf2cbc42b5f40d2411
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> @@ -197,7 +197,8 @@ TEST_ST1x3 (vst1q, uint64x2x3_t, uint64_t*, u64, x3);
>  TEST_ST1x3 (vst1q, poly64x2x3_t, poly64_t*, p64, x3);
>  TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
>  
> -/* { dg-final { scan-assembler-not "mov\\t" } } */
> +/* { dg-final { scan-assembler-not {"mov\\t"} {
> + target { aarch64_little_endian } } ) }  */

I think this needs to stay on line.  We should also either keep the
original quoting on the regexp or use {mov\t}.  Having both forms
of quote would turn it into a test for the characters:

   "mov\t"

(including quotes and backslash).

Thanks,
Richard


>  
>  /* { dg-final { scan-assembler-times "tbl\\t" 18} }  */
>  /* { dg-final { scan-assembler-times "tbx\\t" 18} }  */


rb14749.patch
Description: rb14749.patch


[PATCH] testsuite: aarch64: Fix invalid SVE tests

2021-08-09 Thread Jonathan Wright via Gcc-patches
Hi,

Some scan-assembler tests for SVE code generation were erroneously
split over multiple lines - meaning they became invalid. This patch
gets the tests working again by putting each test on a single line.

The extract_[1234].c tests are corrected to expect that extracted
32-bit values are moved into 'w' registers rather than 'x' registers.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-08-06  Jonathan Wright  

* gcc.target/aarch64/sve/dup_lane_1.c: Don't split
scan-assembler tests over multiple lines. Expect 32-bit
result values in 'w' registers.
* gcc.target/aarch64/sve/extract_1.c: Likewise.
* gcc.target/aarch64/sve/extract_2.c: Likewise.
* gcc.target/aarch64/sve/extract_3.c: Likewise.
* gcc.target/aarch64/sve/extract_4.c: Likewise.


rb14768.patch
Description: rb14768.patch


[PATCH 1/3] aarch64: Remove macros for vld2[q]_lane Neon intrinsics

2021-08-16 Thread Jonathan Wright via Gcc-patches
Hi,

This patch removes macros for vld2[q]_lane Neon intrinsics. This is a
preparatory step before adding new modes for structures of Advanced
SIMD vectors.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-08-12  Jonathan Wright  

* config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Delete.
(__LD2Q_LANE_FUNC): Likewise.
(vld2_lane_u8): Define without macro.
(vld2_lane_u16): Likewise.
(vld2_lane_u32): Likewise.
(vld2_lane_u64): Likewise.
(vld2_lane_s8): Likewise.
(vld2_lane_s16): Likewise.
(vld2_lane_s32): Likewise.
(vld2_lane_s64): Likewise.
(vld2_lane_f16): Likewise.
(vld2_lane_f32): Likewise.
(vld2_lane_f64): Likewise.
(vld2_lane_p8): Likewise.
(vld2_lane_p16): Likewise.
(vld2_lane_p64): Likewise.
(vld2q_lane_u8): Likewise.
(vld2q_lane_u16): Likewise.
(vld2q_lane_u32): Likewise.
(vld2q_lane_u64): Likewise.
(vld2q_lane_s8): Likewise.
(vld2q_lane_s16): Likewise.
(vld2q_lane_s32): Likewise.
(vld2q_lane_s64): Likewise.
(vld2q_lane_f16): Likewise.
(vld2q_lane_f32): Likewise.
(vld2q_lane_f64): Likewise.
(vld2q_lane_p8): Likewise.
(vld2q_lane_p16): Likewise.
(vld2q_lane_p64): Likewise.
(vld2_lane_bf16): Likewise.
(vld2q_lane_bf16): Likewise.


rb14791.patch
Description: rb14791.patch


[PATCH 2/3] aarch64: Remove macros for vld3[q]_lane Neon intrinsics

2021-08-16 Thread Jonathan Wright via Gcc-patches
Hi,

This patch removes macros for vld3[q]_lane Neon intrinsics. This is a
preparatory step before adding new modes for structures of Advanced
SIMD vectors.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-08-16  Jonathan Wright  

* config/aarch64/arm_neon.h (__LD3_LANE_FUNC): Delete.
(__LD3Q_LANE_FUNC): Delete.
(vld3_lane_u8): Define without macro.
(vld3_lane_u16): Likewise.
(vld3_lane_u32): Likewise.
(vld3_lane_u64): Likewise.
(vld3_lane_s8): Likewise.
(vld3_lane_s16): Likewise.
(vld3_lane_s32): Likewise.
(vld3_lane_s64): Likewise.
(vld3_lane_f16): Likewise.
(vld3_lane_f32): Likewise.
(vld3_lane_f64): Likewise.
(vld3_lane_p8): Likewise.
(vld3_lane_p16): Likewise.
(vld3_lane_p64): Likewise.
(vld3q_lane_u8): Likewise.
(vld3q_lane_u16): Likewise.
(vld3q_lane_u32): Likewise.
(vld3q_lane_u64): Likewise.
(vld3q_lane_s8): Likewise.
(vld3q_lane_s16): Likewise.
(vld3q_lane_s32): Likewise.
(vld3q_lane_s64): Likewise.
(vld3q_lane_f16): Likewise.
(vld3q_lane_f32): Likewise.
(vld3q_lane_f64): Likewise.
(vld3q_lane_p8): Likewise.
(vld3q_lane_p16): Likewise.
(vld3q_lane_p64): Likewise.
(vld3_lane_bf16): Likewise.
(vld3q_lane_bf16): Likewise.


rb14792.patch
Description: rb14792.patch


[PATCH 3/3] aarch64: Remove macros for vld4[q]_lane Neon intrinsics

2021-08-16 Thread Jonathan Wright via Gcc-patches
Hi,

This patch removes macros for vld4[q]_lane Neon intrinsics. This is a
preparatory step before adding new modes for structures of Advanced
SIMD vectors.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-08-16  Jonathan Wright  

* config/aarch64/arm_neon.h (__LD4_LANE_FUNC): Delete.
(__LD4Q_LANE_FUNC): Likewise.
(vld4_lane_u8): Define without macro.
(vld4_lane_u16): Likewise.
(vld4_lane_u32): Likewise.
(vld4_lane_u64): Likewise.
(vld4_lane_s8): Likewise.
(vld4_lane_s16): Likewise.
(vld4_lane_s32): Likewise.
(vld4_lane_s64): Likewise.
(vld4_lane_f16): Likewise.
(vld4_lane_f32): Likewise.
(vld4_lane_f64): Likewise.
(vld4_lane_p8): Likewise.
(vld4_lane_p16): Likewise.
(vld4_lane_p64): Likewise.
(vld4q_lane_u8): Likewise.
(vld4q_lane_u16): Likewise.
(vld4q_lane_u32): Likewise.
(vld4q_lane_u64): Likewise.
(vld4q_lane_s8): Likewise.
(vld4q_lane_s16): Likewise.
(vld4q_lane_s32): Likewise.
(vld4q_lane_s64): Likewise.
(vld4q_lane_f16): Likewise.
(vld4q_lane_f32): Likewise.
(vld4q_lane_f64): Likewise.
(vld4q_lane_p8): Likewise.
(vld4q_lane_p16): Likewise.
(vld4q_lane_p64): Likewise.
(vld4_lane_bf16): Likewise.
(vld4q_lane_bf16): Likewise.


rb14793.patch
Description: rb14793.patch


[PATCH] aarch64: Fix float <-> int errors in vld4[q]_lane intrinsics

2021-08-18 Thread Jonathan Wright via Gcc-patches
Hi,

A previous commit "aarch64: Remove macros for vld4[q]_lane Neon
intrinsics" introduced some float <-> int type conversion errors.
This patch fixes those errors.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-08-18  Jonathan Wright  

* config/aarch64/arm_neon.h (vld4_lane_f32): Use float RTL
pattern.
(vld4q_lane_f64): Use float type cast.



From: Andreas Schwab 
Sent: 18 August 2021 13:09
To: Jonathan Wright via Gcc-patches 
Cc: Jonathan Wright ; Richard Sandiford 

Subject: Re: [PATCH 3/3] aarch64: Remove macros for vld4[q]_lane Neon 
intrinsics 
 
I think this patch breaks bootstrap.

In file included from ../../libcpp/lex.c:756:
/opt/gcc/gcc-20210818/Build/prev-gcc/include/arm_neon.h: In function 
'float32x2x4_t vld4_lane_f32(const float32_t*, float32x2x4_t, int)':
/opt/gcc/gcc-20210818/Build/prev-gcc/include/arm_neon.h:21081:11: error: cannot 
convert 'float*' to 'const int*'
21081 |   (__builtin_aarch64_simd_sf *) __ptr, __o, __c);
  |   ^~~
  |   |
  |   float*
: note:   initializing argument 1 of '__builtin_aarch64_simd_xi 
__builtin_aarch64_ld4_lanev2si(const int*, __builtin_aarch64_simd_xi, int)'
/opt/gcc/gcc-20210818/Build/prev-gcc/include/arm_neon.h: In function 
'float64x2x4_t vld4q_lane_f64(const float64_t*, float64x2x4_t, int)':
/opt/gcc/gcc-20210818/Build/prev-gcc/include/arm_neon.h:21384:9: error: cannot 
convert 'long int*' to 'const double*'
21384 | (__builtin_aarch64_simd_di *) __ptr, __o, __c);
  | ^~~
  | |
  | long int*
: note:   initializing argument 1 of '__builtin_aarch64_simd_xi 
__builtin_aarch64_ld4_lanev2df(const double*, __builtin_aarch64_simd_xi, int)'

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

rb14797.patch
Description: rb14797.patch


[PATCH] aarch64: Fix type qualifiers for qtbl1 and qtbx1 Neon builtins

2021-09-24 Thread Jonathan Wright via Gcc-patches
Hi,

This patch fixes type qualifiers for the qtbl1 and qtbx1 Neon builtins
and removes the casts from the Neon intrinsic function bodies that
use these builtins.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

23-09-2021  Jonathan Wright  

* config/aarch64/aarch64-builtins.c (TYPES_BINOP_PPU): Define
new type qualifier enum.
(TYPES_TERNOP_SSSU): Likewise.
(TYPES_TERNOP_PPPU): Likewise.
* config/aarch64/aarch64-simd-builtins.def: Define PPU, SSU,
PPPU and SSSU builtin generator macros for qtbl1 and qtbx1
Neon builtins.
* config/aarch64/arm_neon.h (vqtbl1_p8): Use type-qualified
builtin and remove casts.
(vqtbl1_s8): Likewise.
(vqtbl1q_p8): Likewise.
(vqtbl1q_s8): Likewise.
(vqtbx1_s8): Likewise.
(vqtbx1_p8): Likewise.
(vqtbx1q_s8): Likewise.
(vqtbx1q_p8): Likewise.
(vtbl1_p8): Likewise.
(vtbl2_p8): Likewise.
(vtbx2_p8): Likewise.


rb14884.patch
Description: rb14884.patch


Re: [PATCH 4/6 V2] aarch64: Add machine modes for Neon vector-tuple types

2021-11-02 Thread Jonathan Wright via Gcc-patches
Hi,

Each of the comments on the previous version of the patch have been
addressed.

Ok for master?

Thanks,
Jonathan


From: Richard Sandiford 
Sent: 22 October 2021 16:13
To: Jonathan Wright 
Cc: gcc-patches@gcc.gnu.org ; Kyrylo Tkachov 

Subject: Re: [PATCH 4/6] aarch64: Add machine modes for Neon vector-tuple types 
 
Thanks a lot for doing this.

Jonathan Wright  writes:
> @@ -763,9 +839,16 @@ aarch64_lookup_simd_builtin_type (machine_mode mode,
>  return aarch64_simd_builtin_std_type (mode, q);
>  
>    for (i = 0; i < nelts; i++)
> -    if (aarch64_simd_types[i].mode == mode
> - && aarch64_simd_types[i].q == q)
> -  return aarch64_simd_types[i].itype;
> +    {
> +  if (aarch64_simd_types[i].mode == mode
> +   && aarch64_simd_types[i].q == q)
> + return aarch64_simd_types[i].itype;
> +  else if (aarch64_simd_tuple_types[i][0] != NULL_TREE)

Very minor (sorry for not noticing earlier), but: the “else” is
redundant here.

> + for (int j = 0; j < 3; j++)
> +   if (TYPE_MODE (aarch64_simd_tuple_types[i][j]) == mode
> +   && aarch64_simd_types[i].q == q)
> + return aarch64_simd_tuple_types[i][j];
> +    }
>  
>    return NULL_TREE;
>  }
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> 48eddf64e05afe3788abfa05141f6544a9323ea1..0aa185b67ff13d40c87db0449aec312929ff5387
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -6636,162 +6636,165 @@
>  
>  ;; Patterns for vector struct loads and stores.
>  
> -(define_insn "aarch64_simd_ld2"
> -  [(set (match_operand:OI 0 "register_operand" "=w")
> - (unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")
> - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
> -    UNSPEC_LD2))]
> +(define_insn "aarch64_simd_ld2"
> +  [(set (match_operand:VSTRUCT_2Q 0 "register_operand" "=w")
> + (unspec:VSTRUCT_2Q [
> +   (match_operand:VSTRUCT_2Q 1 "aarch64_simd_struct_operand" "Utv")]
> +   UNSPEC_LD2))]
>    "TARGET_SIMD"
>    "ld2\\t{%S0. - %T0.}, %1"
>    [(set_attr "type" "neon_load2_2reg")]
>  )
>  
> -(define_insn "aarch64_simd_ld2r"
> -  [(set (match_operand:OI 0 "register_operand" "=w")
> -   (unspec:OI [(match_operand:BLK 1 "aarch64_simd_struct_operand" "Utv")
> -   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY) ]
> -  UNSPEC_LD2_DUP))]
> +(define_insn "aarch64_simd_ld2r"
> +  [(set (match_operand:VSTRUCT_2QD 0 "register_operand" "=w")
> + (unspec:VSTRUCT_2QD [
> +   (match_operand:VSTRUCT_2QD 1 "aarch64_simd_struct_operand" "Utv")]
> +  UNSPEC_LD2_DUP))]

Sorry again for missing this, but the ld2rs, ld3rs and ld4rs should
keep their BLKmode arguments, since they only access 2, 3 or 4
scalar memory elements.

> @@ -7515,10 +7605,10 @@
>  )
>  
>  (define_insn_and_split "aarch64_combinev16qi"
> -  [(set (match_operand:OI 0 "register_operand" "=w")
> - (unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
> - (match_operand:V16QI 2 "register_operand" "w")]
> -    UNSPEC_CONCAT))]
> +  [(set (match_operand:V2x16QI 0 "register_operand" "=w")
> + (unspec:V2x16QI [(match_operand:V16QI 1 "register_operand" "w")
> +  (match_operand:V16QI 2 "register_operand" "w")]
> + UNSPEC_CONCAT))]

Just realised that we can now make this a vec_concat, since the
modes are finally self-consistent.

No need to do that though, either way is fine.

Looks good otherwise.

Richard<>


[PATCH] aarch64: Use type-qualified builtins for unsigned MLA/MLS intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares type-qualified builtins and uses them for MLA/MLS
Neon intrinsics that operate on unsigned types. This eliminates lots of
casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-08  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Declare type-
qualified builtin generators for unsigned MLA/MLS intrinsics.
* config/aarch64/arm_neon.h (vmla_n_u16): Use type-qualified
builtin.
(vmla_n_u32): Likewise.
(vmla_u8): Likewise.
(vmla_u16): Likewise.
(vmla_u32): Likewise.
(vmlaq_n_u16): Likewise.
(vmlaq_n_u32): Likewise.
(vmlaq_u8): Likewise.
(vmlaq_u16): Likewise.
(vmlaq_u32): Likewise.
(vmls_n_u16): Likewise.
(vmls_n_u32): Likewise.
(vmls_u8): Likewise.
(vmls_u16): Likewise.
(vmls_u32): Likewise.
(vmlsq_n_u16): Likewise.
(vmlsq_n_u32): Likewise.
(vmlsq_u8): Likewise.
(vmlsq_u16): Likewise.
(vmlsq_u32): Likewise.


rb15027.patch
Description: rb15027.patch


[PATCH] aarch64: Use type-qualified builtins for PMUL[L] Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares poly type-qualified builtins and uses them for
PMUL[L] Neon intrinsics. This removes the need for casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-08  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Use poly type
qualifier in builtin generator macros.
* config/aarch64/arm_neon.h (vmul_p8): Use type-qualified
builtin and remove casts.
(vmulq_p8): Likewise.
(vmull_high_p8): Likewise.
(vmull_p8): Likewise.


rb15030.patch
Description: rb15030.patch


[PATCH] aarch64: Use type-qualified builtins for XTN[2] Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned type-qualified builtins and uses them for
XTN[2] Neon intrinsics. This removes the need for casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-08  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Declare unsigned
type-qualified builtins for XTN[2].
* config/aarch64/arm_neon.h (vmovn_high_u16): Use type-
qualified builtin and remove casts.
(vmovn_high_u32): Likewise.
(vmovn_high_u64): Likewise.
(vmovn_u16): Likewise.
(vmovn_u32): Likewise.
(vmovn_u64): Likewise.


rb15031.patch
Description: rb15031.patch


[PATCH] aarch64: Use type-qualified builtins for [R]SHRN[2] Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

Thus patch declares unsigned type-qualified builtins and uses them for
[R]SHRN[2] Neon intrinsics. This removes the need for casts in
arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-08  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Declare type-
qualified builtins for [R]SHRN[2].
* config/aarch64/arm_neon.h (vshrn_n_u16): Use type-qualified
builtin and remove casts.
(vshrn_n_u32): Likewise.
(vshrn_n_u64): Likewise.
(vrshrn_high_n_u16): Likewise.
(vrshrn_high_n_u32): Likewise.
(vrshrn_high_n_u64): Likewise.
(vrshrn_n_u16): Likewise.
(vrshrn_n_u32): Likewise.
(vrshrn_n_u64): Likewise.
(vshrn_high_n_u16): Likewise.
(vshrn_high_n_u32): Likewise.
(vshrn_high_n_u64): Likewise.


rb15032.patch
Description: rb15032.patch


[PATCH] aarch64: Use type-qualified builtins for UADD[LW][2] Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned type-qualified builtins and uses them to
implement widening-add Neon intrinsics. This removes the need for
many casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-09  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Use BINOPU type
qualifiers in generator macros for uadd[lw][2] builtins.
* config/aarch64/arm_neon.h (vaddl_s8): Remove unnecessary
cast.
(vaddl_s16): Likewise.
(vaddl_s32): Likewise.
(vaddl_u8): Use type-qualified builtin and remove casts.
(vaddl_u16): Likewise.
(vaddl_u32): Likewise.
(vaddl_high_s8): Remove unnecessary cast.
(vaddl_high_s16): Likewise.
(vaddl_high_s32): Likewise.
(vaddl_high_u8): Use type-qualified builtin and remove casts.
(vaddl_high_u16): Likewise.
(vaddl_high_u32): Likewise.
(vaddw_s8): Remove unnecessary cast.
(vaddw_s16): Likewise.
(vaddw_s32): Likewise.
(vaddw_u8): Use type-qualified builtin and remove casts.
(vaddw_u16): Likewise.
(vaddw_u32): Likewise.
(vaddw_high_s8): Remove unnecessary cast.
(vaddw_high_s16): Likewise.
(vaddw_high_s32): Likewise.
(vaddw_high_u8): Use type-qualified builtin and remove casts.
(vaddw_high_u16): Likewise.
(vaddw_high_u32): Likewise.


rb15033.patch
Description: rb15033.patch


[PATCH] aarch64: Use type-qualified builtins for USUB[LW][2] Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned type-qualified builtins and uses them to
implement widening-subtract Neon intrinsics. This removes the need
for many casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-09  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Use BINOPU type
qualifiers in generator macros for usub[lw][2] builtins.
* config/aarch64/arm_neon.h (vsubl_s8): Remove unnecessary
cast.
(vsubl_s16): Likewise.
(vsubl_s32): Likewise.
(vsubl_u8): Use type-qualified builtin and remove casts.
(vsubl_u16): Likewise.
(vsubl_u32): Likewise.
(vsubl_high_s8): Remove unnecessary cast.
(vsubl_high_s16): Likewise.
(vsubl_high_s32): Likewise.
(vsubl_high_u8): Use type-qualified builtin and remove casts.
(vsubl_high_u16): Likewise.
(vsubl_high_u32): Likewise.
(vsubw_s8): Remove unnecessary casts.
(vsubw_s16): Likewise.
(vsubw_s32): Likewise.
(vsubw_u8): Use type-qualified builtin and remove casts.
(vsubw_u16): Likewise.
(vsubw_u32): Likewise.
(vsubw_high_s8): Remove unnecessary cast.
(vsubw_high_s16): Likewise.
(vsubw_high_s32): Likewise.
(vsubw_high_u8): Use type-qualified builtin and remove casts.
(vsubw_high_u16): Likewise.
(vsubw_high_u32): Likewise.


rb15034.patch
Description: rb15034.patch


[PATCH] aarch64: Use type-qualified builtins for U[R]HADD Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned type-qualified builtins and uses them to
implement (rounding) halving-add Neon intrinsics. This removes the
need for many casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-09  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Use BINOPU type
qualifiers in generator macros for u[r]hadd builtins.
* config/aarch64/arm_neon.h (vhadd_s8): Remove unnecessary
cast.
(vhadd_s16): Likewise.
(vhadd_s32): Likewise.
(vhadd_u8): Use type-qualified builtin and remove casts.
(vhadd_u16): Likewise.
(vhadd_u32): Likewise.
(vhaddq_s8): Remove unnecessary cast.
(vhaddq_s16): Likewise.
(vhaddq_s32): Likewise.
(vhaddq_u8): Use type-qualified builtin and remove casts.
(vhaddq_u16): Likewise.
(vhaddq_u32): Likewise.
(vrhadd_s8): Remove unnecessary cast.
(vrhadd_s16): Likewise.
(vrhadd_s32): Likewise.
(vrhadd_u8): Use type-qualified builtin and remove casts.
(vrhadd_u16): Likewise.
(vrhadd_u32): Likewise.
(vrhaddq_s8): Remove unnecessary cast.
(vrhaddq_s16): Likewise.
(vrhaddq_s32): Likewise.
(vrhaddq_u8): Use type-wualified builtin and remove casts.
(vrhaddq_u16): Likewise.
(vrhaddq_u32): Likewise.


rb15035.patch
Description: rb15035.patch


[PATCH] aarch64: Use type-qualified builtins for UHSUB Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned type-qualified builtins and uses them to
implement halving-subtract Neon intrinsics. This removes the need for
many casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-09  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Use BINOPU type
qualifiers in generator macros for uhsub builtins.
* config/aarch64/arm_neon.h (vhsub_s8): Remove unnecessary
cast.
(vhsub_s16): Likewise.
(vhsub_s32): Likewise.
(vhsub_u8): Use type-qualified builtin and remove casts.
(vhsub_u16): Likewise.
(vhsub_u32): Likewise.
(vhsubq_s8): Remove unnecessary cast.
(vhsubq_s16): Likewise.
(vhsubq_s32): Likewise.
(vhsubq_u8): Use type-qualified builtin and remove casts.
(vhsubq_u16): Likewise.
(vhsubq_u32): Likewise.


rb15036.patch
Description: rb15036.patch


[PATCH] aarch64: Use type-qualified builtins for [R]ADDHN[2] Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned type-qualified builtins and uses them to
implement (rounding) halving-narrowing-add Neon intrinsics. This
removes the need for many casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-09  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Declare unsigned
builtins for [r]addhn[2].
* config/aarch64/arm_neon.h (vaddhn_s16): Remove unnecessary
cast.
(vaddhn_s32): Likewise.
(vaddhn_s64): Likewise.
(vaddhn_u16): Use type-qualified builtin and remove casts.
(vaddhn_u32): Likewise.
(vaddhn_u64): Likewise.
(vraddhn_s16): Remove unnecessary cast.
(vraddhn_s32): Likewise.
(vraddhn_s64): Likewise.
(vraddhn_u16): Use type-qualified builtin and remove casts.
(vraddhn_u32): Likewise.
(vraddhn_u64): Likewise.
(vaddhn_high_s16): Remove unnecessary cast.
(vaddhn_high_s32): Likewise.
(vaddhn_high_s64): Likewise.
(vaddhn_high_u16): Use type-qualified builtin and remove
casts.
(vaddhn_high_u32): Likewise.
(vaddhn_high_u64): Likewise.
(vraddhn_high_s16): Remove unnecessary cast.
(vraddhn_high_s32): Likewise.
(vraddhn_high_s64): Likewise.
(vraddhn_high_u16): Use type-qualified builtin and remove
casts.
(vraddhn_high_u32): Likewise.
(vraddhn_high_u64): Likewise.


rb15037.patch
Description: rb15037.patch


[PATCH] aarch64: Use type-qualified builtins for [R]SUBHN[2] Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned type-qualified builtins and uses them to
implement (rounding) halving-narrowing-subtract Neon intrinsics. This
removes the need for many casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-09  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Declare unsigned
builtins for [r]subhn[2].
* config/aarch64/arm_neon.h (vsubhn_s16): Remove unnecessary
cast.
(vsubhn_s32): Likewise.
(vsubhn_s64): Likewise.
(vsubhn_u16): Use type-qualified builtin and remove casts.
(vsubhn_u32): Likewise.
(vsubhn_u64): Likewise.
(vrsubhn_s16): Remove unnecessary cast.
(vrsubhn_s32): Likewise.
(vrsubhn_s64): Likewise.
(vrsubhn_u16): Use type-qualified builtin and remove casts.
(vrsubhn_u32): Likewise.
(vrsubhn_u64): Likewise.
(vrsubhn_high_s16): Remove unnecessary cast.
(vrsubhn_high_s32): Likewise.
(vrsubhn_high_s64): Likewise.
(vrsubhn_high_u16): Use type-qualified builtin and remove
casts.
(vrsubhn_high_u32): Likewise.
(vrsubhn_high_u64): Likewise.
(vsubhn_high_s16): Remove unnecessary cast.
(vsubhn_high_s32): Likewise.
(vsubhn_high_s64): Likewise.
(vsubhn_high_u16): Use type-qualified builtin and remove
casts.
(vsubhn_high_u32): Likewise.
(vsubhn_high_u64): Likewise.


rb15038.patch
Description: rb15038.patch


[PATCH] aarch64: Use type-qualified builtins for ADDP Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned type-qualified builtins and uses them to
implement the pairwise addition Neon intrinsics. This removes the need
for many casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-09  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def:
* config/aarch64/arm_neon.h (vpaddq_u8): Use type-qualified
builtin and remove casts.
(vpaddq_u16): Likewise.
(vpaddq_u32): Likewise.
(vpaddq_u64): Likewise.
(vpadd_u8): Likewise.
(vpadd_u16): Likewise.
(vpadd_u32): Likewise.
(vpaddd_u64): Likewise.


rb15039.patch
Description: rb15039.patch


[PATCH] aarch64: Use type-qualified builtins for ADDV Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned type-qualified builtins and uses them to
implement the vector reduction Neon intrinsics. This removes the need
for many casts in arm_neon.h.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-09  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Declare unsigned
builtins for vector reduction.
* config/aarch64/arm_neon.h (vaddv_u8): Use type-qualified
builtin and remove casts.
(vaddv_u16): Likewise.
(vaddv_u32): Likewise.
(vaddvq_u8): Likewise.
(vaddvq_u16): Likewise.
(vaddvq_u32): Likewise.
(vaddvq_u64): Likewise.


rb15057.patch
Description: rb15057.patch


[PATCH] aarch64: Use type-qualified builtins for LD1/ST1 Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned and polynomial type-qualified builtins and
uses them to implement the LD1/ST1 Neon intrinsics. This removes the
need for many casts in arm_neon.h.

The new type-qualified builtins are also lowered to gimple - as the
unqualified builtins are already.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-10  Jonathan Wright  

* config/aarch64/aarch64-builtins.c (TYPES_LOAD1_U): Define.
(TYPES_LOAD1_P): Define.
(TYPES_STORE1_U): Define.
(TYPES_STORE1P): Rename to...
(TYPES_STORE1_P): This.
(get_mem_type_for_load_store): Add unsigned and poly types.
(aarch64_general_gimple_fold_builtin): Add unsigned and poly
type-qualified builtin declarations.
* config/aarch64/aarch64-simd-builtins.def: Declare type-
qualified builtins for LD1/ST1.
* config/aarch64/arm_neon.h (vld1_p8): Use type-qualified
builtin and remove cast.
(vld1_p16): Likewise.
(vld1_u8): Likewise.
(vld1_u16): Likewise.
(vld1_u32): Likewise.
(vld1q_p8): Likewise.
(vld1q_p16): Likewise.
(vld1q_p64): Likewise.
(vld1q_u8): Likewise.
(vld1q_u16): Likewise.
(vld1q_u32): Likewise.
(vld1q_u64): Likewise.
(vst1_p8): Likewise.
(vst1_p16): Likewise.
(vst1_u8): Likewise.
(vst1_u16): Likewise.
(vst1_u32): Likewise.
(vst1q_p8): Likewise.
(vst1q_p16): Likewise.
(vst1q_p64): Likewise.
(vst1q_u8): Likewise.
(vst1q_u16): Likewise.
(vst1q_u32): Likewise.
(vst1q_u64): Likewise.
* config/aarch64/iterators.md (VALLP_NO_DI): New iterator.


rb15058.patch
Description: rb15058.patch


[PATCH] aarch64: Use type-qualified builtins for vcombine_* Neon intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned and polynomial type-qualified builtins for
vcombine_* Neon intrinsics. Using these builtins removes the need for
many casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-10  Jonathan Wright  

* config/aarch64/aarch64-builtins.c (TYPES_COMBINE): Delete.
(TYPES_COMBINEP): Delete.
* config/aarch64/aarch64-simd-builtins.def: Declare type-
qualified builtins for vcombine_* intrinsics.
* config/aarch64/arm_neon.h (vcombine_s8): Remove unnecessary
cast.
(vcombine_s16): Likewise.
(vcombine_s32): Likewise.
(vcombine_f32): Likewise.
(vcombine_u8): Use type-qualified builtin and remove casts.
(vcombine_u16): Likewise.
(vcombine_u32): Likewise.
(vcombine_u64): Likewise.
(vcombine_p8): Likewise.
(vcombine_p16): Likewise.
(vcombine_p64): Likewise.
(vcombine_bf16): Remove unnecessary cast.
* config/aarch64/iterators.md (VDC_I): New mode iterator.
(VDC_P): New mode iterator.


rb15059.patch
Description: rb15059.patch


[PATCH] aarch64: Use type-qualified builtins for vget_low/high intrinsics

2021-11-11 Thread Jonathan Wright via Gcc-patches
Hi,

This patch declares unsigned and polynomial type-qualified builtins for
vget_low_*/vget_high_* Neon intrinsics. Using these builtins removes
the need for many casts in arm_neon.h.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-11-10  Jonathan Wright  

* config/aarch64/aarch64-builtins.c (TYPES_UNOPP): Define.
* config/aarch64/aarch64-simd-builtins.def: Declare type-
qualified builtins for vget_low/high.
* config/aarch64/arm_neon.h (vget_low_p8): Use type-qualified
builtin and remove casts.
(vget_low_p16): Likewise.
(vget_low_p64): Likewise.
(vget_low_u8): Likewise.
(vget_low_u16): Likewise.
(vget_low_u32): Likewise.
(vget_low_u64): Likewise.
(vget_high_p8): Likewise.
(vget_high_p16): Likewise.
(vget_high_p64): Likewise.
(vget_high_u8): Likewise.
(vget_high_u16): Likewise.
(vget_high_u32): Likewise.
(vget_high_u64): Likewise.
* config/aarch64/iterators.md (VQ_P): New mode iterator.


rb15060.patch
Description: rb15060.patch


[PATCH] aarch64: Use an expander for quad-word vec_pack_trunc pattern

2021-05-19 Thread Jonathan Wright via Gcc-patches
Hi,

The existing vec_pack_trunc RTL pattern emits an opaque two-
instruction assembly code sequence that prevents proper instruction
scheduling. This commit changes the pattern to an expander that emits
individual xtn and xtn2 instructions.

This commit also consolidates the duplicate truncation patterns.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-05-17  Jonathan Wright  

* config/aarch64/aarch64-simd.md (aarch64_simd_vec_pack_trunc_):
Remove as duplicate of...
(aarch64_xtn): This.
(aarch64_xtn2_le): Move position in file.
(aarch64_xtn2_be): Move position in file.
(aarch64_xtn2): Move position in file.
(vec_pack_trunc_): Define as an expander.


rb14480.patch
Description: rb14480.patch


[PATCH] aarch64: Use correct type attributes for RTL generating XTN(2)

2021-05-19 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch corrects the type attribute in RTL patterns that
generate XTN/XTN2 instructions to be "neon_move_narrow_q".

This makes a material difference because these instructions can be
executed on both SIMD pipes in the Cortex-A57 core model, whereas the
"neon_shift_imm_narrow_q" attribute (in use until now) would suggest
to the scheduler that they could only execute on one of the two
pipes.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-05-18  Jonathan Wright  

* config/aarch64/aarch64-simd.md: Use "neon_move_narrow_q"
type attribute in patterns generating XTN(2).


rb14492.patch
Description: rb14492.patch


[PATCH] aarch64: Fix pointer parameter type in LD1 Neon intrinsics

2021-10-14 Thread Jonathan Wright via Gcc-patches
The pointer parameter to load a vector of signed values should itself
be a signed type. This patch fixes two instances of this unsigned-
signed implicit conversion in arm_neon.h.

Tested relevant intrinsics with -Wpointer-sign and warnings no longer
present.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-10-14  Jonathan Wright  

* config/aarch64/arm_neon.h (vld1_s8_x3): Use signed type for
pointer parameter.
(vld1_s32_x3): Likewise.


rb14933.patch
Description: rb14933.patch


[PATCH] aarch64: Remove redundant struct type definitions in arm_neon.h

2021-10-21 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch deletes some redundant type definitions in
arm_neon.h. These vector type definitions are an artifact from the initial
commit that added the AArch64 port.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-10-15  Jonathan Wright  

* config/aarch64/arm_neon.h (__STRUCTN): Delete function
macro and all invocations.


rb14942.patch
Description: rb14942.patch


[PATCH 1/6] aarch64: Move Neon vector-tuple type declaration into the compiler

2021-10-22 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch declares the Neon vector-tuple types inside the
compiler instead of in the arm_neon.h header. This is a necessary first
step before adding corresponding machine modes to the AArch64
backend.

The vector-tuple types are implemented using a #pragma. This means
initialization of builtin functions that have vector-tuple types as
arguments or return values has to be delayed until the #pragma is
handled.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Note that this patch series cannot be merged until the following has
been accepted: 
https://gcc.gnu.org/pipermail/gcc-patches/2021-October/581948.html

Ok for master with this proviso?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-09-10  Jonathan Wright  

* config/aarch64/aarch64-builtins.c (aarch64_init_simd_builtins):
Factor out main loop to...
(aarch64_init_simd_builtin_functions): This new function.
(register_tuple_type): Define.
(aarch64_scalar_builtin_type_p): Define.
(handle_arm_neon_h): Define.
* config/aarch64/aarch64-c.c (aarch64_pragma_aarch64): Handle
pragma for arm_neon.h.
* config/aarch64/aarch64-protos.h (aarch64_advsimd_struct_mode_p):
Declare.
(handle_arm_neon_h): Likewise.
* config/aarch64/aarch64.c (aarch64_advsimd_struct_mode_p):
Remove static modifier.
* config/aarch64/arm_neon.h (target): Remove Neon vector
structure type definitions.


rb14838.patch
Description: rb14838.patch


[PATCH 2/6] gcc/expr.c: Remove historic workaround for broken SIMD subreg

2021-10-22 Thread Jonathan Wright via Gcc-patches
Hi,

A long time ago, using a parallel to take a subreg of a SIMD register
was broken. This temporary fix[1] (from 2003) spilled these registers
to memory and reloaded the appropriate part to obtain the subreg.

The fix initially existed for the benefit of the PowerPC E500 - a
platform for which GCC removed support a number of years ago.
Regardless, a proper mechanism for taking a subreg of a SIMD register
exists now anyway.

This patch removes the workaround thus preventing SIMD registers
being dumped to memory unnecessarily - which sometimes can't be fixed
by later passes.

Bootstrapped and regression tested on aarch64-none-linux-gnu and
x86_64-pc-linux-gnu - no issues.

Ok for master?

Thanks,
Jonathan

[1] https://gcc.gnu.org/pipermail/gcc-patches/2003-April/102099.html

---

gcc/ChangeLog:

2021-10-11  Jonathan Wright  

* expr.c (emit_group_load_1): Remove historic workaround.


rb14923.patch
Description: rb14923.patch


[PATCH 3/6] gcc/expmed.c: Ensure vector modes are tieable before extraction

2021-10-22 Thread Jonathan Wright via Gcc-patches
Hi,

Extracting a bitfield from a vector can be achieved by casting the
vector to a new type whose elements are the same size as the desired
bitfield, before generating a subreg. However, this is only an
optimization if the original vector can be accessed in the new
machine mode without first being copied - a condition denoted by the
TARGET_MODES_TIEABLE_P hook.

This patch adds a check to make sure that the vector modes are
tieable before attempting to generate a subreg. This is a necessary
prerequisite for a subsequent patch that will introduce new machine
modes for Arm Neon vector-tuple types.

Bootstrapped and regression tested on aarch64-none-linux-gnu and
x86_64-pc-linux-gnu - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-10-11  Jonathan Wright  

* expmed.c (extract_bit_field_1): Ensure modes are tieable.


rb14926.patch
Description: rb14926.patch


[PATCH 5/6] gcc/lower_subreg.c: Prevent decomposition if modes are not tieable

2021-10-22 Thread Jonathan Wright via Gcc-patches
Hi,

Preventing decomposition if modes are not tieable is necessary to
stop AArch64 partial Neon structure modes being treated as packed in
registers.

This is a necessary prerequisite for a future AArch64 PCS change to
maintain good code generation.

Bootstrapped and regression tested on:
* x86_64-pc-linux-gnu - no issues.
* aarch64-none-linux-gnu - two test failures which will be fixed by
  the next patch in this series. 

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-10-14  Jonathan Wright  

* lower-subreg.c (simple_move): Prevent decomposition if
modes are not tieable.


rb14936.patch
Description: rb14936.patch


[PATCH 6/6] aarch64: Pass and return Neon vector-tuple types without a parallel

2021-10-22 Thread Jonathan Wright via Gcc-patches
Hi,

Neon vector-tuple types can be passed in registers on function call
and return - there is no need to generate a parallel rtx. This patch
adds cases to detect vector-tuple modes and generates an appropriate
register rtx.

This change greatly improves code generated when passing Neon vector-
tuple types between functions; many new test cases are added to
defend these improvements.

Bootstrapped and regression tested on aarch64-none-linux-gnu and
aarch64_be-none-linux-gnu - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-10-07  Jonathan Wright  

* config/aarch64/aarch64.c (aarch64_function_value): Generate
a register rtx for Neon vector-tuple modes.
(aarch64_layout_arg): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: New code
generation tests.


rb14937.patch
Description: rb14937.patch


[PATCH 4/6] aarch64: Add machine modes for Neon vector-tuple types

2021-10-22 Thread Jonathan Wright via Gcc-patches
Hi,

Until now, GCC has used large integer machine modes (OI, CI and XI)
to model Neon vector-tuple types. This is suboptimal for many
reasons, the most notable are:

 1) Large integer modes are opaque and modifying one vector in the
    tuple requires a lot of inefficient set/get gymnastics. The
    result is a lot of superfluous move instructions.
 2) Large integer modes do not map well to types that are tuples of
    64-bit vectors - we need additional zero-padding which again
    results in superfluous move instructions.

This patch adds new machine modes that better model the C-level Neon
vector-tuple types. The approach is somewhat similar to that already
used for SVE vector-tuple types.

All of the AArch64 backend patterns and builtins that manipulate Neon
vector tuples are updated to use the new machine modes. This has the
effect of significantly reducing the amount of boiler-plate code in
the arm_neon.h header.

While this patch increases the quality of code generated in many
instances, there is still room for significant improvement - which
will be attempted in subsequent patches.

Bootstrapped and regression tested on aarch64-none-linux-gnu and
aarch64_be-none-linux-gnu - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-08-09  Jonathan Wright  
            Richard Sandiford  

* config/aarch64/aarch64-builtins.c (v2x8qi_UP): Define.
(v2x4hi_UP): Likewise.
(v2x4hf_UP): Likewise.
(v2x4bf_UP): Likewise.
(v2x2si_UP): Likewise.
(v2x2sf_UP): Likewise.
(v2x1di_UP): Likewise.
(v2x1df_UP): Likewise.
(v2x16qi_UP): Likewise.
(v2x8hi_UP): Likewise.
(v2x8hf_UP): Likewise.
(v2x8bf_UP): Likewise.
(v2x4si_UP): Likewise.
(v2x4sf_UP): Likewise.
(v2x2di_UP): Likewise.
(v2x2df_UP): Likewise.
(v3x8qi_UP): Likewise.
(v3x4hi_UP): Likewise.
(v3x4hf_UP): Likewise.
(v3x4bf_UP): Likewise.
(v3x2si_UP): Likewise.
(v3x2sf_UP): Likewise.
(v3x1di_UP): Likewise.
(v3x1df_UP): Likewise.
(v3x16qi_UP): Likewise.
(v3x8hi_UP): Likewise.
(v3x8hf_UP): Likewise.
(v3x8bf_UP): Likewise.
(v3x4si_UP): Likewise.
(v3x4sf_UP): Likewise.
(v3x2di_UP): Likewise.
(v3x2df_UP): Likewise.
(v4x8qi_UP): Likewise.
(v4x4hi_UP): Likewise.
(v4x4hf_UP): Likewise.
(v4x4bf_UP): Likewise.
(v4x2si_UP): Likewise.
(v4x2sf_UP): Likewise.
(v4x1di_UP): Likewise.
(v4x1df_UP): Likewise.
(v4x16qi_UP): Likewise.
(v4x8hi_UP): Likewise.
(v4x8hf_UP): Likewise.
(v4x8bf_UP): Likewise.
(v4x4si_UP): Likewise.
(v4x4sf_UP): Likewise.
(v4x2di_UP): Likewise.
(v4x2df_UP): Likewise.
(TYPES_GETREGP): Delete.
(TYPES_SETREGP): Likewise.
(TYPES_LOADSTRUCT_U): Define.
(TYPES_LOADSTRUCT_P): Likewise.
(TYPES_LOADSTRUCT_LANE_U): Likewise.
(TYPES_LOADSTRUCT_LANE_P): Likewise.
(TYPES_STORE1P): Move for consistency.
(TYPES_STORESTRUCT_U): Define.
(TYPES_STORESTRUCT_P): Likewise.
(TYPES_STORESTRUCT_LANE_U): Likewise.
(TYPES_STORESTRUCT_LANE_P): Likewise.
(aarch64_simd_tuple_types): Define.
(aarch64_lookup_simd_builtin_type): Handle tuple type lookup.
(aarch64_init_simd_builtin_functions): Update frontend lookup
for builtin functions after handling arm_neon.h pragma.
(register_tuple_type): Manually set modes of single-integer
tuple types. Record tuple types.
* config/aarch64/aarch64-modes.def
(ADV_SIMD_D_REG_STRUCT_MODES): Define D-register tuple modes.
(ADV_SIMD_Q_REG_STRUCT_MODES): Define Q-register tuple modes.
(SVE_MODES): Give single-vector modes priority over vector-
tuple modes.
(VECTOR_MODES_WITH_PREFIX): Set partial-vector mode order to
be after all single-vector modes.
* config/aarch64/aarch64-simd-builtins.def: Update builtin
generator macros to reflect modifications to the backend
patterns.
* config/aarch64/aarch64-simd.md (aarch64_simd_ld2):
Use vector-tuple mode iterator and rename to...
(aarch64_simd_ld2): This.
(aarch64_simd_ld2r): Use vector-tuple mode iterator and
rename to...
(aarch64_simd_ld2r): This.
(aarch64_vec_load_lanesoi_lane): Use vector-tuple mode
iterator and rename to...
(aarch64_vec_load_lanes_lane): This.
(vec_load_lanesoi): Use vector-tuple mode iterator and
rename to...
(vec_load_lanes): This.
(aarch64_simd_st2): Use vector-tuple mode iterator and
rename to...
(aarch64_simd_st2): This.
(aarch64_vec_store_lanesoi_lane): Use vector-tuple mode
iterator and rename to...
(aarch64_vec_store_lanes_lane): This.
  

[PATCH V2] gcc: Add vec_select -> subreg RTL simplification

2021-07-07 Thread Jonathan Wright via Gcc-patches
Hi,

Version 2 of this patch adds more code generation tests to show the
benefit of this RTL simplification as well as adding a new helper function
'rtx_vec_series_p' to reduce code duplication.

Patch tested as version 1 - ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-06-08  Jonathan Wright  

* combine.c (combine_simplify_rtx): Add vec_select -> subreg
simplification.
* config/aarch64/aarch64.md 
(*zero_extend2_aarch64):
Add Neon to general purpose register case for zero-extend
pattern.
* config/arm/vfp.md (*arm_movsi_vfp): Remove "*" from *t -> r
case to prevent some cases opting to go through memory.
* cse.c (fold_rtx): Add vec_select -> subreg simplification.
* rtl.c (rtx_vec_series_p): Define helper function to
determine whether RTX vector-selection indices are in series.
* rtl.h (rtx_vec_series_p): Define.
* simplify-rtx.c (simplify_context::simplify_binary_operation_1):
Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/extract_zero_extend.c: Remove dump scan
for RTL pattern match.
* gcc.target/aarch64/narrow_high_combine.c: Add new tests.
* gcc.target/aarch64/simd/vmulx_laneq_f64_1.c: Update
scan-assembler regex to look for a scalar register instead of
lane 0 of a vector.
* gcc.target/aarch64/simd/vmulx_laneq_f64_1.c: Likewise.
* gcc.target/aarch64/simd/vmulxd_laneq_f64_1.c: Likewise.
* gcc.target/aarch64/simd/vmulxs_lane_f32_1.c: Likewise.
* gcc.target/aarch64/simd/vmulxs_laneq_f32_1.c: Likewise.
* gcc.target/aarch64/simd/vqdmlalh_lane_s16.c: Likewise.
* gcc.target/aarch64/simd/vqdmlals_lane_s32.c: Likewise.
* gcc.target/aarch64/simd/vqdmlslh_lane_s16.c: Likewise.
* gcc.target/aarch64/simd/vqdmlsls_lane_s32.c: Likewise.
* gcc.target/aarch64/simd/vqdmullh_lane_s16.c: Likewise.
* gcc.target/aarch64/simd/vqdmullh_laneq_s16.c: Likewise.
* gcc.target/aarch64/simd/vqdmulls_lane_s32.c: Likewise.
* gcc.target/aarch64/simd/vqdmulls_laneq_s32.c: Likewise.
* gcc.target/aarch64/sve/dup_lane_1.c: Likewise.
* gcc.target/aarch64/sve/live_1.c: Update scan-assembler regex
cases to look for 'b' and 'h' registers instead of 'w'.
* gcc.target/arm/mve/intrinsics/vgetq_lane_f16.c: Extract
lane 1 as the moves for lane 0 now get optimized away.
* gcc.target/arm/mve/intrinsics/vgetq_lane_f32.c: Likewise.
* gcc.target/arm/mve/intrinsics/vgetq_lane_s16.c: Likewise.
* gcc.target/arm/mve/intrinsics/vgetq_lane_s32.c: Likewise.
* gcc.target/arm/mve/intrinsics/vgetq_lane_s8.c: Likewise.
* gcc.target/arm/mve/intrinsics/vgetq_lane_u16.c: Likewise.
* gcc.target/arm/mve/intrinsics/vgetq_lane_u32.c: Likewise.
* gcc.target/arm/mve/intrinsics/vgetq_lane_u8.c: Likewise.



From: Jonathan Wright
Sent: 02 July 2021 10:53
To: gcc-patches@gcc.gnu.org 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] gcc: Add vec_select -> subreg RTL simplification 
 
Hi,

As subject, this patch adds a new RTL simplification for the case of a
VEC_SELECT selecting the low part of a vector. The simplification
returns a SUBREG.

The primary goal of this patch is to enable better combinations of
Neon RTL patterns - specifically allowing generation of 'write-to-
high-half' narrowing intructions.

Adding this RTL simplification means that the expected results for a
number of tests need to be updated:
* aarch64 Neon: Update the scan-assembler regex for intrinsics tests
  to expect a scalar register instead of lane 0 of a vector.
* aarch64 SVE: Likewise.
* arm MVE: Use lane 1 instead of lane 0 for lane-extraction
  intrinsics tests (as the move instructions get optimized away for
  lane 0.)

Regression tested and bootstrapped on aarch64-none-linux-gnu,
x86_64-unknown-linux-gnu, arm-none-linux-gnueabihf and
aarch64_be-none-linux-gnu - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-06-08  Jonathan Wright  

    * combine.c (combine_simplify_rtx): Add vec_select -> subreg
    simplification.
    * config/aarch64/aarch64.md 
(*zero_extend2_aarch64):
    Add Neon to general purpose register case for zero-extend
    pattern.
    * config/arm/vfp.md (*arm_movsi_vfp): Remove "*" from *t -> r
    case to prevent some cases opting to go through memory.
    * cse.c (fold_rtx): Add vec_select -> subreg simplification.
    * simplify-rtx.c (simplify_context::simplify_binary_operation_1):
    Likewise.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/extract_zero_extend.c: Remove dump scan
    for RTL pattern match.
    * gcc.target/aarch64/simd/vmulx_laneq_f64_1.c: Update
    scan-assembler regex to look for a scalar register instead of
    lane 0 of a vector.
    * gcc.target/aarch64/simd/vmulx_laneq_f64_1.c: Likewise.

[PATCH] aarch64: Use unions for vector tables in vqtbl[234] intrinsics

2021-07-09 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch uses a union instead of constructing a new opaque
vector structure for each of the vqtbl[234] Neon intrinsics in arm_neon.h.
This simplifies the header file and also improves code generation -
superfluous move instructions were emitted for every register
extraction/set in this additional structure.

This change is safe because the C-level vector structure types e.g.
uint8x16x4_t already provide a tie for sequential register allocation
- which is required by the TBL instructions.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-08  Jonathan Wright  

* config/aarch64/arm_neon.h (vqtbl2_s8): Use union instead of
additional __builtin_aarch64_simd_oi structure.
(vqtbl2_u8): Likewise.
(vqtbl2_p8): Likewise.
(vqtbl2q_s8): Likewise.
(vqtbl2q_u8): Likewise.
(vqtbl2q_p8): Likewise.
(vqtbl3_s8): Use union instead of additional
__builtin_aarch64_simd_ci structure.
(vqtbl3_u8): Likewise.
(vqtbl3_p8): Likewise.
(vqtbl3q_s8): Likewise.
(vqtbl3q_u8): Likewise.
(vqtbl3q_p8): Likewise.
(vqtbl4_s8): Use union instead of additional
__builtin_aarch64_simd_xi structure.
(vqtbl4_u8): Likewise.
(vqtbl4_p8): Likewise.
(vqtbl4q_s8): Likewise.
(vqtbl4q_u8): Likewise.
(vqtbl4q_p8): Likewise.


rb14639.patch
Description: rb14639.patch


testsuite: aarch64: Fix failing SVE tests on big endian

2021-07-15 Thread Jonathan Wright via Gcc-patches
Hi,

A recent change "gcc: Add vec_select -> subreg RTL simplification"
updated the expected test results for SVE extraction tests. The new
result should only have been changed for little endian. This patch
restores the old expected result for big endian.

Ok for master?

Thanks,
Jonathan

---

gcc/testsuite/ChangeLog:

2021-07-15  Jonathan Wright  

* gcc.target/aarch64/sve/extract_1.c: Split expected results
by big/little endian targets, restoring the old expected
result for big endian.
* gcc.target/aarch64/sve/extract_2.c: Likewise.
* gcc.target/aarch64/sve/extract_3.c: Likewise.
* gcc.target/aarch64/sve/extract_4.c: Likewise.


rb14655.patch
Description: rb14655.patch


Re: [PATCH V2] gcc: Add vec_select -> subreg RTL simplification

2021-07-15 Thread Jonathan Wright via Gcc-patches
Ah, yes - those test results should have only been changed for little endian.

I've submitted a patch to the list restoring the original expected results for 
big
endian.

Thanks,
Jonathan

From: Christophe Lyon 
Sent: 15 July 2021 10:09
To: Richard Sandiford ; Jonathan Wright 
; gcc-patches@gcc.gnu.org ; 
Kyrylo Tkachov 
Subject: Re: [PATCH V2] gcc: Add vec_select -> subreg RTL simplification



On Mon, Jul 12, 2021 at 5:31 PM Richard Sandiford via Gcc-patches 
mailto:gcc-patches@gcc.gnu.org>> wrote:
Jonathan Wright mailto:jonathan.wri...@arm.com>> 
writes:
> Hi,
>
> Version 2 of this patch adds more code generation tests to show the
> benefit of this RTL simplification as well as adding a new helper function
> 'rtx_vec_series_p' to reduce code duplication.
>
> Patch tested as version 1 - ok for master?

Sorry for the slow reply.

> Regression tested and bootstrapped on aarch64-none-linux-gnu,
> x86_64-unknown-linux-gnu, arm-none-linux-gnueabihf and
> aarch64_be-none-linux-gnu - no issues.

I've also tested this on powerpc64le-unknown-linux-gnu, no issues again.

> diff --git a/gcc/combine.c b/gcc/combine.c
> index 
> 6476812a21268e28219d1e302ee1c979d528a6ca..0ff6ca87e4432cfeff1cae1dd219ea81ea0b73e4
>  100644
> --- a/gcc/combine.c
> +++ b/gcc/combine.c
> @@ -6276,6 +6276,26 @@ combine_simplify_rtx (rtx x, machine_mode op0_mode, 
> int in_dest,
> - 1,
> 0));
>break;
> +case VEC_SELECT:
> +  {
> + rtx trueop0 = XEXP (x, 0);
> + mode = GET_MODE (trueop0);
> + rtx trueop1 = XEXP (x, 1);
> + int nunits;
> + /* If we select a low-part subreg, return that.  */
> + if (GET_MODE_NUNITS (mode).is_constant (&nunits)
> + && targetm.can_change_mode_class (mode, GET_MODE (x), ALL_REGS))
> +   {
> + int offset = BYTES_BIG_ENDIAN ? nunits - XVECLEN (trueop1, 0) : 0;
> +
> + if (rtx_vec_series_p (trueop1, offset))
> +   {
> + rtx new_rtx = lowpart_subreg (GET_MODE (x), trueop0, mode);
> + if (new_rtx != NULL_RTX)
> +   return new_rtx;
> +   }
> +   }
> +  }

Since this occurs three times, I think it would be worth having
a new predicate:

/* Return true if, for all OP of mode OP_MODE:

 (vec_select:RESULT_MODE OP SEL)

   is equivalent to the lowpart RESULT_MODE of OP.  */

bool
vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)

containing the GET_MODE_NUNITS (…).is_constant, can_change_mode_class
and rtx_vec_series_p tests.

I think the function belongs in rtlanal.[hc], even though subreg_lowpart_p
is in emit-rtl.c.

> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> aef6da9732d45b3586bad5ba57dafa438374ac3c..f12a0bebd3d6dd3381ac8248cd3fa3f519115105
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1884,15 +1884,16 @@
>  )
>
>  (define_insn "*zero_extend2_aarch64"
> -  [(set (match_operand:GPI 0 "register_operand" "=r,r,w")
> -(zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" 
> "r,m,m")))]
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r")
> +(zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" 
> "r,m,m,w")))]
>""
>"@
> and\t%0, %1, 
> ldr\t%w0, %1
> -   ldr\t%0, %1"
> -  [(set_attr "type" "logic_imm,load_4,f_loads")
> -   (set_attr "arch" "*,*,fp")]
> +   ldr\t%0, %1
> +   umov\t%w0, %1.[0]"
> +  [(set_attr "type" "logic_imm,load_4,f_loads,neon_to_gp")
> +   (set_attr "arch" "*,*,fp,fp")]

FTR (just to show I thought about it): I don't know whether the umov
can really be considered an fp operation rather than a simd operation,
but since we don't support fp without simd, this is already a distinction
without a difference.  So the pattern is IMO OK as-is.

> diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
> index 
> 55b6c1ac585a4cae0789c3afc0fccfc05a6d3653..93e963696dad30f29a76025696670f8b31bf2c35
>  100644
> --- a/gcc/config/arm/vfp.md
> +++ b/gcc/config/arm/vfp.md
> @@ -224,7 +224,7 @@
>  ;; problems because small constants get converted into adds.
>  (define_insn "*arm_movsi_vfp"
>[(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m 
> ,*t,r,*t,*t, *Uv")
> -  (match_operand:SI 1 "general_operand" "rk, 
> I,K,j,mi,rk,r,*t,*t,*Uvi,*t"))]
> +  (match_operand:SI 1 "general_operand" "rk, 
> I,K,j,mi,rk,r,t,*t,*Uvi,*t"))]
>"TARGET_ARM && TARGET_HARD_FLOAT
> && (   s_register_operand (operands[0], SImode)
> || s_register_operand (operands[1], SImode))"

I'll assume that an Arm maintainer would have spoken up by now if
they didn't want this for some reason.

> diff --git a/gcc/rtl.c b/gcc/rtl.c
> index 
> aaee882f5ca3e37b59c9829e41d0864070c170eb..3e8b3628b0b76b41889b77bb0019f582ee6f5aaa
>  100644
> --- a/gcc/rtl.c
> +++ b/gcc/rtl.c
> @@ -736,6 +736,19 @@ rtvec_all_equal_p (const_r

[PATCH] aarch64: Refactor TBL/TBX RTL patterns

2021-07-19 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch renames the two-source-register TBL/TBX RTL
patterns so that their names better reflect what they do, rather than
confusing them with tbl3 or tbx4 patterns. Also use the correct
"neon_tbl2" type attribute for both patterns.

Rename single-source-register TBL/TBX patterns for consistency.

Bootstrapped and regression tested on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-08  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Use two variant
generators for all TBL/TBX intrinsics and rename to
consistent forms: qtbl[1234] or qtbx[1234].
* config/aarch64/aarch64-simd.md (aarch64_tbl1):
Rename to...
(aarch64_qtbl1): This.
(aarch64_tbx1): Rename to...
(aarch64_qtbx1): This.
(aarch64_tbl2v16qi): Delete.
(aarch64_tbl3): Rename to...
(aarch64_qtbl2): This.
(aarch64_tbx4): Rename to...
(aarch64_qtbx2): This.
* config/aarch64/aarch64.c (aarch64_expand_vec_perm_1): Use
renamed qtbl1 and qtbl2 RTL patterns.
* config/aarch64/arm_neon.h (vqtbl1_p8): Use renamed qtbl1
RTL pattern.
(vqtbl1_s8): Likewise.
(vqtbl1_u8): Likewise.
(vqtbl1q_p8): Likewise.
(vqtbl1q_s8): Likewise.
(vqtbl1q_u8): Likewise.
(vqtbx1_s8): Use renamed qtbx1 RTL pattern.
(vqtbx1_u8): Likewise.
(vqtbx1_p8): Likewise.
(vqtbx1q_s8): Likewise.
(vqtbx1q_u8): Likewise.
(vqtbx1q_p8): Likewise.
(vtbl1_s8): Use renamed qtbl1 RTL pattern.
(vtbl1_u8): Likewise.
(vtbl1_p8): Likewise.
(vtbl2_s8): Likewise
(vtbl2_u8): Likewise.
(vtbl2_p8): Likewise.
(vtbl3_s8): Use renamed qtbl2 RTL pattern.
(vtbl3_u8): Likewise.
(vtbl3_p8): Likewise.
(vtbl4_s8): Likewise.
(vtbl4_u8): Likewise.
(vtbl4_p8): Likewise.
(vtbx2_s8): Use renamed qtbx2 RTL pattern.
(vtbx2_u8): Likewise.
(vtbx2_p8): Likewise.
(vqtbl2_s8): Use renamed qtbl2 RTL pattern.
(vqtbl2_u8): Likewise.
(vqtbl2_p8): Likewise.
(vqtbl2q_s8): Likewise.
(vqtbl2q_u8): Likewise.
(vqtbl2q_p8): Likewise.
(vqtbx2_s8): Use renamed qtbx2 RTL pattern.
(vqtbx2_u8): Likewise.
(vqtbx2_p8): Likewise.
(vqtbx2q_s8): Likewise.
(vqtbx2q_u8): Likewise.
(vqtbx2q_p8): Likewise.
(vtbx4_s8): Likewise.
(vtbx4_u8): Likewise.
(vtbx4_p8): Likewise.


rb14671.patch
Description: rb14671.patch


[PATCH] aarch64: Don't include vec_select in SIMD multiply cost

2021-07-20 Thread Jonathan Wright via Gcc-patches
Hi,

The Neon multiply/multiply-accumulate/multiply-subtract instructions
can take various forms - multiplying full vector registers of values
or multiplying one vector by a single element of another. Regardless
of the form used, these instructions have the same cost, and this
should be reflected by the RTL cost function.

This patch adds RTL tree traversal in the Neon multiply cost function
to match the vec_select used by the lane-referencing forms of the
instructions already mentioned. This traversal prevents the cost of
the vec_select from being added into the cost of the multiply -
meaning that these instructions can now be emitted in the combine
pass as they are no longer deemed prohibitively expensive.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  

* config/aarch64/aarch64.c (aarch64_rtx_mult_cost): Traverse
RTL tree to prevents vec_select from being added into Neon
multiply cost.


rb14675.patch
Description: rb14675.patch


[PATCH] simplify-rtx: Push sign/zero-extension inside vec_duplicate

2021-07-20 Thread Jonathan Wright via Gcc-patches
Hi,

As a general principle, vec_duplicate should be as close to the root
of an expression as possible. Where unary operations have
vec_duplicate as an argument, these operations should be pushed
inside the vec_duplicate.

This patch modifies unary operation simplification to push
sign/zero-extension of a scalar inside vec_duplicate.

This patch also updates all RTL patterns in aarch64-simd.md to use
the new canonical form.

Regression tested and bootstrapped on aarch64-none-linux-gnu and
x86_64-none-linux-gnu - no issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  

* config/aarch64/aarch64-simd.md: Push sign/zero-extension
inside vec_duplicate for all patterns.
* simplify-rtx.c (simplify_context::simplify_unary_operation_1):
Push sign/zero-extension inside vec_duplicate.

rb14677.patch
Description: rb14677.patch


[PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vqtbl[234] Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.

Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vqtbl[234] intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-08  Jonathan Wright  

* config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vqtbl2_u8): Likewise.
(vqtbl2_p8): Likewise.
(vqtbl2q_s8): Likewise.
(vqtbl2q_u8): Likewise.
(vqtbl2q_p8): Likewise.
(vqtbl3_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_ci one vector at a time.
(vqtbl3_u8): Likewise.
(vqtbl3_p8): Likewise.
(vqtbl3q_s8): Likewise.
(vqtbl3q_u8): Likewise.
(vqtbl3q_p8): Likewise.
(vqtbl4_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_xi one vector at a time.
(vqtbl4_u8): Likewise.
(vqtbl4_p8): Likewise.
(vqtbl4q_s8): Likewise.
(vqtbl4q_u8): Likewise.
(vqtbl4q_p8): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: New test.


rb14639.patch
Description: rb14639.patch


[PATCH 2/8] aarch64: Use memcpy to copy vector tables in vqtbx[234] intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vqtbx[234] Neon intrinsics in arm_neon.h. This simplifies the header
file and also improves code generation - superfluous move
instructions were emitted for every register extraction/set in this
additional structure.

Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vqtbx[234] intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-08  Jonathan Wright  

* config/aarch64/arm_neon.h (vqtbx2_s8): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vqtbx2_u8): Likewise.
(vqtbx2_p8): Likewise.
(vqtbx2q_s8): Likewise.
(vqtbx2q_u8): Likewise.
(vqtbx2q_p8): Likewise.
(vqtbx3_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_ci one vector at a time.
(vqtbx3_u8): Likewise.
(vqtbx3_p8): Likewise.
(vqtbx3q_s8): Likewise.
(vqtbx3q_u8): Likewise.
(vqtbx3q_p8): Likewise.
(vqtbx4_s8): Use __builtin_memcpy instead of constructing
__builtin_aarch64_simd_xi one vector at a time.
(vqtbx4_u8): Likewise.
(vqtbx4_p8): Likewise.
(vqtbx4q_s8): Likewise.
(vqtbx4q_u8): Likewise.
(vqtbx4q_p8): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: New tests.

rb14640.patch
Description: rb14640.patch


[PATCH 3/8] aarch64: Use memcpy to copy vector tables in vtbl[34] intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vtbl[34] Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-08  Jonathan Wright  

* config/aarch64/arm_neon.h (vtbl3_s8): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vtbl3_u8): Likewise.
(vtbl3_p8): Likewise.
(vtbl4_s8): Likewise.
(vtbl4_u8): Likewise.
(vtbl4_p8): Likewise.

rb14673.patch
Description: rb14673.patch


[PATCH 4/8] aarch64: Use memcpy to copy vector tables in vtbx4 intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vtbx4 Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  

* config/aarch64/arm_neon.h (vtbx4_s8): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vtbx4_u8): Likewise.
(vtbx4_p8): Likewise.


rb14674.patch
Description: rb14674.patch


[PATCH 5/8] aarch64: Use memcpy to copy vector tables in vst4[q] intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vst4[q] Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.

Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vst4q intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-20  Jonathan Wright  

* config/aarch64/arm_neon.h (vst4_s64): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_xi one vector
at a time.
(vst4_u64): Likewise.
(vst4_f64): Likewise.
(vst4_s8): Likewise.
(vst4_p8): Likewise.
(vst4_s16): Likewise.
(vst4_p16): Likewise.
(vst4_s32): Likewise.
(vst4_u8): Likewise.
(vst4_u16): Likewise.
(vst4_u32): Likewise.
(vst4_f16): Likewise.
(vst4_f32): Likewise.
(vst4_p64): Likewise.
(vst4q_s8): Likewise.
(vst4q_p8): Likewise.
(vst4q_s16): Likewise.
(vst4q_p16): Likewise.
(vst4q_s32): Likewise.
(vst4q_s64): Likewise.
(vst4q_u8): Likewise.
(vst4q_u16): Likewise.
(vst4q_u32): Likewise.
(vst4q_u64): Likewise.
(vst4q_f16): Likewise.
(vst4q_f32): Likewise.
(vst4q_f64): Likewise.
(vst4q_p64): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14687.patch
Description: rb14687.patch


[PATCH 6/8] aarch64: Use memcpy to copy vector tables in vst3[q] intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vst3[q] Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.

Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vst3q intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-21  Jonathan Wright  

* config/aarch64/arm_neon.h (vst3_s64): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_ci one vector
at a time.
(vst3_u64): Likewise.
(vst3_f64): Likewise.
(vst3_s8): Likewise.
(vst3_p8): Likewise.
(vst3_s16): Likewise.
(vst3_p16): Likewise.
(vst3_s32): Likewise.
(vst3_u8): Likewise.
(vst3_u16): Likewise.
(vst3_u32): Likewise.
(vst3_f16): Likewise.
(vst3_f32): Likewise.
(vst3_p64): Likewise.
(vst3q_s8): Likewise.
(vst3q_p8): Likewise.
(vst3q_s16): Likewise.
(vst3q_p16): Likewise.
(vst3q_s32): Likewise.
(vst3q_s64): Likewise.
(vst3q_u8): Likewise.
(vst3q_u16): Likewise.
(vst3q_u32): Likewise.
(vst3q_u64): Likewise.
(vst3q_f16): Likewise.
(vst3q_f32): Likewise.
(vst3q_f64): Likewise.
(vst3q_p64): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14688.patch
Description: rb14688.patch


Re: [PATCH 3/8] aarch64: Use memcpy to copy vector tables in vtbl[34] intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
I haven't added test cases here because these intrinsics don't map to
a single instruction (they're legacy from Armv7) and would trip the
"scan-assembler not mov" that we're using for the other tests.

Jonathan

From: Richard Sandiford 
Sent: 23 July 2021 10:29
To: Kyrylo Tkachov 
Cc: Jonathan Wright ; gcc-patches@gcc.gnu.org 

Subject: Re: [PATCH 3/8] aarch64: Use memcpy to copy vector tables in vtbl[34] 
intrinsics

Kyrylo Tkachov  writes:
>> -Original Message-
>> From: Jonathan Wright 
>> Sent: 23 July 2021 09:30
>> To: gcc-patches@gcc.gnu.org
>> Cc: Kyrylo Tkachov ; Richard Sandiford
>> 
>> Subject: [PATCH 3/8] aarch64: Use memcpy to copy vector tables in vtbl[34]
>> intrinsics
>>
>> Hi,
>>
>> This patch uses __builtin_memcpy to copy vector structures instead of
>> building a new opaque structure one vector at a time in each of the
>> vtbl[34] Neon intrinsics in arm_neon.h. This simplifies the header file
>> and also improves code generation - superfluous move instructions
>> were emitted for every register extraction/set in this additional
>> structure.
>>
>> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
>> issues.
>>
>> Ok for master?
>
> Ok.

Please add testcases first though. :-)

Thanks,
Richard


[PATCH 7/8] aarch64: Use memcpy to copy vector tables in vst2[q] intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vst2[q] Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.

Add new code generation tests to verify that superfluous move
instructions are no longer generated for the vst2q intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-21  Jonathan Wrightt  

* config/aarch64/arm_neon.h (vst2_s64): Use __builtin_memcpy
instead of constructing __builtin_aarch64_simd_oi one vector
at a time.
(vst2_u64): Likewise.
(vst2_f64): Likewise.
(vst2_s8): Likewise.
(vst2_p8): Likewise.
(vst2_s16): Likewise.
(vst2_p16): Likewise.
(vst2_s32): Likewise.
(vst2_u8): Likewise.
(vst2_u16): Likewise.
(vst2_u32): Likewise.
(vst2_f16): Likewise.
(vst2_f32): Likewise.
(vst2_p64): Likewise.
(vst2q_s8): Likewise.
(vst2q_p8): Likewise.
(vst2q_s16): Likewise.
(vst2q_p16): Likewise.
(vst2q_s32): Likewise.
(vst2q_s64): Likewise.
(vst2q_u8): Likewise.
(vst2q_u16): Likewise.
(vst2q_u32): Likewise.
(vst2q_u64): Likewise.
(vst2q_f16): Likewise.
(vst2q_f32): Likewise.
(vst2q_f64): Likewise.
(vst2q_p64): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14689.patch
Description: rb14689.patch


[PATCH 8/8] aarch64: Use memcpy to copy vector tables in vst1[q]_x4 intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
using a union in each of the vst1[q]_x4 Neon intrinsics in arm_neon.h.

Add new code generation tests to verify that superfluous move
instructions are not generated for the vst1q_x4 intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-21  Jonathan Wright  

* config/aarch64/arm_neon.h (vst1_s8_x4): Use
__builtin_memcpy instead of using a union.
(vst1q_s8_x4): Likewise.
(vst1_s16_x4): Likewise.
(vst1q_s16_x4): Likewise.
(vst1_s32_x4): Likewise.
(vst1q_s32_x4): Likewise.
(vst1_u8_x4): Likewise.
(vst1q_u8_x4): Likewise.
(vst1_u16_x4): Likewise.
(vst1q_u16_x4): Likewise.
(vst1_u32_x4): Likewise.
(vst1q_u32_x4): Likewise.
(vst1_f16_x4): Likewise.
(vst1q_f16_x4): Likewise.
(vst1_f32_x4): Likewise.
(vst1q_f32_x4): Likewise.
(vst1_p8_x4): Likewise.
(vst1q_p8_x4): Likewise.
(vst1_p16_x4): Likewise.
(vst1q_p16_x4): Likewise.
(vst1_s64_x4): Likewise.
(vst1_u64_x4): Likewise.
(vst1_p64_x4): Likewise.
(vst1q_s64_x4): Likewise.
(vst1q_u64_x4): Likewise.
(vst1q_p64_x4): Likewise.
(vst1_f64_x4): Likewise.
(vst1q_f64_x4): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14697.patch
Description: rb14697.patch


Re: [PATCH 4/8] aarch64: Use memcpy to copy vector tables in vtbx4 intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Same explanation as for patch 3/8:

I haven't added test cases here because these intrinsics don't map to
a single instruction (they're legacy from Armv7) and would trip the
"scan-assembler not mov" that we're using for the other tests.

Thanks,
Jonathan

From: Richard Sandiford 
Sent: 23 July 2021 10:31
To: Kyrylo Tkachov 
Cc: Jonathan Wright ; gcc-patches@gcc.gnu.org 

Subject: Re: [PATCH 4/8] aarch64: Use memcpy to copy vector tables in vtbx4 
intrinsics

Kyrylo Tkachov  writes:
>> -Original Message-
>> From: Jonathan Wright 
>> Sent: 23 July 2021 10:15
>> To: gcc-patches@gcc.gnu.org
>> Cc: Kyrylo Tkachov ; Richard Sandiford
>> 
>> Subject: [PATCH 4/8] aarch64: Use memcpy to copy vector tables in vtbx4
>> intrinsics
>>
>> Hi,
>>
>> This patch uses __builtin_memcpy to copy vector structures instead of
>> building a new opaque structure one vector at a time in each of the
>> vtbx4 Neon intrinsics in arm_neon.h. This simplifies the header file
>> and also improves code generation - superfluous move instructions
>> were emitted for every register extraction/set in this additional
>> structure.
>>
>> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
>> issues.
>>
>> Ok for master?
>
> Ok.

Here too I think we want some testcases…

Thanks,
Richard


[PATCH] aarch64: Use memcpy to copy vector tables in vst1[q]_x3 intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vst1[q]_x3 Neon intrinsics in arm_neon.h. This simplifies the header file
and also improves code generation - superfluous move instructions
were emitted for every register extraction/set in this additional
structure.

Add new code generation tests to verify that superfluous move
instructions are not generated for the vst1q_x3 intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-23  Jonathan Wright  

* config/aarch64/arm_neon.h (vst1_s64_x3): Use
__builtin_memcpy instead of constructing
__builtin_aarch64_simd_ci one vector at a time.
(vst1_u64_x3): Likewise.
(vst1_f64_x3): Likewise.
(vst1_s8_x3): Likewise.
(vst1_p8_x3): Likewise.
(vst1_s16_x3): Likewise.
(vst1_p16_x3): Likewise.
(vst1_s32_x3): Likewise.
(vst1_u8_x3): Likewise.
(vst1_u16_x3): Likewise.
(vst1_u32_x3): Likewise.
(vst1_f16_x3): Likewise.
(vst1_f32_x3): Likewise.
(vst1_p64_x3): Likewise.
(vst1q_s8_x3): Likewise.
(vst1q_p8_x3): Likewise.
(vst1q_s16_x3): Likewise.
(vst1q_p16_x3): Likewise.
(vst1q_s32_x3): Likewise.
(vst1q_s64_x3): Likewise.
(vst1q_u8_x3): Likewise.
(vst1q_u16_x3): Likewise.
(vst1q_u32_x3): Likewise.
(vst1q_u64_x3): Likewise.
(vst1q_f16_x3): Likewise.
(vst1q_f32_x3): Likewise.
(vst1q_f64_x3): Likewise.
(vst1q_p64_x3): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14700.patch
Description: rb14700.patch


[PATCH] aarch64: Use memcpy to copy vector tables in vst1[q]_x2 intrinsics

2021-07-23 Thread Jonathan Wright via Gcc-patches
Hi,

This patch uses __builtin_memcpy to copy vector structures instead of
building a new opaque structure one vector at a time in each of the
vst1[q]_x2 Neon intrinsics in arm_neon.h. This simplifies the header
file and also improves code generation - superfluous move
instructions were emitted for every register extraction/set in this
additional structure.

Add new code generation tests to verify that superfluous move
instructions are not generated for the vst1q_x2 intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-23  Jonathan Wright  

* config/aarch64/arm_neon.h (vst1_s64_x2): Use
__builtin_memcpy instead of constructing
__builtin_aarch64_simd_oi one vector at a time.
(vst1_u64_x2): Likewise.
(vst1_f64_x2): Likewise.
(vst1_s8_x2): Likewise.
(vst1_p8_x2): Likewise.
(vst1_s16_x2): Likewise.
(vst1_p16_x2): Likewise.
(vst1_s32_x2): Likewise.
(vst1_u8_x2): Likewise.
(vst1_u16_x2): Likewise.
(vst1_u32_x2): Likewise.
(vst1_f16_x2): Likewise.
(vst1_f32_x2): Likewise.
(vst1_p64_x2): Likewise.
(vst1q_s8_x2): Likewise.
(vst1q_p8_x2): Likewise.
(vst1q_s16_x2): Likewise.
(vst1q_p16_x2): Likewise.
(vst1q_s32_x2): Likewise.
(vst1q_s64_x2): Likewise.
(vst1q_u8_x2): Likewise.
(vst1q_u16_x2): Likewise.
(vst1q_u32_x2): Likewise.
(vst1q_u64_x2): Likewise.
(vst1q_f16_x2): Likewise.
(vst1q_f32_x2): Likewise.
(vst1q_f64_x2): Likewise.
(vst1q_p64_x2): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.


rb14701.patch
Description: rb14701.patch


Re: [PATCH V2] simplify-rtx: Push sign/zero-extension inside vec_duplicate

2021-07-26 Thread Jonathan Wright via Gcc-patches
Hi,

This updated patch fixes the two-operators-per-row style issue in the 
aarch64-simd.md RTL patterns as well as integrating the simplify-rtx.c
change as suggested.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  

* config/aarch64/aarch64-simd.md: Push sign/zero-extension
inside vec_duplicate for all patterns.
* simplify-rtx.c (simplify_context::simplify_unary_operation_1):
Push sign/zero-extension inside vec_duplicate.



From: Richard Sandiford 
Sent: 22 July 2021 18:36
To: Jonathan Wright 
Cc: gcc-patches@gcc.gnu.org ; Kyrylo Tkachov 

Subject: Re: [PATCH] simplify-rtx: Push sign/zero-extension inside 
vec_duplicate 
 
Jonathan Wright  writes:
> Hi,
>
> As a general principle, vec_duplicate should be as close to the root
> of an expression as possible. Where unary operations have
> vec_duplicate as an argument, these operations should be pushed
> inside the vec_duplicate.
>
> This patch modifies unary operation simplification to push
> sign/zero-extension of a scalar inside vec_duplicate.
>
> This patch also updates all RTL patterns in aarch64-simd.md to use
> the new canonical form.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu and
> x86_64-none-linux-gnu - no issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-19  Jonathan Wright  
>
> * config/aarch64/aarch64-simd.md: Push sign/zero-extension
> inside vec_duplicate for all patterns.
> * simplify-rtx.c (simplify_context::simplify_unary_operation_1):
> Push sign/zero-extension inside vec_duplicate.
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> 74890989cb3045798bf8d0241467eaaf72238297..99a95a54248041906b9a0ad742d3a0dca9733b35
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2092,14 +2092,14 @@
>  
>  (define_insn "aarch64_mlal_hi_n_insn"
>    [(set (match_operand: 0 "register_operand" "=w")
> -    (plus:
> -  (mult:
> -  (ANY_EXTEND: (vec_select:
> - (match_operand:VQ_HSI 2 "register_operand" "w")
> - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
> -  (ANY_EXTEND: (vec_duplicate:
> -    (match_operand: 4 "register_operand" ""
> -  (match_operand: 1 "register_operand" "0")))]
> + (plus:
> +   (mult:
> +   (ANY_EXTEND: (vec_select:
> +  (match_operand:VQ_HSI 2 "register_operand" "w")
> +  (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
> +  (vec_duplicate: (ANY_EXTEND:
> +  (match_operand: 4 "register_operand" ""
> +   (match_operand: 1 "register_operand" "0")))]

Sorry to nitpick, since this is pre-existing, but I think the pattern
would be easier to read with one operation per line.  I.e.:

    (plus:
  (mult:
    (ANY_EXTEND:
  (vec_select:
    (match_operand:VQ_HSI 2 "register_operand" "w")
    (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
    (vec_duplicate:
  (ANY_EXTEND:
    (match_operand: 4 "register_operand" ""
  (match_operand: 1 "register_operand" "0")))]

Same for the other patterns with similar doubling of operators.
(It looks like you've fixed other indentation problems though, thanks.)

> diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
> index 
> 2d169d3f9f70c85d396adaed124b6c52aca98f07..f885816412f7576d2535f827562d2b425a6a553b
>  100644
> --- a/gcc/simplify-rtx.c
> +++ b/gcc/simplify-rtx.c
> @@ -903,6 +903,18 @@ simplify_context::simplify_unary_operation_1 (rtx_code 
> code, machine_mode mode,
>    rtx temp, elt, base, step;
>    scalar_int_mode inner, int_mode, op_mode, op0_mode;
>  
> +  /* Extending a VEC_DUPLICATE of a scalar should be canonicalized to a
> + VEC_DUPLICATE of an extended scalar. This is outside of the main switch
> + as we may wish to push all unary operations inside VEC_DUPLICATE. */
> +  if ((code == SIGN_EXTEND || code == ZERO_EXTEND)
> +  && GET_CODE (op) == VEC_DUPLICATE
> +  && GET_MODE_NUNITS (GET_MODE (XEXP (op, 0))).to_constant () == 1)
> +    {
> +  rtx x = simplify_gen_unary (code, GET_MODE_INNER (mode),
> +   XEXP (op, 0), GET_MODE (XEXP (op, 0)));
> +  return gen_vec_duplicate (mode, x);
> +    }
> +
>    switch (code)
>  {
>  case NOT:

This is really an extension of the existing code:

  if (VECTOR_MODE_P (mode)
  && vec_duplicate_p (op, &elt)
  && code != VEC_DUPLICATE)
    {
  /* Try applying the operator to ELT and see if that simplifies.
 We can duplicate the result if so.

 The reason we don't use simplify_gen_unary is that it isn't
 necessarily a win to convert things like:


Re: [PATCH V2] aarch64: Don't include vec_select in SIMD multiply cost

2021-07-28 Thread Jonathan Wright via Gcc-patches
Hi,

V2 of the patch addresses the initial review comments, factors out
common code (as we discussed off-list) and adds a set of unit tests
to verify the code generation benefit.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  

* config/aarch64/aarch64.c (aarch64_strip_duplicate_vec_elt):
Define.
(aarch64_rtx_mult_cost): Traverse RTL tree to prevent
vec_select cost from being added into Neon multiply cost.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vmul_element_cost.c: New test.



From: Richard Sandiford 
Sent: 22 July 2021 18:16
To: Jonathan Wright 
Cc: gcc-patches@gcc.gnu.org ; Kyrylo Tkachov 

Subject: Re: [PATCH] aarch64: Don't include vec_select in SIMD multiply cost 
 
Jonathan Wright  writes:
> Hi,
>
> The Neon multiply/multiply-accumulate/multiply-subtract instructions
> can take various forms - multiplying full vector registers of values
> or multiplying one vector by a single element of another. Regardless
> of the form used, these instructions have the same cost, and this
> should be reflected by the RTL cost function.
>
> This patch adds RTL tree traversal in the Neon multiply cost function
> to match the vec_select used by the lane-referencing forms of the
> instructions already mentioned. This traversal prevents the cost of
> the vec_select from being added into the cost of the multiply -
> meaning that these instructions can now be emitted in the combine
> pass as they are no longer deemed prohibitively expensive.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-19  Jonathan Wright  
>
> * config/aarch64/aarch64.c (aarch64_rtx_mult_cost): Traverse
> RTL tree to prevents vec_select from being added into Neon
> multiply cost.
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> f5b25a7f7041645921e6ad85714efda73b993492..b368303b0e699229266e6d008e28179c496bf8cd
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -11985,6 +11985,21 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, 
> int outer, bool speed)
>    op0 = XEXP (op0, 0);
>  else if (GET_CODE (op1) == VEC_DUPLICATE)
>    op1 = XEXP (op1, 0);
> +   /* The same argument applies to the VEC_SELECT when using the lane-
> +  referencing forms of the MUL/MLA/MLS instructions. Without the
> +  traversal here, the combine pass deems these patterns too
> +  expensive and subsequently does not emit the lane-referencing
> +  forms of the instructions. In addition, canonical form is for the
> +  VEC_SELECT to be the second argument of the multiply - thus only
> +  op1 is traversed.  */
> +   if (GET_CODE (op1) == VEC_SELECT
> +   && GET_MODE_NUNITS (GET_MODE (op1)).to_constant () == 1)
> + op1 = XEXP (op1, 0);
> +   else if ((GET_CODE (op1) == ZERO_EXTEND
> + || GET_CODE (op1) == SIGN_EXTEND)
> +    && GET_CODE (XEXP (op1, 0)) == VEC_SELECT
> +    && GET_MODE_NUNITS (GET_MODE (op1)).to_constant () == 1)
> + op1 = XEXP (XEXP (op1, 0), 0);

I think this logically belongs in the “GET_CODE (op1) == VEC_DUPLICATE”
if block, since the condition is never true otherwise.  We can probably
skip the GET_MODE_NUNITS tests, but if you'd prefer to keep them, I think
it would be better to add them to the existing VEC_DUPLICATE tests rather
than restrict them to the VEC_SELECT ones.

Also, although this is in Advanced SIMD-specific code, I think it'd be
better to use:

  is_a (GET_MODE (op1))

instead of:

  GET_MODE_NUNITS (GET_MODE (op1)).to_constant () == 1

Do you have a testcase?

Thanks,
Richard

rb14675.patch
Description: rb14675.patch


[PATCH] aarch64: Don't include vec_select high-half in SIMD multiply cost

2021-07-28 Thread Jonathan Wright via Gcc-patches
Hi,

The Neon multiply/multiply-accumulate/multiply-subtract instructions
can select the top or bottom half of the operand registers. This
selection does not change the cost of the underlying instruction and
this should be reflected by the RTL cost function.

This patch adds RTL tree traversal in the Neon multiply cost function
to match vec_select high-half of its operands. This traversal
prevents the cost of the vec_select from being added into the cost of
the multiply - meaning that these instructions can now be emitted in
the combine pass as they are no longer deemed prohibitively
expensive.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  

* config/aarch64/aarch64.c (aarch64_vec_select_high_operand_p):
Define.
(aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
vec_select high-half from being added into Neon multiply
cost.
* rtlanal.c (vec_series_highpart_p): Define.
* rtlanal.h (vec_series_highpart_p): Declare.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vmul_high_cost.c: New test.


rb14704.patch
Description: rb14704.patch


[PATCH] aarch64: Don't include vec_select high-half in SIMD add cost

2021-07-29 Thread Jonathan Wright via Gcc-patches
Hi,

The Neon add-long/add-widen instructions can select the top or bottom
half of the operand registers. This selection does not change the
cost of the underlying instruction and this should be reflected by
the RTL cost function.

This patch adds RTL tree traversal in the Neon add cost function to
match vec_select high-half of its operands. This traversal prevents
the cost of the vec_select from being added into the cost of the
subtract - meaning that these instructions can now be emitted in the
combine pass as they are no longer deemed prohibitively expensive.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-28  Jonathan Wright  

* config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
of vec_select high-half from being added into Neon add cost.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vaddX_high_cost.c: New test.


rb14710.patch
Description: rb14710.patch


[PATCH] aarch64: Don't include vec_select high-half in SIMD subtract cost

2021-07-29 Thread Jonathan Wright via Gcc-patches
Hi,

The Neon subtract-long/subract-widen instructions can select the top
or bottom half of the operand registers. This selection does not
change the cost of the underlying instruction and this should be
reflected by the RTL cost function.

This patch adds RTL tree traversal in the Neon subtract cost function
to match vec_select high-half of its operands. This traversal
prevents the cost of the vec_select from being added into the cost of
the subtract - meaning that these instructions can now be emitted in
the combine pass as they are no longer deemed prohibitively
expensive.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-28  Jonathan Wright  

* config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
of vec_select high-half from being added into Neon subtract
cost.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vsubX_high_cost.c: New test.


rb14711.patch
Description: rb14711.patch


[PATCH 1/20] aarch64: Use RTL builtin for vmull[_high]_p8 intrinsics

2021-04-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites the vmull[_high]_p8 Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling and
optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu and
aarch64_be-none-elf - no issues.

Ok for master?

Thanks,
Jonathan



gcc/ChangeLog:

2021-02-05  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add pmull[2]
builtin generator macros.
* config/aarch64/aarch64-simd.md (aarch64_pmullv8qi): Define.
(aarch64_pmull_hiv16qi_insn): Define.
(aarch64_pmull_hiv16qi): Define.
* config/aarch64/arm_neon.h (vmull_high_p8): Use RTL builtin
instead of inline asm.
(vmull_p8): Likewise.


rb14128.patch
Description: rb14128.patch


[PATCH 2/20] aarch64: Use RTL builtin for vq[r]dmulh[q]_n intrinsics

2021-04-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites the vq[r]dmulh[q]_n Neon intrinsics to use
RTL builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-08  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add sq[r]dmulh_n
builtin generator macros.
* config/aarch64/aarch64-simd.md (aarch64_sqdmulh_n):
Define.
* config/aarch64/arm_neon.h (vqdmulh_n_s16): Use RTL builtin
instead of inline asm.
(vqdmulh_n_s32): Likewise.
(vqdmulhq_n_s16): Likewise.
(vqdmulhq_n_s32): Likewise.
(vqrdmulh_n_s16): Likewise.
(vqrdmulh_n_s32): Likewise.
(vqrdmulhq_n_s16): Likewise.
(vqrdmulhq_n_s32): Likewise.


rb14130.patch
Description: rb14130.patch


[PATCH 3/20] aarch64: Use RTL builtins for vpaddq intrinsics

2021-04-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites the vpaddq Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-08  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Use VDQ_I iterator
for aarch64_addp builtin macro generator.
* config/aarch64/aarch64-simd.md: Use VDQ_I iterator in
aarch64_addp RTL pattern.
* config/aarch64/arm_neon.h (vpaddq_s8): Use RTL builtin
instead of inline asm.
(vpaddq_s16): Likewise.
(vpaddq_s32): Likewise.
(vpaddq_s64): Likewise.
(vpaddq_u8): Likewise.
(vpaddq_u16): Likewise.
(vpaddq_u32): Likewise.
(vpaddq_u64): Likewise.


rb14136.patch
Description: rb14136.patch


[PATCH 4/20] aarch64: Use RTL builtins for [su]paddl[q] intrinsics

2021-04-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites the [su]paddl[q] Neon intrinsics to use
RTL builtins rather than inline assembly code, allowing for better
scheduling and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-08  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add [su]addlp
builtin generator macros.
* config/aarch64/aarch64-simd.md (aarch64_addlp):
Define.
* config/aarch64/arm_neon.h (vpaddl_s8): Use RTL builtin
instead of inline asm.
(vpaddl_s16): Likewise.
(vpaddl_s32): Likewise.
(vpaddl_u8): Likewise.
(vpaddl_u16): Likewise.
(vpaddl_u32): Likewise.
(vpaddlq_s8): Likewise.
(vpaddlq_s16): Likewise.
(vpaddlq_s32): Likewise.
(vpaddlq_u8): Likewise.
(vpaddlq_u16): Likewise.
(vpaddlq_u32): Liwewise.
* config/aarch64/iterators.md: Define [SU]ADDLP unspecs with
appropriate attributes.


rb14137.patch
Description: rb14137.patch


[PATCH 5/20] aarch64: Use RTL builtins for vpadal_[su]32 intrinsics

2021-04-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites the vpadal_[su]32 Neon intrinsics to use
RTL builtins rather than inline assembly code, allowing for better
scheduling and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-09  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Use VDQV_L
iterator to generate [su]adalp RTL builtins.
* config/aarch64/aarch64-simd.md: Use VDQV_L iterator in
[su]adalp RTL pattern.
* config/aarch64/arm_neon.h (vpadal_s32): Use RTL builtin
instead of inline asm.
(vpadal_u32): Likewise.


rb14133.patch
Description: rb14133.patch


[PATCH 6/20] aarch64: Use RTL builtins for polynomial vsli[q]_n intrinsics

2021-04-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites the vsli[q]_n_p* Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-10  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Use VALLP mode
iterator for polynomial ssli_n builtin generator macro.
* config/aarch64/arm_neon.h (vsli_n_p8): Use RTL builtin
instead of inline asm.
(vsli_n_p16): Likewise.
(vsliq_n_p8): Likewise.
(vsliq_n_p16): Likewise.
* config/aarch64/iterators.md: Define VALLP mode iterator.


rb14146.patch
Description: rb14146.patch


[PATCH 7/20] aarch64: Use RTL builtins for polynomial vsri[q]_n intrinsics

2021-04-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites the vsri[q]_n_p* Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-10  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add polynomial
ssri_n buitin generator macro.
* config/aarch64/arm_neon.h (vsri_n_p8): Use RTL builtin
instead of inline asm.
(vsri_n_p16): Likewise.
(vsri_n_p64): Likewise.
(vsriq_n_p8): Likewise.
(vsriq_n_p16): Likewise.
(vsriq_n_p64): Likewise.


rb14147.patch
Description: rb14147.patch


[PATCH 8/20] aarch64: Use RTL builtins for v[q]tbl intrinsics

2021-04-28 Thread Jonathan Wright via Gcc-patches
Hi,

As subject, this patch rewrites the v[q]tbl Neon intrinsics to use RTL
builtins rather than inline assembly code, allowing for better scheduling
and optimization.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-02-12  Jonathan Wright  

* config/aarch64/aarch64-simd-builtins.def: Add tbl1 builtin
generator macros.
* config/aarch64/arm_neon.h (vqtbl1_p8): Use RTL builtin
instead of inline asm.
(vqtbl1_s8): Likewise.
(vqtbl1_u8): Likewise.
(vqtbl1q_p8): Likewise.
(vqtbl1q_s8): Likewise.
(vqtbl1q_u8): Likewise.
(vtbl1_s8): Likewise.
(vtbl1_u8): Likewise.
(vtbl1_p8): Likewise.
(vtbl2_s8): Likewise.
(vtbl2_u8): Likewise.
(vtbl2_p8): Likewise.

rb14154.patch
Description: rb14154.patch


  1   2   >