[gcc r15-3669] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-09-16 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:a92f54f580c37732a5de01e47aed56882231f196

commit r15-3669-ga92f54f580c37732a5de01e47aed56882231f196
Author: Pengxuan Zheng 
Date:   Tue Sep 10 17:59:46 2024 -0700

aarch64: Improve vector constant generation using SVE INDEX instruction 
[PR113328]

SVE's INDEX instruction can be used to populate vectors by values starting 
from
"base" and incremented by "step" for each subsequent value. We can take
advantage of it to generate vector constants if TARGET_SVE is available and 
the
base and step values are within [-16, 15].

For example, with the following function:

typedef int v4si __attribute__ ((vector_size (16)));
v4si
f_v4si (void)
{
  return (v4si){ 0, 1, 2, 3 };
}

GCC currently generates:

f_v4si:
adrpx0, .LC4
ldr q0, [x0, #:lo12:.LC4]
ret

.LC4:
.word   0
.word   1
.word   2
.word   3

With this patch, we generate an INDEX instruction instead if TARGET_SVE is
available.

f_v4si:
index   z0.s, #0, #1
ret

PR target/113328

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_simd_valid_immediate): Improve
handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE 
is
available.
(aarch64_output_simd_mov_immediate): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
SVE's INDEX instruction.
* gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
* gcc.target/aarch64/sve/vec_init_3.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64.cc  | 13 ++-
 .../gcc.target/aarch64/sve/acle/general/dupq_1.c   |  3 +-
 .../gcc.target/aarch64/sve/acle/general/dupq_2.c   |  3 +-
 .../gcc.target/aarch64/sve/acle/general/dupq_3.c   |  3 +-
 .../gcc.target/aarch64/sve/acle/general/dupq_4.c   |  3 +-
 gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c  | 99 ++
 6 files changed, 115 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 6ccf08d1cc0a..92763d403c75 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22987,7 +22987,8 @@ aarch64_simd_valid_immediate (rtx op, 
simd_immediate_info *info,
   if (CONST_VECTOR_P (op)
   && CONST_VECTOR_DUPLICATE_P (op))
 n_elts = CONST_VECTOR_NPATTERNS (op);
-  else if ((vec_flags & VEC_SVE_DATA)
+  else if (which == AARCH64_CHECK_MOV
+  && TARGET_SVE
   && const_vec_series_p (op, &base, &step))
 {
   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
@@ -25245,6 +25246,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, 
unsigned width,
 
   if (which == AARCH64_CHECK_MOV)
 {
+  if (info.insn == simd_immediate_info::INDEX)
+   {
+ gcc_assert (TARGET_SVE);
+ snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
+   HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
+   element_char, INTVAL (info.u.index.base),
+   INTVAL (info.u.index.step));
+ return templ;
+   }
+
   mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
   shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
  ? "msl" : "lsl");
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
index 216699b0536e..0940bedd0ddb 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
@@ -10,7 +10,6 @@ dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
index d494943a2753..218a66013375 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
@@ -10,7 +10,6 @@ dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */

[gcc r15-1801] aarch64: Add vector popcount besides QImode [PR113859]

2024-07-02 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:895bbc08d38c2aca3cbbab273a247021fea73930

commit r15-1801-g895bbc08d38c2aca3cbbab273a247021fea73930
Author: Pengxuan Zheng 
Date:   Wed Jun 12 18:23:13 2024 -0700

aarch64: Add vector popcount besides QImode [PR113859]

This patch improves GCC’s vectorization of __builtin_popcount for aarch64 
target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v0.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.8b, #0
  moviv1.8b, #1
  cnt v3.8b, v2.8b
  udotv0.2s, v3.8b, v1.8b

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b
  uaddlp  v0.2d, v0.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename 
to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-udot.c: New test.
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-simd.md | 41 ++-
 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c | 58 ++
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c  | 69 ++
 3 files changed, 167 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 01b084d8ccb..fd0c5e612b5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,45 @@
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+   (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount.  */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+if (TARGET_DOTPROD
+   && (mode == SImode || mode == DImode))
+  {
+   /* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a
+  1 multiplicand.  For V2DI, another UAADDLP is needed.  */
+   rtx ones = force_reg (mode, CONST1_RTX (mode));
+   auto icode = optab_handler (udot_prod_optab, mode);
+   mode =  == 64 ? V2SImode : V4SImode;
+   rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+   rtx zeros = force_reg (mode, CONST0_RTX (mode));
+   emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros));
+   tmp = dest;
+  }
+
+/* Use a sequence of UADDLPs to accumulate the counts.  Each step doubles
+   the element size and halves the number of elements.  */
+while (mode != mode)
+  {
+   auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+   mode = insn_data[icode].operand[0].mode;
+   rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+   emit_insn (GEN_FCN (icode) (dest, tmp));
+   tmp = dest;
+  }
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
new file mode 100644
index 000..f6a968dae95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model 
-fno-schedule-insns -fno-schedule-insns2" } */
+
+/*
+** bar:
+** moviv([0-9]+).16b, 0x1
+** moviv([0-9]+).4s, 0
+** ldr q([0-9]+), \[x0\]
+** cnt v([0-9]+).16b, v\3.16b
+** udotv\2.4s, v\4.16b, v\

[gcc r15-2659] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

2024-08-01 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:e4b8db26de35239bd621aad9c0361f25d957122b

commit r15-2659-ge4b8db26de35239bd621aad9c0361f25d957122b
Author: Pengxuan Zheng 
Date:   Wed Jul 31 17:00:01 2024 -0700

aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

This patch improves the Advanced SIMD popcount expansion by using SVE if
available.

For example, GCC currently generates the following code sequence for V2DI:
  cnt v31.16b, v31.16b
  uaddlp  v31.8h, v31.16b
  uaddlp  v31.4s, v31.8h
  uaddlp  v31.2d, v31.4s

However, by using SVE, we can generate the following sequence instead:
  ptrue   p7.b, all
  cnt z31.d, p7/m, z31.d

Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.

The scalar popcount expansion can also be improved similarly by using SVE 
and
those changes will be included in a separate patch.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (popcount2): Add TARGET_SVE
support.
* config/aarch64/aarch64-sve.md (@aarch64_pred_): Use 
new
iterator SVE_VDQ_I.
* config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator.
(VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-sve.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-simd.md|  9 +++
 gcc/config/aarch64/aarch64-sve.md | 13 ++--
 gcc/config/aarch64/iterators.md   |  5 ++
 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++
 4 files changed, 109 insertions(+), 6 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 459e11b09a19..816f499e9634 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3508,6 +3508,15 @@
(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
   "TARGET_SIMD"
   {
+if (TARGET_SVE)
+  {
+   rtx p = aarch64_ptrue_reg (mode);
+   emit_insn (gen_aarch64_pred_popcount (operands[0],
+   p,
+   operands[1]));
+   DONE;
+  }
+
 /* Generate a byte popcount.  */
 machine_mode mode =  == 64 ? V8QImode : V16QImode;
 rtx tmp = gen_reg_rtx (mode);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index c3ed5075c4ed..a5cd42be9d5c 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3104,16 +3104,16 @@
 
 ;; Integer unary arithmetic predicated with a PTRUE.
 (define_insn "@aarch64_pred_"
-  [(set (match_operand:SVE_I 0 "register_operand")
-   (unspec:SVE_I
+  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+   (unspec:SVE_VDQ_I
  [(match_operand: 1 "register_operand")
-  (SVE_INT_UNARY:SVE_I
-(match_operand:SVE_I 2 "register_operand"))]
+  (SVE_INT_UNARY:SVE_VDQ_I
+(match_operand:SVE_VDQ_I 2 "register_operand"))]
  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
- [ w, Upl , 0 ; *  ] \t%0., %1/m, 
%2.
- [ ?&w  , Upl , w ; yes] movprfx\t%0, 
%2\;\t%0., %1/m, %2.
+ [ w, Upl , 0 ; *  ] \t%Z0., %1/m, 
%Z2.
+ [ ?&w  , Upl , w ; yes] movprfx\t%Z0, 
%Z2\;\t%Z0., %1/m, %Z2.
   }
 )
 
@@ -3168,6 +3168,7 @@
   }
 )
 
+
 ;; -
 ;;  [INT] General unary arithmetic corresponding to unspecs
 ;; -
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 95fe8f070f4c..aaa4afefe2ce 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -559,6 +559,9 @@
 ;; element modes
 (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
 
+;; All SVE and Advanced SIMD integer vector modes.
+(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
+
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
VNx4SI VNx2SI
@@ -2278,6 +2281,8 @@
 (VNx32BF "VNx8BI")
 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+(V8QI "VNx8BI") (V16QI "VNx16BI")
+(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
 (V4SI "VNx4BI") (V2DI "VNx2BI")])
 
 ;; ...and again in lower case.
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
new file mode 100644
index ..8e349efe3907
--- /dev/null
+

[gcc r15-949] MAINTAINERS: Add myself to Write After Approval and DCO

2024-05-31 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:96ec186d1dbeaa87453c3703e25fae7ce3ddbbb7

commit r15-949-g96ec186d1dbeaa87453c3703e25fae7ce3ddbbb7
Author: Pengxuan Zheng 
Date:   Fri May 31 11:07:05 2024 -0700

MAINTAINERS: Add myself to Write After Approval and DCO

ChangeLog:

* MAINTAINERS: Add myself to Write After Approval and DCO.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index e2870eef2ef..6444e6ea2f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -743,6 +743,7 @@ Dennis Zhang

 Yufeng Zhang   
 Qing Zhao  
 Shujing Zhao   
+Pengxuan Zheng 
 Jon Ziegler
 Roman Zippel   
 Josef Zlomek   
@@ -789,3 +790,4 @@ Martin Uecker   

 Jonathan Wakely
 Alexander Westbrooks   
 Chung-Ju Wu
+Pengxuan Zheng 


[gcc r15-950] aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c

2024-05-31 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:7fb62627cfb3e03811bb667fa7159bbc7f972f00

commit r15-950-g7fb62627cfb3e03811bb667fa7159bbc7f972f00
Author: Pengxuan Zheng 
Date:   Wed May 22 17:38:43 2024 -0700

aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c

vget_low_2.c is a test case for little-endian, but we missed the 
-mlittle-endian
flag in r15-697-ga2e4fe5a53cf75.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vget_low_2.c: Add -mlittle-endian.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/testsuite/gcc.target/aarch64/vget_low_2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c 
b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
index 44414e1c043..93e9e664ee9 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-options "-O3 -fdump-tree-optimized -mlittle-endian" } */
 
 #include 


[gcc r15-1079] aarch64: Add vector floating point extend pattern [PR113880, PR113869]

2024-06-06 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:230d62a2cdd16c1ec8fe87998ec01081503f010d

commit r15-1079-g230d62a2cdd16c1ec8fe87998ec01081503f010d
Author: Pengxuan Zheng 
Date:   Thu May 30 17:53:23 2024 -0700

aarch64: Add vector floating point extend pattern [PR113880, PR113869]

This patch adds vector floating point extend pattern for V2SF->V2DF and
V4HF->V4SF conversions by renaming the existing 
aarch64_float_extend_lo_
pattern to the standard optab one, i.e., extend2. This allows 
the
vectorizer to vectorize certain floating point widening operations for the
aarch64 target.

PR target/113880
PR target/113869

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (VAR1): Remap float_extend_lo_
builtin codes to standard optab ones.
* config/aarch64/aarch64-simd.md (aarch64_float_extend_lo_): 
Rename
to...
(extend2): ... This.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/extend-vec.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-builtins.cc|  9 +
 gcc/config/aarch64/aarch64-simd.md|  2 +-
 gcc/testsuite/gcc.target/aarch64/extend-vec.c | 21 +
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index f8eeccb554d..25189888d17 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -534,6 +534,15 @@ BUILTIN_VDQ_BHSI (urhadd, uavg, _ceil, 0)
 BUILTIN_VDQ_BHSI (shadd, avg, _floor, 0)
 BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0)
 
+/* The builtins below should be expanded through the standard optabs
+   CODE_FOR_extend2. */
+#undef VAR1
+#define VAR1(F,T,N,M) \
+  constexpr insn_code CODE_FOR_aarch64_##F##M = CODE_FOR_##T##N##M##2;
+
+VAR1 (float_extend_lo_, extend, v2sf, v2df)
+VAR1 (float_extend_lo_, extend, v4hf, v4sf)
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 868f4486218..c5e2c9f00d0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3132,7 +3132,7 @@
 DONE;
   }
 )
-(define_insn "aarch64_float_extend_lo_"
+(define_insn "extend2"
   [(set (match_operand: 0 "register_operand" "=w")
(float_extend:
  (match_operand:VDF 1 "register_operand" "w")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/extend-vec.c 
b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
new file mode 100644
index 000..f6241d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.2d, v[0-9]+.2s} 1 } } */
+void
+f (float *__restrict a, double *__restrict b)
+{
+  b[0] = a[0];
+  b[1] = a[1];
+}
+
+/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.4s, v[0-9]+.4h} 1 } } */
+void
+f1 (_Float16 *__restrict a, float *__restrict b)
+{
+
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+}


[gcc r15-1182] aarch64: Add vector floating point trunc pattern

2024-06-11 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:e7cd8ea1fa3e48404954bb7c06e9bcd603f132dd

commit r15-1182-ge7cd8ea1fa3e48404954bb7c06e9bcd603f132dd
Author: Pengxuan Zheng 
Date:   Fri Jun 7 19:52:00 2024 -0700

aarch64: Add vector floating point trunc pattern

This patch is a follow-up of r15-1079-g230d62a2cdd16c to add vector floating
point trunc pattern for V2DF->V2SF and V4SF->V4HF conversions by renaming 
the
existing aarch64_float_truncate_lo_ pattern to the 
standard
optab one, i.e., trunc2. This allows the 
vectorizer
to vectorize certain floating point narrowing operations for the aarch64 
target.

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (VAR1): Remap 
float_truncate_lo_
builtin codes to standard optab ones.
* config/aarch64/aarch64-simd.md 
(aarch64_float_truncate_lo_):
Rename to...
(trunc2): ... This.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/trunc-vec.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-builtins.cc   |  7 +++
 gcc/config/aarch64/aarch64-simd.md   |  6 +++---
 gcc/testsuite/gcc.target/aarch64/trunc-vec.c | 21 +
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 25189888d17d..d589e59defc2 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -543,6 +543,13 @@ BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0)
 VAR1 (float_extend_lo_, extend, v2sf, v2df)
 VAR1 (float_extend_lo_, extend, v4hf, v4sf)
 
+/* __builtin_aarch64_float_truncate_lo_ should be expanded through the
+   standard optabs CODE_FOR_trunc2. */
+constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v4hf
+= CODE_FOR_truncv4sfv4hf2;
+constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v2sf
+= CODE_FOR_truncv2dfv2sf2;
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index c5e2c9f00d02..f644bd1731e5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3197,7 +3197,7 @@
 }
 )
 
-(define_insn "aarch64_float_truncate_lo_"
+(define_insn "trunc2"
   [(set (match_operand:VDF 0 "register_operand" "=w")
   (float_truncate:VDF
(match_operand: 1 "register_operand" "w")))]
@@ -3256,7 +3256,7 @@
 int lo = BYTES_BIG_ENDIAN ? 2 : 1;
 int hi = BYTES_BIG_ENDIAN ? 1 : 2;
 
-emit_insn (gen_aarch64_float_truncate_lo_v2sf (tmp, operands[lo]));
+emit_insn (gen_truncv2dfv2sf2 (tmp, operands[lo]));
 emit_insn (gen_aarch64_float_truncate_hi_v4sf (operands[0],
   tmp, operands[hi]));
 DONE;
@@ -3272,7 +3272,7 @@
   {
 rtx tmp = gen_reg_rtx (V2SFmode);
 emit_insn (gen_aarch64_vec_concatdf (tmp, operands[1], operands[2]));
-emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp));
+emit_insn (gen_truncv2dfv2sf2 (operands[0], tmp));
 DONE;
   }
 )
diff --git a/gcc/testsuite/gcc.target/aarch64/trunc-vec.c 
b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c
new file mode 100644
index ..05e8af7912de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.2s, v[0-9]+.2d} 1 } } */
+void
+f (double *__restrict a, float *__restrict b)
+{
+  b[0] = a[0];
+  b[1] = a[1];
+}
+
+/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.4h, v[0-9]+.4s} 1 } } */
+void
+f1 (float *__restrict a, _Float16 *__restrict b)
+{
+
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+}


[gcc r15-4579] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

2024-10-23 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:9ffcf1f193b477f417a4c1960cd32696a23b99b4

commit r15-4579-g9ffcf1f193b477f417a4c1960cd32696a23b99b4
Author: Pengxuan Zheng 
Date:   Mon Oct 14 05:37:49 2024 -0700

aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

This is similar to the recent improvements to the Advanced SIMD popcount
expansion by using SVE. We can utilize SVE to generate more efficient code 
for
scalar mode popcount too.

Changes since v1:
* v2: Add a new VNx1BI mode and a new test case for V1DI.
* v3: Abandon VNx1BI changes and add a new variant of aarch64_ptrue_reg.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-protos.h (aarch64_ptrue_reg): New function.
* config/aarch64/aarch64-simd.md (popcount2): Update pattern 
to
also support V1DI mode.
* config/aarch64/aarch64.cc (aarch64_ptrue_reg): New function.
* config/aarch64/aarch64.md (popcount2): Add TARGET_SVE 
support.
* config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator.
(SVE_VDQ_I): Add V1DI.
(bitsize): Likewise.
(VPRED): Likewise.
(VEC_POP_MODE): New mode attribute.
(vec_pop_mode): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-sve.c: Update test.
* gcc.target/aarch64/popcnt11.c: New test.
* gcc.target/aarch64/popcnt12.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-protos.h   |  1 +
 gcc/config/aarch64/aarch64-simd.md| 15 +--
 gcc/config/aarch64/aarch64.cc | 21 ++
 gcc/config/aarch64/aarch64.md |  9 +
 gcc/config/aarch64/iterators.md   | 16 ++--
 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 10 ++---
 gcc/testsuite/gcc.target/aarch64/popcnt11.c   | 58 +++
 gcc/testsuite/gcc.target/aarch64/popcnt12.c   | 20 +
 8 files changed, 139 insertions(+), 11 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 06aa0aac0df6..75f30a52e617 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -917,6 +917,7 @@ rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx);
 void aarch64_expand_mov_immediate (rtx, rtx);
 rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
 rtx aarch64_ptrue_reg (machine_mode);
+rtx aarch64_ptrue_reg (machine_mode, unsigned int);
 rtx aarch64_pfalse_reg (machine_mode);
 bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 04851524fdea..68839246fd8a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3516,19 +3516,28 @@
 )
 
 (define_expand "popcount2"
-  [(set (match_operand:VDQHSD 0 "register_operand")
-   (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  [(set (match_operand:VDQHSD_V1DI 0 "register_operand")
+   (popcount:VDQHSD_V1DI
+ (match_operand:VDQHSD_V1DI 1 "register_operand")))]
   "TARGET_SIMD"
   {
 if (TARGET_SVE)
   {
-   rtx p = aarch64_ptrue_reg (mode);
+   rtx p = aarch64_ptrue_reg (mode,  == 64 ? 8 : 16);
emit_insn (gen_aarch64_pred_popcount (operands[0],
p,
operands[1]));
DONE;
   }
 
+if (mode == V1DImode)
+  {
+   rtx out = gen_reg_rtx (DImode);
+   emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1])));
+   emit_move_insn (operands[0], gen_lowpart (mode, out));
+   DONE;
+  }
+
 /* Generate a byte popcount.  */
 machine_mode mode =  == 64 ? V8QImode : V16QImode;
 machine_mode mode2 =  == 64 ? V2SImode : V4SImode;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 3e1d67431566..e6d957d275d1 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3630,6 +3630,27 @@ aarch64_ptrue_reg (machine_mode mode)
   return gen_lowpart (mode, reg);
 }
 
+/* Return an all-true (restricted to the leading VL bits) predicate register of
+   mode MODE.  */
+
+rtx
+aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
+{
+  gcc_assert (aarch64_sve_pred_mode_p (mode));
+
+  rtx_vector_builder builder (VNx16BImode, vl, 2);
+
+  for (int i = 0; i < vl; i++)
+builder.quick_push (CONST1_RTX (BImode));
+
+  for (int i = 0; i < vl; i++)
+builder.quick_push (CONST0_RTX (BImode));
+
+  rtx const_vec = builder.build ();
+  rtx reg = force_reg (VNx16BImode, const_vec);
+  return gen_lowpart (mode, reg);
+}
+
 /* Return an all-false predicate register of mode MODE.  */
 
 rtx
diff --git a/gcc/config/aarch64/aarch64.

[gcc r16-459] Canonicalize vec_merge in simplify_ternary_operation

2025-05-07 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:9b13bea07706a7cae0185f8a860d67209308c050

commit r16-459-g9b13bea07706a7cae0185f8a860d67209308c050
Author: Pengxuan Zheng 
Date:   Thu Feb 6 16:16:32 2025 -0800

Canonicalize vec_merge in simplify_ternary_operation

Similar to the canonicalization done in combine, we canonicalize vec_merge 
with
swap_communattive_operands_p in simplify_ternary_operation too.

gcc/ChangeLog:

* config/aarch64/aarch64-protos.h (aarch64_exact_log2_inverse): New.
* config/aarch64/aarch64-simd.md (aarch64_simd_vec_set_zero):
Update pattern accordingly.
* config/aarch64/aarch64.cc (aarch64_exact_log2_inverse): New.
* simplify-rtx.cc (simplify_context::simplify_ternary_operation):
Canonicalize vec_merge.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-protos.h |  1 +
 gcc/config/aarch64/aarch64-simd.md  | 10 ++
 gcc/config/aarch64/aarch64.cc   | 10 ++
 gcc/simplify-rtx.cc |  7 +++
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index c83c35c6d71e..c935e7bcf33d 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1055,6 +1055,7 @@ void aarch64_subvti_scratch_regs (rtx, rtx, rtx *,
  rtx *, rtx *, rtx *);
 void aarch64_expand_subvti (rtx, rtx, rtx,
rtx, rtx, rtx, rtx, bool);
+int aarch64_exact_log2_inverse (unsigned int, rtx);
 
 
 /* Initialize builtins for SIMD intrinsics.  */
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index e2afe87e5130..1099e742cbf7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1193,12 +1193,14 @@
 (define_insn "aarch64_simd_vec_set_zero"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
(vec_merge:VALL_F16
-   (match_operand:VALL_F16 1 "aarch64_simd_imm_zero" "")
-   (match_operand:VALL_F16 3 "register_operand" "0")
+   (match_operand:VALL_F16 1 "register_operand" "0")
+   (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "")
(match_operand:SI 2 "immediate_operand" "i")))]
-  "TARGET_SIMD && exact_log2 (INTVAL (operands[2])) >= 0"
+  "TARGET_SIMD && aarch64_exact_log2_inverse (, operands[2]) >= 0"
   {
-int elt = ENDIAN_LANE_N (, exact_log2 (INTVAL (operands[2])));
+int elt = ENDIAN_LANE_N (,
+aarch64_exact_log2_inverse (,
+operands[2]));
 operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
 return "ins\\t%0.[%p2], zr";
   }
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2dc5f4c4b59d..9e3f2885bccb 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23914,6 +23914,16 @@ aarch64_strided_registers_p (rtx *operands, unsigned 
int num_operands,
   return true;
 }
 
+/* Return the base 2 logarithm of the bit inverse of OP masked by the lowest
+   NELTS bits, if OP is a power of 2.  Otherwise, returns -1.  */
+
+int
+aarch64_exact_log2_inverse (unsigned int nelts, rtx op)
+{
+  return exact_log2 ((~INTVAL (op))
+& ((HOST_WIDE_INT_1U << nelts) - 1));
+}
+
 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
HIGH (exclusive).  */
 void
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 7bcbe11370fa..b34fd2f4b9ea 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -7387,6 +7387,13 @@ simplify_context::simplify_ternary_operation (rtx_code 
code, machine_mode mode,
  return gen_rtx_CONST_VECTOR (mode, v);
}
 
+ if (swap_commutative_operands_p (op0, op1)
+ /* Two operands have same precedence, then first bit of mask
+select first operand.  */
+ || (!swap_commutative_operands_p (op1, op0) && !(sel & 1)))
+   return simplify_gen_ternary (code, mode, mode, op1, op0,
+GEN_INT (~sel & mask));
+
  /* Replace (vec_merge (vec_merge a b m) c n) with (vec_merge b c n)
 if no element from a appears in the result.  */
  if (GET_CODE (op0) == VEC_MERGE)


[gcc r16-701] aarch64: Fix an oversight in aarch64_evpc_reencode

2025-05-16 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:d77c3bc1c35e3032b91648dbef4e0ef1f6020017

commit r16-701-gd77c3bc1c35e3032b91648dbef4e0ef1f6020017
Author: Pengxuan Zheng 
Date:   Thu May 15 17:52:29 2025 -0700

aarch64: Fix an oversight in aarch64_evpc_reencode

Some fields (e.g., zero_op0_p and zero_op1_p) of the struct "newd" may be 
left
uninitialized in aarch64_evpc_reencode. This can cause reading of 
uninitialized
data. I found this oversight when testing my patches on and/fmov
optimizations. This patch fixes the bug by zero initializing the struct.

Pushed as obvious after bootstrap/test on aarch64-linux-gnu.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_evpc_reencode): Zero initialize
newd.

Diff:
---
 gcc/config/aarch64/aarch64.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 901aa6ea68a8..f5552e4b86ce 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26277,7 +26277,7 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
 static bool
 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
 {
-  expand_vec_perm_d newd;
+  expand_vec_perm_d newd = {};
 
   /* The subregs that we'd create are not supported for big-endian SVE;
  see aarch64_modes_compatible_p for details.  */


[gcc r16-702] aarch64: Recognize vector permute patterns which can be interpreted as AND [PR100165]

2025-05-16 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:dc501cb0dc857663f7fa762f3dbf0ae60973d2c3

commit r16-702-gdc501cb0dc857663f7fa762f3dbf0ae60973d2c3
Author: Pengxuan Zheng 
Date:   Wed May 7 10:47:37 2025 -0700

aarch64: Recognize vector permute patterns which can be interpreted as AND 
[PR100165]

Certain permute that blends a vector with zero can be interpreted as an AND 
of a
mask. This idea was suggested by Richard Sandiford when he was reviewing my
patch which tries to optimizes certain vector permute with the FMOV 
instruction
for the aarch64 target.

For example, for the aarch64 target, at present:

v4hi
f_v4hi (v4hi x)
{
  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
}

generates:

f_v4hi:
uzp1v0.2d, v0.2d, v0.2d
adrpx0, .LC0
ldr d31, [x0, #:lo12:.LC0]
tbl v0.8b, {v0.16b}, v31.8b
ret
.LC0:
.byte   -1
.byte   -1
.byte   2
.byte   3
.byte   -1
.byte   -1
.byte   6
.byte   7

With this patch, it generates:

f_v4hi:
mvniv31.2s, 0xff, msl 8
and v0.8b, v0.8b, v31.8b
ret

This patch also provides a target-independent routine for detecting vector
permute patterns which can be interpreted as AND.

Changes since v1:
* v2: Rework the patch to only perform the optimization for aarch64 by 
calling
the target independent routine vec_perm_and_mask.

PR target/100165

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_evpc_and): New.
(aarch64_expand_vec_perm_const_1): Call aarch64_evpc_and.
* optabs.cc (vec_perm_and_mask): New.
* optabs.h (vec_perm_and_mask): New prototype.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/and-be.c: New test.
* gcc.target/aarch64/and-le.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64.cc |  36 +
 gcc/optabs.cc |  44 +++
 gcc/optabs.h  |   4 +
 gcc/testsuite/gcc.target/aarch64/and-be.c | 123 ++
 gcc/testsuite/gcc.target/aarch64/and-le.c | 123 ++
 5 files changed, 330 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f5552e4b86ce..34f9725485d2 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26886,6 +26886,40 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the AND instructions.  */
+static bool
+aarch64_evpc_and (struct expand_vec_perm_d *d)
+{
+  /* Either d->op0 or d->op1 should be a vector of all zeros.  */
+  if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
+return false;
+
+  machine_mode mode = d->vmode;
+  machine_mode sel_mode;
+  if (!related_int_vector_mode (mode).exists (&sel_mode))
+return false;
+
+  insn_code and_code = optab_handler (and_optab, sel_mode);
+  rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p);
+  if (and_code == CODE_FOR_nothing || !and_mask)
+return false;
+
+  if (d->testing_p)
+return true;
+
+  class expand_operand ops[3];
+  rtx in = d->zero_op0_p ? d->op1 : d->op0;
+  create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target), sel_mode);
+  create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode);
+  create_input_operand (&ops[2], and_mask, sel_mode);
+  expand_insn (and_code, 3, ops);
+  rtx result = gen_lowpart (mode, ops[0].value);
+  if (!rtx_equal_p (d->target, result))
+emit_move_insn (d->target, result);
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -26921,6 +26955,8 @@ aarch64_expand_vec_perm_const_1 (struct 
expand_vec_perm_d *d)
return true;
  else if (aarch64_evpc_uzp (d))
return true;
+ else if (aarch64_evpc_and (d))
+   return true;
  else if (aarch64_evpc_trn (d))
return true;
  else if (aarch64_evpc_sel (d))
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 92d6d50d55a0..5c9450f61450 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -6362,6 +6362,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target,
   return NULL_RTX;
 }
 
+/* Check if vec_perm mask SEL is a constant equivalent to an and operation of
+   the non-zero vec_perm operand with some mask consisting of 0xffs and 0x00s,
+   assuming the other vec_perm operand is a constant vector of zeros.  Return
+   the mask for the equivalent and operation, or NULL_RTX if the vec_perm can
+   not be modeled as an and.  MODE is the mode of the value being anded.
+   ZERO_OP0_P is true if the first operand of the vec_perm is a con

[gcc r16-703] aarch64: Optimize AND with certain vector of immediates as FMOV [PR100165]

2025-05-16 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:0417a630811404c2362060b7e15f99e5a4a0d76a

commit r16-703-g0417a630811404c2362060b7e15f99e5a4a0d76a
Author: Pengxuan Zheng 
Date:   Mon May 12 10:12:11 2025 -0700

aarch64: Optimize AND with certain vector of immediates as FMOV [PR100165]

We can optimize AND with certain vector of immediates as FMOV if the result 
of
the AND is as if the upper lane of the input vector is set to zero and the 
lower
lane remains unchanged.

For example, at present:

v4hi
f_v4hi (v4hi x)
{
  return x & (v4hi){ 0x, 0x, 0, 0 };
}

generates:

f_v4hi:
movid31, 0x
and v0.8b, v0.8b, v31.8b
ret

With this patch, it generates:

f_v4hi:
fmovs0, s0
ret

Changes since v1:
* v2: Simplify the mask checking logic by using native_decode_int and 
address a
few other review comments.

PR target/100165

gcc/ChangeLog:

* config/aarch64/aarch64-protos.h (aarch64_output_fmov): New 
prototype.
(aarch64_simd_valid_and_imm_fmov): Likewise.
* config/aarch64/aarch64-simd.md (and3): Allow 
FMOV
codegen.
* config/aarch64/aarch64.cc (aarch64_simd_valid_and_imm_fmov): New.
(aarch64_output_fmov): Likewise.
* config/aarch64/constraints.md (Df): New constraint.
* config/aarch64/predicates.md (aarch64_reg_or_and_imm): Update
predicate to support FMOV codegen.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/fmov-1-be.c: New test.
* gcc.target/aarch64/fmov-1-le.c: New test.
* gcc.target/aarch64/fmov-2-be.c: New test.
* gcc.target/aarch64/fmov-2-le.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-protos.h  |   2 +
 gcc/config/aarch64/aarch64-simd.md   |  10 +-
 gcc/config/aarch64/aarch64.cc|  50 +
 gcc/config/aarch64/constraints.md|   7 ++
 gcc/config/aarch64/predicates.md |   3 +-
 gcc/testsuite/gcc.target/aarch64/fmov-1-be.c | 151 +++
 gcc/testsuite/gcc.target/aarch64/fmov-1-le.c | 151 +++
 gcc/testsuite/gcc.target/aarch64/fmov-2-be.c |  90 
 gcc/testsuite/gcc.target/aarch64/fmov-2-le.c |  90 
 9 files changed, 548 insertions(+), 6 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index b59eecf5bdff..8f37e56d440e 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -933,6 +933,7 @@ char *aarch64_output_simd_mov_imm (rtx, unsigned);
 char *aarch64_output_simd_orr_imm (rtx, unsigned);
 char *aarch64_output_simd_and_imm (rtx, unsigned);
 char *aarch64_output_simd_xor_imm (rtx, unsigned);
+char *aarch64_output_fmov (rtx);
 
 char *aarch64_output_sve_mov_immediate (rtx);
 char *aarch64_output_sve_ptrues (rtx);
@@ -948,6 +949,7 @@ bool aarch64_simd_scalar_immediate_valid_for_move (rtx, 
scalar_int_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
 bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
 bool aarch64_simd_valid_and_imm (rtx);
+bool aarch64_simd_valid_and_imm_fmov (rtx, unsigned int * = NULL);
 bool aarch64_simd_valid_mov_imm (rtx);
 bool aarch64_simd_valid_orr_imm (rtx);
 bool aarch64_simd_valid_xor_imm (rtx);
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 1099e742cbf7..6e30dc48934c 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1117,17 +1117,17 @@
   [(set_attr "type" "neon_fp_abd_")]
 )
 
-;; For AND (vector, register) and BIC (vector, immediate)
+;; For AND (vector, register), BIC (vector, immediate) and FMOV (register)
 (define_insn "and3"
   [(set (match_operand:VDQ_I 0 "register_operand")
(and:VDQ_I (match_operand:VDQ_I 1 "register_operand")
   (match_operand:VDQ_I 2 "aarch64_reg_or_and_imm")))]
   "TARGET_SIMD"
-  {@ [ cons: =0 , 1 , 2   ]
- [ w, w , w   ] and\t%0., %1., %2.
- [ w, 0 , Db  ] << aarch64_output_simd_and_imm (operands[2], 
);
+  {@ [ cons: =0 , 1 , 2  ; attrs: type   ]
+ [ w, w , w  ; neon_logic ] and\t%0., %1., 
%2.
+ [ w, w , Df ; fmov  ] << aarch64_output_fmov 
(operands[2]);
+ [ w, 0 , Db ; neon_logic ] << aarch64_output_simd_and_imm 
(operands[2], );
   }
-  [(set_attr "type" "neon_logic")]
 )
 
 ;; For ORR (vector, register) and ORR (vector, immediate)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 34f9725485d2..1da615c8955a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23620,6 +23620,36 @@ aarch64_simd_valid_and_imm (rtx op)
   return aarch64_simd_valid_imm (op, NULL, 

[gcc r16-704] aarch64: Add more vector permute tests for the FMOV optimization [PR100165]

2025-05-16 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:265fdb3fa91346f1be40111a9f3e8a0838f7d7fd

commit r16-704-g265fdb3fa91346f1be40111a9f3e8a0838f7d7fd
Author: Pengxuan Zheng 
Date:   Mon May 12 10:21:49 2025 -0700

aarch64: Add more vector permute tests for the FMOV optimization [PR100165]

This patch adds more tests for vector permutes which can now be optimized as
FMOV with the generic PERM change and the aarch64 AND patch.

Changes since v1:
* v2: Add -mlittle-endian to the little endian tests explicitly and rename 
the
tests accordingly.

PR target/100165

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/fmov-3-be.c: New test.
* gcc.target/aarch64/fmov-3-le.c: New test.
* gcc.target/aarch64/fmov-4-be.c: New test.
* gcc.target/aarch64/fmov-4-le.c: New test.
* gcc.target/aarch64/fmov-5-be.c: New test.
* gcc.target/aarch64/fmov-5-le.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/testsuite/gcc.target/aarch64/fmov-3-be.c |  77 ++
 gcc/testsuite/gcc.target/aarch64/fmov-3-le.c | 129 +++
 gcc/testsuite/gcc.target/aarch64/fmov-4-be.c |  54 ++
 gcc/testsuite/gcc.target/aarch64/fmov-4-le.c |  94 +
 gcc/testsuite/gcc.target/aarch64/fmov-5-be.c | 150 +++
 gcc/testsuite/gcc.target/aarch64/fmov-5-le.c | 150 +++
 6 files changed, 654 insertions(+)

diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-3-be.c 
b/gcc/testsuite/gcc.target/aarch64/fmov-3-be.c
new file mode 100644
index ..0bddd96ea000
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-3-be.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target ("arch=armv8-a")
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+
+/*
+** f_v4hi:
+** fmovs0, s0
+** ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 5, 2, 3 });
+}
+
+/*
+** f_v8hi:
+** fmovs0, s0
+** ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+   (v8hi){ 8, 9, 10, 11, 12, 13, 6, 7 });
+}
+
+/*
+** f_v4si:
+** fmovd0, d0
+** ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 });
+}
+
+/*
+** g_v4si:
+** fmovd0, d0
+** ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 2, 3, 6, 7 });
+}
+
+/*
+** h_v4si:
+** fmovs0, s0
+** ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 4, 5, 6, 3 });
+}
+
+/*
+** f_v4sf:
+** fmovd0, d0
+** ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-3-le.c 
b/gcc/testsuite/gcc.target/aarch64/fmov-3-le.c
new file mode 100644
index ..4545841db36e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-3-le.c
@@ -0,0 +1,129 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target ("arch=armv8-a")
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+/*
+** f_v4hi:
+** fmovs0, s0
+** ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 1, 4, 5 });
+}
+
+/*
+** g_v4hi:
+** (?:(?!fmov).)*
+** ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 3, 1, 4, 2 });
+}
+
+/*
+** f_v8hi:
+** fmovs0, s0
+** ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+   (v8hi){ 0, 1, 8, 9, 10, 11, 12, 13 });
+}
+
+/*
+** f_v4si:
+** fmovd0, d0
+** ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 1, 4, 5 });
+}
+
+/*
+** g_v4si:
+** fmovd0, d0
+** ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 4, 5, 2, 3 });
+}
+
+/*
+** h_v4si:
+** fmovs0, s0
+** ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 5, 6 });
+}
+
+/*
+** f_v4sf:
+** fmovd0, d0
+** ret
+*/
+v4sf
+f_v4sf (v4sf x)

[gcc r16-811] aarch64: Carry over zeroness in aarch64_evpc_reencode

2025-05-21 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:84c6988c026114727693cd7cd74b8cd5cdcdeb74

commit r16-811-g84c6988c026114727693cd7cd74b8cd5cdcdeb74
Author: Pengxuan Zheng 
Date:   Tue May 20 17:58:23 2025 -0700

aarch64: Carry over zeroness in aarch64_evpc_reencode

There was a bug in aarch64_evpc_reencode which could leave zero_op0_p and
zero_op1_p of the struct "newd" uninitialized.  r16-701-gd77c3bc1c35e303 
fixed
the issue by zero initializing "newd."  This patch provides an alternative 
fix
as suggested by Richard Sandiford based on the fact that the zeroness is
preserved by aarch64_evpc_reencode.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_evpc_reencode): Copy 
zero_op0_p and
zero_op1_p from d to newd.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 1da615c8955a..2b837ec8e673 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26327,7 +26327,7 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
 static bool
 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
 {
-  expand_vec_perm_d newd = {};
+  expand_vec_perm_d newd;
 
   /* The subregs that we'd create are not supported for big-endian SVE;
  see aarch64_modes_compatible_p for details.  */
@@ -26353,6 +26353,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
   newd.testing_p = d->testing_p;
   newd.one_vector_p = d->one_vector_p;
+  newd.zero_op0_p = d->zero_op0_p;
+  newd.zero_op1_p = d->zero_op1_p;
 
   newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2,
newpermindices.nelts_per_input ());