[gcc r15-3669] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-09-16 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:a92f54f580c37732a5de01e47aed56882231f196

commit r15-3669-ga92f54f580c37732a5de01e47aed56882231f196
Author: Pengxuan Zheng 
Date:   Tue Sep 10 17:59:46 2024 -0700

aarch64: Improve vector constant generation using SVE INDEX instruction 
[PR113328]

SVE's INDEX instruction can be used to populate vectors by values starting 
from
"base" and incremented by "step" for each subsequent value. We can take
advantage of it to generate vector constants if TARGET_SVE is available and 
the
base and step values are within [-16, 15].

For example, with the following function:

typedef int v4si __attribute__ ((vector_size (16)));
v4si
f_v4si (void)
{
  return (v4si){ 0, 1, 2, 3 };
}

GCC currently generates:

f_v4si:
adrpx0, .LC4
ldr q0, [x0, #:lo12:.LC4]
ret

.LC4:
.word   0
.word   1
.word   2
.word   3

With this patch, we generate an INDEX instruction instead if TARGET_SVE is
available.

f_v4si:
index   z0.s, #0, #1
ret

PR target/113328

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_simd_valid_immediate): Improve
handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE 
is
available.
(aarch64_output_simd_mov_immediate): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
SVE's INDEX instruction.
* gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
* gcc.target/aarch64/sve/vec_init_3.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64.cc  | 13 ++-
 .../gcc.target/aarch64/sve/acle/general/dupq_1.c   |  3 +-
 .../gcc.target/aarch64/sve/acle/general/dupq_2.c   |  3 +-
 .../gcc.target/aarch64/sve/acle/general/dupq_3.c   |  3 +-
 .../gcc.target/aarch64/sve/acle/general/dupq_4.c   |  3 +-
 gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c  | 99 ++
 6 files changed, 115 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 6ccf08d1cc0a..92763d403c75 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22987,7 +22987,8 @@ aarch64_simd_valid_immediate (rtx op, 
simd_immediate_info *info,
   if (CONST_VECTOR_P (op)
   && CONST_VECTOR_DUPLICATE_P (op))
 n_elts = CONST_VECTOR_NPATTERNS (op);
-  else if ((vec_flags & VEC_SVE_DATA)
+  else if (which == AARCH64_CHECK_MOV
+  && TARGET_SVE
   && const_vec_series_p (op, &base, &step))
 {
   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
@@ -25245,6 +25246,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, 
unsigned width,
 
   if (which == AARCH64_CHECK_MOV)
 {
+  if (info.insn == simd_immediate_info::INDEX)
+   {
+ gcc_assert (TARGET_SVE);
+ snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
+   HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
+   element_char, INTVAL (info.u.index.base),
+   INTVAL (info.u.index.step));
+ return templ;
+   }
+
   mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
   shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
  ? "msl" : "lsl");
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
index 216699b0536e..0940bedd0ddb 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
@@ -10,7 +10,6 @@ dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
index d494943a2753..218a66013375 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
@@ -10,7 +10,6 @@ dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */

[gcc r15-1801] aarch64: Add vector popcount besides QImode [PR113859]

2024-07-02 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:895bbc08d38c2aca3cbbab273a247021fea73930

commit r15-1801-g895bbc08d38c2aca3cbbab273a247021fea73930
Author: Pengxuan Zheng 
Date:   Wed Jun 12 18:23:13 2024 -0700

aarch64: Add vector popcount besides QImode [PR113859]

This patch improves GCC’s vectorization of __builtin_popcount for aarch64 
target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v0.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.8b, #0
  moviv1.8b, #1
  cnt v3.8b, v2.8b
  udotv0.2s, v3.8b, v1.8b

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b
  uaddlp  v0.2d, v0.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename 
to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-udot.c: New test.
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-simd.md | 41 ++-
 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c | 58 ++
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c  | 69 ++
 3 files changed, 167 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 01b084d8ccb..fd0c5e612b5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,45 @@
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+   (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount.  */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+if (TARGET_DOTPROD
+   && (mode == SImode || mode == DImode))
+  {
+   /* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a
+  1 multiplicand.  For V2DI, another UAADDLP is needed.  */
+   rtx ones = force_reg (mode, CONST1_RTX (mode));
+   auto icode = optab_handler (udot_prod_optab, mode);
+   mode =  == 64 ? V2SImode : V4SImode;
+   rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+   rtx zeros = force_reg (mode, CONST0_RTX (mode));
+   emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros));
+   tmp = dest;
+  }
+
+/* Use a sequence of UADDLPs to accumulate the counts.  Each step doubles
+   the element size and halves the number of elements.  */
+while (mode != mode)
+  {
+   auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+   mode = insn_data[icode].operand[0].mode;
+   rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+   emit_insn (GEN_FCN (icode) (dest, tmp));
+   tmp = dest;
+  }
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
new file mode 100644
index 000..f6a968dae95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model 
-fno-schedule-insns -fno-schedule-insns2" } */
+
+/*
+** bar:
+** moviv([0-9]+).16b, 0x1
+** moviv([0-9]+).4s, 0
+** ldr q([0-9]+), \[x0\]
+** cnt v([0-9]+).16b, v\3.16b
+** udotv\2.4s, v\4.16b, v\

[gcc r15-2659] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

2024-08-01 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:e4b8db26de35239bd621aad9c0361f25d957122b

commit r15-2659-ge4b8db26de35239bd621aad9c0361f25d957122b
Author: Pengxuan Zheng 
Date:   Wed Jul 31 17:00:01 2024 -0700

aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

This patch improves the Advanced SIMD popcount expansion by using SVE if
available.

For example, GCC currently generates the following code sequence for V2DI:
  cnt v31.16b, v31.16b
  uaddlp  v31.8h, v31.16b
  uaddlp  v31.4s, v31.8h
  uaddlp  v31.2d, v31.4s

However, by using SVE, we can generate the following sequence instead:
  ptrue   p7.b, all
  cnt z31.d, p7/m, z31.d

Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.

The scalar popcount expansion can also be improved similarly by using SVE 
and
those changes will be included in a separate patch.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (popcount2): Add TARGET_SVE
support.
* config/aarch64/aarch64-sve.md (@aarch64_pred_): Use 
new
iterator SVE_VDQ_I.
* config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator.
(VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-sve.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-simd.md|  9 +++
 gcc/config/aarch64/aarch64-sve.md | 13 ++--
 gcc/config/aarch64/iterators.md   |  5 ++
 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++
 4 files changed, 109 insertions(+), 6 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 459e11b09a19..816f499e9634 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3508,6 +3508,15 @@
(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
   "TARGET_SIMD"
   {
+if (TARGET_SVE)
+  {
+   rtx p = aarch64_ptrue_reg (mode);
+   emit_insn (gen_aarch64_pred_popcount (operands[0],
+   p,
+   operands[1]));
+   DONE;
+  }
+
 /* Generate a byte popcount.  */
 machine_mode mode =  == 64 ? V8QImode : V16QImode;
 rtx tmp = gen_reg_rtx (mode);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index c3ed5075c4ed..a5cd42be9d5c 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3104,16 +3104,16 @@
 
 ;; Integer unary arithmetic predicated with a PTRUE.
 (define_insn "@aarch64_pred_"
-  [(set (match_operand:SVE_I 0 "register_operand")
-   (unspec:SVE_I
+  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+   (unspec:SVE_VDQ_I
  [(match_operand: 1 "register_operand")
-  (SVE_INT_UNARY:SVE_I
-(match_operand:SVE_I 2 "register_operand"))]
+  (SVE_INT_UNARY:SVE_VDQ_I
+(match_operand:SVE_VDQ_I 2 "register_operand"))]
  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
- [ w, Upl , 0 ; *  ] \t%0., %1/m, 
%2.
- [ ?&w  , Upl , w ; yes] movprfx\t%0, 
%2\;\t%0., %1/m, %2.
+ [ w, Upl , 0 ; *  ] \t%Z0., %1/m, 
%Z2.
+ [ ?&w  , Upl , w ; yes] movprfx\t%Z0, 
%Z2\;\t%Z0., %1/m, %Z2.
   }
 )
 
@@ -3168,6 +3168,7 @@
   }
 )
 
+
 ;; -
 ;;  [INT] General unary arithmetic corresponding to unspecs
 ;; -
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 95fe8f070f4c..aaa4afefe2ce 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -559,6 +559,9 @@
 ;; element modes
 (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
 
+;; All SVE and Advanced SIMD integer vector modes.
+(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
+
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
VNx4SI VNx2SI
@@ -2278,6 +2281,8 @@
 (VNx32BF "VNx8BI")
 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+(V8QI "VNx8BI") (V16QI "VNx16BI")
+(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
 (V4SI "VNx4BI") (V2DI "VNx2BI")])
 
 ;; ...and again in lower case.
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
new file mode 100644
index ..8e349efe3907
--- /dev/null
+

[gcc r15-949] MAINTAINERS: Add myself to Write After Approval and DCO

2024-05-31 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:96ec186d1dbeaa87453c3703e25fae7ce3ddbbb7

commit r15-949-g96ec186d1dbeaa87453c3703e25fae7ce3ddbbb7
Author: Pengxuan Zheng 
Date:   Fri May 31 11:07:05 2024 -0700

MAINTAINERS: Add myself to Write After Approval and DCO

ChangeLog:

* MAINTAINERS: Add myself to Write After Approval and DCO.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index e2870eef2ef..6444e6ea2f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -743,6 +743,7 @@ Dennis Zhang

 Yufeng Zhang   
 Qing Zhao  
 Shujing Zhao   
+Pengxuan Zheng 
 Jon Ziegler
 Roman Zippel   
 Josef Zlomek   
@@ -789,3 +790,4 @@ Martin Uecker   

 Jonathan Wakely
 Alexander Westbrooks   
 Chung-Ju Wu
+Pengxuan Zheng 


[gcc r15-950] aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c

2024-05-31 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:7fb62627cfb3e03811bb667fa7159bbc7f972f00

commit r15-950-g7fb62627cfb3e03811bb667fa7159bbc7f972f00
Author: Pengxuan Zheng 
Date:   Wed May 22 17:38:43 2024 -0700

aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c

vget_low_2.c is a test case for little-endian, but we missed the 
-mlittle-endian
flag in r15-697-ga2e4fe5a53cf75.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vget_low_2.c: Add -mlittle-endian.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/testsuite/gcc.target/aarch64/vget_low_2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c 
b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
index 44414e1c043..93e9e664ee9 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-options "-O3 -fdump-tree-optimized -mlittle-endian" } */
 
 #include 


[gcc r15-1079] aarch64: Add vector floating point extend pattern [PR113880, PR113869]

2024-06-06 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:230d62a2cdd16c1ec8fe87998ec01081503f010d

commit r15-1079-g230d62a2cdd16c1ec8fe87998ec01081503f010d
Author: Pengxuan Zheng 
Date:   Thu May 30 17:53:23 2024 -0700

aarch64: Add vector floating point extend pattern [PR113880, PR113869]

This patch adds vector floating point extend pattern for V2SF->V2DF and
V4HF->V4SF conversions by renaming the existing 
aarch64_float_extend_lo_
pattern to the standard optab one, i.e., extend2. This allows 
the
vectorizer to vectorize certain floating point widening operations for the
aarch64 target.

PR target/113880
PR target/113869

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (VAR1): Remap float_extend_lo_
builtin codes to standard optab ones.
* config/aarch64/aarch64-simd.md (aarch64_float_extend_lo_): 
Rename
to...
(extend2): ... This.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/extend-vec.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-builtins.cc|  9 +
 gcc/config/aarch64/aarch64-simd.md|  2 +-
 gcc/testsuite/gcc.target/aarch64/extend-vec.c | 21 +
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index f8eeccb554d..25189888d17 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -534,6 +534,15 @@ BUILTIN_VDQ_BHSI (urhadd, uavg, _ceil, 0)
 BUILTIN_VDQ_BHSI (shadd, avg, _floor, 0)
 BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0)
 
+/* The builtins below should be expanded through the standard optabs
+   CODE_FOR_extend2. */
+#undef VAR1
+#define VAR1(F,T,N,M) \
+  constexpr insn_code CODE_FOR_aarch64_##F##M = CODE_FOR_##T##N##M##2;
+
+VAR1 (float_extend_lo_, extend, v2sf, v2df)
+VAR1 (float_extend_lo_, extend, v4hf, v4sf)
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 868f4486218..c5e2c9f00d0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3132,7 +3132,7 @@
 DONE;
   }
 )
-(define_insn "aarch64_float_extend_lo_"
+(define_insn "extend2"
   [(set (match_operand: 0 "register_operand" "=w")
(float_extend:
  (match_operand:VDF 1 "register_operand" "w")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/extend-vec.c 
b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
new file mode 100644
index 000..f6241d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.2d, v[0-9]+.2s} 1 } } */
+void
+f (float *__restrict a, double *__restrict b)
+{
+  b[0] = a[0];
+  b[1] = a[1];
+}
+
+/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.4s, v[0-9]+.4h} 1 } } */
+void
+f1 (_Float16 *__restrict a, float *__restrict b)
+{
+
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+}


[gcc r15-1182] aarch64: Add vector floating point trunc pattern

2024-06-11 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:e7cd8ea1fa3e48404954bb7c06e9bcd603f132dd

commit r15-1182-ge7cd8ea1fa3e48404954bb7c06e9bcd603f132dd
Author: Pengxuan Zheng 
Date:   Fri Jun 7 19:52:00 2024 -0700

aarch64: Add vector floating point trunc pattern

This patch is a follow-up of r15-1079-g230d62a2cdd16c to add vector floating
point trunc pattern for V2DF->V2SF and V4SF->V4HF conversions by renaming 
the
existing aarch64_float_truncate_lo_ pattern to the 
standard
optab one, i.e., trunc2. This allows the 
vectorizer
to vectorize certain floating point narrowing operations for the aarch64 
target.

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (VAR1): Remap 
float_truncate_lo_
builtin codes to standard optab ones.
* config/aarch64/aarch64-simd.md 
(aarch64_float_truncate_lo_):
Rename to...
(trunc2): ... This.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/trunc-vec.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-builtins.cc   |  7 +++
 gcc/config/aarch64/aarch64-simd.md   |  6 +++---
 gcc/testsuite/gcc.target/aarch64/trunc-vec.c | 21 +
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 25189888d17d..d589e59defc2 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -543,6 +543,13 @@ BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0)
 VAR1 (float_extend_lo_, extend, v2sf, v2df)
 VAR1 (float_extend_lo_, extend, v4hf, v4sf)
 
+/* __builtin_aarch64_float_truncate_lo_ should be expanded through the
+   standard optabs CODE_FOR_trunc2. */
+constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v4hf
+= CODE_FOR_truncv4sfv4hf2;
+constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v2sf
+= CODE_FOR_truncv2dfv2sf2;
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index c5e2c9f00d02..f644bd1731e5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3197,7 +3197,7 @@
 }
 )
 
-(define_insn "aarch64_float_truncate_lo_"
+(define_insn "trunc2"
   [(set (match_operand:VDF 0 "register_operand" "=w")
   (float_truncate:VDF
(match_operand: 1 "register_operand" "w")))]
@@ -3256,7 +3256,7 @@
 int lo = BYTES_BIG_ENDIAN ? 2 : 1;
 int hi = BYTES_BIG_ENDIAN ? 1 : 2;
 
-emit_insn (gen_aarch64_float_truncate_lo_v2sf (tmp, operands[lo]));
+emit_insn (gen_truncv2dfv2sf2 (tmp, operands[lo]));
 emit_insn (gen_aarch64_float_truncate_hi_v4sf (operands[0],
   tmp, operands[hi]));
 DONE;
@@ -3272,7 +3272,7 @@
   {
 rtx tmp = gen_reg_rtx (V2SFmode);
 emit_insn (gen_aarch64_vec_concatdf (tmp, operands[1], operands[2]));
-emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp));
+emit_insn (gen_truncv2dfv2sf2 (operands[0], tmp));
 DONE;
   }
 )
diff --git a/gcc/testsuite/gcc.target/aarch64/trunc-vec.c 
b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c
new file mode 100644
index ..05e8af7912de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.2s, v[0-9]+.2d} 1 } } */
+void
+f (double *__restrict a, float *__restrict b)
+{
+  b[0] = a[0];
+  b[1] = a[1];
+}
+
+/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.4h, v[0-9]+.4s} 1 } } */
+void
+f1 (float *__restrict a, _Float16 *__restrict b)
+{
+
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+}


[gcc r15-4579] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

2024-10-23 Thread Pengxuan Zheng via Gcc-cvs
https://gcc.gnu.org/g:9ffcf1f193b477f417a4c1960cd32696a23b99b4

commit r15-4579-g9ffcf1f193b477f417a4c1960cd32696a23b99b4
Author: Pengxuan Zheng 
Date:   Mon Oct 14 05:37:49 2024 -0700

aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

This is similar to the recent improvements to the Advanced SIMD popcount
expansion by using SVE. We can utilize SVE to generate more efficient code 
for
scalar mode popcount too.

Changes since v1:
* v2: Add a new VNx1BI mode and a new test case for V1DI.
* v3: Abandon VNx1BI changes and add a new variant of aarch64_ptrue_reg.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-protos.h (aarch64_ptrue_reg): New function.
* config/aarch64/aarch64-simd.md (popcount2): Update pattern 
to
also support V1DI mode.
* config/aarch64/aarch64.cc (aarch64_ptrue_reg): New function.
* config/aarch64/aarch64.md (popcount2): Add TARGET_SVE 
support.
* config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator.
(SVE_VDQ_I): Add V1DI.
(bitsize): Likewise.
(VPRED): Likewise.
(VEC_POP_MODE): New mode attribute.
(vec_pop_mode): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-sve.c: Update test.
* gcc.target/aarch64/popcnt11.c: New test.
* gcc.target/aarch64/popcnt12.c: New test.

Signed-off-by: Pengxuan Zheng 

Diff:
---
 gcc/config/aarch64/aarch64-protos.h   |  1 +
 gcc/config/aarch64/aarch64-simd.md| 15 +--
 gcc/config/aarch64/aarch64.cc | 21 ++
 gcc/config/aarch64/aarch64.md |  9 +
 gcc/config/aarch64/iterators.md   | 16 ++--
 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 10 ++---
 gcc/testsuite/gcc.target/aarch64/popcnt11.c   | 58 +++
 gcc/testsuite/gcc.target/aarch64/popcnt12.c   | 20 +
 8 files changed, 139 insertions(+), 11 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 06aa0aac0df6..75f30a52e617 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -917,6 +917,7 @@ rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx);
 void aarch64_expand_mov_immediate (rtx, rtx);
 rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
 rtx aarch64_ptrue_reg (machine_mode);
+rtx aarch64_ptrue_reg (machine_mode, unsigned int);
 rtx aarch64_pfalse_reg (machine_mode);
 bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 04851524fdea..68839246fd8a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3516,19 +3516,28 @@
 )
 
 (define_expand "popcount2"
-  [(set (match_operand:VDQHSD 0 "register_operand")
-   (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  [(set (match_operand:VDQHSD_V1DI 0 "register_operand")
+   (popcount:VDQHSD_V1DI
+ (match_operand:VDQHSD_V1DI 1 "register_operand")))]
   "TARGET_SIMD"
   {
 if (TARGET_SVE)
   {
-   rtx p = aarch64_ptrue_reg (mode);
+   rtx p = aarch64_ptrue_reg (mode,  == 64 ? 8 : 16);
emit_insn (gen_aarch64_pred_popcount (operands[0],
p,
operands[1]));
DONE;
   }
 
+if (mode == V1DImode)
+  {
+   rtx out = gen_reg_rtx (DImode);
+   emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1])));
+   emit_move_insn (operands[0], gen_lowpart (mode, out));
+   DONE;
+  }
+
 /* Generate a byte popcount.  */
 machine_mode mode =  == 64 ? V8QImode : V16QImode;
 machine_mode mode2 =  == 64 ? V2SImode : V4SImode;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 3e1d67431566..e6d957d275d1 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3630,6 +3630,27 @@ aarch64_ptrue_reg (machine_mode mode)
   return gen_lowpart (mode, reg);
 }
 
+/* Return an all-true (restricted to the leading VL bits) predicate register of
+   mode MODE.  */
+
+rtx
+aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
+{
+  gcc_assert (aarch64_sve_pred_mode_p (mode));
+
+  rtx_vector_builder builder (VNx16BImode, vl, 2);
+
+  for (int i = 0; i < vl; i++)
+builder.quick_push (CONST1_RTX (BImode));
+
+  for (int i = 0; i < vl; i++)
+builder.quick_push (CONST0_RTX (BImode));
+
+  rtx const_vec = builder.build ();
+  rtx reg = force_reg (VNx16BImode, const_vec);
+  return gen_lowpart (mode, reg);
+}
+
 /* Return an all-false predicate register of mode MODE.  */
 
 rtx
diff --git a/gcc/config/aarch64/aarch64.