[RISC-V] RISC-V: Add implication for M extension.

2024-10-11 Thread Tsung Chun Lin

From 114731cd9cf28ad313de05a507b7253fb9bef3cb Mon Sep 17 00:00:00 2001
From: Tsung Chun Lin 
Date: Tue, 8 Oct 2024 17:40:59 -0600
Subject: [RISC-V] RISC-V: Add implication for M extension.

That M implies Zmmul.

gcc/ChangeLog:

	* common/config/riscv/riscv-common.cc: M implies Zmmul.

gcc/testsuite/ChangeLog:

	* gcc/testsuite/gcc.target/riscv/attribute-15.c: Add _zmmul1p0 to arch string.
	* gcc/testsuite/gcc.target/riscv/attribute-16.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/attribute-17.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/attribute-18.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/attribute-19.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/pr110696.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/target-attr-01.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/target-attr-02.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/target-attr-03.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/target-attr-04.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/target-attr-08.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/target-attr-11.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/target-attr-14.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/target-attr-15.c: Ditto.
	* gcc/testsuite/gcc.target/riscv/target-attr-16.c: Ditto.
---
 gcc/common/config/riscv/riscv-common.cc | 2 ++
 gcc/testsuite/gcc.target/riscv/attribute-15.c   | 2 +-
 gcc/testsuite/gcc.target/riscv/attribute-16.c   | 2 +-
 gcc/testsuite/gcc.target/riscv/attribute-17.c   | 2 +-
 gcc/testsuite/gcc.target/riscv/attribute-18.c   | 2 +-
 gcc/testsuite/gcc.target/riscv/attribute-19.c   | 2 +-
 gcc/testsuite/gcc.target/riscv/pr110696.c   | 2 +-
 gcc/testsuite/gcc.target/riscv/target-attr-01.c | 2 +-
 gcc/testsuite/gcc.target/riscv/target-attr-02.c | 2 +-
 gcc/testsuite/gcc.target/riscv/target-attr-03.c | 2 +-
 gcc/testsuite/gcc.target/riscv/target-attr-04.c | 2 +-
 gcc/testsuite/gcc.target/riscv/target-attr-08.c | 2 +-
 gcc/testsuite/gcc.target/riscv/target-attr-11.c | 2 +-
 gcc/testsuite/gcc.target/riscv/target-attr-14.c | 4 ++--
 gcc/testsuite/gcc.target/riscv/target-attr-15.c | 4 ++--
 gcc/testsuite/gcc.target/riscv/target-attr-16.c | 4 ++--
 16 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/gcc/common/config/riscv/riscv-common.cc b/gcc/common/config/riscv/riscv-common.cc
index 2adebe0b6f2..60595a3e356 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -75,6 +75,8 @@ struct riscv_implied_info_t
 /* Implied ISA info, must end with NULL sentinel.  */
 static const riscv_implied_info_t riscv_implied_info[] =
 {
+  {"m", "zmmul"},
+
   {"d", "f"},
   {"f", "zicsr"},
   {"d", "zicsr"},
diff --git a/gcc/testsuite/gcc.target/riscv/attribute-15.c b/gcc/testsuite/gcc.target/riscv/attribute-15.c
index ac6caaecd4f..d7a70e86aa1 100644
--- a/gcc/testsuite/gcc.target/riscv/attribute-15.c
+++ b/gcc/testsuite/gcc.target/riscv/attribute-15.c
@@ -3,4 +3,4 @@
 int foo()
 {
 }
-/* { dg-final { scan-assembler ".attribute arch, \"rv32i2p0_m2p0_a2p0_f2p0_d2p0_c2p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0\"" } } */
+/* { dg-final { scan-assembler ".attribute arch, \"rv32i2p0_m2p0_a2p0_f2p0_d2p0_c2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0\"" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/attribute-16.c b/gcc/testsuite/gcc.target/riscv/attribute-16.c
index 539e426ca97..4818cbe90d4 100644
--- a/gcc/testsuite/gcc.target/riscv/attribute-16.c
+++ b/gcc/testsuite/gcc.target/riscv/attribute-16.c
@@ -3,4 +3,4 @@
 int foo()
 {
 }
-/* { dg-final { scan-assembler ".attribute arch, \"rv32i2p1_m2p0_a2p0_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0\"" } } */
+/* { dg-final { scan-assembler ".attribute arch, \"rv32i2p1_m2p0_a2p0_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0\"" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/attribute-17.c b/gcc/testsuite/gcc.target/riscv/attribute-17.c
index 30928cb5b68..64b11b6a28c 100644
--- a/gcc/testsuite/gcc.target/riscv/attribute-17.c
+++ b/gcc/testsuite/gcc.target/riscv/attribute-17.c
@@ -3,4 +3,4 @@
 int foo()
 {
 }
-/* { dg-final { scan-assembler ".attribute arch, \"rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0\"" } } */
+/* { dg-final { scan-assembler ".attribute arch, \"rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0\"" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/attribute-18.c b/gcc/testsuite/gcc.target/riscv/attribute-18.c
index 9f7199f331a..43ae37b5089 100644
--- a/gcc/testsuite/gcc.target/riscv/attribute-18.c
+++ b/gcc/testsuite/gcc.target/riscv/attribute-18.c
@@ -1,4 +1,4 @@
 /* { dg-do compile } */
 /* { dg-options "-mriscv-attribute -march=rv64imafdc -mabi=lp64d -misa-spec=2.2" } */
 int foo() {}
-/* { dg-final { scan-assembler ".attribute arch, \"rv64i2p0_m2p0_a2p0_f2p0_d2p0_c2p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0\"" } } */
+/* { dg-final { scan-assembler ".attribute arch, \"rv64i2p0_m2p0_a2p0_f2p0_d2p

Re: [PATCH v1 1/2] Match: Support form 2 for scalar signed integer SAT_TRUNC

2024-10-11 Thread Richard Biener
On Wed, Oct 9, 2024 at 5:30 AM  wrote:
>
> From: Pan Li 
>
> This patch would like to support the form 2 of the scalar signed
> integer SAT_TRUNC.  Aka below example:
>
> Form 2:
>   #define DEF_SAT_S_TRUNC_FMT_2(NT, WT, NT_MIN, NT_MAX) \
>   NT __attribute__((noinline))  \
>   sat_s_trunc_##WT##_to_##NT##_fmt_2 (WT x) \
>   { \
> NT trunc = (NT)x;   \
> return (WT)NT_MIN < x && x < (WT)NT_MAX \
>   ? trunc   \
>   : x < 0 ? NT_MIN : NT_MAX;\
>   }
>
> DEF_SAT_S_TRUNC_FMT_2(int8_t, int16_t, INT8_MIN, INT8_MAX)
>
> Before this patch:
>4   │ __attribute__((noinline))
>5   │ int8_t sat_s_trunc_int16_t_to_int8_t_fmt_2 (int16_t x)
>6   │ {
>7   │   int8_t trunc;
>8   │   unsigned short x.0_1;
>9   │   unsigned short _2;
>   10   │   int8_t _3;
>   11   │   _Bool _7;
>   12   │   signed char _8;
>   13   │   signed char _9;
>   14   │   signed char _10;
>   15   │
>   16   │ ;;   basic block 2, loop depth 0
>   17   │ ;;pred:   ENTRY
>   18   │   x.0_1 = (unsigned short) x_4(D);
>   19   │   _2 = x.0_1 + 127;
>   20   │   if (_2 > 253)
>   21   │ goto ; [50.00%]
>   22   │   else
>   23   │ goto ; [50.00%]
>   24   │ ;;succ:   4
>   25   │ ;;3
>   26   │
>   27   │ ;;   basic block 3, loop depth 0
>   28   │ ;;pred:   2
>   29   │   trunc_5 = (int8_t) x_4(D);
>   30   │   goto ; [100.00%]
>   31   │ ;;succ:   5
>   32   │
>   33   │ ;;   basic block 4, loop depth 0
>   34   │ ;;pred:   2
>   35   │   _7 = x_4(D) < 0;
>   36   │   _8 = (signed char) _7;
>   37   │   _9 = -_8;
>   38   │   _10 = _9 ^ 127;
>   39   │ ;;succ:   5
>   40   │
>   41   │ ;;   basic block 5, loop depth 0
>   42   │ ;;pred:   3
>   43   │ ;;4
>   44   │   # _3 = PHI 
>   45   │   return _3;
>   46   │ ;;succ:   EXIT
>   47   │
>   48   │ }
>
> After this patch:
>4   │ __attribute__((noinline))
>5   │ int8_t sat_s_trunc_int16_t_to_int8_t_fmt_2 (int16_t x)
>6   │ {
>7   │   int8_t _3;
>8   │
>9   │ ;;   basic block 2, loop depth 0
>   10   │ ;;pred:   ENTRY
>   11   │   _3 = .SAT_TRUNC (x_4(D)); [tail call]
>   12   │   return _3;
>   13   │ ;;succ:   EXIT
>   14   │
>   15   │ }
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.

OK.

> gcc/ChangeLog:
>
> * match.pd: Add case 2 matching pattern for signed SAT_TRUNC.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/match.pd | 21 +
>  1 file changed, 13 insertions(+), 8 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 6a924f409d9..70fdd10926f 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3461,7 +3461,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>}
>(if (wi::eq_p (trunc_max, int_cst_1) && wi::eq_p (max, int_cst_2))
>
> -/* Signed saturation truncate, case 1, sizeof (WT) > sizeof (NT).
> +/* Signed saturation truncate, case 1 and case 2, sizeof (WT) > sizeof (NT).
> SAT_S_TRUNC(X) = (unsigned)X + NT_MAX + 1  > Unsigned_MAX ? (NT)X.  */
>  (match (signed_integer_sat_trunc @0)
>   (cond^ (gt (plus:c (convert@4 @0) INTEGER_CST@1) INTEGER_CST@2)
> @@ -3471,17 +3471,22 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>&& !TYPE_UNSIGNED (TREE_TYPE (@0)) && TYPE_UNSIGNED (TREE_TYPE (@4)))
>   (with
>{
> -   unsigned itype_precision = TYPE_PRECISION (TREE_TYPE (@0));
> -   unsigned otype_precision = TYPE_PRECISION (type);
> -   wide_int offset = wi::uhwi (HOST_WIDE_INT_1U << (otype_precision - 1), 
> itype_precision);
> -   wide_int trunc_max = wi::mask (otype_precision, false, itype_precision);
> -   wide_int max = wi::mask (otype_precision - 1, false, otype_precision);
> +   unsigned itype_prec = TYPE_PRECISION (TREE_TYPE (@0));
> +   unsigned otype_prec = TYPE_PRECISION (type);
> +   wide_int offset = wi::uhwi (HOST_WIDE_INT_1U << (otype_prec - 1),
> +  itype_prec); // Aka 128 for int8_t
> +   wide_int limit_0 = wi::mask (otype_prec, false, itype_prec); // Aka 255
> +   wide_int limit_1 = wi::uhwi ((HOST_WIDE_INT_1U << otype_prec) - 3,
> +   itype_prec); // Aka 253
> +   wide_int otype_max = wi::mask (otype_prec - 1, false, otype_prec);
> +   wide_int itype_max = wi::mask (otype_prec - 1, false, itype_prec);
> wide_int int_cst_1 = wi::to_wide (@1);
> wide_int int_cst_2 = wi::to_wide (@2);
> wide_int int_cst_3 = wi::to_wide (@3);
>}
> -  (if (wi::eq_p (int_cst_1, offset) && wi::eq_p (int_cst_2, trunc_max)
> -   && wi::eq_p (int_cst_3, max))
> +  (if (((wi::eq_p (int_cst_1, offset) && wi::eq_p (int_cst_2, limit_0))
> +|| (wi::eq_p (int_cst_1, itype_max) && wi::eq_p (int_cst_2, 

Re: [PATCH v1 1/2] Match: Support form 4 for scalar signed integer SAT_TRUNC

2024-10-11 Thread Richard Biener
On Thu, Oct 10, 2024 at 8:55 AM  wrote:
>
> From: Pan Li 
>
> This patch would like to support the form 4 of the scalar signed
> integer SAT_TRUNC.  Aka below example:
>
> Form 4:
>   #define DEF_SAT_S_TRUNC_FMT_4(NT, WT, NT_MIN, NT_MAX) \
>   NT __attribute__((noinline))  \
>   sat_s_trunc_##WT##_to_##NT##_fmt_4 (WT x) \
>   { \
> NT trunc = (NT)x;   \
> return (WT)NT_MIN <= x && x < (WT)NT_MAX\
>   ? trunc   \
>   : x < 0 ? NT_MIN : NT_MAX;\
>   }
>
> DEF_SAT_S_TRUNC_FMT_4(int8_t, int16_t, INT8_MIN, INT8_MAX)
>
> Before this patch:
>4   │ __attribute__((noinline))
>5   │ int8_t sat_s_trunc_int16_t_to_int8_t_fmt_4 (int16_t x)
>6   │ {
>7   │   int8_t trunc;
>8   │   unsigned short x.0_1;
>9   │   unsigned short _2;
>   10   │   int8_t _3;
>   11   │   _Bool _7;
>   12   │   signed char _8;
>   13   │   signed char _9;
>   14   │   signed char _10;
>   15   │
>   16   │ ;;   basic block 2, loop depth 0
>   17   │ ;;pred:   ENTRY
>   18   │   x.0_1 = (unsigned short) x_4(D);
>   19   │   _2 = x.0_1 + 128;
>   20   │   if (_2 > 254)
>   21   │ goto ; [50.00%]
>   22   │   else
>   23   │ goto ; [50.00%]
>   24   │ ;;succ:   4
>   25   │ ;;3
>   26   │
>   27   │ ;;   basic block 3, loop depth 0
>   28   │ ;;pred:   2
>   29   │   trunc_5 = (int8_t) x_4(D);
>   30   │   goto ; [100.00%]
>   31   │ ;;succ:   5
>   32   │
>   33   │ ;;   basic block 4, loop depth 0
>   34   │ ;;pred:   2
>   35   │   _7 = x_4(D) < 0;
>   36   │   _8 = (signed char) _7;
>   37   │   _9 = -_8;
>   38   │   _10 = _9 ^ 127;
>   39   │ ;;succ:   5
>   40   │
>   41   │ ;;   basic block 5, loop depth 0
>   42   │ ;;pred:   3
>   43   │ ;;4
>   44   │   # _3 = PHI 
>   45   │   return _3;
>   46   │ ;;succ:   EXIT
>   47   │
>   48   │ }
>
> After this patch:
>4   │ __attribute__((noinline))
>5   │ int8_t sat_s_trunc_int16_t_to_int8_t_fmt_4 (int16_t x)
>6   │ {
>7   │   int8_t _3;
>8   │
>9   │ ;;   basic block 2, loop depth 0
>   10   │ ;;pred:   ENTRY
>   11   │   _3 = .SAT_TRUNC (x_4(D)); [tail call]
>   12   │   return _3;
>   13   │ ;;succ:   EXIT
>   14   │
>   15   │ }
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.

OK.

> gcc/ChangeLog:
>
> * match.pd: Add case 4 matching pattern for signed SAT_TRUNC.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/match.pd | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 5e20651c8ce..6bd515fdd87 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3488,6 +3488,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>}
>(if (((wi::eq_p (int_cst_1, offset) && wi::eq_p (int_cst_2, limit_0))
>  || (wi::eq_p (int_cst_1, itype_max) && wi::eq_p (int_cst_2, limit_2))
> +|| (wi::eq_p (int_cst_1, offset) && wi::eq_p (int_cst_2, limit_2))
>  || (wi::eq_p (int_cst_1, itype_max) && wi::eq_p (int_cst_2, 
> limit_1)))
> && wi::eq_p (int_cst_3, otype_max))
>
> --
> 2.43.0
>


Re: [PATCH v1 1/2] Match: Support form 3 for scalar signed integer SAT_TRUNC

2024-10-11 Thread Richard Biener
On Wed, Oct 9, 2024 at 4:40 PM  wrote:
>
> From: Pan Li 
>
> This patch would like to support the form 3 of the scalar signed
> integer SAT_TRUNC.  Aka below example:
>
> Form 3:
>   #define DEF_SAT_S_TRUNC_FMT_3(NT, WT, NT_MIN, NT_MAX) \
>   NT __attribute__((noinline))  \
>   sat_s_trunc_##WT##_to_##NT##_fmt_3 (WT x) \
>   { \
> NT trunc = (NT)x;   \
> return (WT)NT_MIN < x && x <= (WT)NT_MAX\
>   ? trunc   \
>   : x < 0 ? NT_MIN : NT_MAX;\
>   }
>
> DEF_SAT_S_TRUNC_FMT_3(int8_t, int16_t, INT8_MIN, INT8_MAX)
>
> Before this patch:
>4   │ __attribute__((noinline))
>5   │ int8_t sat_s_sub_int8_t_fmt_3 (int8_t x, int8_t y)
>6   │ {
>7   │   signed char _1;
>8   │   signed char _2;
>9   │   int8_t _3;
>   10   │   __complex__ signed char _6;
>   11   │   _Bool _8;
>   12   │   signed char _9;
>   13   │   signed char _10;
>   14   │   signed char _11;
>   15   │
>   16   │ ;;   basic block 2, loop depth 0
>   17   │ ;;pred:   ENTRY
>   18   │   _6 = .SUB_OVERFLOW (x_4(D), y_5(D));
>   19   │   _2 = IMAGPART_EXPR <_6>;
>   20   │   if (_2 != 0)
>   21   │ goto ; [50.00%]
>   22   │   else
>   23   │ goto ; [50.00%]
>   24   │ ;;succ:   4
>   25   │ ;;3
>   26   │
>   27   │ ;;   basic block 3, loop depth 0
>   28   │ ;;pred:   2
>   29   │   _1 = REALPART_EXPR <_6>;
>   30   │   goto ; [100.00%]
>   31   │ ;;succ:   5
>   32   │
>   33   │ ;;   basic block 4, loop depth 0
>   34   │ ;;pred:   2
>   35   │   _8 = x_4(D) < 0;
>   36   │   _9 = (signed char) _8;
>   37   │   _10 = -_9;
>   38   │   _11 = _10 ^ 127;
>   39   │ ;;succ:   5
>   40   │
>   41   │ ;;   basic block 5, loop depth 0
>   42   │ ;;pred:   3
>   43   │ ;;4
>   44   │   # _3 = PHI <_1(3), _11(4)>
>   45   │   return _3;
>   46   │ ;;succ:   EXIT
>   47   │
>   48   │ }
>
> After this patch:
>4   │ __attribute__((noinline))
>5   │ int8_t sat_s_trunc_int16_t_to_int8_t_fmt_3 (int16_t x)
>6   │ {
>7   │   int8_t _3;
>8   │
>9   │ ;;   basic block 2, loop depth 0
>   10   │ ;;pred:   ENTRY
>   11   │   _3 = .SAT_TRUNC (x_4(D)); [tail call]
>   12   │   return _3;
>   13   │ ;;succ:   EXIT
>   14   │
>   15   │ }
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.

OK.

> gcc/ChangeLog:
>
> * match.pd: Add case 3 matching pattern for signed SAT_TRUNC.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/match.pd | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 70fdd10926f..5e20651c8ce 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3478,6 +3478,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> wide_int limit_0 = wi::mask (otype_prec, false, itype_prec); // Aka 255
> wide_int limit_1 = wi::uhwi ((HOST_WIDE_INT_1U << otype_prec) - 3,
> itype_prec); // Aka 253
> +   wide_int limit_2 = wi::uhwi ((HOST_WIDE_INT_1U << otype_prec) - 2,
> +   itype_prec); // Aka 254
> wide_int otype_max = wi::mask (otype_prec - 1, false, otype_prec);
> wide_int itype_max = wi::mask (otype_prec - 1, false, itype_prec);
> wide_int int_cst_1 = wi::to_wide (@1);
> @@ -3485,6 +3487,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> wide_int int_cst_3 = wi::to_wide (@3);
>}
>(if (((wi::eq_p (int_cst_1, offset) && wi::eq_p (int_cst_2, limit_0))
> +|| (wi::eq_p (int_cst_1, itype_max) && wi::eq_p (int_cst_2, limit_2))
>  || (wi::eq_p (int_cst_1, itype_max) && wi::eq_p (int_cst_2, 
> limit_1)))
> && wi::eq_p (int_cst_3, otype_max))
>
> --
> 2.43.0
>


Re: [PATCH v1 2/4] Vect: Try the pattern of vector signed integer SAT_SUB

2024-10-11 Thread Richard Biener
On Fri, Oct 11, 2024 at 8:24 AM  wrote:
>
> From: Pan Li 
>
> Almost the same as vector unsigned integer SAT_SUB, try to match
> the signed version during the vector pattern matching.
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.

OK.

> gcc/ChangeLog:
>
> * tree-vect-patterns.cc (gimple_signed_integer_sat_sub): Add new
> func decl for signed SAT_SUB.
> (vect_recog_sat_sub_pattern_transform): Update comments.
> (vect_recog_sat_sub_pattern): Try the vector signed SAT_SUB
> pattern.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/tree-vect-patterns.cc | 26 +-
>  1 file changed, 25 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 9bf8526ac99..746f100a084 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4538,6 +4538,7 @@ extern bool gimple_unsigned_integer_sat_sub (tree, 
> tree*, tree (*)(tree));
>  extern bool gimple_unsigned_integer_sat_trunc (tree, tree*, tree (*)(tree));
>
>  extern bool gimple_signed_integer_sat_add (tree, tree*, tree (*)(tree));
> +extern bool gimple_signed_integer_sat_sub (tree, tree*, tree (*)(tree));
>
>  static gimple *
>  vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info 
> stmt_info,
> @@ -4684,6 +4685,7 @@ vect_recog_sat_sub_pattern_transform (vec_info *vinfo,
>
>  /*
>   * Try to detect saturation sub pattern (SAT_ADD), aka below gimple:
> + * Unsigned:
>   *   _7 = _1 >= _2;
>   *   _8 = _1 - _2;
>   *   _10 = (long unsigned int) _7;
> @@ -4691,6 +4693,27 @@ vect_recog_sat_sub_pattern_transform (vec_info *vinfo,
>   *
>   * And then simplied to
>   *   _9 = .SAT_SUB (_1, _2);
> + *
> + * Signed:
> + *   x.0_4 = (unsigned char) x_16;
> + *   y.1_5 = (unsigned char) y_18;
> + *   _6 = x.0_4 - y.1_5;
> + *   minus_19 = (int8_t) _6;
> + *   _7 = x_16 ^ y_18;
> + *   _8 = x_16 ^ minus_19;
> + *   _44 = _7 < 0;
> + *   _23 = x_16 < 0;
> + *   _24 = (signed char) _23;
> + *   _58 = (unsigned char) _24;
> + *   _59 = -_58;
> + *   _25 = (signed char) _59;
> + *   _26 = _25 ^ 127;
> + *   _42 = _8 < 0;
> + *   _41 = _42 & _44;
> + *   iftmp.2_11 = _41 ? _26 : minus_19;
> + *
> + * And then simplied to
> + *   iftmp.2_11 = .SAT_SUB (x_16, y_18);
>   */
>
>  static gimple *
> @@ -4705,7 +4728,8 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, 
> stmt_vec_info stmt_vinfo,
>tree ops[2];
>tree lhs = gimple_assign_lhs (last_stmt);
>
> -  if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL))
> +  if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)
> +  || gimple_signed_integer_sat_sub (lhs, ops, NULL))
>  {
>vect_recog_sat_sub_pattern_transform (vinfo, stmt_vinfo, lhs, ops);
>gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo,
> --
> 2.43.0
>


Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer SAT_SUB

2024-10-11 Thread Richard Biener
On Fri, Oct 11, 2024 at 8:24 AM  wrote:
>
> From: Pan Li 
>
> This patch would like to support the form 1 of the vector signed
> integer SAT_SUB.  Aka below example:
>
> Form 1:
>   #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX) \
>   void __attribute__((noinline))   \
>   vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
>   {\
> unsigned i;\
> for (i = 0; i < limit; i++)\
>   {\
> T x = op_1[i]; \
> T y = op_2[i]; \
> T minus = (UT)x - (UT)y;   \
> out[i] = (x ^ y) >= 0  \
>   ? minus  \
>   : (minus ^ x) >= 0   \
> ? minus\
> : x < 0 ? MIN : MAX;   \
>   }\
>   }
>
> DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)
>
> Before this patch:
>   91   │   _108 = .SELECT_VL (ivtmp_106, POLY_INT_CST [16, 16]);
>   92   │   vect_x_16.11_80 = .MASK_LEN_LOAD (vectp_op_1.9_78, 8B, { -1, ... 
> }, _108, 0);
>   93   │   _69 = vect_x_16.11_80 >> 7;
>   94   │   vect_x.12_81 = VIEW_CONVERT_EXPR char>(vect_x_16.11_80);
>   95   │   vect_y_18.15_85 = .MASK_LEN_LOAD (vectp_op_2.13_83, 8B, { -1, ... 
> }, _108, 0);
>   96   │   vect__7.21_91 = vect_x_16.11_80 ^ vect_y_18.15_85;
>   97   │   mask__44.22_92 = vect__7.21_91 < { 0, ... };
>   98   │   vect_y.16_86 = VIEW_CONVERT_EXPR char>(vect_y_18.15_85);
>   99   │   vect__6.17_87 = vect_x.12_81 - vect_y.16_86;
>  100   │   vect_minus_19.18_88 = VIEW_CONVERT_EXPR char>(vect__6.17_87);
>  101   │   vect__8.19_89 = vect_x_16.11_80 ^ vect_minus_19.18_88;
>  102   │   mask__42.20_90 = vect__8.19_89 < { 0, ... };
>  103   │   mask__41.23_93 = mask__42.20_90 & mask__44.22_92;
>  104   │   _4 = .COND_XOR (mask__41.23_93, _69, { 127, ... }, 
> vect_minus_19.18_88);
>  105   │   .MASK_LEN_STORE (vectp_out.31_102, 8B, { -1, ... }, _108, 0, _4);
>  106   │   vectp_op_1.9_79 = vectp_op_1.9_78 + _108;
>  107   │   vectp_op_2.13_84 = vectp_op_2.13_83 + _108;
>  108   │   vectp_out.31_103 = vectp_out.31_102 + _108;
>  109   │   ivtmp_107 = ivtmp_106 - _108;
>
> After this patch:
>   81   │   _102 = .SELECT_VL (ivtmp_100, POLY_INT_CST [16, 16]);
>   82   │   vect_x_16.11_89 = .MASK_LEN_LOAD (vectp_op_1.9_87, 8B, { -1, ... 
> }, _102, 0);
>   83   │   vect_y_18.14_93 = .MASK_LEN_LOAD (vectp_op_2.12_91, 8B, { -1, ... 
> }, _102, 0);
>   84   │   vect_patt_38.15_94 = .SAT_SUB (vect_x_16.11_89, vect_y_18.14_93);
>   85   │   .MASK_LEN_STORE (vectp_out.16_96, 8B, { -1, ... }, _102, 0, 
> vect_patt_38.15_94);
>   86   │   vectp_op_1.9_88 = vectp_op_1.9_87 + _102;
>   87   │   vectp_op_2.12_92 = vectp_op_2.12_91 + _102;
>   88   │   vectp_out.16_97 = vectp_out.16_96 + _102;
>   89   │   ivtmp_101 = ivtmp_100 - _102;
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.

OK.

I wonder since we now can match many different variants of writing
signed and unsigned
saturation add and sub whether it makes sense to canonicalize to the "cheapest"
variant when the target doesn't support .SAT_SUB/ADD?  Are there any
"sub-patterns"
not forming the full saturation add/sub that can be
simplified/canonicalized in such
way maybe?

> gcc/ChangeLog:
>
> * match.pd: Add case 1 matching pattern for vector signed SAT_SUB.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/match.pd | 16 
>  1 file changed, 16 insertions(+)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 8a7569ce387..a3c298d3a22 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3401,6 +3401,22 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type)
>&& types_match (type, @0, @1
>
> +/* Signed saturation sub, case 4:
> +   T minus = (T)((UT)X - (UT)Y);
> +   SAT_S_SUB = (X ^ Y) < 0 & (X ^ minus) < 0 ? (-(T)(X < 0) ^ MAX) : minus;
> +
> +   The T and UT are type pair like T=int8_t, UT=uint8_t.  */
> +(match (signed_integer_sat_sub @0 @1)
> + (cond^ (bit_and:c (lt (bit_xor @0 (nop_convert@2 (minus (nop_convert @0)
> +(nop_convert @1
> +  integer_zerop)
> +  (lt (bit_xor:c @0 @1) integer_zerop))
> +   (bit_xor:c (nop_convert (negate (nop_convert (convert
> + 

RE: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector reductions

2024-10-11 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Friday, October 11, 2024 7:52 AM
> To: Richard Sandiford 
> Cc: Jennifer Schmitz ; gcc-patches@gcc.gnu.org; Richard
> Earnshaw ; Kyrylo Tkachov
> ; Tamar Christina 
> Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
> reductions
> 
> On Thu, 10 Oct 2024, Richard Sandiford wrote:
> 
> > Jennifer Schmitz  writes:
> > > This patch implements the optabs reduc_and_scal_,
> > > reduc_ior_scal_, and reduc_xor_scal_ for ASIMD modes V8QI,
> > > V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise
> logical
> > > vector reduction operations.
> > > Previously, either only vector registers or only general purpose 
> > > registers (GPR)
> > > were used. Now, vector registers are used for the reduction from 128 to 64
> bits;
> > > 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR 
> > > are
> used
> > > for the rest of the reduction steps.
> > >
> > > For example, the test case (V8HI)
> > > int16_t foo (int16_t *a)
> > > {
> > >   int16_t b = -1;
> > >   for (int i = 0; i < 8; ++i)
> > > b &= a[i];
> > >   return b;
> > > }
> > >
> > > was previously compiled to (-O2):
> > > foo:
> > >   ldr q0, [x0]
> > >   moviv30.4s, 0
> > >   ext v29.16b, v0.16b, v30.16b, #8
> > >   and v29.16b, v29.16b, v0.16b
> > >   ext v31.16b, v29.16b, v30.16b, #4
> > >   and v31.16b, v31.16b, v29.16b
> > >   ext v30.16b, v31.16b, v30.16b, #2
> > >   and v30.16b, v30.16b, v31.16b
> > >   umovw0, v30.h[0]
> > >   ret
> > >
> > > With patch, it is compiled to:
> > > foo:
> > >   ldr q31, [x0]
> > >   ext v30.16b, v31.16b, v31.16b, #8
> > >   and v31.8b, v30.8b, v31.8b
> > >   fmovx0, d31
> > >   and x0, x0, x0, lsr 32
> > >   and w0, w0, w0, lsr 16
> > >   ret
> > >
> > > For modes V4SI and V2DI, the pattern was not implemented, because the
> > > current codegen (using only base instructions) is already efficient.
> > >
> > > Note that the PR initially suggested to use SVE reduction ops. However,
> > > they have higher latency than the proposed sequence, which is why using
> > > neon and base instructions is preferable.
> > >
> > > Test cases were added for 8/16-bit integers for all implemented modes and 
> > > all
> > > three operations to check the produced assembly.
> > >
> > > We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
> > > because for aarch64 vector types, either the logical reduction optabs are
> > > implemented or the codegen for reduction operations is good as it is.
> > > This was motivated by failure of a scan-tree-dump directive in the test 
> > > cases
> > > gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
> > >
> > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no
> regression.
> > > OK for mainline?
> > >
> > > Signed-off-by: Jennifer Schmitz 
> > >
> > > gcc/
> > >   PR target/113816
> > >   * config/aarch64/aarch64-simd.md (reduc__scal_):
> > >   Implement for logical bitwise operations for VDQV_E.
> > >
> > > gcc/testsuite/
> > >   PR target/113816
> > >   * lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
> > >   * gcc.target/aarch64/simd/logical_reduc.c: New test.
> > >   * gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> > > ---
> > >  gcc/config/aarch64/aarch64-simd.md|  55 +
> > >  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++
> > >  .../gcc.target/aarch64/vect-reduc-or_1.c  |   2 +-
> > >  gcc/testsuite/lib/target-supports.exp |   4 +-
> > >  4 files changed, 267 insertions(+), 2 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> > > index 23c03a96371..00286b8b020 100644
> > > --- a/gcc/config/aarch64/aarch64-simd.md
> > > +++ b/gcc/config/aarch64/aarch64-simd.md
> > > @@ -3608,6 +3608,61 @@
> > >}
> > >  )
> > >
> > > +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, 
> > > V16QI,
> > > +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively 
> > > operating
> > > +;; on the two halves of the input.
> > > +;; If the input has 128 bits, the first operation is performed in vector
> > > +;; registers.  From 64 bits down, the reduction steps are performed in 
> > > general
> > > +;; purpose registers.
> > > +;; For example, for V8HI and operation AND, the intended sequence is:
> > > +;; EXT  v1.16b, v0.16b, v0.16b, #8
> > > +;; AND  v0.8b, v1.8b, v0.8b
> > > +;; FMOV x0, d0
> > > +;; AND  x0, x0, x0, 32
> > > +;; AND  w0, w0, w0, 16
> > > +;;
> > > +;; For V8QI and operation AND, the sequence is:
> > > +;; AND  x0, x0, x0, lsr 32
> > > +;; AND  w0, w0, w0, lsr, 16
> > > +;; AND  w0, w0, w0, lsr, 8
> > > +
> > > +(define_expand "reduc__scal_"
> > > + [(match_operand: 0 "register_operand")
>

[PATCH] c: Implement C2Y N3355 - Named Loops [PR117022]

2024-10-11 Thread Jakub Jelinek
Hi!

The following patch implements the C2Y N3355 - Named Loops paper.

I've tried to implement it lazily, rather than proactively e.g. push
labels to a vector just in case the following statement is iteration
statement, switch statement or one of the loop pragmas followed by
iteration statement the patch just notes the last statement in
cur_stmt_list if any before c_parser_label/c_parser_all_labels and
passes it down to the iteration/switch statement parsing routines,
which then search backward for LABEL_EXPRs before they reach the given
stop statement.

The patch then adds one extra argument to
{FOR,WHILE,DO,BREAK,CONTINUE,SWITCH}_STMT, which is set to a canonical
name LABEL_DECL (the last named label before the construct).
If one just refers to the innermost construct with a fancy name,
it is in the end parsed the same as break/continue without an identifier
(i.e. NULL_TREE argument), and if a loop or switch has name(s) but
break/continue to that isn't used, the name is set to NULL_TREE.
At c-gimplify.cc time the name is then pushed into a hash map mapping
it to a pair of labels.

I've implemented it also for ObjC foreach loops (which have break/continue
handled during parsing, not during c-gimplify.cc).

As for OpenMP/OpenACC, the patch right now pretends no OpenMP loop
has a name, until something different is decided in the standard.
As shown in the testcases, most break identifier/continue identifier
cases aren't really useful in OpenMP code, a break identifier or
continue identifier jumping out of an OpenMP region is certainly invalid
(such regions have to be single entry single exit, so escaping it
through goto/break lab/continue lab violates that), similarly break
is disallowed in the innermost OpenMP nested loop, just continue
is allowed, so the only thing that would make sense for OpenMP (second
gomp testcase) would be allowing to give name to the innermost
loop in OpenMP canonical loop nest (except that labels aren't allowed
in the syntax right now in between the loops) and only continue to
that label.  For collapse(1) loops that would be a label before
the #pragma or [[omp::directive (parallel for)]] etc.  And of course,
what already works fine in the patch is break/continue to non-OpenMP loops
nested in OpenMP loops.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-10-11  Jakub Jelinek  

PR c/117022
gcc/c-family/
* c-common.def (FOR_STMT, WHILE_STMT, DO_STMT, BREAK_STMT,
CONTINUE_STMT, SWITCH_STMT): Add an extra operand, *_NAME
and document it.
* c-common.h (bc_hash_map_t): New typedef.
(struct bc_state): Add bc_hash_map member.
(WHILE_NAME, DO_NAME, FOR_NAME, BREAK_NAME, CONTINUE_NAME,
SWITCH_STMT_NAME): Define.
* c-pretty-print.cc (c_pretty_printer::statement): Print
BREAK_STMT or CONTINUE_STMT operand if any.
* c-gimplify.cc (bc_hash_map): New static variable.
(note_named_bc, release_named_bc): New functions.
(save_bc_state): Save and clear bc_hash_map.
(restore_bc_state): Assert NULL and restore bc_hash_map.
(genericize_c_loop): Add NAME argument, call note_named_bc
and release_named_bc if non-NULL around the body walk.
(genericize_for_stmt, genericize_while_stmt, genericize_do_stmt):
Adjust callers of it.
(genericize_switch_stmt): Rename break_block variable to blab.
Call note_named_bc and release_named_bc if SWITCH_STMT_NAME is
non-NULL around the body walk.
(genericize_continue_stmt): Handle non-NULL CONTINUE_NAME.
(genericize_break_stmt): Handle non-NULL BREAK_NAME.
(c_genericize): Delete and clear bc_hash_map.
gcc/c/
* c-tree.h: Implement C2Y N3355 - Named loops.
(C_DECL_LOOP_NAME, C_DECL_SWITCH_NAME, C_DECL_LOOP_SWITCH_NAME_VALID,
C_DECL_LOOP_SWITCH_NAME_USED, IN_NAMED_STMT): Define.
(c_get_loop_names, c_release_loop_names, c_finish_bc_name): Declare.
(c_start_switch): Add NAME argument.
(c_finish_bc_stmt): Likewise.
* c-lang.h (struct language_function): Add loop_names and
loop_names_hash members.
* c-parser.cc (c_parser_external_declaration,
c_parser_declaration_or_fndef, c_parser_struct_or_union_specifier,
c_parser_parameter_declaration): Adjust c_parser_pragma caller.
(get_before_labels): New function.
(c_parser_compound_statement_nostart): Call get_before_labels when
needed, adjust c_parser_pragma and c_parser_statement_after_labels
callers.
(c_parser_statement): Call get_before_labels first and pass it to
c_parser_statement_after_labels.
(c_parser_bc_name): New function.
(c_parser_statement_after_labels): Add BEFORE_LABELS argument.  Pass
it down to c_parser_switch_statement, c_parser_while_statement,
c_parser_do_statement, c_parser_for_statement and c_parser_pragma.
Call c_parser_bc_name

RE: [PATCH]middle-end: support SLP early break

2024-10-11 Thread Richard Biener
On Thu, 10 Oct 2024, Tamar Christina wrote:

> > > e.g. if (a != 0) where a is loop invariant.  For instance test_memcmp_1_1
> > > in /gcc.dg/memcmp-1.c is such loop.  Technically we should be able to
> > > vectorize such loops,  but while we can represent externals in the SLP 
> > > tree,
> > > we can't start discovery at them, as no stmt_info for them.
> > >
> > > In principle all I need here is an empty SLP tree, since all codegen is 
> > > driven
> > > by the roots for such invariant compares.  However vect_build_slp_tree
> > > doesn't accept empty stmts.
> > 
> > The externals would have SLP nodes of course but the requirement
> > currently is that the SLP instance root is an internal def.
> > 
> > > I believe we are able to vectorize such loops today,  so perhaps instead 
> > > of
> > > failing we should support building an SLP instance with only roots?
> > 
> > It might be tempting but I don't think this is generally useful.
> > 
> > > In which case should I try to fit it into vect_build_slp_tree or just 
> > > special
> > > case it for the gcond discovery?
> > 
> > The issue is that you have two operands you technically would like to
> > see code-genrated - the 'a' and the '0' vector invariants, but the
> > SLP instance only has a single root.  You could (as I suggested)
> > simply only build the SLP node for the (invariant) LHS of the gcond,
> > not by using vect_build_slp_tree but instead by manually building
> > the SLP tree for the invariant - see what vect_build_slp_tree_2 does
> > here:
> > 
> 
> Done,
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> Will test more targets closer to commit.
> 
> Ok for master?
> 
> gcc/ChangeLog:
> 
>   * tree-vect-loop.cc (vect_analyze_loop_2): Handle SLP trees with no
>   children.
>   * tree-vectorizer.h (enum slp_instance_kind): Add slp_inst_kind_gcond.
>   (LOOP_VINFO_EARLY_BREAKS_LIVE_IVS): New.
>   (vectorizable_early_exit): Expose.
>   (class _loop_vec_info): Add early_break_live_stmts.
>   * tree-vect-slp.cc (vect_build_slp_instance, vect_analyze_slp_instance):
>   Support gcond instances.
>   (vect_analyze_slp): Analyze gcond roots and early break live statements.
>   (maybe_push_to_hybrid_worklist): Don't sink gconds.
>   (vect_slp_analyze_operations): Support gconds.
>   (vect_slp_check_for_roots): Update comments.
>   (vectorize_slp_instance_root_stmt): Support gconds.
>   (vect_schedule_slp): Pass vinfo to vectorize_slp_instance_root_stmt.
>   * tree-vect-stmts.cc (vect_stmt_relevant_p): Record early break live
>   statements.
>   (vectorizable_early_exit): Support SLP.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.dg/vect/vect-early-break_126.c: New test.
>   * gcc.dg/vect/vect-early-break_127.c: New test.
>   * gcc.dg/vect/vect-early-break_128.c: New test.
> 
> -- inline copy of patch --
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_126.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_126.c
> new file mode 100644
> index 
> ..4bfc9880f9fc869bf616123ff509d13be17ffacf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_126.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */
> +
> +#define N 1024
> +unsigned vect_a[N];
> +unsigned vect_b[N];
> + 
> +unsigned test4(unsigned x)
> +{
> + unsigned ret = 0;
> + for (int i = 0; i < N; i++)
> + {
> +   vect_b[i] = x + i;
> +   if (vect_a[i] > x)
> + {
> +   ret *= vect_a[i];
> +   return vect_a[i];
> + }
> +   vect_a[i] = x;
> +   ret += vect_a[i] + vect_b[i];
> + }
> + return ret;
> +}
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_127.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_127.c
> new file mode 100644
> index 
> ..67cb5d34a77192e5d7d72c35df8e83535ef184ab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_127.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */
> +
> +#ifndef N
> +#define N 800
> +#endif
> +unsigned vect_a[N];
> +unsigned vect_b[N];
> +  
> +unsigned test4(unsigned x)
> +{
> + unsigned ret = 0;
> + for (int i = 0; i < N; i++)
> + {
> +   vect_b[i] = x + i;
> +   if (vect_a[i]*2 != x)
> + break;
> +   vect_a[i] = x;
> +   
> + }
> + return ret;
> +}
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_128.c 
> b

RE: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector reductions

2024-10-11 Thread Richard Biener
On Fri, 11 Oct 2024, Tamar Christina wrote:

> > -Original Message-
> > From: Richard Biener 
> > Sent: Friday, October 11, 2024 7:52 AM
> > To: Richard Sandiford 
> > Cc: Jennifer Schmitz ; gcc-patches@gcc.gnu.org; Richard
> > Earnshaw ; Kyrylo Tkachov
> > ; Tamar Christina 
> > Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
> > reductions
> > 
> > On Thu, 10 Oct 2024, Richard Sandiford wrote:
> > 
> > > Jennifer Schmitz  writes:
> > > > This patch implements the optabs reduc_and_scal_,
> > > > reduc_ior_scal_, and reduc_xor_scal_ for ASIMD modes V8QI,
> > > > V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise
> > logical
> > > > vector reduction operations.
> > > > Previously, either only vector registers or only general purpose 
> > > > registers (GPR)
> > > > were used. Now, vector registers are used for the reduction from 128 to 
> > > > 64
> > bits;
> > > > 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit 
> > > > GPR are
> > used
> > > > for the rest of the reduction steps.
> > > >
> > > > For example, the test case (V8HI)
> > > > int16_t foo (int16_t *a)
> > > > {
> > > >   int16_t b = -1;
> > > >   for (int i = 0; i < 8; ++i)
> > > > b &= a[i];
> > > >   return b;
> > > > }
> > > >
> > > > was previously compiled to (-O2):
> > > > foo:
> > > > ldr q0, [x0]
> > > > moviv30.4s, 0
> > > > ext v29.16b, v0.16b, v30.16b, #8
> > > > and v29.16b, v29.16b, v0.16b
> > > > ext v31.16b, v29.16b, v30.16b, #4
> > > > and v31.16b, v31.16b, v29.16b
> > > > ext v30.16b, v31.16b, v30.16b, #2
> > > > and v30.16b, v30.16b, v31.16b
> > > > umovw0, v30.h[0]
> > > > ret
> > > >
> > > > With patch, it is compiled to:
> > > > foo:
> > > > ldr q31, [x0]
> > > > ext v30.16b, v31.16b, v31.16b, #8
> > > > and v31.8b, v30.8b, v31.8b
> > > > fmovx0, d31
> > > > and x0, x0, x0, lsr 32
> > > > and w0, w0, w0, lsr 16
> > > > ret
> > > >
> > > > For modes V4SI and V2DI, the pattern was not implemented, because the
> > > > current codegen (using only base instructions) is already efficient.
> > > >
> > > > Note that the PR initially suggested to use SVE reduction ops. However,
> > > > they have higher latency than the proposed sequence, which is why using
> > > > neon and base instructions is preferable.
> > > >
> > > > Test cases were added for 8/16-bit integers for all implemented modes 
> > > > and all
> > > > three operations to check the produced assembly.
> > > >
> > > > We also added [istarget aarch64*-*-*] to the selector 
> > > > vect_logical_reduc,
> > > > because for aarch64 vector types, either the logical reduction optabs 
> > > > are
> > > > implemented or the codegen for reduction operations is good as it is.
> > > > This was motivated by failure of a scan-tree-dump directive in the test 
> > > > cases
> > > > gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
> > > >
> > > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no
> > regression.
> > > > OK for mainline?
> > > >
> > > > Signed-off-by: Jennifer Schmitz 
> > > >
> > > > gcc/
> > > > PR target/113816
> > > > * config/aarch64/aarch64-simd.md (reduc__scal_):
> > > > Implement for logical bitwise operations for VDQV_E.
> > > >
> > > > gcc/testsuite/
> > > > PR target/113816
> > > > * lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
> > > > * gcc.target/aarch64/simd/logical_reduc.c: New test.
> > > > * gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> > > > ---
> > > >  gcc/config/aarch64/aarch64-simd.md|  55 +
> > > >  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++
> > > >  .../gcc.target/aarch64/vect-reduc-or_1.c  |   2 +-
> > > >  gcc/testsuite/lib/target-supports.exp |   4 +-
> > > >  4 files changed, 267 insertions(+), 2 deletions(-)
> > > >  create mode 100644 
> > > > gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > > >
> > > > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > > > index 23c03a96371..00286b8b020 100644
> > > > --- a/gcc/config/aarch64/aarch64-simd.md
> > > > +++ b/gcc/config/aarch64/aarch64-simd.md
> > > > @@ -3608,6 +3608,61 @@
> > > >}
> > > >  )
> > > >
> > > > +;; Emit a sequence for bitwise logical reductions over vectors for 
> > > > V8QI, V16QI,
> > > > +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively 
> > > > operating
> > > > +;; on the two halves of the input.
> > > > +;; If the input has 128 bits, the first operation is performed in 
> > > > vector
> > > > +;; registers.  From 64 bits down, the reduction steps are performed in 
> > > > general
> > > > +;; purpose registers.
> > > > +;; For example, for V8HI and operation AND, the 

RE: [PATCH]middle-end: support SLP early break

2024-10-11 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Friday, October 11, 2024 8:11 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; j...@ventanamicro.com
> Subject: RE: [PATCH]middle-end: support SLP early break
> 
> On Thu, 10 Oct 2024, Tamar Christina wrote:
> 
> > > > e.g. if (a != 0) where a is loop invariant.  For instance 
> > > > test_memcmp_1_1
> > > > in /gcc.dg/memcmp-1.c is such loop.  Technically we should be able to
> > > > vectorize such loops,  but while we can represent externals in the SLP 
> > > > tree,
> > > > we can't start discovery at them, as no stmt_info for them.
> > > >
> > > > In principle all I need here is an empty SLP tree, since all codegen is 
> > > > driven
> > > > by the roots for such invariant compares.  However vect_build_slp_tree
> > > > doesn't accept empty stmts.
> > >
> > > The externals would have SLP nodes of course but the requirement
> > > currently is that the SLP instance root is an internal def.
> > >
> > > > I believe we are able to vectorize such loops today,  so perhaps 
> > > > instead of
> > > > failing we should support building an SLP instance with only roots?
> > >
> > > It might be tempting but I don't think this is generally useful.
> > >
> > > > In which case should I try to fit it into vect_build_slp_tree or just 
> > > > special
> > > > case it for the gcond discovery?
> > >
> > > The issue is that you have two operands you technically would like to
> > > see code-genrated - the 'a' and the '0' vector invariants, but the
> > > SLP instance only has a single root.  You could (as I suggested)
> > > simply only build the SLP node for the (invariant) LHS of the gcond,
> > > not by using vect_build_slp_tree but instead by manually building
> > > the SLP tree for the invariant - see what vect_build_slp_tree_2 does
> > > here:
> > >
> >
> > Done,
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > Will test more targets closer to commit.
> >
> > Ok for master?
> >
> > gcc/ChangeLog:
> >
> > * tree-vect-loop.cc (vect_analyze_loop_2): Handle SLP trees with no
> > children.
> > * tree-vectorizer.h (enum slp_instance_kind): Add slp_inst_kind_gcond.
> > (LOOP_VINFO_EARLY_BREAKS_LIVE_IVS): New.
> > (vectorizable_early_exit): Expose.
> > (class _loop_vec_info): Add early_break_live_stmts.
> > * tree-vect-slp.cc (vect_build_slp_instance, vect_analyze_slp_instance):
> > Support gcond instances.
> > (vect_analyze_slp): Analyze gcond roots and early break live statements.
> > (maybe_push_to_hybrid_worklist): Don't sink gconds.
> > (vect_slp_analyze_operations): Support gconds.
> > (vect_slp_check_for_roots): Update comments.
> > (vectorize_slp_instance_root_stmt): Support gconds.
> > (vect_schedule_slp): Pass vinfo to vectorize_slp_instance_root_stmt.
> > * tree-vect-stmts.cc (vect_stmt_relevant_p): Record early break live
> > statements.
> > (vectorizable_early_exit): Support SLP.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.dg/vect/vect-early-break_126.c: New test.
> > * gcc.dg/vect/vect-early-break_127.c: New test.
> > * gcc.dg/vect/vect-early-break_128.c: New test.
> >
> > -- inline copy of patch --
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_126.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_126.c
> > new file mode 100644
> > index
> ..4bfc9880f9fc869bf616123
> ff509d13be17ffacf
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_126.c
> > @@ -0,0 +1,28 @@
> > +/* { dg-do compile } */
> > +/* { dg-add-options vect_early_break } */
> > +/* { dg-require-effective-target vect_early_break } */
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> > +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */
> > +
> > +#define N 1024
> > +unsigned vect_a[N];
> > +unsigned vect_b[N];
> > +
> > +unsigned test4(unsigned x)
> > +{
> > + unsigned ret = 0;
> > + for (int i = 0; i < N; i++)
> > + {
> > +   vect_b[i] = x + i;
> > +   if (vect_a[i] > x)
> > + {
> > +   ret *= vect_a[i];
> > +   return vect_a[i];
> > + }
> > +   vect_a[i] = x;
> > +   ret += vect_a[i] + vect_b[i];
> > + }
> > + return ret;
> > +}
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_127.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_127.c
> > new file mode 100644
> > index
> ..67cb5d34a77192e5d7d72
> c35df8e83535ef184ab
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_127.c
> > @@ -0,0 +1,27 @@
> > +/* { dg-do compile } */
> > +/* { dg-add-options vect_early_break } */
> > +/* { dg-require-effective-target vect_early_break } */
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> > +/* { dg-final { scan-tree-dump "L

[PATCH] i386: Fix up spaceship expanders for -mtune=i[45]86 [PR117053]

2024-10-11 Thread Jakub Jelinek
Hi!

The adjusted and new spaceship expanders ICE with -mtune=i486 or
-mtune=i586.
The problem is that in that case TARGET_ZERO_EXTEND_WITH_AND is true
and zero_extendqisi2 isn't allowed in that case, and we can't use
the replacement AND, because that clobbers flags and we want to use them
again.

The following patch fixes that by using in those cases roughly what
we want to expand it to after peephole2 optimizations, i.e. xor
before the comparison, *setcc_qi_slp and sbbl $0 (or for signed
int case xoring of 2 regs, two *setcc_qi_slp, subl).
For *setcc_qi_slp, it uses the setcc_si_slp hacks with UNSPEC that
were in use for the floating point jp case (so such code is IMHO
undesirable for the !TARGET_ZERO_EXTEND_WITH_AND case as we want to
give combiner more liberty in that case).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-10-11  Jakub Jelinek  

PR target/117053
* config/i386/i386-expand.cc (ix86_expand_fp_spaceship): Handle
TARGET_ZERO_EXTEND_WITH_AND differently.
(ix86_expand_int_spaceship): Likewise.

* g++.target/i386/pr116896-3.C: New test.

--- gcc/config/i386/i386-expand.cc.jj   2024-10-08 10:44:30.144935903 +0200
+++ gcc/config/i386/i386-expand.cc  2024-10-10 20:16:05.192669243 +0200
@@ -3150,7 +3150,9 @@ ix86_expand_fp_spaceship (rtx dest, rtx
 {
   gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
   rtx zero = NULL_RTX;
-  if (op2 != const0_rtx && TARGET_IEEE_FP && GET_MODE (dest) == SImode)
+  if (op2 != const0_rtx
+  && (TARGET_IEEE_FP || TARGET_ZERO_EXTEND_WITH_AND)
+  && GET_MODE (dest) == SImode)
 zero = force_reg (SImode, const0_rtx);
   rtx gt = ix86_expand_fp_compare (GT, op0, op1);
   rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
@@ -3190,15 +3192,20 @@ ix86_expand_fp_spaceship (rtx dest, rtx
 }
   else
 {
-  rtx lt_tmp = gen_reg_rtx (QImode);
-  ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
-const0_rtx);
-  if (GET_MODE (dest) != QImode)
+  rtx lt_tmp = NULL_RTX;
+  if (GET_MODE (dest) != SImode || !TARGET_ZERO_EXTEND_WITH_AND)
{
- tmp = gen_reg_rtx (GET_MODE (dest));
- emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
-   lt_tmp)));
- lt_tmp = tmp;
+ lt_tmp = gen_reg_rtx (QImode);
+ ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
+const0_rtx);
+ if (GET_MODE (dest) != QImode)
+   {
+ tmp = gen_reg_rtx (GET_MODE (dest));
+ emit_insn (gen_rtx_SET (tmp,
+ gen_rtx_ZERO_EXTEND (GET_MODE (dest),
+  lt_tmp)));
+ lt_tmp = tmp;
+   }
}
   rtx gt_tmp;
   if (zero)
@@ -3206,7 +3213,9 @@ ix86_expand_fp_spaceship (rtx dest, rtx
  /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear
 before the floating point comparison and use setcc_si_slp
 pattern to hide it from the combiner, so that it doesn't
-undo it.  */
+undo it.  Similarly for TARGET_ZERO_EXTEND_WITH_AND, where
+the ZERO_EXTEND normally emitted would need to be AND
+with flags clobber.  */
  tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx);
  PUT_MODE (tmp, QImode);
  emit_insn (gen_setcc_si_slp (zero, tmp, zero));
@@ -3225,10 +3234,23 @@ ix86_expand_fp_spaceship (rtx dest, rtx
  gt_tmp = tmp;
}
}
-  tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest,
-0, OPTAB_DIRECT);
-  if (!rtx_equal_p (tmp, dest))
-   emit_move_insn (dest, tmp);
+  if (lt_tmp)
+   {
+ tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp,
+dest, 0, OPTAB_DIRECT);
+ if (!rtx_equal_p (tmp, dest))
+   emit_move_insn (dest, tmp);
+   }
+  else
+   {
+ /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
+do ZERO_EXTEND without clobbering flags.  */
+ tmp = ix86_expand_compare (UNLT, XEXP (gt, 0), const0_rtx);
+ PUT_MODE (tmp, SImode);
+ emit_insn (gen_subsi3_carry (dest, gt_tmp,
+  force_reg (GET_MODE (dest), const0_rtx),
+  XEXP (gt, 0), tmp));
+   }
 }
   emit_jump (lend);
   if (l2)
@@ -3246,6 +3268,14 @@ void
 ix86_expand_int_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
 {
   gcc_assert (INTVAL (op2));
+  rtx zero1 = NULL_RTX, zero2 = NULL_RTX;
+  if (TARGET_ZERO_EXTEND_WITH_AND && GET_MODE (dest) == SImode)
+{
+  zero1 = force_reg (SImode, const0_rtx);
+  if (INTVAL (op2) != 1)
+   zer

[PATCH 2/2] PR target/117048 aarch64: Use more canonical and optimization-friendly representation for XAR instruction

2024-10-11 Thread Kyrylo Tkachov
The pattern for the Advanced SIMD XAR instruction isn't very
optimization-friendly at the moment.
In the testcase from the PR once simlify-rtx has done its work it
generates the RTL:
(set (reg:V2DI 119 [ _14 ])
(rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
(reg:V2DI 116 [ *m1_01_8(D) ]))
(const_vector:V2DI [
(const_int 32 [0x20]) repeated x2
])))

which fails to match our XAR pattern because the pattern expects:
1) A ROTATERT instead of the ROTATE.  However, according to the RTL ops
documentation the preferred form of rotate-by-immediate is ROTATE, which
I take to mean it's the canonical form.
ROTATE (x, C) <-> ROTATERT (x, MODE_WIDTH - C) so it's better to match just
one canonical representation.
2) A CONST_INT shift amount whereas the midend asks for a repeated vector
constant.

These issues are fixed by introducing a dedicated expander for the
aarch64_xarqv2di name, needed by the arm_neon.h intrinsic, that translate
the intrinsic-level CONST_INT immediate (the right-rotate amount) into
a repeated vector constant subtracted from 64 to give the corresponding
left-rotate amount that is fed to the new representation for the XAR
define_insn that uses the ROTATE RTL code.  This is a similar approach
to have we handle the discrepancy between intrinsic-level and RTL-level
vector lane numbers for big-endian.

With this patch and [1/2] the arithmetic parts of the testcase now simplify
to just one XAR instruction.

Bootstrapped and tested on aarch64-none-linux-gnu.
I’ll push it after patch approval of [1/2] leaving some time for comments.

I’ll note that the SVE2 patterns for XAR should also be improved in a similar
but that is a separate patch.

Thanks,
Kyrill 

Signed-off-by: Kyrylo Tkachov 

gcc/
PR target/117048
* config/aarch64/aarch64-simd.md (aarch64_xarqv2di): Redefine into a
define_expand.
(*aarch64_xarqv2di_insn): Define.

gcc/testsuite/
PR target/117048
* g++.target/aarch64/pr117048.C: New test.



0002-PR-target-117048-aarch64-Use-more-canonical-and-opti.patch
Description: 0002-PR-target-117048-aarch64-Use-more-canonical-and-opti.patch


[PATCH 1/2] PR 117048: simplify-rtx: Extend (x << C1) | (X >> C2) --> ROTATE transformation to vector operands

2024-10-11 Thread Kyrylo Tkachov
Hi all,

In the testcase from patch [2/2] we want to match a vector rotate operation from
an IOR of left and right shifts by immediate.  simplify-rtx has code for just
that but it looks like it's prepared to do handle only scalar operands.
In practice most of the code works for vector modes as well except the shift
amounts are checked to be CONST_INT rather than vector constants that we have
here.  This is easily extended by using unwrap_const_vec_duplicate to extract
the repeating constant shift amount.  With this change combine now tries
matching the simpler and expected:
(set (reg:V2DI 119 [ _14 ])
(rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
(reg:V2DI 116 [ *m1_01_8(D) ]))
(const_vector:V2DI [
(const_int 32 [0x20]) repeated x2
])))
instead of the previous:
(set (reg:V2DI 119 [ _14 ])
(ior:V2DI (ashift:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
(reg:V2DI 116 [ *m1_01_8(D) ]))
(const_vector:V2DI [
(const_int 32 [0x20]) repeated x2
]))
(lshiftrt:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
(reg:V2DI 116 [ *m1_01_8(D) ]))
(const_vector:V2DI [
(const_int 32 [0x20]) repeated x2
]

To actually fix the PR the aarch64 backend needs some adjustment as well
which is done in patch [2/2], which adds the testcase as well.

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for mainline?
Thanks,
Kyrill

Signed-off-by: Kyrylo Tkachov 

PR target/117048
* simplify-rtx.cc (simplify_context::simplify_binary_operation_1):
Handle vector constants in (x << C1) | (x >> C2) -> ROTATE
simplification.



0001-PR-117048-simplify-rtx-Extend-x-C1-X-C2-ROTATE-trans.patch
Description: 0001-PR-117048-simplify-rtx-Extend-x-C1-X-C2-ROTATE-trans.patch


Re: [PATCH] middle-end: [PR middle-end/116926] Allow widening optabs for vec-mode -> scalar-mode

2024-10-11 Thread Richard Biener
On Thu, Oct 10, 2024 at 5:25 PM Victor Do Nascimento
 wrote:
>
> The recent refactoring of the dot_prod optab to convert-type exposed a
> limitation in how `find_widening_optab_handler_and_mode' is currently
> implemented, owing to the fact that, while the function expects the
>
>   GET_MODE_CLASS (from_mode) == GET_MODE_CLASS (to_mode)
>
> condition to hold, the c6x backend implements a dot product from V2HI
> to SI, which triggers an ICE.
>
> Consequently, this patch adds some logic to allow widening optabs
> which accumulate vector elements to a single scalar.
>
> Regression tested on x86_64 and aarch64 with no new regressions.
> Fixes failing unit tests on c6x, as validated for the tic6x-unknown-elf
> target.
>
> Ok for master?
>
> gcc/ChangeLog:
>
> PR middle-end/116926
> * optabs-query.cc (find_widening_optab_handler_and_mode): Add
> handling of vector -> scalar optab handling.
> ---
>  gcc/optabs-query.cc | 13 +
>  1 file changed, 13 insertions(+)
>
> diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> index c3134d6a2ce..8a9092ffec7 100644
> --- a/gcc/optabs-query.cc
> +++ b/gcc/optabs-query.cc
> @@ -485,6 +485,19 @@ find_widening_optab_handler_and_mode (optab op, 
> machine_mode to_mode,
>if (GET_MODE_CLASS (limit_mode) == MODE_PARTIAL_INT)
> limit_mode = GET_MODE_WIDER_MODE (limit_mode).require ();
>  }
> +  else if (GET_MODE_CLASS (from_mode) != GET_MODE_CLASS (to_mode))
> +{
> +  gcc_checking_assert (VECTOR_MODE_P (from_mode)
> +  && !VECTOR_MODE_P (to_mode)
> +  && GET_MODE_INNER (from_mode) < to_mode);
> +  enum insn_code handler = convert_optab_handler (op, to_mode, 
> from_mode);
> +  if (handler != CODE_FOR_nothing)
> +   {
> + if (found_mode)
> +   *found_mode = from_mode;
> + return handler;
> +   }

   else if (is_a  (to_mode))
 {
gcc_checking_assert (VECTOR_MODE_P (from_mode)
&& GET_MODE_INNER
(from_mode) < to_mode);
limit_mode = from_mode;
 }
   else
...

would also work?

Thanks,
Richard.

> +}
>else
>  gcc_checking_assert (GET_MODE_CLASS (from_mode) == GET_MODE_CLASS 
> (to_mode)
>  && from_mode < to_mode);
> --
> 2.34.1
>


Re: [PATCH] i386: Fix up spaceship expanders for -mtune=i[45]86 [PR117053]

2024-10-11 Thread Uros Bizjak
On Fri, Oct 11, 2024 at 9:16 AM Jakub Jelinek  wrote:
>
> Hi!
>
> The adjusted and new spaceship expanders ICE with -mtune=i486 or
> -mtune=i586.
> The problem is that in that case TARGET_ZERO_EXTEND_WITH_AND is true
> and zero_extendqisi2 isn't allowed in that case, and we can't use
> the replacement AND, because that clobbers flags and we want to use them
> again.
>
> The following patch fixes that by using in those cases roughly what
> we want to expand it to after peephole2 optimizations, i.e. xor
> before the comparison, *setcc_qi_slp and sbbl $0 (or for signed
> int case xoring of 2 regs, two *setcc_qi_slp, subl).
> For *setcc_qi_slp, it uses the setcc_si_slp hacks with UNSPEC that
> were in use for the floating point jp case (so such code is IMHO
> undesirable for the !TARGET_ZERO_EXTEND_WITH_AND case as we want to
> give combiner more liberty in that case).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2024-10-11  Jakub Jelinek  
>
> PR target/117053
> * config/i386/i386-expand.cc (ix86_expand_fp_spaceship): Handle
> TARGET_ZERO_EXTEND_WITH_AND differently.
> (ix86_expand_int_spaceship): Likewise.
>
> * g++.target/i386/pr116896-3.C: New test.

OK.

Thanks,
Uros.

>
> --- gcc/config/i386/i386-expand.cc.jj   2024-10-08 10:44:30.144935903 +0200
> +++ gcc/config/i386/i386-expand.cc  2024-10-10 20:16:05.192669243 +0200
> @@ -3150,7 +3150,9 @@ ix86_expand_fp_spaceship (rtx dest, rtx
>  {
>gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
>rtx zero = NULL_RTX;
> -  if (op2 != const0_rtx && TARGET_IEEE_FP && GET_MODE (dest) == SImode)
> +  if (op2 != const0_rtx
> +  && (TARGET_IEEE_FP || TARGET_ZERO_EXTEND_WITH_AND)
> +  && GET_MODE (dest) == SImode)
>  zero = force_reg (SImode, const0_rtx);
>rtx gt = ix86_expand_fp_compare (GT, op0, op1);
>rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
> @@ -3190,15 +3192,20 @@ ix86_expand_fp_spaceship (rtx dest, rtx
>  }
>else
>  {
> -  rtx lt_tmp = gen_reg_rtx (QImode);
> -  ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
> -const0_rtx);
> -  if (GET_MODE (dest) != QImode)
> +  rtx lt_tmp = NULL_RTX;
> +  if (GET_MODE (dest) != SImode || !TARGET_ZERO_EXTEND_WITH_AND)
> {
> - tmp = gen_reg_rtx (GET_MODE (dest));
> - emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
> -   lt_tmp)));
> - lt_tmp = tmp;
> + lt_tmp = gen_reg_rtx (QImode);
> + ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
> +const0_rtx);
> + if (GET_MODE (dest) != QImode)
> +   {
> + tmp = gen_reg_rtx (GET_MODE (dest));
> + emit_insn (gen_rtx_SET (tmp,
> + gen_rtx_ZERO_EXTEND (GET_MODE (dest),
> +  lt_tmp)));
> + lt_tmp = tmp;
> +   }
> }
>rtx gt_tmp;
>if (zero)
> @@ -3206,7 +3213,9 @@ ix86_expand_fp_spaceship (rtx dest, rtx
>   /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear
>  before the floating point comparison and use setcc_si_slp
>  pattern to hide it from the combiner, so that it doesn't
> -undo it.  */
> +undo it.  Similarly for TARGET_ZERO_EXTEND_WITH_AND, where
> +the ZERO_EXTEND normally emitted would need to be AND
> +with flags clobber.  */
>   tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx);
>   PUT_MODE (tmp, QImode);
>   emit_insn (gen_setcc_si_slp (zero, tmp, zero));
> @@ -3225,10 +3234,23 @@ ix86_expand_fp_spaceship (rtx dest, rtx
>   gt_tmp = tmp;
> }
> }
> -  tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, 
> dest,
> -0, OPTAB_DIRECT);
> -  if (!rtx_equal_p (tmp, dest))
> -   emit_move_insn (dest, tmp);
> +  if (lt_tmp)
> +   {
> + tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp,
> +dest, 0, OPTAB_DIRECT);
> + if (!rtx_equal_p (tmp, dest))
> +   emit_move_insn (dest, tmp);
> +   }
> +  else
> +   {
> + /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
> +do ZERO_EXTEND without clobbering flags.  */
> + tmp = ix86_expand_compare (UNLT, XEXP (gt, 0), const0_rtx);
> + PUT_MODE (tmp, SImode);
> + emit_insn (gen_subsi3_carry (dest, gt_tmp,
> +  force_reg (GET_MODE (dest), 
> const0_rtx),
> +  XEXP (gt, 0), tmp));
> +   }
>  }
>emit_jump (lend);
>if (l2)
> @@ -3246,6 +3268,14 @@ void
>  

Re: [to-be-committed][RISC-V] Slightly improve broadcasting small constants into vectors

2024-10-11 Thread Jeff Law




On 10/11/24 5:40 PM, Andrew Waterman wrote:

Whether or not we should use vmv.v.i vs vmv.s.x for loading [-16..15]
into the 0th element is probably uarch dependent.  The tradeoff is
loading the GPR vs the broadcast in the vector unit.  I didn't bother
with this case.


Note that this tradeoff is only interesting when LMUL is small.  When
LMUL is large, vmv.v.i does a lot more work than vmv.s.x (writing
multiple vector registers versus just one).

Very true and I would expect LMUL <= 1 to be the most common case.


Mostly it's a matter of spotting something dumb and fixing it rather 
than having to answer questions later about dumb codegen.  I doubt any 
of these cases matter in practice.



Jeff



Re: [to-be-committed][RISC-V] Slightly improve broadcasting small constants into vectors

2024-10-11 Thread Andrew Waterman
On Fri, Oct 11, 2024 at 6:26 AM Jeff Law  wrote:
>
> I probably spent way more time on this than it's worth...
>
> I was looking at the code we generate for vector SAD and noticed that we
> were being a bit silly.  Specifically:
>
>  li  a4,0# 272   [c=4 l=4]  *movsi_internal/1
>
> Followed shortly by:
>
>  vmv.s.x v3,a4   # 261   [c=4 l=4]  *pred_broadcastrvvm1si/6
>
> And no other uses of a4.  We could have used x0 trivially.
>
> First we adjust the expander so that it doesn't force the constant into
> a register.  In the matching pattern we change the appropriate source
> constraints from "r" to "rJ" and the output template is changed to use
> %z for the operand.  The net is we drop the li completely and emit
> vmv.s.x,v3,x0.
>
> But wait, there's more.  If we're broadcasting a constant in the range
> [-16..15] into a vector, we currently load the constant into a register
> and use vmv.v.r.  We can instead use vmv.v.i, which avoids loading the
> constant into a GPR.  For that case we again avoid forcing the constant
> into a register in the expander and adjust the output template to emit
> vmv.v.x or vmv.v.i based on whether or not the appropriate operand is a
> constant or general purpose register.  So again, we'll drop a load
> immediate into a scalar for this case.
>
> Whether or not we should use vmv.v.i vs vmv.s.x for loading [-16..15]
> into the 0th element is probably uarch dependent.  The tradeoff is
> loading the GPR vs the broadcast in the vector unit.  I didn't bother
> with this case.

Note that this tradeoff is only interesting when LMUL is small.  When
LMUL is large, vmv.v.i does a lot more work than vmv.s.x (writing
multiple vector registers versus just one).

>
> Tested in my tester (which tests rv64gcv as a default codegen option).
> Will wait for the pre-commit tester to render a verdict.
>
> Jeff


[PATCH] gcc.target/i386/invariant-ternlog-1.c: Also scan (%edx)

2024-10-11 Thread H.J. Lu
Since x32 uses (%edx), instead of (%rdx), also scan (%edx).

* gcc.target/i386/invariant-ternlog-1.c: Also scan (%edx).


-- 
H.J.
From eb27c432407702cf18c460dac08696d80851e729 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Sat, 12 Oct 2024 05:04:33 +0800
Subject: [PATCH] gcc.target/i386/invariant-ternlog-1.c: Also scan (%edx)

Since x32 uses (%edx), instead of (%rdx), also scan (%edx).

	* gcc.target/i386/invariant-ternlog-1.c: Also scan (%edx).

Signed-off-by: H.J. Lu 
---
 gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
index bf67ed7e43d..77b95d3bb33 100644
--- a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
+++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
 /* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
-/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%rdx\)} 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%(?:r|e)dx\)} 2 { target { ! ia32 } } } } */
 
 #include 
 
-- 
2.47.0



Re: [PATCH v1 3/4] RISC-V: Implement vector SAT_SUB for signed integer

2024-10-11 Thread 钟居哲
LGTM



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2024-10-11 14:22
To: gcc-patches
CC: richard.guenther; Tamar.Christina; juzhe.zhong; kito.cheng; jeffreyalaw; 
rdapp.gcc; Pan Li
Subject: [PATCH v1 3/4] RISC-V: Implement vector SAT_SUB for signed integer
From: Pan Li 
 
This patch would like to implement the sssub for vector signed integer.
 
Form 1:
  #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX) \
  void __attribute__((noinline))   \
  vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
T minus = (UT)x - (UT)y;   \
out[i] = (x ^ y) >= 0  \
  ? minus  \
  : (minus ^ x) >= 0   \
? minus\
: x < 0 ? MIN : MAX;   \
  }\
  }
 
DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)
 
Before this patch:
  28   │ vle8.v  v1,0(a1)
  29   │ vle8.v  v2,0(a2)
  30   │ sub a3,a3,a5
  31   │ add a1,a1,a5
  32   │ add a2,a2,a5
  33   │ vsra.vi v4,v1,7
  34   │ vsub.vv v3,v1,v2
  35   │ vxor.vv v2,v1,v2
  36   │ vxor.vv v0,v1,v3
  37   │ vmslt.viv2,v2,0
  38   │ vmslt.viv0,v0,0
  39   │ vmand.mmv0,v0,v2
  40   │ vxor.vv v3,v4,v5,v0.t
  41   │ vse8.v  v3,0(a0)
  42   │ add a0,a0,a5
 
After this patch:
  25   │ vle8.v  v1,0(a1)
  26   │ vle8.v  v2,0(a2)
  27   │ sub a3,a3,a5
  28   │ add a1,a1,a5
  29   │ add a2,a2,a5
  30   │ vssub.vvv1,v1,v2
  31   │ vse8.v  v1,0(a0)
  32   │ add a0,a0,a5
 
The below test suites are passed for this patch.
* The rv64gcv fully regression test.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (sssub3): Add new pattern for
signed SAT_SUB.
* config/riscv/riscv-protos.h (expand_vec_sssub): Add new func
decl to expand sssub to vssub.
* config/riscv/riscv-v.cc (expand_vec_sssub): Add new func
impl to expand sssub to vssub.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/autovec.md | 11 +++
gcc/config/riscv/riscv-protos.h |  1 +
gcc/config/riscv/riscv-v.cc |  9 +
3 files changed, 21 insertions(+)
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 836cdd4491f..7dc78a48874 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2734,6 +2734,17 @@ (define_expand "ussub3"
   }
)
+(define_expand "sssub3"
+  [(match_operand:V_VLSI 0 "register_operand")
+   (match_operand:V_VLSI 1 "register_operand")
+   (match_operand:V_VLSI 2 "register_operand")]
+  "TARGET_VECTOR"
+  {
+riscv_vector::expand_vec_sssub (operands[0], operands[1], operands[2], 
mode);
+DONE;
+  }
+)
+
(define_expand "ustrunc2"
   [(match_operand: 0 "register_operand")
(match_operand:VWEXTI   1 "register_operand")]
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 1e6d10a1402..b2f5d72f494 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -649,6 +649,7 @@ void expand_vec_lfloor (rtx, rtx, machine_mode, 
machine_mode);
void expand_vec_usadd (rtx, rtx, rtx, machine_mode);
void expand_vec_ssadd (rtx, rtx, rtx, machine_mode);
void expand_vec_ussub (rtx, rtx, rtx, machine_mode);
+void expand_vec_sssub (rtx, rtx, rtx, machine_mode);
void expand_vec_double_ustrunc (rtx, rtx, machine_mode);
void expand_vec_quad_ustrunc (rtx, rtx, machine_mode, machine_mode);
void expand_vec_oct_ustrunc (rtx, rtx, machine_mode, machine_mode,
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index ca3a80cceb9..fba35652cc2 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -4902,6 +4902,15 @@ expand_vec_ussub (rtx op_0, rtx op_1, rtx op_2, 
machine_mode vec_mode)
   emit_vec_binary_alu (op_0, op_1, op_2, US_MINUS, vec_mode);
}
+/* Expand the standard name ssadd3 for vector mode,  we can leverage
+   the vector fixed point vector single-width saturating add directly.  */
+
+void
+expand_vec_sssub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  emit_vec_binary_alu (op_0, op_1, op_2, SS_MINUS, vec_mode);
+}
+
/* Expand the standard name ustrunc2 for double vector mode,  like
DI => SI.  we can leverage the vector fixed point vector narrowing
fixed-point cl

Re: [PATCH] [PR86710][PR116826] match.pd: Fold logarithmic identities.

2024-10-11 Thread Jennifer Schmitz


> On 8 Oct 2024, at 10:44, Richard Biener  wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> On Thu, 3 Oct 2024, Jennifer Schmitz wrote:
> 
>> 
>> 
>>> On 1 Oct 2024, at 14:27, Richard Biener  wrote:
>>> 
>>> External email: Use caution opening links or attachments
>>> 
>>> 
>>> On Tue, 1 Oct 2024, Jennifer Schmitz wrote:
>>> 
 This patch implements 4 rules for logarithmic identities in match.pd
 under -funsafe-math-optimizations:
 1) logN(1.0/a) -> -logN(a). This avoids the division instruction.
 2) logN(C/a) -> logN(C) - logN(a), where C is a real constant. Same as 1).
 3) logN(a) + logN(b) -> logN(a*b). This reduces the number of calls to
 log function.
 4) logN(a) - logN(b) -> logN(a/b). Same as 4).
 Tests were added for float, double, and long double.
 
 The patch was bootstrapped and regtested on aarch64-linux-gnu and
 x86_64-linux-gnu, no regression.
 Additionally, SPEC 2017 fprate was run. While the transform does not seem
 to be triggered, we also see no non-noise impact on performance.
 OK for mainline?
>>> 
>>> Since log can set errno we have the builtins affect global memory and
>>> thus have VDEFs, this posses issues for match.pd which does not assign
>>> new VDEFs upon materializing the result, esp. for the case where
>>> you duplicate a call.  There's a similar issue for -frounding-math
>>> where intermediate FP status changes can be lost.  match.pd simply
>>> follows the SSA use-def chains without regarding memory side-effects.
>>> 
>>> The transforms are guarded by flag_unsafe_math_optimizations but here
>>> I think we need !HONOR_SIGN_DEPENDENT_ROUNDING, !flag_trapping_math
>>> (exception state might be different for logN(a) - logN(b) -> logN(a/b),
>>> at least WRT INEXACT?), and !flag_errno_math (because of the VDEFs).
>>> 
>>> +  /* Simplify logN(C/a) into logN(C)-logN(a).  */
>>> +  (simplify
>>> +   (logs (rdiv:s REAL_CST@0 @1))
>>> +(minus (logs @0) (logs @1)))
>>> 
>>> I think you want
>>> 
>>>(minus (logs! @0) (logs @1))
>>> 
>>> here to make sure we constant-fold.
>>> 
>>> +  (simplify
>>> +   (minus (logs:s @0) (logs:s @1))
>>> +(logs (rdiv @0 @1
>>> 
>>> I think that's somewhat dangerous for @1 == 0 given log for
>>> zero arg results in -HUGE_VAL but a FP division by gives a NaN.
>>> I'm not exactly sure whether !HONOR_INFINITIES && !HONOR_NANS
>>> is good enough here.
>>> 
>>> Your testcases probably all trigger during GENERIC folding,
>>> bypassing the VDEF issue - you might want to try assigning
>>> the comparison operands to tempoaries to run into the actual
>>> issues.
>> Dear Richard,
>> Thanks for the review and suggesting the additional flags. I added
>> - !HONOR_SIGN_DEPENDENT_ROUNDING
>> - !flag_trapping_math
>> - !flag_errno_math
>> - !HONOR_INFINITIES
>> - !HONOR_NANS
>> as guard before the patterns.
>> Can we add anything else to account for HUGE_VAL or will !HONOR_INFINITIES 
>> && !HONOR_NANS be enough? Or do you have a suggestion how I can check this?
>> I validated again on aarch64 and x86_64.
> 
> Can you change your patch attachment format to be at least text/plain
> and ideally just inline it or use git send-mail?  This way reviewing
> is much easier.
> 
> I'll quote it here for reference:
> 
> (if (flag_unsafe_math_optimizations)
> 
> [...]
> 
> + (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
> +  && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
> +  && ! flag_trapping_math
> +  && ! flag_errno_math)
> +  (for logs (LOG LOG2 LOG10)
> +   /* Simplify logN(1.0/a) into -logN(a).  */
> +   (simplify
> +(logs (rdiv:s real_onep@0 @1))
> + (negate (logs @1)))
> +
> +   /* Simplify logN(C/a) into logN(C)-logN(a).  */
> +   (simplify
> +(logs (rdiv:s REAL_CST@0 @1))
> + (minus (logs! @0) (logs @1)))
> +
> +   /* Simplify logN(a)+logN(b) into logN(a*b).  */
> +   (simplify
> +(plus (logs:s @0) (logs:s @1))
> + (logs (mult @0 @1)))
> +
> +   /* Simplify logN(a)-logN(b) into logN(a/b).  */
> +   (simplify
> +(minus (logs:s @0) (logs:s @1))
> + (logs (rdiv @0 @1)
> 
> I'm OK with the extra guards added but I'm also not a IEEE FP expert
> here.  In the previous review I did mention the extra constraints
> for specific sub-patterns IIRC.
> 
> As a general note we might want to implement a HONOR_ERRNO (combined_fn).
> Probably a bad name, builtin_can_set_errno (combined_fn) might be better,
> for example above I'm not sure whether all of log(), log2() and log10()
> can set errno and how we generally should handle errno when
> -fno-math-errno isn't given and GCC wasn't configured specifically
> to target for example glibc.  For glibc it might be nice if we could
> tell it we're not interested in errno from math functions and in this
> way convey -fno-math-errno to it for example via a crtfastmath like
> mechanism (or by calling alternate entry points).
> 
> So, the patch is OK in case Joseph doesn't have any comments during
> t

[PATCH] SVE intrinsics: Fold svmul with constant power-of-2 operand to svlsl

2024-10-11 Thread Jennifer Schmitz
Previously submitted in 
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/663435.html

For svmul, if one of the operands is a constant vector with a uniform
power of 2, this patch folds the multiplication to a left-shift by
immediate (svlsl).
Because the shift amount in svlsl is the second operand, the order of the
operands is switched, if the first operand contained the powers of 2. However,
this switching is not valid for some predications: If the predication is
_m and the predicate not ptrue, the result of svlsl might not be the
same as for svmul. Therefore, we do not apply the fold in this case.
The transform is also not applied to INTMIN for signed integers and to
constant vectors of 1 (this case is partially covered by constant folding
already and the missing cases will be addressed by the follow-up patch
suggested in
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/663275.html).

Tests were added in the existing test harness to check the produced assembly
- when the first or second operand contains the power of 2
- when the second operand is a vector or scalar (_n)
- for _m, _z, _x predication
- for _m with ptrue or non-ptrue
- for intmin for signed integer types
- for the maximum power of 2 for signed and unsigned integer types.
Note that we used 4 as a power of 2, instead of 2, because a recent
patch optimizes left-shifts by 1 to an add instruction. But since we
wanted to highlight the change to an lsl instruction we used a higher
power of 2.
To also check correctness, runtime tests were added.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-base.cc (svmul_impl::fold):
Implement fold to svlsl for power-of-2 operands.

gcc/testsuite/
* gcc.target/aarch64/sve/acle/asm/mul_s8.c: New test.
* gcc.target/aarch64/sve/acle/asm/mul_s16.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_s32.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_s64.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u8.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u16.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u32.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u64.c: Likewise.
* gcc.target/aarch64/sve/mul_const_run.c: Likewise.
---
 .../aarch64/aarch64-sve-builtins-base.cc  |  36 +-
 .../gcc.target/aarch64/sve/acle/asm/mul_s16.c | 353 +++--
 .../gcc.target/aarch64/sve/acle/asm/mul_s32.c | 353 +++--
 .../gcc.target/aarch64/sve/acle/asm/mul_s64.c | 361 --
 .../gcc.target/aarch64/sve/acle/asm/mul_s8.c  | 353 +++--
 .../gcc.target/aarch64/sve/acle/asm/mul_u16.c | 322 ++--
 .../gcc.target/aarch64/sve/acle/asm/mul_u32.c | 322 ++--
 .../gcc.target/aarch64/sve/acle/asm/mul_u64.c | 332 ++--
 .../gcc.target/aarch64/sve/acle/asm/mul_u8.c  | 327 ++--
 .../gcc.target/aarch64/sve/mul_const_run.c| 101 +
 10 files changed, 2620 insertions(+), 240 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index afce52a7e8d..0ba350edfe5 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2035,7 +2035,41 @@ public:
|| is_ptrue (pg, f.type_suffix (0).element_bytes)))
   return gimple_build_assign (f.lhs, build_zero_cst (TREE_TYPE (f.lhs)));
 
-return NULL;
+/* If one of the operands is a uniform power of 2, fold to a left shift
+   by immediate.  */
+tree op1_cst = uniform_integer_cst_p (op1);
+tree op2_cst = uniform_integer_cst_p (op2);
+tree shift_op1, shift_op2;
+if (op1_cst && integer_pow2p (op1_cst)
+   && (f.pred != PRED_m
+   || is_ptrue (pg, f.type_suffix (0).element_bytes)))
+  {
+   shift_op1 = op2;
+   shift_op2 = op1_cst;
+  }
+else if (op2_cst && integer_pow2p (op2_cst))
+  {
+   shift_op1 = op1;
+   shift_op2 = op2_cst;
+  }
+else
+  return NULL;
+
+if ((f.type_suffix (0).unsigned_p && tree_to_uhwi (shift_op2) == 1)
+   || (!f.type_suffix (0).unsigned_p
+   && (tree_int_cst_sign_bit (shift_op2)
+   || tree_to_shwi (shift_op2) == 1)))
+  return NULL;
+
+shift_op2 = wide_int_to_tree (unsigned_type_for (TREE_TYPE (shift_op2)),
+ tree_log2 (shift_op2));
+function_instance instance ("svlsl", functions::svlsl,
+   shapes::binary_uint_opt_n, MODE_n,
+   f.type_suffix_ids, GROUP_none, f.pred);
+gcall *call = f.redirect_call (instance);
+gimple_call_set_arg (call, 1, shift_op1);
+gimple_call_set_arg (call, 2, shift_op2);
+return call;
   }
 };
 
dif

[PATCH] gcc.target/i386/pr53533-[13].c: Adjust assembly scan

2024-10-11 Thread H.J. Lu
Before

1089d083117 Simplify (B * v + C) * D -> BD* v + CD when B,C,D are all INTEGER_CS
T.

the loop was

.L2:
movl (%rdi,%rdx), %eax
addl $12345, %eax
imull $-1564285888, %eax, %eax
leal -333519936(%rax), %eax
movl %eax, (%rsi,%rdx)
addq $4, %rdx
cmpq $1024, %rdx
jne .L2

There were 1 addl and 1 leal. 1 addq was to update the loop counter.  The
optimized loop is

.L2:
imull $-1564285888, (%rdi,%rax), %edx
subl $1269844480, %edx
movl %edx, (%rsi,%rax)
addq $4, %rax
cmpq $1024, %rax
jne .L2

1 addl is changed to subl and leal is removed. Adjust assembly scan to
check for 1 subl and 1 addl/addq as well as lea removal.

* gcc.target/i386/pr53533-1.c: Adjust assembly scan.
* gcc.target/i386/pr53533-3.c: Likewise.

-- 
H.J.
From 2e3984e83900772710c5a652f1f7ad0e9a46e489 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Sat, 12 Oct 2024 05:53:49 +0800
Subject: [PATCH] gcc.target/i386/pr53533-[13].c: Adjust assembly scan

Before

1089d083117 Simplify (B * v + C) * D -> BD* v + CD when B,C,D are all INTEGER_CST.

the loop was

.L2:
	movl	(%rdi,%rdx), %eax
	addl	$12345, %eax
	imull	$-1564285888, %eax, %eax
	leal	-333519936(%rax), %eax
	movl	%eax, (%rsi,%rdx)
	addq	$4, %rdx
	cmpq	$1024, %rdx
	jne	.L2

There were 1 addl and 1 leal. 1 addq was to update the loop counter.  The
optimized loop is

.L2:
	imull	$-1564285888, (%rdi,%rax), %edx
	subl	$1269844480, %edx
	movl	%edx, (%rsi,%rax)
	addq	$4, %rax
	cmpq	$1024, %rax
	jne	.L2

1 addl is changed to subl and leal is removed. Adjust assembly scan to
check for 1 subl and 1 addl/addq as well as lea removal.

	* gcc.target/i386/pr53533-1.c: Adjust assembly scan.
	* gcc.target/i386/pr53533-3.c: Likewise.

Signed-off-by: H.J. Lu 
---
 gcc/testsuite/gcc.target/i386/pr53533-1.c | 4 +++-
 gcc/testsuite/gcc.target/i386/pr53533-3.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr53533-1.c b/gcc/testsuite/gcc.target/i386/pr53533-1.c
index 095de665366..11d12015145 100644
--- a/gcc/testsuite/gcc.target/i386/pr53533-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr53533-1.c
@@ -1,7 +1,9 @@
 /* { dg-do compile } */
 /* { dg-options "-O1" } */
 /* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
-/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "subl\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-times "add(?:l|q)\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-not "leal" } } */
 
 void
 __attribute__((noipa))
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-3.c b/gcc/testsuite/gcc.target/i386/pr53533-3.c
index 3b260d134e9..347fa828eb7 100644
--- a/gcc/testsuite/gcc.target/i386/pr53533-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr53533-3.c
@@ -1,7 +1,9 @@
 /* { dg-do compile } */
 /* { dg-options "-O1 -fwrapv" } */
 /* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
-/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "subl\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-times "add(?:l|q)\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-not "leal" } } */
 
 void
 __attribute__((noipa))
-- 
2.47.0



[PATCH] gcc.target/i386/pr55583.c: Use long long for 64-bit integer

2024-10-11 Thread H.J. Lu
Since long is 32-bit for x32, use long long for 64-bit integer.

* gcc.target/i386/pr55583.c: Use long long for 64-bit integer.


-- 
H.J.
From 97a61d75d338ce330f411fa0058949c4346b7119 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Sat, 12 Oct 2024 06:15:28 +0800
Subject: [PATCH] gcc.target/i386/pr55583.c: Use long long for 64-bit integer

Since long is 32-bit for x32, use long long for 64-bit integer.

	* gcc.target/i386/pr55583.c: Use long long for 64-bit integer.

Signed-off-by: H.J. Lu 
---
 gcc/testsuite/gcc.target/i386/pr55583.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr55583.c b/gcc/testsuite/gcc.target/i386/pr55583.c
index 1c128b5d929..ea6a2d54c49 100644
--- a/gcc/testsuite/gcc.target/i386/pr55583.c
+++ b/gcc/testsuite/gcc.target/i386/pr55583.c
@@ -5,11 +5,11 @@
 /* { dg-final { scan-assembler-times {(?n)shldl?[\t ]*\$2} 1 { target ia32 } } } */
 /* { dg-final { scan-assembler-times {(?n)shld[ql]?[\t ]*\$2} 2 { target { ! ia32 } } } } */
 
-typedef unsigned long  u64;
+typedef unsigned long long u64;
 typedef unsigned int   u32;
 typedef unsigned short u16;
 
-long  a, b;
+long long  a, b;
 int   c, d;
 short e, f;
 const int n = 2;
@@ -17,7 +17,7 @@ const int n = 2;
 void test64r () { b = ((u64)b >> n) | (a << (64 - n)); }
 void test32r () { d = ((u32)d >> n) | (c << (32 - n)); }
 
-unsigned long  ua, ub;
+unsigned long long ua, ub;
 unsigned int   uc, ud;
 unsigned short ue, uf;
 
-- 
2.47.0



[PATCH] gcc.target/i386/pr115749.c: Use word_mode integer

2024-10-11 Thread H.J. Lu
Use word_mode integer with func so that 64-bit integer is used with
x32.

* gcc.target/i386/pr115749.c (uword): New.
(func): Replace unsigned long with uword.

-- 
H.J.
From bef1df8952cb373dda768c5370fd70479b7ba785 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Sat, 12 Oct 2024 05:22:52 +0800
Subject: [PATCH] gcc.target/i386/pr115749.c: Use word_mode integer

Use word_mode integer with func so that 64-bit integer is used with
x32.

	* gcc.target/i386/pr115749.c (uword): New.
	(func): Replace unsigned long with uword.

Signed-off-by: H.J. Lu 
---
 gcc/testsuite/gcc.target/i386/pr115749.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr115749.c b/gcc/testsuite/gcc.target/i386/pr115749.c
index 82505d603ef..e7946d77861 100644
--- a/gcc/testsuite/gcc.target/i386/pr115749.c
+++ b/gcc/testsuite/gcc.target/i386/pr115749.c
@@ -4,7 +4,9 @@
 /* { dg-final { scan-assembler-times "imul" 2 } } */
 /* { dg-final { scan-assembler-not "sal" } } */
 
-unsigned long func(unsigned long x)
+typedef unsigned int uword __attribute__ ((mode (word)));
+
+uword func(uword x)
 {
 return x % 240;
 }
-- 
2.47.0



[Patch] Fortran: Unify gfc_get_location handling; fix expr->ts bug

2024-10-11 Thread Tobias Burnus
This unifies the two locus to location_t conversion functions, preparing 
for some changes I want to do later.


In principle, I had the patch this morning; however, the assert is now 
exercised more often than before - and it triggered rather unexpected 
when running the testsuite.


Turned out that partially overriding a derived-type spec with a string 
length is not really a good idea ...


Otherwise, it is a rather simple patch.

Nonetheless, are there any comments, suggestions, concerns?

Tobias

PS: I have added the assert to gfc_resolve_code, gfc_resolve_expr and 
resolve_symbol without additional fails; this does not guarantee that 
there isn't any issue with other locations the code, but a least it 
makes it less likely. As there was no good reason to include that 
assert, I have done so for the attached patch.
Fortran: Unify gfc_get_location handling; fix expr->ts bug

This commit reduces code duplication by moving gfc_get_location
from trans.cc to error.cc.  The gcc_assert is now used more often
and reveald a bug in gfc_match_array_constructor where the union
expr->ts.u.derived of a derived type is partially overwritten by
the assignment expr->ts.u.cl->... as a ts.type == BT_CHARACTER check
was missing.

gcc/fortran/ChangeLog:

	* array.cc (gfc_match_array_constructor): Only update the
	character length if the expression is of character type.
	* error.cc (gfc_get_location_with_offset): New; split off
	from ...
	(gfc_format_decoder): ... here; call it.
	* gfortran.h (gfc_get_location_with_offset): New prototype.
	(gfc_get_location): New inline function.
	* trans.cc (gfc_get_location): Remove function definition.
	* trans.h (gfc_get_location): Remove declaration.

 gcc/fortran/array.cc   |  2 +-
 gcc/fortran/error.cc   | 34 --
 gcc/fortran/gfortran.h |  7 +++
 gcc/fortran/trans.cc   | 12 
 gcc/fortran/trans.h|  4 
 5 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/gcc/fortran/array.cc b/gcc/fortran/array.cc
index ed8cb54803b..773c5b72c85 100644
--- a/gcc/fortran/array.cc
+++ b/gcc/fortran/array.cc
@@ -1390,7 +1390,7 @@ done:
 expr = gfc_get_array_expr (BT_UNKNOWN, 0, &where);
 
   expr->value.constructor = head;
-  if (expr->ts.u.cl)
+  if (expr->ts.type == BT_CHARACTER && expr->ts.u.cl)
 expr->ts.u.cl->length_from_typespec = seen_ts;
 
   *result = expr;
diff --git a/gcc/fortran/error.cc b/gcc/fortran/error.cc
index e6c05aa149f..5165d7c4628 100644
--- a/gcc/fortran/error.cc
+++ b/gcc/fortran/error.cc
@@ -50,6 +50,21 @@ static gfc_error_buffer error_buffer;
 static output_buffer *pp_error_buffer, *pp_warning_buffer;
 static int warningcount_buffered, werrorcount_buffered;
 
+
+/* Return a location_t suitable for 'tree' for a gfortran locus.  During
+   parsing in gfortran, loc->lb->location contains only the line number
+   and LOCATION_COLUMN is 0; hence, the column has to be added when generating
+   locations for 'tree'.  */
+
+location_t
+gfc_get_location_with_offset (locus *loc, unsigned offset)
+{
+  gcc_assert (loc->nextc >= loc->lb->line);
+  return linemap_position_for_loc_and_offset (line_table, loc->lb->location,
+	  loc->nextc - loc->lb->line
+	  + offset);
+}
+
 /* Return buffered_p.  */
 bool
 gfc_buffered_p (void)
@@ -411,6 +426,7 @@ gfc_format_decoder (pretty_printer *pp, text_info *text, const char *spec,
 		int precision, bool wide, bool set_locus, bool hash,
 		bool *quoted, pp_token_list &formatted_token_list)
 {
+  unsigned offset = 0;
   switch (*spec)
 {
 case 'C':
@@ -419,21 +435,19 @@ gfc_format_decoder (pretty_printer *pp, text_info *text, const char *spec,
 	static const char *result[2] = { "(1)", "(2)" };
 	locus *loc;
 	if (*spec == 'C')
-	  loc = &gfc_current_locus;
+	  {
+	loc = &gfc_current_locus;
+	/* Point %C first offending character not the last good one. */
+	if (*loc->nextc != '\0')
+	  offset++;
+	  }
 	else
 	  loc = va_arg (*text->m_args_ptr, locus *);
-	gcc_assert (loc->nextc - loc->lb->line >= 0);
-	unsigned int offset = loc->nextc - loc->lb->line;
-	if (*spec == 'C' && *loc->nextc != '\0')
-	  /* Point %C first offending character not the last good one. */
-	  offset++;
+
 	/* If location[0] != UNKNOWN_LOCATION means that we already
 	   processed one of %C/%L.  */
 	int loc_num = text->get_location (0) == UNKNOWN_LOCATION ? 0 : 1;
-	location_t src_loc
-	  = linemap_position_for_loc_and_offset (line_table,
-		 loc->lb->location,
-		 offset);
+	location_t src_loc = gfc_get_location_with_offset (loc, offset);
 	text->set_location (loc_num, src_loc, SHOW_RANGE_WITH_CARET);
 	/* Colorize the markers to match the color choices of
 	   diagnostic_show_locus (the initial location has a color given
diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 917866a7ef0..e0ca7c114f7 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -3434,6 +3434,13 @@ const char * gfc_get_string (const char *, ...) ATTRIBU

RE: [PATCH v1 1/4] Match: Support form 1 for vector signed integer SAT_SUB

2024-10-11 Thread Li, Pan2
Thanks Richard for explaining.

> Yes.  The different variants seem to have different complexity and generic
> expansion might prefer one or another version.  So I wasn't suggesting to
> expand .SAT_ADD/SUB in the middle-end but instead canonicalize the open-coding
> to the cheapest (smallest) variant.

Got it. There are many variants but we can simplify them to the cheapest one.
I will have a try after all saturation alu supported.

Pan

-Original Message-
From: Richard Biener  
Sent: Friday, October 11, 2024 6:27 PM
To: Li, Pan2 
Cc: gcc-patches@gcc.gnu.org; tamar.christ...@arm.com; juzhe.zh...@rivai.ai; 
kito.ch...@gmail.com; jeffreya...@gmail.com; rdapp@gmail.com
Subject: Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer 
SAT_SUB

On Fri, Oct 11, 2024 at 11:44 AM Li, Pan2  wrote:
>
> Thanks Richard for reviewing and comments.
>
> > I wonder since we now can match many different variants of writing
> > signed and unsigned
> > saturation add and sub whether it makes sense to canonicalize to the 
> > "cheapest"
> > variant when the target doesn't support .SAT_SUB/ADD?
>
> I think it is a good point. But sorry, not sure if I get the point here. Like 
> what is the purpose of
> the "cheapest" variant regardless of target support it or not. You mean for a 
> "cheapest" variant
> we can expand it in the middle end? Instead of leave it to the target.

Yes.  The different variants seem to have different complexity and generic
expansion might prefer one or another version.  So I wasn't suggesting to
expand .SAT_ADD/SUB in the middle-end but instead canonicalize the open-coding
to the cheapest (smallest) variant.

> > Are there any
> > "sub-patterns"
> > not forming the full saturation add/sub that can be
> > simplified/canonicalized in such
> > way maybe?
>
> Yes, you are right. There will be some common sub-pattern for so many 
> saturation alu variants.
> Like x < 0 ? MIN : MAX. I plan to refine this part after all saturation alu 
> are supported
> (to make sure we have full picture).

Yeah, having a full picture is good.

Richard.

> Pan
>
> -Original Message-
> From: Richard Biener 
> Sent: Friday, October 11, 2024 5:10 PM
> To: Li, Pan2 
> Cc: gcc-patches@gcc.gnu.org; tamar.christ...@arm.com; juzhe.zh...@rivai.ai; 
> kito.ch...@gmail.com; jeffreya...@gmail.com; rdapp@gmail.com
> Subject: Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer 
> SAT_SUB
>
> On Fri, Oct 11, 2024 at 8:24 AM  wrote:
> >
> > From: Pan Li 
> >
> > This patch would like to support the form 1 of the vector signed
> > integer SAT_SUB.  Aka below example:
> >
> > Form 1:
> >   #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX) \
> >   void __attribute__((noinline))   \
> >   vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
> >   {\
> > unsigned i;\
> > for (i = 0; i < limit; i++)\
> >   {\
> > T x = op_1[i]; \
> > T y = op_2[i]; \
> > T minus = (UT)x - (UT)y;   \
> > out[i] = (x ^ y) >= 0  \
> >   ? minus  \
> >   : (minus ^ x) >= 0   \
> > ? minus\
> > : x < 0 ? MIN : MAX;   \
> >   }\
> >   }
> >
> > DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)
> >
> > Before this patch:
> >   91   │   _108 = .SELECT_VL (ivtmp_106, POLY_INT_CST [16, 16]);
> >   92   │   vect_x_16.11_80 = .MASK_LEN_LOAD (vectp_op_1.9_78, 8B, { -1, ... 
> > }, _108, 0);
> >   93   │   _69 = vect_x_16.11_80 >> 7;
> >   94   │   vect_x.12_81 = VIEW_CONVERT_EXPR > char>(vect_x_16.11_80);
> >   95   │   vect_y_18.15_85 = .MASK_LEN_LOAD (vectp_op_2.13_83, 8B, { -1, 
> > ... }, _108, 0);
> >   96   │   vect__7.21_91 = vect_x_16.11_80 ^ vect_y_18.15_85;
> >   97   │   mask__44.22_92 = vect__7.21_91 < { 0, ... };
> >   98   │   vect_y.16_86 = VIEW_CONVERT_EXPR > char>(vect_y_18.15_85);
> >   99   │   vect__6.17_87 = vect_x.12_81 - vect_y.16_86;
> >  100   │   vect_minus_19.18_88 = VIEW_CONVERT_EXPR > char>(vect__6.17_87);
> >  101   │   vect__8.19_89 = vect_x_16.11_80 ^ vect_minus_19.18_88;
> >  102   │   mask__42.20_90 = vect__8.19_89 < { 0, ... };
> >  103   │   mask__41.23_93 = mask__42.20_90 & mask__44.22_92;
> >  104   │   _4 = .COND_XOR (mask__41.23_93, _69, { 127, ... }, 
> > vect_minus_19.18_88);
> >  105  

[PATCH] phiopt: Clobbers can sometimes get in the way of phiopt [PR117096]

2024-10-11 Thread Andrew Pinski
Clobbers in a condition don't cause any improvements and are getting
in the way of doing match and simplify with phiopt. Need to ignore/skip
over them in when seeing if there is only one statement that can moved.
Also since clobbers have vops, skipping over the vops phi node is needed
to be done.

Bootstrapped and tested on x86_64-linux-gnu.

PR tree-optimization/117096

gcc/ChangeLog:

* tree-ssa-phiopt.cc (single_non_singleton_phi_for_edges): Skip
over vops rather than return false.
(empty_bb_or_one_feeding_into_p): Skip over clobbers too.

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/phiopt-2.C: New test.

Signed-off-by: Andrew Pinski 
---
 gcc/testsuite/g++.dg/tree-ssa/phiopt-2.C | 46 
 gcc/tree-ssa-phiopt.cc   | 14 
 2 files changed, 54 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/tree-ssa/phiopt-2.C

diff --git a/gcc/testsuite/g++.dg/tree-ssa/phiopt-2.C 
b/gcc/testsuite/g++.dg/tree-ssa/phiopt-2.C
new file mode 100644
index 000..9bc8fb6e8dc
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/phiopt-2.C
@@ -0,0 +1,46 @@
+// { dg-do compile }
+// { dg-options "-O1 -fdump-tree-phiopt1-details" }
+// PR tree-optimization/117096
+// Clobbers should NOT make a difference
+// when it comes to phiopt
+
+struct s1{
+  unsigned b;
+
+  // Declare as always inline just in some someone turns down the inlining 
limits
+  // we also want to inline early.
+  __attribute__((always_inline)) inline
+   s1() : b(0) {}
+};
+void g();
+int f(signed a, int c)
+{
+  signed b = 0;
+  if (a < 0)
+  {
+s1();
+b = a;
+  }
+  else { s1(); }
+  g();
+  return b;
+}
+
+
+int f1(signed a, int c)
+{
+  signed b = 0;
+  if (a < 0)
+  {
+s1 t;
+b = a;
+  }
+  else { s1 t; }
+  g();
+  return b;
+}
+
+/* both if should be converted into MIN_EXPR */
+/* { dg-final { scan-tree-dump-not "if " "phiopt1" }  }*/
+/* { dg-final { scan-tree-dump-times "MIN_EXPR " 2 "phiopt1" }  }*/
+
diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
index f3ee3a80c0f..b22086f4de8 100644
--- a/gcc/tree-ssa-phiopt.cc
+++ b/gcc/tree-ssa-phiopt.cc
@@ -72,9 +72,10 @@ single_non_singleton_phi_for_edges (gimple_seq seq, edge e0, 
edge e1)
   gimple_phi_arg_def (p, e1->dest_idx)))
continue;
 
-  /* Punt on virtual phis with different arguments from the edges.  */
+  /* Skip virtual phis with different arguments
+ from the edges as clobbers might cause the vop to be different.  */
   if (virtual_operand_p (gimple_phi_result (p)))
-   return NULL;
+   continue;
 
   /* If we already have a PHI that has the two edge arguments are
 different, then return it is not a singleton for these PHIs. */
@@ -663,9 +664,10 @@ empty_bb_or_one_feeding_into_p (basic_block bb,
 {
   gimple *s = gsi_stmt (gsi);
   gsi_next_nondebug (&gsi);
-  /* Skip over Predict and nop statements. */
+  /* Skip over Predict, clobber and nop statements. */
   if (gimple_code (s) == GIMPLE_PREDICT
- || gimple_code (s) == GIMPLE_NOP)
+ || gimple_code (s) == GIMPLE_NOP
+ || gimple_clobber_p (s))
continue;
   /* If there is more one statement return false. */
   if (stmt_to_move)
@@ -673,8 +675,8 @@ empty_bb_or_one_feeding_into_p (basic_block bb,
   stmt_to_move = s;
 }
 
-  /* The only statement here was a Predict or a nop statement
- so return true. */
+  /* The only statement here was a Predict, nop,
+ or a clobber statement so return true. */
   if (!stmt_to_move)
 return true;
 
-- 
2.43.0



[PATCH] middle-end/117086 - fixup vec_cond simplifications

2024-10-11 Thread Richard Biener
The following adds missing checks for a vector type result type
to simplifications that end up creating a vec_cond.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

PR middle-end/117086
* match.pd ((op (vec_cond ...) ..) -> (vec_cond ...)): Add
missing checks for VECTOR_TYPE_P (type).

* gcc.dg/torture/pr117086.c: New testcase.
---
 gcc/match.pd| 45 +
 gcc/testsuite/gcc.dg/torture/pr117086.c | 12 +++
 2 files changed, 36 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/torture/pr117086.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 8a7569ce387..d8be7a8f6f7 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5690,35 +5690,38 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 /* (c ? a : b) op (c ? d : e)  -->  c ? (a op d) : (b op e) */
  (simplify
   (op (vec_cond:s @0 @1 @2) (vec_cond:s @0 @3 @4))
-  (if (TREE_CODE_CLASS (op) != tcc_comparison
-   || types_match (type, TREE_TYPE (@1))
-   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK)
-   || (optimize_vectors_before_lowering_p ()
-  /* The following is optimistic on the side of non-support, we are
- missing the legacy vcond{,u,eq} cases.  Do this only when
- lowering will be able to fixup..  */
-  && !expand_vec_cond_expr_p (TREE_TYPE (@1),
-  TREE_TYPE (@0), ERROR_MARK)))
+  (if (VECTOR_TYPE_P (type)
+   && (TREE_CODE_CLASS (op) != tcc_comparison
+  || types_match (type, TREE_TYPE (@1))
+  || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK)
+  || (optimize_vectors_before_lowering_p ()
+  /* The following is optimistic on the side of non-support, we are
+ missing the legacy vcond{,u,eq} cases.  Do this only when
+ lowering will be able to fixup..  */
+  && !expand_vec_cond_expr_p (TREE_TYPE (@1),
+  TREE_TYPE (@0), ERROR_MARK
(vec_cond @0 (op! @1 @3) (op! @2 @4
 
 /* (c ? a : b) op d  -->  c ? (a op d) : (b op d) */
  (simplify
   (op (vec_cond:s @0 @1 @2) @3)
-  (if (TREE_CODE_CLASS (op) != tcc_comparison
-   || types_match (type, TREE_TYPE (@1))
-   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK)
-   || (optimize_vectors_before_lowering_p ()
-  && !expand_vec_cond_expr_p (TREE_TYPE (@1),
-  TREE_TYPE (@0), ERROR_MARK)))
+  (if (VECTOR_TYPE_P (type)
+   && (TREE_CODE_CLASS (op) != tcc_comparison
+  || types_match (type, TREE_TYPE (@1))
+  || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK)
+  || (optimize_vectors_before_lowering_p ()
+  && !expand_vec_cond_expr_p (TREE_TYPE (@1),
+  TREE_TYPE (@0), ERROR_MARK
(vec_cond @0 (op! @1 @3) (op! @2 @3
  (simplify
   (op @3 (vec_cond:s @0 @1 @2))
-  (if (TREE_CODE_CLASS (op) != tcc_comparison
-   || types_match (type, TREE_TYPE (@1))
-   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK)
-   || (optimize_vectors_before_lowering_p ()
-  && !expand_vec_cond_expr_p (TREE_TYPE (@1),
-  TREE_TYPE (@0), ERROR_MARK)))
+  (if (VECTOR_TYPE_P (type)
+   && (TREE_CODE_CLASS (op) != tcc_comparison
+  || types_match (type, TREE_TYPE (@1))
+  || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK)
+  || (optimize_vectors_before_lowering_p ()
+  && !expand_vec_cond_expr_p (TREE_TYPE (@1),
+  TREE_TYPE (@0), ERROR_MARK
(vec_cond @0 (op! @3 @1) (op! @3 @2)
 
 #if GIMPLE
diff --git a/gcc/testsuite/gcc.dg/torture/pr117086.c 
b/gcc/testsuite/gcc.dg/torture/pr117086.c
new file mode 100644
index 000..cee19c91de2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr117086.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=znver5" { target { x86_64-*-* i?86-*-* } } 
} */
+
+void fancy_abort(const char *, int, const char *);
+int usage_insns_0_0;
+void inherit_in_ebb() {
+  int abis = 0;
+  for (int i = 0; i < 8; ++i)
+if (i > usage_insns_0_0)
+  abis |= i;
+  abis ? fancy_abort("", 0, __FUNCTION__), 0 : 0;
+}
-- 
2.43.0


Re: Ping: [PATCH] d,ada/spec: only sub nostd{inc,lib} rather than nostd{inc,lib}*

2024-10-11 Thread Marc Poulhiès
Arsen Arsenović  writes:

> Ping on this patch.

Hello Arsen, and sorry for taking so long to reply.
The patch is OK for the Ada part.

Cheers,
Marc


Re: [PATCH] Add 'cobol' to Makefile.def

2024-10-11 Thread Richard Biener
On Thu, Oct 10, 2024 at 9:07 PM James K. Lowden
 wrote:
>
> Hello,
>
> I just joined the list to begin contributing patches for the COBOL
> front end we've been touting for the last 4 years.  It's my first
> attempt.  Please tell me if you'd like to see something different.
>
> What follows mimics to some degree the output of "git format-patch".  I
> don't think I can use that command literally, but if I can and that
> would be better, I'm happy to follow instructions.

I checked and the patch applies using 'git am', so I think it works as-is
from a technical side.

What's missing here is a short description (what would be the git
commit message) which should also contain a properly formatted
ChangeLog part.  For this patch for example

\t* Makefile.def: Add libgcobol module and cobol language.

Specifically this patch on its own splits at unwanted granularity
and should at least come together with adding stubs in
the libgcobol toplevel directory and the gcc/cobol directory and
corresponding changes to Makefile.tpl and regenerating Makefile.in.

Possibly separate patches for libgcobol and gcc/cobol both with
changes to Makefile.def would work as well.  The idea is that
the parts when applied in series produce trees that configure
and build OK.

Richard.

> My plan is to send patches for one file at a time, starting from
> the top of the tree.  Very soon we'll get to the front end proper, in
> gcc/cobol.  After we work our way through those, there is a runtiime
> library.  After that I have tests and documentation.  And then we'll be
> done. Right?  ;-)
>
> This patch adds "cobol" as a language and subdirectory.
>
> --jkl
>
>
> From 216ec55cdb2ad95728612d4b9b5550324e9b506fpatch 4 Oct 2024 12:01:22 -0400
> From: "James K. Lowden" 
> Date: Thu Oct 10 14:28:48 EDT 2024
> Subject: [PATCH]  Add 'cobol' to 1 file
>
> ---
> a/Makefile.def | +++
> 1 file changed, 7 insertions(+), 0 deletions(-)
> diff --git a/Makefile.def b/Makefile.def
> index 19954e7d731..1192e852c7a 100644
> --- a/Makefile.def
> +++ b/Makefile.def
> @@ -209,6 +209,7 @@ target_modules = { module= libgomp; bootstrap= true; 
> lib_path=.libs; };
>  target_modules = { module= libitm; lib_path=.libs; };
>  target_modules = { module= libatomic; bootstrap=true; lib_path=.libs; };
>  target_modules = { module= libgrust; };
> +target_modules = { module= libgcobol; };
>
>  // These are (some of) the make targets to be done in each subdirectory.
>  // Not all; these are the ones which don't have special options.
> @@ -324,6 +325,7 @@ flags_to_pass = { flag= CXXFLAGS_FOR_TARGET ; };
>  flags_to_pass = { flag= DLLTOOL_FOR_TARGET ; };
>  flags_to_pass = { flag= DSYMUTIL_FOR_TARGET ; };
>  flags_to_pass = { flag= FLAGS_FOR_TARGET ; };
> +flags_to_pass = { flag= GCOBOL_FOR_TARGET ; };
>  flags_to_pass = { flag= GFORTRAN_FOR_TARGET ; };
>  flags_to_pass = { flag= GOC_FOR_TARGET ; };
>  flags_to_pass = { flag= GOCFLAGS_FOR_TARGET ; };
> @@ -655,6 +657,7 @@ lang_env_dependencies = { module=libgcc; no_gcc=true; 
> no_c=true; };
>  // built newlib on some targets (e.g. Cygwin).  It still needs
>  // a dependency on libgcc for native targets to configure.
>  lang_env_dependencies = { module=libiberty; no_c=true; };
> +lang_env_dependencies = { module=libgcobol; cxx=true; };
>
>  dependencies = { module=configure-target-fastjar; on=configure-target-zlib; 
> };
>  dependencies = { module=all-target-fastjar; on=all-target-zlib; };
> @@ -690,6 +693,7 @@ dependencies = { module=install-target-libvtv; 
> on=install-target-libgcc; };
>  dependencies = { module=install-target-libitm; on=install-target-libgcc; };
>  dependencies = { module=install-target-libobjc; on=install-target-libgcc; };
>  dependencies = { module=install-target-libstdc++-v3; 
> on=install-target-libgcc; };
> +dependencies = { module=install-target-libgcobol; 
> on=install-target-libstdc++-v3; };
>
>  // Target modules in the 'src' repository.
>  lang_env_dependencies = { module=libtermcap; };
> @@ -727,6 +731,8 @@ languages = { language=d;   gcc-check-target=check-d;
> lib-check-target=check-target-libphobos; };
>  languages = { language=jit;gcc-check-target=check-jit; };
>  languages = { language=rust;   gcc-check-target=check-rust; };
> +languages = { language=cobol;  gcc-check-target=check-cobol;
> +   lib-check-target=check-target-libgcobol; };
>
>  // Toplevel bootstrap
>  bootstrap_stage = { id=1 ; };
>
>
>


Re: [PATCH] libgccjit: Add support for creating temporary variables

2024-10-11 Thread Antoni Boucher

Hi, David.
Can you please review the updated patch?
Can I merge it?
Thanks.

Le 2024-02-29 à 16 h 11, Antoni Boucher a écrit :

Hi and thanks for the review!
Here's the updated patch.

Le 2024-01-24 à 09 h 54, David Malcolm a écrit :

On Fri, 2024-01-19 at 16:54 -0500, Antoni Boucher wrote:

Hi.
This patch adds a new way to create local variable that won't
generate
debug info: it is to be used for compiler-generated variables.
Thanks for the review.


Thanks for the patch.

diff --git a/gcc/jit/docs/topics/compatibility.rst b/gcc/jit/docs/ 
topics/compatibility.rst

index cbf5b414d8c..5d62e264a00 100644
--- a/gcc/jit/docs/topics/compatibility.rst
+++ b/gcc/jit/docs/topics/compatibility.rst
@@ -390,3 +390,12 @@ on functions and variables:
    * :func:`gcc_jit_function_add_string_attribute`
    * :func:`gcc_jit_function_add_integer_array_attribute`
    * :func:`gcc_jit_lvalue_add_string_attribute`
+
+.. _LIBGCCJIT_ABI_27:
+
+``LIBGCCJIT_ABI_27``
+
+``LIBGCCJIT_ABI_27`` covers the addition of a functions to create a new


"functions" -> "function"


+temporary variable:
+
+  * :func:`gcc_jit_function_new_temp`
diff --git a/gcc/jit/docs/topics/functions.rst b/gcc/jit/docs/topics/ 
functions.rst

index 804605ea939..230caf42466 100644
--- a/gcc/jit/docs/topics/functions.rst
+++ b/gcc/jit/docs/topics/functions.rst
@@ -171,6 +171,26 @@ Functions
 underlying string, so it is valid to pass in a pointer to an on- 
stack

 buffer.
+.. function:: gcc_jit_lvalue *\
+  gcc_jit_function_new_temp (gcc_jit_function *func,\
+ gcc_jit_location *loc,\
+ gcc_jit_type *type)
+
+   Create a new local variable within the function, of the given type.
+   This function is similar to :func:`gcc_jit_function_new_local`, but
+   it is to be used for compiler-generated variables (as opposed to
+   user-defined variables in the language to be compiled) and these
+   variables won't show up in the debug info.
+
+   The parameter ``type`` must be non-`void`.
+
+   This entrypoint was added in :ref:`LIBGCCJIT_ABI_26`; you can test
+   for its presence using


The ABI number is inconsistent here (it's 27 above and in the .map
file), but obviously you can fix this when you eventually commit this
based on what the ABI number actually is.

[...snip...]


diff --git a/gcc/jit/jit-playback.cc b/gcc/jit/jit-playback.cc
index 84df6c100e6..cb6b2f66276 100644
--- a/gcc/jit/jit-playback.cc
+++ b/gcc/jit/jit-playback.cc
@@ -31,6 +31,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "toplev.h"
  #include "tree-cfg.h"
  #include "convert.h"
+#include "gimple-expr.h"
  #include "stor-layout.h"
  #include "print-tree.h"
  #include "gimplify.h"
@@ -1950,13 +1951,27 @@ new_local (location *loc,
 type *type,
 const char *name,
 const std::vector> &attributes)
+   std::string>> &attributes,
+   bool is_temp)
  {
    gcc_assert (type);
-  gcc_assert (name);
-  tree inner = build_decl (UNKNOWN_LOCATION, VAR_DECL,
+  tree inner;
+  if (is_temp)
+  {
+    inner = build_decl (UNKNOWN_LOCATION, VAR_DECL,
+    create_tmp_var_name ("JITTMP"),
+    type->as_tree ());
+    DECL_ARTIFICIAL (inner) = 1;
+    DECL_IGNORED_P (inner) = 1;
+    DECL_NAMELESS (inner) = 1;


We could assert that "name" is null in the is_temp branch.

An alternative approach might be to drop "is_temp", and instead make
"name" being null signify that it's a temporary, if you prefer that
approach.  Would client code ever want to specify a name prefix for a
temporary?


No, I don't think anyone would want a different prefix.





+  }
+  else
+  {
+    gcc_assert (name);
+    inner = build_decl (UNKNOWN_LOCATION, VAR_DECL,
 get_identifier (name),
 type->as_tree ());
+  }
    DECL_CONTEXT (inner) = this->m_inner_fndecl;
    /* Prepend to BIND_EXPR_VARS: */


[...snip...]

Thanks again for the patch.  Looks good to me as-is (apart from the
grammar and ABI number nits), but what do you think of eliminating
"is_temp" in favor of the "name" ptr being null?  I think it's your
call.

Dave





[PATCH] This is a test3, please ignore

2024-10-11 Thread Christophe Lyon
CI-tag: skip
-- >8 --

This is a test patch, please ignore.

---
 README | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README b/README
index be15bc2b44e..7a3d7cfeb74 100644
--- a/README
+++ b/README
@@ -1,3 +1,5 @@
+THIS IS A TEST -- IGNORE
+
 This directory contains the GNU Compiler Collection (GCC).
 
 The GNU Compiler Collection is free software.  See the files whose
-- 
2.34.1



Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector reductions

2024-10-11 Thread Jennifer Schmitz


> On 11 Oct 2024, at 12:08, Richard Sandiford  wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Tamar Christina  writes:
>>> -Original Message-
>>> From: Richard Biener 
>>> Sent: Friday, October 11, 2024 7:52 AM
>>> To: Richard Sandiford 
>>> Cc: Jennifer Schmitz ; gcc-patches@gcc.gnu.org; Richard
>>> Earnshaw ; Kyrylo Tkachov
>>> ; Tamar Christina 
>>> Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
>>> reductions
>>> 
>>> On Thu, 10 Oct 2024, Richard Sandiford wrote:
>>> 
 Jennifer Schmitz  writes:
> This patch implements the optabs reduc_and_scal_,
> reduc_ior_scal_, and reduc_xor_scal_ for ASIMD modes V8QI,
> V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise
>>> logical
> vector reduction operations.
> Previously, either only vector registers or only general purpose 
> registers (GPR)
> were used. Now, vector registers are used for the reduction from 128 to 64
>>> bits;
> 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR 
> are
>>> used
> for the rest of the reduction steps.
> 
> For example, the test case (V8HI)
> int16_t foo (int16_t *a)
> {
>  int16_t b = -1;
>  for (int i = 0; i < 8; ++i)
>b &= a[i];
>  return b;
> }
> 
> was previously compiled to (-O2):
> foo:
> ldr q0, [x0]
> moviv30.4s, 0
> ext v29.16b, v0.16b, v30.16b, #8
> and v29.16b, v29.16b, v0.16b
> ext v31.16b, v29.16b, v30.16b, #4
> and v31.16b, v31.16b, v29.16b
> ext v30.16b, v31.16b, v30.16b, #2
> and v30.16b, v30.16b, v31.16b
> umovw0, v30.h[0]
> ret
> 
> With patch, it is compiled to:
> foo:
> ldr q31, [x0]
> ext v30.16b, v31.16b, v31.16b, #8
> and v31.8b, v30.8b, v31.8b
> fmovx0, d31
> and x0, x0, x0, lsr 32
> and w0, w0, w0, lsr 16
> ret
> 
> For modes V4SI and V2DI, the pattern was not implemented, because the
> current codegen (using only base instructions) is already efficient.
> 
> Note that the PR initially suggested to use SVE reduction ops. However,
> they have higher latency than the proposed sequence, which is why using
> neon and base instructions is preferable.
> 
> Test cases were added for 8/16-bit integers for all implemented modes and 
> all
> three operations to check the produced assembly.
> 
> We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
> because for aarch64 vector types, either the logical reduction optabs are
> implemented or the codegen for reduction operations is good as it is.
> This was motivated by failure of a scan-tree-dump directive in the test 
> cases
> gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
> 
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no
>>> regression.
> OK for mainline?
> 
> Signed-off-by: Jennifer Schmitz 
> 
> gcc/
> PR target/113816
> * config/aarch64/aarch64-simd.md (reduc__scal_):
> Implement for logical bitwise operations for VDQV_E.
> 
> gcc/testsuite/
> PR target/113816
> * lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
> * gcc.target/aarch64/simd/logical_reduc.c: New test.
> * gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> ---
> gcc/config/aarch64/aarch64-simd.md|  55 +
> .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++
> .../gcc.target/aarch64/vect-reduc-or_1.c  |   2 +-
> gcc/testsuite/lib/target-supports.exp |   4 +-
> 4 files changed, 267 insertions(+), 2 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
>>> b/gcc/config/aarch64/aarch64-simd.md
> index 23c03a96371..00286b8b020 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3608,6 +3608,61 @@
>   }
> )
> 
> +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, 
> V16QI,
> +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively 
> operating
> +;; on the two halves of the input.
> +;; If the input has 128 bits, the first operation is performed in vector
> +;; registers.  From 64 bits down, the reduction steps are performed in 
> general
> +;; purpose registers.
> +;; For example, for V8HI and operation AND, the intended sequence is:
> +;; EXT  v1.16b, v0.16b, v0.16b, #8
> +;; AND  v0.8b, v1.8b, v0.8b
> +;; FMOV x0, d0
> +;; AND  x0, x0, x0, 32
> +;; AND  w0, w0, w0, 16
> +;;
> +;; For V8QI and operation AND, the sequence is:
> +;; AND  x0, x0, x0, lsr 32
> +;; AND  w0, w0, w0, 

Re: [Patch] Fortran: Dead-function removal in error.cc (shrinking by 40%)

2024-10-11 Thread David Malcolm
On Fri, 2024-10-11 at 15:34 +0100, Paul Richard Thomas wrote:
> Hi Tobias,
> 
> Good catch! It looks 'obvious' to me too :-)
> 
> Regards
> 
> Paul
> 
> 
> On Fri, 11 Oct 2024 at 14:08, Tobias Burnus 
> wrote:
> 
> > I found always error.cc rather confusing but I just realized that
> > we can reduce number of lines in that file by 40% - and remove a
> > lot of
> > (apparent) complexity.
> > 
> > The removed code is from the old days, when gfortran handled a lot
> > of
> > diagnostic itself, also because it wanted to show lines with
> > carets,
> > while the C/C++ diagnostic did not support this.
> > 
> > Well, that changed and gfortran mostly converted to the common
> > diagnostic
> > code, but somehow the old code remained - without actually being
> > used.
> > 
> > Thus, this patch simply removes it.
> > 
> > 
> > I regarding the change as trivial and obvious and to intent to
> > commit
> > it as obvious. Nonetheless, any comments, suggestions, review
> > remarks?
> > 
> > 
> > Tobias
> > 
> > 
> > PS: I also wanted to reduce code duplication, but an assert that
> > was
> > previously
> > only in one code path triggered, showing at least one case where
> > 'locus' is
> > broken. Something to fix first before sending in that part ...
> > 
> > There are also some other changes in the pipeline:
> > * I want to move support range-based locations, which is also a
> > good
> > opportunity to fix some misplaced '1' (e.g. which point at white
> > space
> > instead of the actual declaration or ...).
> > 
> > * David wants to improve json/sarif output, including stderr +
> > sarif/json
> > output at the same time, but that has issues with
> > delayed/suppressed/buffered
> > diagnostic in gfortran (because of the try & error parsing* in
> > Fortran)
> > → https://gcc.gnu.org/PR116613 for the former and
> > https://gcc.gnu.org/105916
> > for the buffering issue.

Thanks for the patch; I was planning to take a look at the SARIF
buffering issue later today/Monday from the gcc/diagnostic.cc/h side
(perhaps introducing an idea of "pending diagnostics" there), so any
simplifications on the fortran side are most welcome!  My knowledge of
Fortran is almost zero, sorry.

Dave



Re: [Patch] Fortran: Dead-function removal in error.cc (shrinking by 40%)

2024-10-11 Thread Paul Richard Thomas
Hi Tobias,

Good catch! It looks 'obvious' to me too :-)

Regards

Paul


On Fri, 11 Oct 2024 at 14:08, Tobias Burnus  wrote:

> I found always error.cc rather confusing but I just realized that
> we can reduce number of lines in that file by 40% - and remove a lot of
> (apparent) complexity.
>
> The removed code is from the old days, when gfortran handled a lot of
> diagnostic itself, also because it wanted to show lines with carets,
> while the C/C++ diagnostic did not support this.
>
> Well, that changed and gfortran mostly converted to the common diagnostic
> code, but somehow the old code remained - without actually being used.
>
> Thus, this patch simply removes it.
>
>
> I regarding the change as trivial and obvious and to intent to commit
> it as obvious. Nonetheless, any comments, suggestions, review remarks?
>
>
> Tobias
>
>
> PS: I also wanted to reduce code duplication, but an assert that was
> previously
> only in one code path triggered, showing at least one case where 'locus' is
> broken. Something to fix first before sending in that part ...
>
> There are also some other changes in the pipeline:
> * I want to move support range-based locations, which is also a good
> opportunity to fix some misplaced '1' (e.g. which point at white space
> instead of the actual declaration or ...).
>
> * David wants to improve json/sarif output, including stderr + sarif/json
> output at the same time, but that has issues with
> delayed/suppressed/buffered
> diagnostic in gfortran (because of the try & error parsing* in Fortran)
> → https://gcc.gnu.org/PR116613 for the former and
> https://gcc.gnu.org/105916
> for the buffering issue.
>
> [(*) e.g., in fixed-form Fortran where spaces have no meaning, the question
> when parsing is whether 'd o i = ...' is a 'do i =' loop or a 'doi = '
> assignment.
> If the statement ends without finding a ',' it was an assignment...
> To avoid bogus errors, the diagnostic has to be suppressed at times.]
>


Re: [PATCH] c++: Fix mangling of otherwise unattached class-scope lambdas [PR116568]

2024-10-11 Thread Jason Merrill

On 9/5/24 11:02 AM, Nathaniel Shead wrote:

Bootstrapped and regtested (so far just dg.exp) on x86_64-pc-linux-gnu,
OK for trunk if full regtest passes?  Or would it be better to try to
implement all the rules mentioned in the linked pull request for one
commit; I admit I haven't looked very closely yet at how else we
diverge?


I see a few things in that pull request:

1. Mangling of multiple levels of template arguments, r14-4544.
2. Mangling of lambdas in general class scope, this patch.
3. Mangling explicit lambda template parameters, r13-3527.
4. Mangling lambdas in expressions: looks like we use 'tl' instead of 'L'.
5. A note to check that a lambda in the signature of a namespace-scope 
function doesn't match a textually identical declaration in another TU; 
we make the function internal, but I don't see a testcase.


Fixing #4 as a quick followup would be good.


This is a step closer to implementing the suggested changes for
https://github.com/itanium-cxx-abi/cxx-abi/pull/85.

The main purpose of the patch is to solve testcase PR c++/116568, caused
by lambda expressions within the templates not correctly having the
extra mangling scope attached.

While I was working on this I found some other cases where the mangling
of lambdas was incorrect and causing issues, notably the testcase
lambda-ctx3.C which currently emits the same mangling for the base class
and member lambdas, causing mysterious assembler errors.  Fixing this
ended up also improving the situation for PR c++/107741 as well, though
it doesn't seem easily possible to fix the A::x case at this time so
I've left that as an XFAIL.


It looks pretty straightforward to split grokfield into start and finish 
functions?  But that certainly doesn't need to be part of this patch, 
which is OK as is.



PR c++/107741
PR c++/116568

gcc/cp/ChangeLog:

* cp-tree.h (LAMBDA_EXPR_EXTRA_SCOPE): Adjust comment.
* parser.cc (cp_parser_class_head): Start (and do not finish)
lambda scope for all valid types.
(cp_parser_class_specifier): Finish lambda scope after parsing
members instead.
(cp_parser_member_declaration): Adjust comment to mention
missing lambda scoping for static member initializers.
* pt.cc (instantiate_class_template): Add lambda scoping.
(instantiate_template): Likewise.

gcc/testsuite/ChangeLog:

* g++.dg/abi/lambda-ctx2.C: New test.
* g++.dg/abi/lambda-ctx3.C: New test.
* g++.dg/modules/lambda-8.h: New test.
* g++.dg/modules/lambda-8_a.H: New test.
* g++.dg/modules/lambda-8_b.C: New test.

Signed-off-by: Nathaniel Shead 
---
  gcc/cp/cp-tree.h  |  3 +-
  gcc/cp/parser.cc  | 31 +
  gcc/cp/pt.cc  | 12 +++-
  gcc/testsuite/g++.dg/abi/lambda-ctx2.C| 34 +++
  gcc/testsuite/g++.dg/abi/lambda-ctx3.C| 21 ++
  gcc/testsuite/g++.dg/modules/lambda-8.h   |  7 +
  gcc/testsuite/g++.dg/modules/lambda-8_a.H |  5 
  gcc/testsuite/g++.dg/modules/lambda-8_b.C |  5 
  8 files changed, 104 insertions(+), 14 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/abi/lambda-ctx2.C
  create mode 100644 gcc/testsuite/g++.dg/abi/lambda-ctx3.C
  create mode 100644 gcc/testsuite/g++.dg/modules/lambda-8.h
  create mode 100644 gcc/testsuite/g++.dg/modules/lambda-8_a.H
  create mode 100644 gcc/testsuite/g++.dg/modules/lambda-8_b.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 2eeb5e3e8b1..af1e254745b 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -1513,7 +1513,8 @@ enum cp_lambda_default_capture_mode_type {
(((struct tree_lambda_expr *)LAMBDA_EXPR_CHECK (NODE))->locus)
  
  /* The mangling scope for the lambda: FUNCTION_DECL, PARM_DECL, VAR_DECL,

-   FIELD_DECL or NULL_TREE.  If this is NULL_TREE, we have no linkage.  */
+   FIELD_DECL, TYPE_DECL, or NULL_TREE.  If this is NULL_TREE, we have no
+   linkage.  */
  #define LAMBDA_EXPR_EXTRA_SCOPE(NODE) \
(((struct tree_lambda_expr *)LAMBDA_EXPR_CHECK (NODE))->extra_scope)
  
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc

index 5654bc00e4d..6e5228757a5 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -27051,6 +27051,8 @@ cp_parser_class_specifier (cp_parser* parser)
if (!braces.require_open (parser))
  {
pop_deferring_access_checks ();
+  if (type != error_mark_node)
+   finish_lambda_scope ();
return error_mark_node;
  }
  
@@ -27115,7 +27117,10 @@ cp_parser_class_specifier (cp_parser* parser)

if (cp_parser_allow_gnu_extensions_p (parser))
  attributes = cp_parser_gnu_attributes_opt (parser);
if (type != error_mark_node)
-type = finish_struct (type, attributes);
+{
+  type = finish_struct (type, attributes);
+  finish_lambda_scope ();
+}
if (nested_name_specifier_p)
  pop_inner_scope (old_scope, scope);
  
@@ -27955,6 +27960,12 @@ c

[committed] libstdc++: Add missing whitespace in dg-do directives

2024-10-11 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk.

-- >8 --

libstdc++-v3/ChangeLog:

* testsuite/22_locale/time_get/get/char/5.cc: Fix dg-do
directive.
* testsuite/22_locale/time_get/get/wchar_t/5.cc: Likewise.
---
 libstdc++-v3/testsuite/22_locale/time_get/get/char/5.cc| 2 +-
 libstdc++-v3/testsuite/22_locale/time_get/get/wchar_t/5.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/testsuite/22_locale/time_get/get/char/5.cc 
b/libstdc++-v3/testsuite/22_locale/time_get/get/char/5.cc
index 61fc329cf3c..ca11154b49c 100644
--- a/libstdc++-v3/testsuite/22_locale/time_get/get/char/5.cc
+++ b/libstdc++-v3/testsuite/22_locale/time_get/get/char/5.cc
@@ -1,4 +1,4 @@
-// { dg-do run { target c++11} }
+// { dg-do run { target c++11 } }
 
 #include 
 #include 
diff --git a/libstdc++-v3/testsuite/22_locale/time_get/get/wchar_t/5.cc 
b/libstdc++-v3/testsuite/22_locale/time_get/get/wchar_t/5.cc
index 5f350b043d9..254ae96acd1 100644
--- a/libstdc++-v3/testsuite/22_locale/time_get/get/wchar_t/5.cc
+++ b/libstdc++-v3/testsuite/22_locale/time_get/get/wchar_t/5.cc
@@ -1,4 +1,4 @@
-// { dg-do run { target c++11} }
+// { dg-do run { target c++11 } }
 
 #include 
 #include 
-- 
2.46.2



[committed] libstdc++: Use appropriate feature test macro for std::byte

2024-10-11 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk.

-- >8 --

libstdc++-v3/ChangeLog:

* include/bits/cpp_type_traits.h (__is_byte): Guard with
__glibcxx_byte macro instead of checking __cplusplus.
---
 libstdc++-v3/include/bits/cpp_type_traits.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libstdc++-v3/include/bits/cpp_type_traits.h 
b/libstdc++-v3/include/bits/cpp_type_traits.h
index 19bf1edf647..060652afb18 100644
--- a/libstdc++-v3/include/bits/cpp_type_traits.h
+++ b/libstdc++-v3/include/bits/cpp_type_traits.h
@@ -414,7 +414,7 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
   typedef __true_type __type;
 };
 
-#if __cplusplus >= 201703L
+#ifdef __glibcxx_byte // C++ >= 17
   enum class byte : unsigned char;
 
   template<>
-- 
2.46.2



Re: [PATCH v3 2/5] openmp: Add support for iterators in map clauses (C/C++)

2024-10-11 Thread Jakub Jelinek
On Fri, Oct 04, 2024 at 03:56:01PM +0100, Kwok Cheung Yeung wrote:
> This patch modifies the C and C++ parsers to accept an iterator as a map
> type modifier, storing it in the OMP_CLAUSE_ITERATOR argument of the clause.
> When finishing clauses, any clauses generated from a clause with iterators
> also has the iterator applied to them.
> 
> During gimplification, check_omp_map_iterators is called to check that all
> iterator variables are referenced at some point with a clause.
> Gimplification of the clause decl and size are delayed until iterator
> expansion as they may reference iterator variables.

Any kind of delaying of gimplification feels wrong.
You can arrange for the iterator var to be kept as is, or certain forms of
trees still be allowed through, but arbitrary expressions in there is
definitely wrong.
One could have map(iterator(i=0:1), to: y[foo (bar (i))]) or similar, and
you don't want to gimplify the calls or worse say some FE-ish trees after
gimplification.

> +/* Callback for walk_tree to find a VAR_DECL (stored in DATA) in the
> +   tree TP.  */
> +
> +static tree
> +find_var_decl (tree *tp, int *, void *data)

Please put omp_ somewhere in the name.

> +{
> +  tree t = *tp;
> +
> +  if (TREE_CODE (t) == VAR_DECL && t == (tree) data)

TREE_CODE (x) == VAR_DECL should be VAR_P (x), but why
are you testing it when you do t == (tree) data?
That alone should be enough, no?

> +  for (tree it = OMP_CLAUSE_ITERATORS (c); it; it = TREE_CHAIN (it))
> +{
> +  tree var = TREE_VEC_ELT (it, 0);
> +  tree t = walk_tree (&OMP_CLAUSE_DECL (c), find_var_decl, var, NULL);
> +  if (t == NULL_TREE)
> + t = walk_tree (&OMP_CLAUSE_SIZE (c), find_var_decl, var, NULL);
> +  if (t == NULL_TREE)
> + {
> +   error_at (OMP_CLAUSE_LOCATION (c),
> + "iterator variable %qD not used in clause expression",
> + var);

Where do you see in OpenMP standard a restriction that iterator variable has
to be used in the clause expression?
Sure, the iterators without it are kind of pointless, but unless there is
something in the standard that says it is invalid, we shouldn't reject it.
E.g. one can use iterator which has a single iteration, then why would one
use the iterator in the expression (sure, why would one use the iterator in
that case), or zero iterations.

> @@ -14168,7 +14217,11 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, 
> gimple_seq body, tree *list_p,
>   : TYPE_SIZE_UNIT (TREE_TYPE (decl));
>   }
> gimplify_omp_ctxp = ctx->outer_context;
> -   if (gimplify_expr (&OMP_CLAUSE_SIZE (c), pre_p, NULL,
> +   if (OMP_CLAUSE_ITERATORS (c))
> + /* Gimplify the OMP_CLAUSE_SIZE later, when the iterator is
> +gimplified.  */
> + ;

See above.  At least partial gimplification is a must IMHO.

> +   else if (gimplify_expr (&OMP_CLAUSE_SIZE (c), pre_p, NULL,
> is_gimple_val, fb_rvalue) == GS_ERROR)
>   {
> gimplify_omp_ctxp = ctx;
> @@ -14333,6 +14386,11 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, 
> gimple_seq body, tree *list_p,
> if (code == OMP_TARGET && OMP_CLAUSE_MAP_IN_REDUCTION (c))
>   break;
>  
> +   /* Do not gimplify the declaration yet for clauses with
> +  iterators.  */
> +   if (OMP_CLAUSE_ITERATORS (c))
> + break;

Likewise.

> --- a/gcc/omp-low.cc
> +++ b/gcc/omp-low.cc
> @@ -12607,6 +12607,163 @@ lower_omp_taskreg (gimple_stmt_iterator *gsi_p, 
> omp_context *ctx)
>  }
>  }
>  
> +extern tree compute_iterator_count (tree it, gimple_seq *pre_p);
> +extern tree *build_iterator_loop (tree it, gimple_seq *pre_p, tree 
> *last_bind);

Such declarations belong to some header file.

Jakub



Re: [PATCH] middle-end: [PR middle-end/116926] Allow widening optabs for vec-mode -> scalar-mode

2024-10-11 Thread Victor Do Nascimento

On 10/11/24 08:28, Richard Biener wrote:

On Thu, Oct 10, 2024 at 5:25 PM Victor Do Nascimento
 wrote:


The recent refactoring of the dot_prod optab to convert-type exposed a
limitation in how `find_widening_optab_handler_and_mode' is currently
implemented, owing to the fact that, while the function expects the

   GET_MODE_CLASS (from_mode) == GET_MODE_CLASS (to_mode)

condition to hold, the c6x backend implements a dot product from V2HI
to SI, which triggers an ICE.

Consequently, this patch adds some logic to allow widening optabs
which accumulate vector elements to a single scalar.

Regression tested on x86_64 and aarch64 with no new regressions.
Fixes failing unit tests on c6x, as validated for the tic6x-unknown-elf
target.

Ok for master?

gcc/ChangeLog:

 PR middle-end/116926
 * optabs-query.cc (find_widening_optab_handler_and_mode): Add
 handling of vector -> scalar optab handling.
---
  gcc/optabs-query.cc | 13 +
  1 file changed, 13 insertions(+)

diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index c3134d6a2ce..8a9092ffec7 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -485,6 +485,19 @@ find_widening_optab_handler_and_mode (optab op, 
machine_mode to_mode,
if (GET_MODE_CLASS (limit_mode) == MODE_PARTIAL_INT)
 limit_mode = GET_MODE_WIDER_MODE (limit_mode).require ();
  }
+  else if (GET_MODE_CLASS (from_mode) != GET_MODE_CLASS (to_mode))
+{
+  gcc_checking_assert (VECTOR_MODE_P (from_mode)
+  && !VECTOR_MODE_P (to_mode)
+  && GET_MODE_INNER (from_mode) < to_mode);
+  enum insn_code handler = convert_optab_handler (op, to_mode, from_mode);
+  if (handler != CODE_FOR_nothing)
+   {
+ if (found_mode)
+   *found_mode = from_mode;
+ return handler;
+   }


else if (is_a  (to_mode))
  {
 gcc_checking_assert (VECTOR_MODE_P (from_mode)
 && GET_MODE_INNER
(from_mode) < to_mode);
 limit_mode = from_mode;
  }
else
...

would also work?


You're absolutely right, much better.

Patch resubmitted.

Many thanks,
Victor.


Thanks,
Richard.


+}
else
  gcc_checking_assert (GET_MODE_CLASS (from_mode) == GET_MODE_CLASS 
(to_mode)
  && from_mode < to_mode);
--
2.34.1



Re: [PATCH] libcpp, genmatch: Use gcc_diag instead of printf for libcpp diagnostics

2024-10-11 Thread Joseph Myers
On Fri, 13 Sep 2024, Jakub Jelinek wrote:

> @@ -1533,7 +1536,7 @@ do_linemarker (cpp_reader *pfile)
>/* Unlike #line, there does not seem to be a way to get an EOF
>here.  So, it should be safe to always spell the token.  */
>cpp_error (pfile, CPP_DL_ERROR,
> -  "\"%s\" after # is not a positive integer",
> +  "%qs after # is not a positive integer",
>cpp_token_as_text (pfile, token));

Should be %<#%>, I think.

The libcpp, c-family and testsuite changes are OK with that fixed 
(together with reversion of "libcpp: Use ' instead of %< and %> 
[PR117039]" and any other fixes for recent libcpp changes, or consequent 
testsuite changes for changes to this patch).

-- 
Joseph S. Myers
josmy...@redhat.com



[PATCH v2] middle-end: [PR middle-end/116926] Allow widening optabs for vec-mode -> scalar-mode

2024-10-11 Thread Victor Do Nascimento
The recent refactoring of the dot_prod optab to convert-type exposed a
limitation in how `find_widening_optab_handler_and_mode' is currently
implemented, owing to the fact that, while the function expects the

  GET_MODE_CLASS (from_mode) == GET_MODE_CLASS (to_mode)

condition to hold, the c6x backend implements a dot product from V2HI
to SI, which triggers an ICE.

Consequently, this patch adds some logic to allow widening optabs
which accumulate vector elements to a single scalar.

Regression tested on x86_64 and aarch64 with no new regressions.
Fixes failing unit tests on c6x, as validated for the tic6x-unknown-elf
target.

Ok for master?

gcc/ChangeLog:

PR middle-end/116926
* optabs-query.cc (find_widening_optab_handler_and_mode): Add
handling of vector -> scalar optab handling.
---
 gcc/optabs-query.cc | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index c3134d6a2ce..cc52bc0f5ea 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -485,6 +485,12 @@ find_widening_optab_handler_and_mode (optab op, 
machine_mode to_mode,
   if (GET_MODE_CLASS (limit_mode) == MODE_PARTIAL_INT)
limit_mode = GET_MODE_WIDER_MODE (limit_mode).require ();
 }
+  else if (is_a  (to_mode))
+{
+  gcc_checking_assert (VECTOR_MODE_P (from_mode)
+  && GET_MODE_INNER (from_mode) < to_mode);
+  limit_mode = from_mode;
+}
   else
 gcc_checking_assert (GET_MODE_CLASS (from_mode) == GET_MODE_CLASS (to_mode)
 && from_mode < to_mode);
-- 
2.34.1



Re: [PATCH][aarch64][libstdc++] Use shufflevector instead of shuffle in opt_random.h

2024-10-11 Thread Christophe Lyon
On Fri, 11 Oct 2024 at 17:52, Jonathan Wakely  wrote:
>
> On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus  wrote:
> >
> > This patch modifies the implementation of the vectorized Mersenne
> > Twister random number generator to use __builtin_shufflevector instead
> > of __builtin_shuffle. This makes it (almost) compatible with Clang.
> >
> > To make the implementation fully compatible with Clang, Clang will need
> > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> > currently it does not. This looks like an oversight in Clang and so will
> > be addressed separately.
> >
> > I see no codegen change with this patch.
>
> I'm not qualified to review this myself, but I'd at least like to see
> the CI checks passing:
> https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7...@nvidia.com/
> Apparently the patch couldn't be applied.
>
> Please configure your email client (thunderbird?) to not munge the
> patch, or attach it rather than sending inline. Or just use
> git-send-email :-)
>
Hi!

The problem is not with how the patch was sent: patchwork managed to
see it as a patch, and the CI tried to apply it.
The problem is that for some reason, git was not able to apply the
patch to our current baseline.
Unfortunately, we do not go as far as calling 'git am
--show-current-patch=diff' or something else to provide more info in
our CI logs, so we can only guess that something went wrong. Maybe
your patch is based against a too old revision of GCC?

Thanks,

Christophe


>
> >
> > Bootstrapped and tested on aarch64-none-linux-gnu.
> >
> > Signed-off-by: Ricardo Jesus 
> >
> > 2024-09-05  Ricardo Jesus  
> >
> > * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
> > of __builtin_shuffle with __builtin_shufflevector.
> > (__aarch64_lsl_128): Move shift amount to a template parameter.
> > (__aarch64_lsr_128): Move shift amount to a template parameter.
> > (__aarch64_recursion): Update call sites of __aarch64_lsl_128
> > and __aarch64_lsr_128.
> > ---
> >   .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++
> >   1 file changed, 16 insertions(+), 12 deletions(-)
> >
> > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > index 7f756d1572f..7eb816abcd0 100644
> > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > @@ -35,13 +35,13 @@
> >   #ifdef __ARM_NEON
> >
> >   #ifdef __ARM_BIG_ENDIAN
> > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> > -{16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> > +16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > +24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
> >   #else
> > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> > -{_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> > +_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > +_C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
> >   #endif
> >
> >   #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > namespace {
> >   // Logical Shift right 128-bits by c * 8 bits
> >
> > -__extension__ extern __inline __Uint32x4_t
> > +__extension__
> > +template
> > +extern __inline __Uint32x4_t
> >   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -__aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
> > +__aarch64_lsr_128 (__Uint8x16_t __a)
> >   {
> > const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> >0, 0, 0, 0, 0, 0, 0, 0};
> > @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >
> >   // Logical Shift left 128-bits by c * 8 bits
> >
> > -__extension__ extern __inline __Uint32x4_t
> > +__extension__
> > +template
> > +extern __inline __Uint32x4_t
> >   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -__aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
> > +__aarch64_lsl_128 (__Uint8x16_t __a)
> >   {
> > const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> >0, 0, 0, 0, 0, 0, 0, 0};
> > @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >__Uint32x4_t __e)
> >   {
> > __Uint32x4_t __y = (__b >> __sr1);
> > -  __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
> > +  __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
> >
> > __

Re: [PATCH] libstdc++: improve std::atomic compatibility with Clang

2024-10-11 Thread Jonathan Wakely
On Fri, 11 Oct 2024 at 20:35, Jonathan Wakely  wrote:
>
> On Fri, 11 Oct 2024 at 20:33, Jonathan Wakely  wrote:
> >
> > On Sat, 21 Sept 2024 at 10:43, Giuseppe D'Angelo
> >  wrote:
> > >
> > > Hello,
> > >
> > > The attached patch modifies std::atomic's primary template. The goal is
> > > to improve compatibility with Clang, while also possibly making it more
> > > complaint with the changes introduced by P0883 / C++20.
> > >
> > > Simplifying, std::atomic has a `T t = T()` NDSMI and a defaulted default
> > > constructor. The crux of the problem is that Clang seems to be stricter
> > > than GCC when that constructor is considered / instantiated.
> > >
> > > Given a non-default constructible type NDC, doing something like
> > >
> > > constexpr bool b = std::is_default_constructible_v>;
> > >
> > > causes a hard error on Clang because it will "see" the call to `NDC()`
> > > in the NDSMI. The code is instead accepted by GCC. This hard error will
> > > happen anywhere one "mentions" std::atomic's default constructor,
> > > for instance in libstdc++'s C++20 std::pair implementation (uses them in
> > > the explicit(bool) bits). You can play with this here:
> > >
> > > https://gcc.godbolt.org/z/xcr4zK8hx
> > >
> > > PR116769 argues that Clang's behavior is the correct one here, so this
> > > patch improves compat with Clang by removing the defaulted default
> > > constructor.
> >
> > GCC's behaviour seems much more useful.
> >
> > > A related issue is: what's the value of `b` above? std::atomic's default
> > > constructor is not constrained, so it should be `true`. Right now we're
> > > reporting `false` instead.
> >
> > Good, that's the correct answer :-)
> > I don't understand why anybody would want the NSDMI to be ignored and
> > give the wrong answer, or be instantiated and give a hard error.
> >
> >
> > > Thoughts?
> >
> > Your patch changes the value of
> > is_trivially_default_constructible_v> for
> > C++11/14/17.
> >
> > Currently that is true for <= 17 and true for >= 20. You patch makes
> > it always false.
> >
> > If we did this instead, I think all compilers would handle it
> > correctly and we wouldn't change any behaviour for C++17 down:
> >
> > atomic() = default;
> > #ifdef __cpp_concepts >= 202002
>
> We might not require 202002 here, that's the value for P0848R3 and we
> might not need that to conditionally delete the default ctor. I don't
> remember the details.
>
> It might be sufficient to just check that __cpp_concepts is defined.
>
> > atomic() requires (!std::is_constructible_v<_Tp>) = delete;
> > #endif
> >
> > For C++17 there's no NSDMI and the default constructor does the right
> > thing (getting deleted if T is not default constructible).
> > For C++20 the default constructor is deleted if the NSDMI would be
> > ill-formed, which is consistent with the C++17 behaviour.
> > The triviality of the constructor is unchanged.


Or alternatively, and arguably simpler, we could get rid of the ugly
_GLIBCXX20_INIT macro and just do this:

diff --git a/libstdc++-v3/include/bits/version.def
b/libstdc++-v3/include/bits/version.def
index 6783f9aa2a5..942a9c342f2 100644
--- a/libstdc++-v3/include/bits/version.def
+++ b/libstdc++-v3/include/bits/version.def
@@ -766,6 +766,7 @@ ftms = {
  values = {
v = 201911;
cxxmin = 20;
+extra_cond = "__cpp_concepts";
  };
};

diff --git a/libstdc++-v3/include/std/atomic b/libstdc++-v3/include/std/atomic
index c0568d3320b..a555624a2c9 100644
--- a/libstdc++-v3/include/std/atomic
+++ b/libstdc++-v3/include/std/atomic
@@ -189,14 +189,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif // __cpp_lib_atomic_wait
  };

-/// @cond undocumented
-#if __cpp_lib_atomic_value_initialization
-# define _GLIBCXX20_INIT(I) = I
-#else
-# define _GLIBCXX20_INIT(I)
-#endif
-/// @endcond
-
  /**
   *  @brief Generic atomic type, primary class template.
   *
@@ -216,7 +208,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  static constexpr int _S_alignment
= _S_min_alignment > alignof(_Tp) ? _S_min_alignment : alignof(_Tp);

-  alignas(_S_alignment) _Tp _M_i _GLIBCXX20_INIT(_Tp());
+  alignas(_S_alignment) _Tp _M_i;

  static_assert(__is_trivially_copyable(_Tp),
   "std::atomic requires a trivially copyable type");
@@ -232,7 +224,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif

public:
+#if __cpp_lib_atomic_value_initialization
+  atomic() noexcept (is_nothrow_default_constructible_v<_Tp>)
+  requires (is_default_constructible_v<_Tp>)
+  : _M_i() { }
+#else
  atomic() = default;
+#endif
  ~atomic() noexcept = default;
  atomic(const atomic&) = delete;
  atomic& operator=(const atomic&) = delete;

This more closely matches what's in the standard, except that we have
a requires-clause on the default constructor. It seems like a defect
that the standard doesn't do this, instead requiring
is_constructible> to be true, but Mandating an error if
you try to use it.



Re: [PATCH][aarch64][libstdc++] Use shufflevector instead of shuffle in opt_random.h

2024-10-11 Thread Christophe Lyon
On Fri, 11 Oct 2024 at 21:00, Jonathan Wakely  wrote:
>
> On Fri, 11 Oct 2024 at 19:52, Christophe Lyon
>  wrote:
> >
> > On Fri, 11 Oct 2024 at 17:52, Jonathan Wakely  wrote:
> > >
> > > On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus  wrote:
> > > >
> > > > This patch modifies the implementation of the vectorized Mersenne
> > > > Twister random number generator to use __builtin_shufflevector instead
> > > > of __builtin_shuffle. This makes it (almost) compatible with Clang.
> > > >
> > > > To make the implementation fully compatible with Clang, Clang will need
> > > > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> > > > currently it does not. This looks like an oversight in Clang and so will
> > > > be addressed separately.
> > > >
> > > > I see no codegen change with this patch.
> > >
> > > I'm not qualified to review this myself, but I'd at least like to see
> > > the CI checks passing:
> > > https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7...@nvidia.com/
> > > Apparently the patch couldn't be applied.
> > >
> > > Please configure your email client (thunderbird?) to not munge the
> > > patch, or attach it rather than sending inline. Or just use
> > > git-send-email :-)
> > >
> > Hi!
> >
> > The problem is not with how the patch was sent: patchwork managed to
> > see it as a patch, and the CI tried to apply it.
> > The problem is that for some reason, git was not able to apply the
> > patch to our current baseline.
> > Unfortunately, we do not go as far as calling 'git am
> > --show-current-patch=diff' or something else to provide more info in
> > our CI logs, so we can only guess that something went wrong. Maybe
> > your patch is based against a too old revision of GCC?
>
> No, that file hasn't changed anywhere near the patch. The problem is
> that the patch was munged by thunderbird, adding line breaks where
> they corrupt the patch:
>
>
> Applying: Use shufflevector instead of shuffle in opt_random.h
> error: git diff header lacks filename information when removing 1
> leading pathname component (line 6)
> Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h
>
> I fixed that manually, but it still fails:

I see Thanks for checking manually!
I didn't go further than looking at the build logs.

Thanks,

Christophe

>
> Applying: Use shufflevector instead of shuffle in opt_random.h
> error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:35
> error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
> does not apply
> Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h
>
> That's because of an incorrect number of space characters on the
> unchanged context lines around the +/- diffs. I fixed that manually,
> and failed at the next chunk:
>
> Applying: Use shufflevector instead of shuffle in opt_random.h
> error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:52
> error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
> does not apply
> Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h
>
> So the problem is how the patch was sent.
>
> >
> > Thanks,
> >
> > Christophe
> >
> >
> > >
> > > >
> > > > Bootstrapped and tested on aarch64-none-linux-gnu.
> > > >
> > > > Signed-off-by: Ricardo Jesus 
> > > >
> > > > 2024-09-05  Ricardo Jesus  
> > > >
> > > > * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
> > > > of __builtin_shuffle with __builtin_shufflevector.
> > > > (__aarch64_lsl_128): Move shift amount to a template parameter.
> > > > (__aarch64_lsr_128): Move shift amount to a template parameter.
> > > > (__aarch64_recursion): Update call sites of __aarch64_lsl_128
> > > > and __aarch64_lsr_128.
> > > > ---
> > > >   .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++
> > > >   1 file changed, 16 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > > b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > > index 7f756d1572f..7eb816abcd0 100644
> > > > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > > @@ -35,13 +35,13 @@
> > > >   #ifdef __ARM_NEON
> > > >
> > > >   #ifdef __ARM_BIG_ENDIAN
> > > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> > > > -{16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > > > - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> > > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> > > > +16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > > > +24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
> > > >   #else
> > > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> > > > -{_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > > > - _C

[PATCH v4 1/2] libstdc++: Enable memcpy optimizations for distinct integral types [PR93059]

2024-10-11 Thread Jonathan Wakely
Sigh, the v3 patch had a stray "#pragma GCC" in it somehow, so now
here's v4. Unchanged apart from removing that.

Testing powerpc64le-linux ...

-- >8 --

Currently we only optimize std::copy, std::copy_n etc. to memmove when
the source and destination types are the same. This means that we fail
to optimize copying between distinct 1-byte types, e.g. copying from a
buffer of unsigned char to a buffer of char8_t or vice versa.

This patch adds more partial specializations of the __memcpyable trait
so that we allow memcpy between integers of equal widths. This will
enable memmove for copies between narrow character types and also
between same-width types like int and unsigned.

Enabling the optimization needs to be based on the width of the integer
type, not just the size in bytes. This is because some targets define
non-standard integral types such as __int20 in msp430, which has padding
bits. It would not be safe to memcpy between e.g. __int20 and int32_t,
even though sizeof(__int20) == sizeof(int32_t). A new trait is
introduced to define the width, __memcpyable_integer, and then the
__memcpyable trait compares the widths.

It's safe to copy between signed and unsigned integers of the same
width, because GCC only supports two's complement integers.

I initially though it would be useful to define the specialization
__memcpyable_integer to enable copying between narrow character
types and std::byte. But that isn't possible with std::copy, because
is_assignable is false. Optimized copies using memmove
will already happen for copying std::byte to std::byte, because
__memcpyable is true.

libstdc++-v3/ChangeLog:

PR libstdc++/93059
* include/bits/cpp_type_traits.h (__memcpyable): Add partial
specialization for pointers to distinct types.
(__memcpyable_integer): New trait to control which types can use
cross-type memcpy optimizations.
---
 libstdc++-v3/include/bits/cpp_type_traits.h | 89 -
 1 file changed, 87 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/bits/cpp_type_traits.h 
b/libstdc++-v3/include/bits/cpp_type_traits.h
index 060652afb18..2f9ce75e82c 100644
--- a/libstdc++-v3/include/bits/cpp_type_traits.h
+++ b/libstdc++-v3/include/bits/cpp_type_traits.h
@@ -434,8 +434,6 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
 };
 #endif
 
-  template struct iterator_traits;
-
   // A type that is safe for use with memcpy, memmove, memcmp etc.
   template
 struct __is_nonvolatile_trivially_copyable
@@ -459,16 +457,103 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
   enum { __value = 0 };
 };
 
+  // Allow memcpy when source and destination are pointers to the same type.
   template
 struct __memcpyable<_Tp*, _Tp*>
 : __is_nonvolatile_trivially_copyable<_Tp>
 { };
 
+  // Source pointer can be const.
   template
 struct __memcpyable<_Tp*, const _Tp*>
 : __is_nonvolatile_trivially_copyable<_Tp>
 { };
 
+  template struct __memcpyable_integer;
+
+  // For heterogeneous types, allow memcpy between equal-sized integers.
+  template
+struct __memcpyable<_Tp*, _Up*>
+{
+  enum {
+   __value = __memcpyable_integer<_Tp>::__width != 0
+   && ((int)__memcpyable_integer<_Tp>::__width
+ == (int)__memcpyable_integer<_Up>::__width)
+  };
+};
+
+  // Specialization for const U* because __is_integer is never true.
+  template
+struct __memcpyable<_Tp*, const _Up*>
+: __memcpyable<_Tp*, _Up*>
+{ };
+
+  template
+struct __memcpyable_integer
+{
+  enum {
+   __width = __is_integer<_Tp>::__value ? (sizeof(_Tp) * __CHAR_BIT__) : 0
+  };
+};
+
+  // Cannot memcpy volatile memory.
+  template
+struct __memcpyable_integer
+{ enum { __width = 0 }; };
+
+  // Specializations for __intNN types with padding bits.
+#if defined __GLIBCXX_TYPE_INT_N_0 && __GLIBCXX_BITSIZE_INT_N_0 % __CHAR_BIT__
+  __extension__
+  template<>
+struct __memcpyable_integer<__GLIBCXX_TYPE_INT_N_0>
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_0 }; };
+  __extension__
+  template<>
+struct __memcpyable_integer
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_0 }; };
+#endif
+#if defined __GLIBCXX_TYPE_INT_N_1 && __GLIBCXX_BITSIZE_INT_N_1 % __CHAR_BIT__
+  __extension__
+  template<>
+struct __memcpyable_integer<__GLIBCXX_TYPE_INT_N_1>
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_1 }; };
+  __extension__
+  template<>
+struct __memcpyable_integer
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_1 }; };
+#endif
+#if defined __GLIBCXX_TYPE_INT_N_2 && __GLIBCXX_BITSIZE_INT_N_2 % __CHAR_BIT__
+  __extension__
+  template<>
+struct __memcpyable_integer<__GLIBCXX_TYPE_INT_N_2>
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_2 }; };
+  __extension__
+  template<>
+struct __memcpyable_integer
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_2 }; };
+#endif
+#if defined __GLIBCXX_TYPE_INT_N_3 && __GLIBCXX_BITSIZE_INT_N_3 % __CHAR_BIT__
+  __extension_

Re: ping: [PATCH] libcpp: Support extended characters for #pragma {push,pop}_macro [PR109704]

2024-10-11 Thread Joseph Myers
On Wed, 25 Sep 2024, Lewis Hyatt wrote:

> Hello-
> 
> May I please ping this one? Is there something maybe sub-optimal about
> how I organized it? I can adjust or break it into two maybe if that's
> helpful. Or else, if it's just that #pragma push_macro is not widely
> used or cared about... I think it's still worth fixing clear gaps in
> the support for UTF-8 input, and this is I think the last one, or at
> least, the last obvious one. Thanks!
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2024-January/642926.html

This patch is OK.

-- 
Joseph S. Myers
josmy...@redhat.com



Re: [PATCH] libcpp: Fix _Pragma("GCC system_header") [PR114436]

2024-10-11 Thread Joseph Myers
On Sat, 23 Mar 2024, Lewis Hyatt wrote:

> Hello-
> 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114436
> 
> This is a small fix for the issue mentioned in the PR that _Pragma("GCC
> system_header") does not work completely. I believe it was always the case
> since _Pragma() support was first added. bootstrap + regtested all languages
> on x86-64 Linux. Is it OK now or in stage 1 please? Thanks!
> 
> -Lewis
> 
> -- >8 --
> 
> _Pragma("GCC system_header") currently takes effect only partially. It does
> succeed in updating the line_map, so that checks like in_system_header_at()
> return correctly, but it does not update pfile->buffer->sysp.  One result is
> that a subsequent #include does not set up the system header state properly
> for the newly included file, as pointed out in the PR. Fix by propagating
> the new system header state back to the buffer after processing the pragma.

OK.

-- 
Joseph S. Myers
josmy...@redhat.com



[PATCH v8] Provide new GCC builtin __builtin_counted_by_ref [PR116016]

2024-10-11 Thread Qing Zhao
Hi, This is the 8th version of the patch.

Compared to the 7th version, the major changes are several documentation
wording issues raised by Joseph.

The 7th version is at:
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/664032.html

bootstrapped and regress tested on both X86 and aarch64. no issue.

Okay for the trunk?

thanks.

Qing.

With the addition of the 'counted_by' attribute and its wide roll-out
within the Linux kernel, a use case has been found that would be very
nice to have for object allocators: being able to set the counted_by
counter variable without knowing its name.

For example, given:

  struct foo {
...
int counter;
...
struct bar array[] __attribute__((counted_by (counter)));
  } *p;

The existing Linux object allocators are roughly:

  #define MAX(A, B) (A > B) ? (A) : (B)
  #define alloc(P, FAM, COUNT) ({ \
__auto_type __p = &(P); \
size_t __size = MAX (sizeof(*P),
 __builtin_offsetof (__typeof(*P), FAM)
 + sizeof (*(P->FAM)) * COUNT); \
*__p = kmalloc(__size); \
  })

Right now, any addition of a counted_by annotation must also
include an open-coded assignment of the counter variable after
the allocation:

  p = alloc(p, array, how_many);
  p->counter = how_many;

In order to avoid the tedious and error-prone work of manually adding
the open-coded counted-by intializations everywhere in the Linux
kernel, a new GCC builtin __builtin_counted_by_ref will be very useful
to be added to help the adoption of the counted-by attribute.

 -- Built-in Function: TYPE __builtin_counted_by_ref (PTR)
 The built-in function '__builtin_counted_by_ref' checks whether the
 array object pointed by the pointer PTR has another object
 associated with it that represents the number of elements in the
 array object through the 'counted_by' attribute (i.e.  the
 counted-by object).  If so, returns a pointer to the corresponding
 counted-by object.  If such counted-by object does not exist,
 returns a null pointer.

 This built-in function is only available in C for now.

 The argument PTR must be a pointer to an array.  The TYPE of the
 returned value is a pointer type pointing to the corresponding
 type of the counted-by object or a void pointer type in case of a
 null pointer being returned.

With this new builtin, the central allocator could be updated to:

  #define MAX(A, B) (A > B) ? (A) : (B)
  #define alloc(P, FAM, COUNT) ({ \
__auto_type __p = &(P); \
__auto_type __c = (COUNT); \
size_t __size = MAX (sizeof (*(*__p)),\
 __builtin_offsetof (__typeof(*(*__p)),FAM) \
 + sizeof (*((*__p)->FAM)) * __c); \
if ((*__p = kmalloc(__size))) { \
  __auto_type ret = __builtin_counted_by_ref((*__p)->FAM); \
  *_Generic(ret, void *: &(size_t){0}, default: ret) = __c; \
} \
  })

And then structs can gain the counted_by attribute without needing
additional open-coded counter assignments for each struct, and
unannotated structs could still use the same allocator.

PR c/116016

gcc/c-family/ChangeLog:

* c-common.cc: Add new __builtin_counted_by_ref.
* c-common.h (enum rid): Add RID_BUILTIN_COUNTED_BY_REF.

gcc/c/ChangeLog:

* c-decl.cc (names_builtin_p): Add RID_BUILTIN_COUNTED_BY_REF.
* c-parser.cc (has_counted_by_object): New routine.
(get_counted_by_ref): New routine.
(c_parser_postfix_expression): Handle New RID_BUILTIN_COUNTED_BY_REF.
* c-tree.h: New routine handle_counted_by_for_component_ref.
* c-typeck.cc (handle_counted_by_for_component_ref): New routine.
(build_component_ref): Call the new routine.

gcc/ChangeLog:

* doc/extend.texi: Add documentation for __builtin_counted_by_ref.

gcc/testsuite/ChangeLog:

* gcc.dg/builtin-counted-by-ref-1.c: New test.
* gcc.dg/builtin-counted-by-ref.c: New test.
---
 gcc/c-family/c-common.cc  |   1 +
 gcc/c-family/c-common.h   |   1 +
 gcc/c/c-decl.cc   |   1 +
 gcc/c/c-parser.cc |  79 ++
 gcc/c/c-tree.h|   1 +
 gcc/c/c-typeck.cc |  33 +++--
 gcc/doc/extend.texi   |  55 +++
 .../gcc.dg/builtin-counted-by-ref-1.c | 135 ++
 gcc/testsuite/gcc.dg/builtin-counted-by-ref.c |  61 
 9 files changed, 358 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/builtin-counted-by-ref-1.c
 create mode 100644 gcc/testsuite/gcc.dg/builtin-counted-by-ref.c

diff --git a/gcc/c-family/c-common.cc b/gcc/c-family/c-common.cc
index ec6a5da892d..8ad9b998e7b 100644
--- a/gcc/c-family/c-common.cc
+++ b/gcc/c-family/c-common.cc
@@ -430,6 +430,7 @@ const struct c_common_resword c_common_reswords[] =
   { "__builtin_choose_expr", RID_CHOOSE_EXPR, D_CONLY

ping Re: RFC PATCH: contrib/test_summary mode for submitting testsuite results to bunsen

2024-10-11 Thread Frank Ch. Eigler
Hi -

(ping)


commit 23c3100e992029994f33eb4a1465570b476c1df4 (HEAD -> master)
Author: Frank Ch. Eigler 
Date:   Mon Sep 23 18:03:31 2024 -0400

contrib/test_summary: Add bunsen uploading mode

This makes it easy for someone to push gcc dejagnu/autoconf test
results to a bunsen [1] system, as an alternative or supplement to
sending a subset by email to .  Bunsen
allows minimum-infrastructure archiving, indexing, and analysis of
test results.

% contrib/test_summary -b
echo 'master' > .bunsen.source.gitbranch &&
echo 'basepoints/gcc-15-3524-ga523c2ba5862' > .bunsen.source.gitdescribe &&
echo 'a523c2ba58621c3630a1cd890d6db82879f92c90' > .bunsen.source.gitname &&
echo 'git://gcc.gnu.org/git/gcc.git' > .bunsen.source.gitrepo &&
(find . -name '*.log' -o -name '*.sum' -o -name '.bunsen.*' | 
t-upload-git-sh 'ssh://sourceware.org/git/bunsendb.git/' 
'fche/gcc/x86_64-pc-linux-gnu/x86_6pc-linux-gnu/20240923-1817')

Commit access to the sourceware bunsen database [2] is available on
request [3], so uploads automatically show up in the web interface
[4], but one may also operate a private copy of the system to use it
entirely locally.  A unique tag name is generated from one's userid,
the gcc host/target triplets, and a timestamp, but these defaults may
be overridden with contrib/test_summary options.  The git
commit/tag/push machinery is wrapped into a tiny "t-upload-git-push"
shell script, which may be downloaded from bunsen.git into your $PATH.

[1] https://sourceware.org/bunsen/
[2] https://sourceware.org/git/bunsendb.git
[3] 
[4] https://builder.sourceware.org/testruns/

https://inbox.sourceware.org/bunsen/20240913201848.gc25...@redhat.com/

ChangeLog:

   * Makefile.tpl, Makefile.in: Add bunsen-report.log target.

contrib/ChangeLog:

   * test_summary: Add -b (bunsen) mode to report all test results
 into a https://sourceware.org/bunsen/-like system instead of
 emailing extracts.

Signed-Off-By: Frank Ch. Eigler 

diff --git a/Makefile.in b/Makefile.in
index 966d60454960..8c352f7a2956 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -2852,6 +2852,11 @@ mail-report-with-warnings.log: warning.log
chmod +x $@
echo If you really want to send e-mail, run ./$@ now
 
+bunsen-report.log:
+   $(srcdir)/contrib/test_summary -b >$@
+   chmod +x $@
+   echo If you really want to send to bunsen, run ./$@ now
+
 # Local Vim config
 
 $(srcdir)/.local.vimrc:
diff --git a/Makefile.tpl b/Makefile.tpl
index da38dca697ad..9816fcd6f5b2 100644
--- a/Makefile.tpl
+++ b/Makefile.tpl
@@ -1034,6 +1034,11 @@ mail-report-with-warnings.log: warning.log
chmod +x $@
echo If you really want to send e-mail, run ./$@ now
 
+bunsen-report.log:
+   $(srcdir)/contrib/test_summary -b >$@
+   chmod +x $@
+   echo If you really want to send to bunsen, run ./$@ now
+
 # Local Vim config
 
 $(srcdir)/.local.vimrc:
diff --git a/contrib/test_summary b/contrib/test_summary
index 5760b053ec27..b4a9c92b753e 100755
--- a/contrib/test_summary
+++ b/contrib/test_summary
@@ -39,6 +39,10 @@ if test x"$1" = "x-h"; then
  should be selected from the log files.
  -f: force reports to be mailed; if omitted, only reports that differ
  from the sent.* version are sent.
+ -b: instead of emailing, push test logs into a bunsen git repo
+ -bg REPO: specify the bunsen git repo to override default
+ -bi TAG1: specify the bunsen tag prefix (user name)
+ -bt TAG2: specify the bunsen tag suffix (build name)
 _EOF
   exit 0
 fi
@@ -57,6 +61,10 @@ fi
 : ${filesuffix=}; export filesuffix
 : ${move=true}; export move
 : ${forcemail=false}; export forcemail
+: ${bunsen=false};
+: ${bunsengit=ssh://sourceware.org/git/bunsendb.git/};
+: ${bunsentag1=`whoami`};
+: ${bunsentag2=gcc/`grep ^host= config.log | tr -d "'" | cut -f2 -d=`/`grep 
^tget= config.log | tr -d "'" | cut -f2 -d=`/`date +%Y%m%d-%H%M`};
 while true; do
 case "$1" in 
   -o) filesuffix=.sent; move=false; : ${mailto=nobody}; shift;;
@@ -64,10 +72,31 @@ while true; do
   -p) prepend_logs=${prepend_logs+"$prepend_logs "}"$2"; shift 2;;
   -i) append_logs=${append_logs+"$append_logs "}"$2"; shift 2;;
   -m) mailto=$2; forcemail=true; shift 2;;
+  -b) bunsen=true; shift;;
+  -bg) bunsengit=$2; shift 2;;
+  -bi) bunsentag1=$2; shift 2;;
+  -bt) bunsentag2=$2; shift 2;;
   -f) unset mailto; forcemail=true; shift;;
   *) break;;
 esac
 done
+if [ "x$bunsen" = "xtrue" ]; then
+gitsrcdir=`dirname "$0"` # this script, contrib/test_summary
+gitsrcdir=`dirname "$gitsrcdir"` # and the parent directory
+if [ -d "$gitsrcdir/.git" ]; then # is this a git-hosted source tree?
+# gather basic build metadata for sourceware-buildbot-style .bunsen da
+gitbranch=`cd "$gitsrcdir"; git rev-parse --abbrev-ref HEAD`
+

Re: [PATCH] g++.target/i386/pr105953.C: Skip for x32

2024-10-11 Thread H.J. Lu
On Thu, Oct 10, 2024 at 7:16 PM H.J. Lu  wrote:
>
> Since -mabi=ms isn't supported for x32, skip g++.target/i386/pr105953.C
> for x32.
>
> * g++.target/i386/pr105953.C: Skip for x32.
>
>
> --
> H.J.

I am checking in this patch.

-- 
H.J.


Re: [PATCH] gcc.target/i386/pr115407.c: Only run for lp64

2024-10-11 Thread H.J. Lu
On Thu, Oct 10, 2024 at 7:34 PM H.J. Lu  wrote:
>
> On Thu, Oct 10, 2024 at 7:13 PM H.J. Lu  wrote:
> >
> > Since -mcmodel=large is valid only for lp64, run pr115407.c only for
> > lp64.
> >
> > * gcc.target/i386/pr115407.c: Only run for lp64.
> >
> > --
> > H.J.
>
> This time is the correct patch.
>
> --
> H.J.

I am checking in this patch.

-- 
H.J.


Re: [PATCH] gcc.target/i386: Replace long with long long

2024-10-11 Thread H.J. Lu
On Thu, Oct 10, 2024 at 7:32 PM H.J. Lu  wrote:
>
> On Thu, Oct 10, 2024 at 7:14 PM H.J. Lu  wrote:
> >
> > Since long is 64-bit for x32, replace long with long long for x32.
> >
> > * gcc.target/i386/bmi2-pr112526.c: Replace long with long long.
> > * gcc.target/i386/pr105854.c: Likewise.
> > * gcc.target/i386/pr112943.c: Likewise.
> > * gcc.target/i386/pr67325.c: Likewise.
> > * gcc.target/i386/pr97971.c: Likewise.
> >
> > --
> > H.J.
>
> This time is the correct patch.
>
> --
> H.J.

I am checking in this patch.

-- 
H.J.


Re: [PATCH] libstdc++: improve std::atomic compatibility with Clang

2024-10-11 Thread Jonathan Wakely
On Sat, 21 Sept 2024 at 10:43, Giuseppe D'Angelo
 wrote:
>
> Hello,
>
> The attached patch modifies std::atomic's primary template. The goal is
> to improve compatibility with Clang, while also possibly making it more
> complaint with the changes introduced by P0883 / C++20.
>
> Simplifying, std::atomic has a `T t = T()` NDSMI and a defaulted default
> constructor. The crux of the problem is that Clang seems to be stricter
> than GCC when that constructor is considered / instantiated.
>
> Given a non-default constructible type NDC, doing something like
>
> constexpr bool b = std::is_default_constructible_v>;
>
> causes a hard error on Clang because it will "see" the call to `NDC()`
> in the NDSMI. The code is instead accepted by GCC. This hard error will
> happen anywhere one "mentions" std::atomic's default constructor,
> for instance in libstdc++'s C++20 std::pair implementation (uses them in
> the explicit(bool) bits). You can play with this here:
>
> https://gcc.godbolt.org/z/xcr4zK8hx
>
> PR116769 argues that Clang's behavior is the correct one here, so this
> patch improves compat with Clang by removing the defaulted default
> constructor.

GCC's behaviour seems much more useful.

> A related issue is: what's the value of `b` above? std::atomic's default
> constructor is not constrained, so it should be `true`. Right now we're
> reporting `false` instead.

Good, that's the correct answer :-)
I don't understand why anybody would want the NSDMI to be ignored and
give the wrong answer, or be instantiated and give a hard error.


> Thoughts?

Your patch changes the value of
is_trivially_default_constructible_v> for
C++11/14/17.

Currently that is true for <= 17 and true for >= 20. You patch makes
it always false.

If we did this instead, I think all compilers would handle it
correctly and we wouldn't change any behaviour for C++17 down:

atomic() = default;
#ifdef __cpp_concepts >= 202002
atomic() requires (!std::is_constructible_v<_Tp>) = delete;
#endif

For C++17 there's no NSDMI and the default constructor does the right
thing (getting deleted if T is not default constructible).
For C++20 the default constructor is deleted if the NSDMI would be
ill-formed, which is consistent with the C++17 behaviour.
The triviality of the constructor is unchanged.



Re: [PATCH] libstdc++: improve std::atomic compatibility with Clang

2024-10-11 Thread Jonathan Wakely
On Fri, 11 Oct 2024 at 20:33, Jonathan Wakely  wrote:
>
> On Sat, 21 Sept 2024 at 10:43, Giuseppe D'Angelo
>  wrote:
> >
> > Hello,
> >
> > The attached patch modifies std::atomic's primary template. The goal is
> > to improve compatibility with Clang, while also possibly making it more
> > complaint with the changes introduced by P0883 / C++20.
> >
> > Simplifying, std::atomic has a `T t = T()` NDSMI and a defaulted default
> > constructor. The crux of the problem is that Clang seems to be stricter
> > than GCC when that constructor is considered / instantiated.
> >
> > Given a non-default constructible type NDC, doing something like
> >
> > constexpr bool b = std::is_default_constructible_v>;
> >
> > causes a hard error on Clang because it will "see" the call to `NDC()`
> > in the NDSMI. The code is instead accepted by GCC. This hard error will
> > happen anywhere one "mentions" std::atomic's default constructor,
> > for instance in libstdc++'s C++20 std::pair implementation (uses them in
> > the explicit(bool) bits). You can play with this here:
> >
> > https://gcc.godbolt.org/z/xcr4zK8hx
> >
> > PR116769 argues that Clang's behavior is the correct one here, so this
> > patch improves compat with Clang by removing the defaulted default
> > constructor.
>
> GCC's behaviour seems much more useful.
>
> > A related issue is: what's the value of `b` above? std::atomic's default
> > constructor is not constrained, so it should be `true`. Right now we're
> > reporting `false` instead.
>
> Good, that's the correct answer :-)
> I don't understand why anybody would want the NSDMI to be ignored and
> give the wrong answer, or be instantiated and give a hard error.
>
>
> > Thoughts?
>
> Your patch changes the value of
> is_trivially_default_constructible_v> for
> C++11/14/17.
>
> Currently that is true for <= 17 and true for >= 20. You patch makes
> it always false.
>
> If we did this instead, I think all compilers would handle it
> correctly and we wouldn't change any behaviour for C++17 down:
>
> atomic() = default;
> #ifdef __cpp_concepts >= 202002

We might not require 202002 here, that's the value for P0848R3 and we
might not need that to conditionally delete the default ctor. I don't
remember the details.

It might be sufficient to just check that __cpp_concepts is defined.

> atomic() requires (!std::is_constructible_v<_Tp>) = delete;
> #endif
>
> For C++17 there's no NSDMI and the default constructor does the right
> thing (getting deleted if T is not default constructible).
> For C++20 the default constructor is deleted if the NSDMI would be
> ill-formed, which is consistent with the C++17 behaviour.
> The triviality of the constructor is unchanged.



Re: [patch, Fortran, RFC] Introduce GFC_STD_UNSIGNED

2024-10-11 Thread Jerry Delisle
Good to go.

On Fri, Oct 11, 2024, 9:06 AM Thomas Koenig  wrote:

> Am 11.10.24 um 18:00 schrieb Thomas Koenig:
> > Hello world,
> >
> > the attached patch creates an unsigned "standard" for the
> > gfc_option.allow_std field.
> >
> > One of the main reason why people want UNSIGNED for Fortran is
> > interfacing for C.
> >
> > This is a preparation for further work on the ISO_C_BINDING constants.
> > That, we do via iso-c-binding.def , whose last field is a standard
> > for the constant to be defined for the standard in question, which is
> > then checked.  I could try and invent a different method for this,
> > but I'd rather not.
> >
> > So, OK for trunk? Other, better ideas?
>
> ChangeLog was missing, here it is. Also regression-tested.
>
>
> gcc/fortran/ChangeLog:
>
> * intrinsic.cc (add_functions): Convert uint and
> selected_unsigned_kind to GFC_STD_UNSIGNED.
> (gfc_check_intrinsic_standard): Handle GFC_STD_UNSIGNED.
> * libgfortran.h (GFC_STD_UNSIGNED): Add.
> * options.cc (gfc_post_options): Set GFC_STD_UNSIGNED
> if -funsigned is set.
>


Re: [PATCH 2/2] PR target/117048 aarch64: Use more canonical and optimization-friendly representation for XAR instruction

2024-10-11 Thread Richard Sandiford
Kyrylo Tkachov  writes:
>> Kyrylo Tkachov  writes:
>>> The pattern for the Advanced SIMD XAR instruction isn't very
>>> optimization-friendly at the moment.
>>> In the testcase from the PR once simlify-rtx has done its work it
>>> generates the RTL:
>>> (set (reg:V2DI 119 [ _14 ])
>>>(rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
>>>(reg:V2DI 116 [ *m1_01_8(D) ]))
>>>(const_vector:V2DI [
>>>(const_int 32 [0x20]) repeated x2
>>>])))
>>> 
>>> which fails to match our XAR pattern because the pattern expects:
>>> 1) A ROTATERT instead of the ROTATE.  However, according to the RTL ops
>>> documentation the preferred form of rotate-by-immediate is ROTATE, which
>>> I take to mean it's the canonical form.
>>> ROTATE (x, C) <-> ROTATERT (x, MODE_WIDTH - C) so it's better to match just
>>> one canonical representation.
>>> 2) A CONST_INT shift amount whereas the midend asks for a repeated vector
>>> constant.
>> 
>> Following on from the 1/2 review, I'm surprised that the middle end
>> requires a vector.  I would have expected a scalar shift to work.
>> 
>> I agree it should be rotate rather than rotatert though.  Out of curiosity,
>> where do things go wrong if we just fix that, but keep the scalar shift
>> amount?
>
> The vector constant comes out of the test case using intrinsics such as 
> vshlq_u64 that take a vector as a shift amount.
> Our pattern for vector shift by immediate 
> aarch64_simd_imm_shl expresses the shift amount
> as a vector so I suppose it all comes to that.
> The standard ashl3 expander does take a scalar shift amount but 
> explicitly creates a vector constant for the RTL passes.
> So it seems that we are de facto standardized on using vectors.

OK, thanks, makes sense.

Richard

> Naively, I’d hope recog would try both forms and save us the trouble of 
> worrying about it, but I think we’ve been reluctant to complicate recog that 
> way in the past.
>
>> 
>> No objection to switching to vectors in principle though, especially if it
>> matches what we do elsewhere.
>
> Thanks, I’ll adjust patch 1/2 in the meantime
> Kyrill
>
>> 
>> Thanks,
>> Richard
>> 
>>> 
>>> These issues are fixed by introducing a dedicated expander for the
>>> aarch64_xarqv2di name, needed by the arm_neon.h intrinsic, that translate
>>> the intrinsic-level CONST_INT immediate (the right-rotate amount) into
>>> a repeated vector constant subtracted from 64 to give the corresponding
>>> left-rotate amount that is fed to the new representation for the XAR
>>> define_insn that uses the ROTATE RTL code.  This is a similar approach
>>> to have we handle the discrepancy between intrinsic-level and RTL-level
>>> vector lane numbers for big-endian.
>>> 
>>> With this patch and [1/2] the arithmetic parts of the testcase now simplify
>>> to just one XAR instruction.
>>> 
>>> Bootstrapped and tested on aarch64-none-linux-gnu.
>>> I’ll push it after patch approval of [1/2] leaving some time for comments.
>>> 
>>> I’ll note that the SVE2 patterns for XAR should also be improved in a 
>>> similar
>>> but that is a separate patch.
>>> 
>>> Thanks,
>>> Kyrill
>>> 
>>> Signed-off-by: Kyrylo Tkachov 
>>> 
>>> gcc/
>>>  PR target/117048
>>>  * config/aarch64/aarch64-simd.md (aarch64_xarqv2di): Redefine into a
>>>  define_expand.
>>>  (*aarch64_xarqv2di_insn): Define.
>>> 
>>> gcc/testsuite/
>>>  PR target/117048
>>>  * g++.target/aarch64/pr117048.C: New test.
>>> 
>>> From 4f699bf239a563a05e88da5958c44a643718852c Mon Sep 17 00:00:00 2001
>>> From: Kyrylo Tkachov 
>>> Date: Wed, 9 Oct 2024 09:40:33 -0700
>>> Subject: [PATCH 2/2] PR target/117048 aarch64: Use more canonical and
>>> optimization-friendly representation for XAR instruction
>>> 
>>> The pattern for the Advanced SIMD XAR instruction isn't very
>>> optimization-friendly at the moment.
>>> In the testcase from the PR once simlify-rtx has tried done its work it
>>> generates the RTL:
>>> (set (reg:V2DI 119 [ _14 ])
>>>(rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
>>>(reg:V2DI 116 [ *m1_01_8(D) ]))
>>>(const_vector:V2DI [
>>>(const_int 32 [0x20]) repeated x2
>>>])))
>>> 
>>> which fails to match our XAR pattern because the pattern expects:
>>> 1) A ROTATERT instead of the ROTATE.  However, according to the RTL ops
>>> documentation the preferred form of rotate-by-immediate is ROTATE, which
>>> I take to mean it's the canonical form.
>>> ROTATE (x, C) <-> ROTATERT (x, MODE_WIDTH - C) so it's better to match just
>>> one canonical representation.
>>> 2) A CONST_INT shift amount whereas the midend asks for a repeated vector
>>> constant.
>>> 
>>> These issues are fixed by introducing a dedicated expander for the
>>> aarch64_xarqv2di name, needed by the arm_neon.h intrinsic, that translate
>>> the intrinsic-level CONST_INT immediate (the right-rotate amount) into
>>> a repeated vector constant subtracted from 64 to give the corresponding
>

[PATCH 0/5] Provide better definitions of NULL

2024-10-11 Thread Alejandro Colomar
Hi,

This is just an untested draft.  If there's rough agreement that this is
wanted, I'll test it, write changelog, etc.

The intention of this change is to help improve the common C/C++
language subset, promoting the use of NULL in both languages as the null
pointer constant, expanding to nullptr in C++, and to ((void *)0) in C.

While C23 added nullptr, it was a terrible mistake, and it doesn't do
any good to the C language.  (See the link below for more details.)
Instead, it's C++ which should reconcile with C.  Let's work in that
direction.

Link: 

Have a lovely day!
Alex

Alejandro Colomar (5):
  gcc/ginclude/stddef.h: Indent nested cpp conditionals
  gcc/ginclude/stddef.h: Invert conditional
  gcc/ginclude/stddef.h: Define NULL as nullptr if possible
  Don't define NULL as 0 in C
  libgm2/libm2pim/wrapc.cc: Define NULL as nullptr

 gcc/ginclude/stddef.h | 24 +++
 .../gcc.c-torture/execute/pr68143_1.c |  2 +-
 gcc/testsuite/gcc.c-torture/execute/pr70566.c |  2 +-
 gcc/testsuite/gcc.dg/tm/20100615.c|  2 +-
 gcc/testsuite/gcc.target/aarch64/pr91927.c|  2 +-
 libgm2/libm2pim/wrapc.cc  |  4 +---
 libiberty/alloca.c|  2 +-
 libiberty/argv.c  |  2 +-
 libiberty/getopt1.c   |  2 +-
 9 files changed, 22 insertions(+), 20 deletions(-)

Range-diff:
-:  --- > 1:  8db5b662bdf gcc/ginclude/stddef.h: Indent nested cpp 
conditionals
-:  --- > 2:  74c07a6be6e gcc/ginclude/stddef.h: Invert conditional
-:  --- > 3:  f0e9337f2e3 gcc/ginclude/stddef.h: Define NULL as nullptr 
if possible
-:  --- > 4:  30dc1a7973f Don't define NULL as 0 in C
-:  --- > 5:  e1836aac591 libgm2/libm2pim/wrapc.cc: Define NULL as 
nullptr
-- 
2.45.2



signature.asc
Description: PGP signature


[PATCH 1/5] gcc/ginclude/stddef.h: Indent nested cpp conditionals

2024-10-11 Thread Alejandro Colomar
This is in preparation for the following commits.

Signed-off-by: Alejandro Colomar 
---
 gcc/ginclude/stddef.h | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/gcc/ginclude/stddef.h b/gcc/ginclude/stddef.h
index 349213108ce..f2c4f28673c 100644
--- a/gcc/ginclude/stddef.h
+++ b/gcc/ginclude/stddef.h
@@ -396,16 +396,16 @@ typedef __WINT_TYPE__ wint_t;
 /* A null pointer constant.  */
 
 #if defined (_STDDEF_H) || defined (__need_NULL)
-#undef NULL/* in case  has defined it. */
-#ifdef __GNUG__
-#define NULL __null
-#else   /* G++ */
-#ifndef __cplusplus
-#define NULL ((void *)0)
-#else   /* C++ */
-#define NULL 0
-#endif  /* C++ */
-#endif  /* G++ */
+# undef NULL   /* in case  has defined it. */
+# ifdef __GNUG__
+#  define NULL __null
+# else   /* G++ */
+#  ifndef __cplusplus
+#   define NULL ((void *)0)
+#  else   /* C++ */
+#   define NULL 0
+#  endif  /* C++ */
+# endif  /* G++ */
 #endif /* NULL not defined and  or need NULL.  */
 #undef __need_NULL
 
-- 
2.45.2



signature.asc
Description: PGP signature


[PATCH 4/5] Don't define NULL as 0 in C

2024-10-11 Thread Alejandro Colomar
That was insane.

Link: 
Signed-off-by: Alejandro Colomar 
---
 gcc/testsuite/gcc.c-torture/execute/pr68143_1.c | 2 +-
 gcc/testsuite/gcc.c-torture/execute/pr70566.c   | 2 +-
 gcc/testsuite/gcc.dg/tm/20100615.c  | 2 +-
 gcc/testsuite/gcc.target/aarch64/pr91927.c  | 2 +-
 libiberty/alloca.c  | 2 +-
 libiberty/argv.c| 2 +-
 libiberty/getopt1.c | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/gcc.c-torture/execute/pr68143_1.c 
b/gcc/testsuite/gcc.c-torture/execute/pr68143_1.c
index cbfbbc2458b..87978906e6d 100644
--- a/gcc/testsuite/gcc.c-torture/execute/pr68143_1.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr68143_1.c
@@ -1,4 +1,4 @@
-#define NULL 0
+#define NULL ((void *) 0)
 
 struct stuff
 {
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr70566.c 
b/gcc/testsuite/gcc.c-torture/execute/pr70566.c
index f47106e70c7..4a52d0789f2 100644
--- a/gcc/testsuite/gcc.c-torture/execute/pr70566.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr70566.c
@@ -1,6 +1,6 @@
 /* PR target/70566.  */
 
-#define NULL 0
+#define NULL ((void *) 0)
 
 struct mystruct
 {
diff --git a/gcc/testsuite/gcc.dg/tm/20100615.c 
b/gcc/testsuite/gcc.dg/tm/20100615.c
index 26964d43367..7b5ab0d827d 100644
--- a/gcc/testsuite/gcc.dg/tm/20100615.c
+++ b/gcc/testsuite/gcc.dg/tm/20100615.c
@@ -6,7 +6,7 @@
 /* { dg-final { scan-assembler-not "tm_clone_table" { target { ! *-*-darwin*  
} } } } */
 /* { dg-final { scan-assembler-not "__DATA,__tm_clone_table" { target 
*-*-darwin*  } } } */
 
-#define NULL 0
+#define NULL ((void *) 0)
 extern void *malloc (__SIZE_TYPE__);
 
 __attribute__((transaction_pure))
diff --git a/gcc/testsuite/gcc.target/aarch64/pr91927.c 
b/gcc/testsuite/gcc.target/aarch64/pr91927.c
index f5cde1a5336..e0c0574c949 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr91927.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr91927.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mstrict-align -O3" } */
 
-#define NULL 0
+#define NULL ((void *) 0)
 
 typedef unsigned uint32_t;
 typedef struct __attribute__((__packed__))
diff --git a/libiberty/alloca.c b/libiberty/alloca.c
index b75f7560f94..3bf0b17b41c 100644
--- a/libiberty/alloca.c
+++ b/libiberty/alloca.c
@@ -78,7 +78,7 @@ static long i00afunc ();
 #endif
 
 #ifndef NULL
-#defineNULL0
+#defineNULL ((void *) 0)
 #endif
 
 /* Define STACK_DIRECTION if you know the direction of stack
diff --git a/libiberty/argv.c b/libiberty/argv.c
index f889432a868..3cb79a09eca 100644
--- a/libiberty/argv.c
+++ b/libiberty/argv.c
@@ -44,7 +44,7 @@ Boston, MA 02110-1301, USA.  */
 #endif
 
 #ifndef NULL
-#define NULL 0
+#define NULL ((void *) 0)
 #endif
 
 #ifndef EOS
diff --git a/libiberty/getopt1.c b/libiberty/getopt1.c
index 7db3d167757..cb80c5ed3d8 100644
--- a/libiberty/getopt1.c
+++ b/libiberty/getopt1.c
@@ -61,7 +61,7 @@
 #endif
 
 #ifndefNULL
-#define NULL 0
+#define NULL ((void *) 0)
 #endif
 
 int
-- 
2.45.2



signature.asc
Description: PGP signature


[PATCH 2/5] gcc/ginclude/stddef.h: Invert conditional

2024-10-11 Thread Alejandro Colomar
This is in preparation for the following commit.

Signed-off-by: Alejandro Colomar 
---
 gcc/ginclude/stddef.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/ginclude/stddef.h b/gcc/ginclude/stddef.h
index f2c4f28673c..4d04b0163d5 100644
--- a/gcc/ginclude/stddef.h
+++ b/gcc/ginclude/stddef.h
@@ -400,10 +400,10 @@ typedef __WINT_TYPE__ wint_t;
 # ifdef __GNUG__
 #  define NULL __null
 # else   /* G++ */
-#  ifndef __cplusplus
-#   define NULL ((void *)0)
-#  else   /* C++ */
+#  if defined(__cplusplus)
 #   define NULL 0
+#  else   /* C++ */
+#   define NULL ((void *)0)
 #  endif  /* C++ */
 # endif  /* G++ */
 #endif /* NULL not defined and  or need NULL.  */
-- 
2.45.2



signature.asc
Description: PGP signature


[PATCH 5/5] libgm2/libm2pim/wrapc.cc: Define NULL as nullptr

2024-10-11 Thread Alejandro Colomar
For internal C++ code, unconditionally define NULL as nullptr.
We already require a C++11 compiler to bootstrap GCC anyway.

Link: 
Signed-off-by: Alejandro Colomar 
---
 libgm2/libm2pim/wrapc.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libgm2/libm2pim/wrapc.cc b/libgm2/libm2pim/wrapc.cc
index 5c31f1e2687..cdd1cf0d0fe 100644
--- a/libgm2/libm2pim/wrapc.cc
+++ b/libgm2/libm2pim/wrapc.cc
@@ -63,10 +63,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 #include 
 #endif
 
-/* Define a generic NULL if one hasn't already been defined.  */
-
 #if !defined(NULL)
-#define NULL 0
+#define NULL nullptr
 #endif
 
 /* strtime returns the address of a string which describes the
-- 
2.45.2



signature.asc
Description: PGP signature


[PATCH 3/5] gcc/ginclude/stddef.h: Define NULL as nullptr if possible

2024-10-11 Thread Alejandro Colomar
0 is a terrible definition of NULL.  If possible, that is, for C++11 or
later, define is as nullptr.

Link: 
Signed-off-by: Alejandro Colomar 
---
 gcc/ginclude/stddef.h | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/ginclude/stddef.h b/gcc/ginclude/stddef.h
index 4d04b0163d5..3d9e9ae53b5 100644
--- a/gcc/ginclude/stddef.h
+++ b/gcc/ginclude/stddef.h
@@ -401,7 +401,11 @@ typedef __WINT_TYPE__ wint_t;
 #  define NULL __null
 # else   /* G++ */
 #  if defined(__cplusplus)
-#   define NULL 0
+#   if (__cplusplus >= 201103L)
+#define NULL nullptr
+#   else
+#define NULL 0
+#   endif
 #  else   /* C++ */
 #   define NULL ((void *)0)
 #  endif  /* C++ */
-- 
2.45.2



signature.asc
Description: PGP signature


Re: [PATCH 1/2] libstdc++: Enable memcpy optimizations for distinct integral types [PR93059]

2024-10-11 Thread Jonathan Wakely
On Fri, 11 Oct 2024 at 07:48, Jonathan Wakely  wrote:
>
> Tested x86_64-linux.
>
> -- >8 --
>
> Currently we only optimize std::copy, std::copy_n etc. to memmove when
> the source and destination types are the same. This means that we fail
> to optimize copying between distinct 1-byte types, e.g. copying from a
> buffer of std::byte to a buffer of unsigned char.
>
> This patch adds more partial specializations of the __memcpyable trait
> so that we allow memcpy between integers of equal widths. This will
> enable memmove for copying std::byte to unsigned char, and copying int
> to unsigned, and long to long long (for I32LP64) or long to int (for
> ILP32).
>
> Enabling the optimization needs to be based on the width of the integer
> type, not just the size in bytes. This is because some targets define
> non-standard integral types such as __int20 in msp430, which has padding
> bits. It would not be safe to memcpy between e.g. __int20 and int32_t,
> even though sizeof(__int20) == sizeof(int32_t). A new trait is
> introduced to define the width, __memcpyable_integer, and then the
> __memcpyable trait compares the widths.
>
> It's safe to copy between signed and unsigned integers of the same
> width, because GCC only supports two's complement integers.
>
> We can also add the specialization __memcpyable_integer to enable
> copying between narrow character types and std::byte.

Actually we can't enable memset for std::copy doing byte<->char,
because the assignment that std::copy is supposed to use would be
ill-formed.

So even if the __memcpyable trait says it's OK, it won't compile.


>
> libstdc++-v3/ChangeLog:
>
> PR libstdc++/93059
> * include/bits/cpp_type_traits.h (__memcpyable): Add partial
> specialization for pointers to distinct types.
> (__memcpyable_integer): New trait to control which types can use
> cross-type memcpy optimizations.
> ---
>  libstdc++-v3/include/bits/cpp_type_traits.h | 88 -
>  1 file changed, 85 insertions(+), 3 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/cpp_type_traits.h 
> b/libstdc++-v3/include/bits/cpp_type_traits.h
> index 19bf1edf647..8d386a36e62 100644
> --- a/libstdc++-v3/include/bits/cpp_type_traits.h
> +++ b/libstdc++-v3/include/bits/cpp_type_traits.h
> @@ -414,7 +414,7 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
>typedef __true_type __type;
>  };
>
> -#if __cplusplus >= 201703L
> +#ifdef __glibcxx_byte // C++ >= 17
>enum class byte : unsigned char;
>
>template<>
> @@ -434,8 +434,6 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
>  };
>  #endif
>
> -  template struct iterator_traits;
> -
>// A type that is safe for use with memcpy, memmove, memcmp etc.
>template
>  struct __is_nonvolatile_trivially_copyable
> @@ -459,16 +457,100 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
>enum { __value = 0 };
>  };
>
> +  // Allow memcpy when source and destination are pointers to the same type.
>template
>  struct __memcpyable<_Tp*, _Tp*>
>  : __is_nonvolatile_trivially_copyable<_Tp>
>  { };
>
> +  // Source pointer can be const.
>template
>  struct __memcpyable<_Tp*, const _Tp*>
>  : __is_nonvolatile_trivially_copyable<_Tp>
>  { };
>
> +  template struct __memcpyable_integer;
> +
> +  // For heterogeneous types, allow memcpy between equal-sized integers.
> +  template
> +struct __memcpyable<_Tp*, _Up*>
> +{
> +  enum {
> +   __value = __memcpyable_integer<_Tp>::__width != 0
> +   && ((int)__memcpyable_integer<_Tp>::__width
> + == (int)__memcpyable_integer<_Up>::__width)
> +  };
> +};
> +
> +  // Specialization for const U* because __is_integer is never true.
> +  template
> +struct __memcpyable<_Tp*, const _Up*>
> +: __memcpyable<_Tp*, _Up*>
> +{ };
> +
> +  template
> +struct __memcpyable_integer
> +{
> +  enum {
> +   __width = __is_integer<_Tp>::__value ? (sizeof(_Tp) * __CHAR_BIT__) : > 0
> +  };
> +};
> +
> +  // Cannot memcpy volatile memory.
> +  template
> +struct __memcpyable_integer
> +{ enum { __width = 0 }; };
> +
> +#ifdef __glibcxx_byte // C++ >= 17
> +  // std::byte is not an integer, but is safe to memcpy to/from char.
> +  template<>
> +struct __memcpyable_integer
> +{ enum { __width = __CHAR_BIT__ }; };
> +#endif
> +
> +  // Specializations for __intNN types with padding bits.
> +#if defined __GLIBCXX_TYPE_INT_N_0 && __GLIBCXX_BITSIZE_INT_N_0 % 
> __CHAR_BIT__
> +  template<>
> +struct __memcpyable_integer<__GLIBCXX_TYPE_INT_N_0>
> +{ enum { __width = __GLIBCXX_BITSIZE_INT_N_0 }; };
> +  template<>
> +struct __memcpyable_integer
> +{ enum { __width = __GLIBCXX_BITSIZE_INT_N_0 }; };
> +#endif
> +#if defined __GLIBCXX_TYPE_INT_N_1 && __GLIBCXX_BITSIZE_INT_N_1 % 
> __CHAR_BIT__
> +  template<>
> +struct __memcpyable_integer<__GLIBCXX_TYPE_INT_N_1>
> +{ enum { __width = __GLIBCXX_BITSIZE_INT_N_1 }; };
> +  templa

Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector reductions

2024-10-11 Thread Jeff Law




On 10/11/24 4:08 AM, Richard Sandiford wrote:

Tamar Christina  writes:

-Original Message-
From: Richard Biener 
Sent: Friday, October 11, 2024 7:52 AM
To: Richard Sandiford 
Cc: Jennifer Schmitz ; gcc-patches@gcc.gnu.org; Richard
Earnshaw ; Kyrylo Tkachov
; Tamar Christina 
Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
reductions

On Thu, 10 Oct 2024, Richard Sandiford wrote:


Jennifer Schmitz  writes:

This patch implements the optabs reduc_and_scal_,
reduc_ior_scal_, and reduc_xor_scal_ for ASIMD modes V8QI,
V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise

logical

vector reduction operations.
Previously, either only vector registers or only general purpose registers (GPR)
were used. Now, vector registers are used for the reduction from 128 to 64

bits;

64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are

used

for the rest of the reduction steps.

For example, the test case (V8HI)
int16_t foo (int16_t *a)
{
   int16_t b = -1;
   for (int i = 0; i < 8; ++i)
 b &= a[i];
   return b;
}

was previously compiled to (-O2):
foo:
ldr q0, [x0]
moviv30.4s, 0
ext v29.16b, v0.16b, v30.16b, #8
and v29.16b, v29.16b, v0.16b
ext v31.16b, v29.16b, v30.16b, #4
and v31.16b, v31.16b, v29.16b
ext v30.16b, v31.16b, v30.16b, #2
and v30.16b, v30.16b, v31.16b
umovw0, v30.h[0]
ret

With patch, it is compiled to:
foo:
ldr q31, [x0]
ext v30.16b, v31.16b, v31.16b, #8
and v31.8b, v30.8b, v31.8b
fmovx0, d31
and x0, x0, x0, lsr 32
and w0, w0, w0, lsr 16
ret

For modes V4SI and V2DI, the pattern was not implemented, because the
current codegen (using only base instructions) is already efficient.

Note that the PR initially suggested to use SVE reduction ops. However,
they have higher latency than the proposed sequence, which is why using
neon and base instructions is preferable.

Test cases were added for 8/16-bit integers for all implemented modes and all
three operations to check the produced assembly.

We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
because for aarch64 vector types, either the logical reduction optabs are
implemented or the codegen for reduction operations is good as it is.
This was motivated by failure of a scan-tree-dump directive in the test cases
gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no

regression.

OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
PR target/113816
* config/aarch64/aarch64-simd.md (reduc__scal_):
Implement for logical bitwise operations for VDQV_E.

gcc/testsuite/
PR target/113816
* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
* gcc.target/aarch64/simd/logical_reduc.c: New test.
* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
---
  gcc/config/aarch64/aarch64-simd.md|  55 +
  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++
  .../gcc.target/aarch64/vect-reduc-or_1.c  |   2 +-
  gcc/testsuite/lib/target-supports.exp |   4 +-
  4 files changed, 267 insertions(+), 2 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c

diff --git a/gcc/config/aarch64/aarch64-simd.md

b/gcc/config/aarch64/aarch64-simd.md

index 23c03a96371..00286b8b020 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3608,6 +3608,61 @@
}
  )

+;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
+;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
+;; on the two halves of the input.
+;; If the input has 128 bits, the first operation is performed in vector
+;; registers.  From 64 bits down, the reduction steps are performed in general
+;; purpose registers.
+;; For example, for V8HI and operation AND, the intended sequence is:
+;; EXT  v1.16b, v0.16b, v0.16b, #8
+;; AND  v0.8b, v1.8b, v0.8b
+;; FMOV x0, d0
+;; AND  x0, x0, x0, 32
+;; AND  w0, w0, w0, 16
+;;
+;; For V8QI and operation AND, the sequence is:
+;; AND  x0, x0, x0, lsr 32
+;; AND  w0, w0, w0, lsr, 16
+;; AND  w0, w0, w0, lsr, 8
+
+(define_expand "reduc__scal_"
+ [(match_operand: 0 "register_operand")
+  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
+  "TARGET_SIMD"
+  {
+rtx dst = operands[1];
+rtx tdi = gen_reg_rtx (DImode);
+rtx tsi = lowpart_subreg (SImode, tdi, DImode);
+rtx op1_lo;
+if (known_eq (GET_MODE_SIZE (mode), 16))
+  {
+   rtx t0 = gen_reg_rtx (mode);
+   rtx t1 = gen_reg_rtx (DImode);
+   rtx t2 = gen_reg_rtx (DImode);
+   rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (mode));
+   emit_insn (gen_aarc

Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer SAT_SUB

2024-10-11 Thread Richard Biener
On Fri, Oct 11, 2024 at 11:44 AM Li, Pan2  wrote:
>
> Thanks Richard for reviewing and comments.
>
> > I wonder since we now can match many different variants of writing
> > signed and unsigned
> > saturation add and sub whether it makes sense to canonicalize to the 
> > "cheapest"
> > variant when the target doesn't support .SAT_SUB/ADD?
>
> I think it is a good point. But sorry, not sure if I get the point here. Like 
> what is the purpose of
> the "cheapest" variant regardless of target support it or not. You mean for a 
> "cheapest" variant
> we can expand it in the middle end? Instead of leave it to the target.

Yes.  The different variants seem to have different complexity and generic
expansion might prefer one or another version.  So I wasn't suggesting to
expand .SAT_ADD/SUB in the middle-end but instead canonicalize the open-coding
to the cheapest (smallest) variant.

> > Are there any
> > "sub-patterns"
> > not forming the full saturation add/sub that can be
> > simplified/canonicalized in such
> > way maybe?
>
> Yes, you are right. There will be some common sub-pattern for so many 
> saturation alu variants.
> Like x < 0 ? MIN : MAX. I plan to refine this part after all saturation alu 
> are supported
> (to make sure we have full picture).

Yeah, having a full picture is good.

Richard.

> Pan
>
> -Original Message-
> From: Richard Biener 
> Sent: Friday, October 11, 2024 5:10 PM
> To: Li, Pan2 
> Cc: gcc-patches@gcc.gnu.org; tamar.christ...@arm.com; juzhe.zh...@rivai.ai; 
> kito.ch...@gmail.com; jeffreya...@gmail.com; rdapp@gmail.com
> Subject: Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer 
> SAT_SUB
>
> On Fri, Oct 11, 2024 at 8:24 AM  wrote:
> >
> > From: Pan Li 
> >
> > This patch would like to support the form 1 of the vector signed
> > integer SAT_SUB.  Aka below example:
> >
> > Form 1:
> >   #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX) \
> >   void __attribute__((noinline))   \
> >   vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
> >   {\
> > unsigned i;\
> > for (i = 0; i < limit; i++)\
> >   {\
> > T x = op_1[i]; \
> > T y = op_2[i]; \
> > T minus = (UT)x - (UT)y;   \
> > out[i] = (x ^ y) >= 0  \
> >   ? minus  \
> >   : (minus ^ x) >= 0   \
> > ? minus\
> > : x < 0 ? MIN : MAX;   \
> >   }\
> >   }
> >
> > DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)
> >
> > Before this patch:
> >   91   │   _108 = .SELECT_VL (ivtmp_106, POLY_INT_CST [16, 16]);
> >   92   │   vect_x_16.11_80 = .MASK_LEN_LOAD (vectp_op_1.9_78, 8B, { -1, ... 
> > }, _108, 0);
> >   93   │   _69 = vect_x_16.11_80 >> 7;
> >   94   │   vect_x.12_81 = VIEW_CONVERT_EXPR > char>(vect_x_16.11_80);
> >   95   │   vect_y_18.15_85 = .MASK_LEN_LOAD (vectp_op_2.13_83, 8B, { -1, 
> > ... }, _108, 0);
> >   96   │   vect__7.21_91 = vect_x_16.11_80 ^ vect_y_18.15_85;
> >   97   │   mask__44.22_92 = vect__7.21_91 < { 0, ... };
> >   98   │   vect_y.16_86 = VIEW_CONVERT_EXPR > char>(vect_y_18.15_85);
> >   99   │   vect__6.17_87 = vect_x.12_81 - vect_y.16_86;
> >  100   │   vect_minus_19.18_88 = VIEW_CONVERT_EXPR > char>(vect__6.17_87);
> >  101   │   vect__8.19_89 = vect_x_16.11_80 ^ vect_minus_19.18_88;
> >  102   │   mask__42.20_90 = vect__8.19_89 < { 0, ... };
> >  103   │   mask__41.23_93 = mask__42.20_90 & mask__44.22_92;
> >  104   │   _4 = .COND_XOR (mask__41.23_93, _69, { 127, ... }, 
> > vect_minus_19.18_88);
> >  105   │   .MASK_LEN_STORE (vectp_out.31_102, 8B, { -1, ... }, _108, 0, _4);
> >  106   │   vectp_op_1.9_79 = vectp_op_1.9_78 + _108;
> >  107   │   vectp_op_2.13_84 = vectp_op_2.13_83 + _108;
> >  108   │   vectp_out.31_103 = vectp_out.31_102 + _108;
> >  109   │   ivtmp_107 = ivtmp_106 - _108;
> >
> > After this patch:
> >   81   │   _102 = .SELECT_VL (ivtmp_100, POLY_INT_CST [16, 16]);
> >   82   │   vect_x_16.11_89 = .MASK_LEN_LOAD (vectp_op_1.9_87, 8B, { -1, ... 
> > }, _102, 0);
> >   83   │   vect_y_18.14_93 = .MASK_LEN_LOAD (vectp_op_2.12_91, 8B, { -1, 
> > ... }, _102, 0);
> >   84   │   vect_patt_38.15_94 = .SAT_SUB (vect_x_16.11_89, vect_y_18.14_93);
> >   85   │   .MASK_LEN_STORE (vectp_out.16_96, 8B, { -1, ... }, _102, 0, 
> > vect_patt_38.15_94);
> >   86 

Re: [PATCH 1/2] PR 117048: simplify-rtx: Extend (x << C1) | (X >> C2) --> ROTATE transformation to vector operands

2024-10-11 Thread Richard Sandiford
Kyrylo Tkachov  writes:
> Hi all,
>
> In the testcase from patch [2/2] we want to match a vector rotate operation 
> from
> an IOR of left and right shifts by immediate.  simplify-rtx has code for just
> that but it looks like it's prepared to do handle only scalar operands.
> In practice most of the code works for vector modes as well except the shift
> amounts are checked to be CONST_INT rather than vector constants that we have
> here.  This is easily extended by using unwrap_const_vec_duplicate to extract
> the repeating constant shift amount.

FWIW, shifting a vector by a scalar is valid rtl (at least AIUI), so the
current code does handle that case.  But I agree it's missing shifting a
vector by a vector.

I suppose a fancy version would be to check the rotate condition for each
individual element of the vector shift amount.  Checking the duplicate
case is definitely a good (and strict) improvement over the status quo
though.

> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
> index e8e60404ef6..7ff14594daa 100644
> --- a/gcc/simplify-rtx.cc
> +++ b/gcc/simplify-rtx.cc
> @@ -3477,12 +3477,16 @@ simplify_context::simplify_binary_operation_1 
> (rtx_code code,
>   }
>  
>if (GET_CODE (opleft) == ASHIFT && GET_CODE (opright) == LSHIFTRT
> -  && rtx_equal_p (XEXP (opleft, 0), XEXP (opright, 0))
> -  && CONST_INT_P (XEXP (opleft, 1))
> -  && CONST_INT_P (XEXP (opright, 1))
> -  && (INTVAL (XEXP (opleft, 1)) + INTVAL (XEXP (opright, 1))
> +   && rtx_equal_p (XEXP (opleft, 0), XEXP (opright, 0)))
> + {
> +   rtx leftcst = unwrap_const_vec_duplicate (XEXP (opleft, 1));
> +   rtx rightcst = unwrap_const_vec_duplicate (XEXP (opright, 1));
> +
> +   if (CONST_INT_P (leftcst) && CONST_INT_P (rightcst)
> +   && (INTVAL (leftcst) + INTVAL (rightcst)
> == GET_MODE_UNIT_PRECISION (mode)))

Nit: looks like some reindentation might be missing here.

> -return gen_rtx_ROTATE (mode, XEXP (opright, 0), XEXP (opleft, 1));
> + return gen_rtx_ROTATE (mode, XEXP (opright, 0), XEXP (opleft, 1));
> + }

Looks good.  So referring back to the above, vector shifts will retain a
scalar shift amount if they started with a scalar shift amount, and a
vector shift amount if they started with a vector shift amount.

OK with formatting tweak, thanks.

Richard

>  
>/* Same, but for ashift that has been "simplified" to a wider mode
>  by simplify_shift_const.  */


Re: [PATCH 2/2] PR target/117048 aarch64: Use more canonical and optimization-friendly representation for XAR instruction

2024-10-11 Thread Richard Sandiford
Kyrylo Tkachov  writes:
> The pattern for the Advanced SIMD XAR instruction isn't very
> optimization-friendly at the moment.
> In the testcase from the PR once simlify-rtx has done its work it
> generates the RTL:
> (set (reg:V2DI 119 [ _14 ])
> (rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
> (reg:V2DI 116 [ *m1_01_8(D) ]))
> (const_vector:V2DI [
> (const_int 32 [0x20]) repeated x2
> ])))
>
> which fails to match our XAR pattern because the pattern expects:
> 1) A ROTATERT instead of the ROTATE.  However, according to the RTL ops
> documentation the preferred form of rotate-by-immediate is ROTATE, which
> I take to mean it's the canonical form.
> ROTATE (x, C) <-> ROTATERT (x, MODE_WIDTH - C) so it's better to match just
> one canonical representation.
> 2) A CONST_INT shift amount whereas the midend asks for a repeated vector
> constant.

Following on from the 1/2 review, I'm surprised that the middle end
requires a vector.  I would have expected a scalar shift to work.

I agree it should be rotate rather than rotatert though.  Out of curiosity,
where do things go wrong if we just fix that, but keep the scalar shift
amount?

No objection to switching to vectors in principle though, especially if it
matches what we do elsewhere.

Thanks,
Richard

>
> These issues are fixed by introducing a dedicated expander for the
> aarch64_xarqv2di name, needed by the arm_neon.h intrinsic, that translate
> the intrinsic-level CONST_INT immediate (the right-rotate amount) into
> a repeated vector constant subtracted from 64 to give the corresponding
> left-rotate amount that is fed to the new representation for the XAR
> define_insn that uses the ROTATE RTL code.  This is a similar approach
> to have we handle the discrepancy between intrinsic-level and RTL-level
> vector lane numbers for big-endian.
>
> With this patch and [1/2] the arithmetic parts of the testcase now simplify
> to just one XAR instruction.
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
> I’ll push it after patch approval of [1/2] leaving some time for comments.
>
> I’ll note that the SVE2 patterns for XAR should also be improved in a similar
> but that is a separate patch.
>
> Thanks,
> Kyrill 
>
> Signed-off-by: Kyrylo Tkachov 
>
> gcc/
>   PR target/117048
>   * config/aarch64/aarch64-simd.md (aarch64_xarqv2di): Redefine into a
>   define_expand.
>   (*aarch64_xarqv2di_insn): Define.
>
> gcc/testsuite/
>   PR target/117048
>   * g++.target/aarch64/pr117048.C: New test.
>
> From 4f699bf239a563a05e88da5958c44a643718852c Mon Sep 17 00:00:00 2001
> From: Kyrylo Tkachov 
> Date: Wed, 9 Oct 2024 09:40:33 -0700
> Subject: [PATCH 2/2] PR target/117048 aarch64: Use more canonical and
>  optimization-friendly representation for XAR instruction
>
> The pattern for the Advanced SIMD XAR instruction isn't very
> optimization-friendly at the moment.
> In the testcase from the PR once simlify-rtx has tried done its work it
> generates the RTL:
> (set (reg:V2DI 119 [ _14 ])
> (rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
> (reg:V2DI 116 [ *m1_01_8(D) ]))
> (const_vector:V2DI [
> (const_int 32 [0x20]) repeated x2
> ])))
>
> which fails to match our XAR pattern because the pattern expects:
> 1) A ROTATERT instead of the ROTATE.  However, according to the RTL ops
> documentation the preferred form of rotate-by-immediate is ROTATE, which
> I take to mean it's the canonical form.
> ROTATE (x, C) <-> ROTATERT (x, MODE_WIDTH - C) so it's better to match just
> one canonical representation.
> 2) A CONST_INT shift amount whereas the midend asks for a repeated vector
> constant.
>
> These issues are fixed by introducing a dedicated expander for the
> aarch64_xarqv2di name, needed by the arm_neon.h intrinsic, that translate
> the intrinsic-level CONST_INT immediate (the right-rotate amount) into
> a repeated vector constant subtracted from 64 to give the corresponding
> left-rotate amount that is fed to the new representation for the XAR
> define_insn that uses the ROTATE RTL code.  This is a similar approach
> to have we handle the discrepancy between intrinsic-level and RTL-level
> vector lane numbers for big-endian.
>
> With this patch and [1/2] the arithmetic parts of the testcase now simplify
> to just one XAR instruction.
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Signed-off-by: Kyrylo Tkachov 
>
> gcc/
>   PR target/117048
>   * config/aarch64/aarch64-simd.md (aarch64_xarqv2di): Redefine into a
>   define_expand.
>   (*aarch64_xarqv2di_insn): Define.
>
> gcc/testsuite/
>   PR target/117048
>   * g++.target/aarch64/pr117048.C: New test.
> ---
>  gcc/config/aarch64/aarch64-simd.md  | 33 +---
>  gcc/testsuite/g++.target/aarch64/pr117048.C | 34 +
>  2 files changed, 63 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/g+

Re: [PATCH] c: Implement C2Y N3355 - Named Loops [PR117022]

2024-10-11 Thread Joseph Myers
There should definitely be a test that -std=c23 -pedantic-errors gives 
errors for these constructs (I'd say also test that -std=c23 
-pedantic-errors -Wno-c23-c2y-compat doesn't diagnose them, while -std=c2y 
-Wc23-c2y-compat does).  Not yet reviewed the rest of the patch.

-- 
Joseph S. Myers
josmy...@redhat.com



RE: [PATCH v1 1/4] Match: Support form 1 for vector signed integer SAT_SUB

2024-10-11 Thread Li, Pan2
Thanks Richard for reviewing and comments.

> I wonder since we now can match many different variants of writing
> signed and unsigned
> saturation add and sub whether it makes sense to canonicalize to the 
> "cheapest"
> variant when the target doesn't support .SAT_SUB/ADD?  

I think it is a good point. But sorry, not sure if I get the point here. Like 
what is the purpose of 
the "cheapest" variant regardless of target support it or not. You mean for a 
"cheapest" variant
we can expand it in the middle end? Instead of leave it to the target.

> Are there any
> "sub-patterns"
> not forming the full saturation add/sub that can be
> simplified/canonicalized in such
> way maybe?

Yes, you are right. There will be some common sub-pattern for so many 
saturation alu variants.
Like x < 0 ? MIN : MAX. I plan to refine this part after all saturation alu are 
supported
(to make sure we have full picture).

Pan

-Original Message-
From: Richard Biener  
Sent: Friday, October 11, 2024 5:10 PM
To: Li, Pan2 
Cc: gcc-patches@gcc.gnu.org; tamar.christ...@arm.com; juzhe.zh...@rivai.ai; 
kito.ch...@gmail.com; jeffreya...@gmail.com; rdapp@gmail.com
Subject: Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer 
SAT_SUB

On Fri, Oct 11, 2024 at 8:24 AM  wrote:
>
> From: Pan Li 
>
> This patch would like to support the form 1 of the vector signed
> integer SAT_SUB.  Aka below example:
>
> Form 1:
>   #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX) \
>   void __attribute__((noinline))   \
>   vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
>   {\
> unsigned i;\
> for (i = 0; i < limit; i++)\
>   {\
> T x = op_1[i]; \
> T y = op_2[i]; \
> T minus = (UT)x - (UT)y;   \
> out[i] = (x ^ y) >= 0  \
>   ? minus  \
>   : (minus ^ x) >= 0   \
> ? minus\
> : x < 0 ? MIN : MAX;   \
>   }\
>   }
>
> DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)
>
> Before this patch:
>   91   │   _108 = .SELECT_VL (ivtmp_106, POLY_INT_CST [16, 16]);
>   92   │   vect_x_16.11_80 = .MASK_LEN_LOAD (vectp_op_1.9_78, 8B, { -1, ... 
> }, _108, 0);
>   93   │   _69 = vect_x_16.11_80 >> 7;
>   94   │   vect_x.12_81 = VIEW_CONVERT_EXPR char>(vect_x_16.11_80);
>   95   │   vect_y_18.15_85 = .MASK_LEN_LOAD (vectp_op_2.13_83, 8B, { -1, ... 
> }, _108, 0);
>   96   │   vect__7.21_91 = vect_x_16.11_80 ^ vect_y_18.15_85;
>   97   │   mask__44.22_92 = vect__7.21_91 < { 0, ... };
>   98   │   vect_y.16_86 = VIEW_CONVERT_EXPR char>(vect_y_18.15_85);
>   99   │   vect__6.17_87 = vect_x.12_81 - vect_y.16_86;
>  100   │   vect_minus_19.18_88 = VIEW_CONVERT_EXPR char>(vect__6.17_87);
>  101   │   vect__8.19_89 = vect_x_16.11_80 ^ vect_minus_19.18_88;
>  102   │   mask__42.20_90 = vect__8.19_89 < { 0, ... };
>  103   │   mask__41.23_93 = mask__42.20_90 & mask__44.22_92;
>  104   │   _4 = .COND_XOR (mask__41.23_93, _69, { 127, ... }, 
> vect_minus_19.18_88);
>  105   │   .MASK_LEN_STORE (vectp_out.31_102, 8B, { -1, ... }, _108, 0, _4);
>  106   │   vectp_op_1.9_79 = vectp_op_1.9_78 + _108;
>  107   │   vectp_op_2.13_84 = vectp_op_2.13_83 + _108;
>  108   │   vectp_out.31_103 = vectp_out.31_102 + _108;
>  109   │   ivtmp_107 = ivtmp_106 - _108;
>
> After this patch:
>   81   │   _102 = .SELECT_VL (ivtmp_100, POLY_INT_CST [16, 16]);
>   82   │   vect_x_16.11_89 = .MASK_LEN_LOAD (vectp_op_1.9_87, 8B, { -1, ... 
> }, _102, 0);
>   83   │   vect_y_18.14_93 = .MASK_LEN_LOAD (vectp_op_2.12_91, 8B, { -1, ... 
> }, _102, 0);
>   84   │   vect_patt_38.15_94 = .SAT_SUB (vect_x_16.11_89, vect_y_18.14_93);
>   85   │   .MASK_LEN_STORE (vectp_out.16_96, 8B, { -1, ... }, _102, 0, 
> vect_patt_38.15_94);
>   86   │   vectp_op_1.9_88 = vectp_op_1.9_87 + _102;
>   87   │   vectp_op_2.12_92 = vectp_op_2.12_91 + _102;
>   88   │   vectp_out.16_97 = vectp_out.16_96 + _102;
>   89   │   ivtmp_101 = ivtmp_100 - _102;
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.

OK.

I wonder since we now can match many different variants of writing
signed and unsigned
saturation add and sub whether it makes sense to canonicalize to the "cheapest"
variant when the target d

Re: [PATCH 2/2] PR target/117048 aarch64: Use more canonical and optimization-friendly representation for XAR instruction

2024-10-11 Thread Kyrylo Tkachov


> On 11 Oct 2024, at 12:36, Richard Sandiford  wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Kyrylo Tkachov  writes:
>> The pattern for the Advanced SIMD XAR instruction isn't very
>> optimization-friendly at the moment.
>> In the testcase from the PR once simlify-rtx has done its work it
>> generates the RTL:
>> (set (reg:V2DI 119 [ _14 ])
>>(rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
>>(reg:V2DI 116 [ *m1_01_8(D) ]))
>>(const_vector:V2DI [
>>(const_int 32 [0x20]) repeated x2
>>])))
>> 
>> which fails to match our XAR pattern because the pattern expects:
>> 1) A ROTATERT instead of the ROTATE.  However, according to the RTL ops
>> documentation the preferred form of rotate-by-immediate is ROTATE, which
>> I take to mean it's the canonical form.
>> ROTATE (x, C) <-> ROTATERT (x, MODE_WIDTH - C) so it's better to match just
>> one canonical representation.
>> 2) A CONST_INT shift amount whereas the midend asks for a repeated vector
>> constant.
> 
> Following on from the 1/2 review, I'm surprised that the middle end
> requires a vector.  I would have expected a scalar shift to work.
> 
> I agree it should be rotate rather than rotatert though.  Out of curiosity,
> where do things go wrong if we just fix that, but keep the scalar shift
> amount?

The vector constant comes out of the test case using intrinsics such as 
vshlq_u64 that take a vector as a shift amount.
Our pattern for vector shift by immediate 
aarch64_simd_imm_shl expresses the shift amount
as a vector so I suppose it all comes to that.
The standard ashl3 expander does take a scalar shift amount but 
explicitly creates a vector constant for the RTL passes.
So it seems that we are de facto standardized on using vectors.
Naively, I’d hope recog would try both forms and save us the trouble of 
worrying about it, but I think we’ve been reluctant to complicate recog that 
way in the past.

> 
> No objection to switching to vectors in principle though, especially if it
> matches what we do elsewhere.

Thanks, I’ll adjust patch 1/2 in the meantime
Kyrill

> 
> Thanks,
> Richard
> 
>> 
>> These issues are fixed by introducing a dedicated expander for the
>> aarch64_xarqv2di name, needed by the arm_neon.h intrinsic, that translate
>> the intrinsic-level CONST_INT immediate (the right-rotate amount) into
>> a repeated vector constant subtracted from 64 to give the corresponding
>> left-rotate amount that is fed to the new representation for the XAR
>> define_insn that uses the ROTATE RTL code.  This is a similar approach
>> to have we handle the discrepancy between intrinsic-level and RTL-level
>> vector lane numbers for big-endian.
>> 
>> With this patch and [1/2] the arithmetic parts of the testcase now simplify
>> to just one XAR instruction.
>> 
>> Bootstrapped and tested on aarch64-none-linux-gnu.
>> I’ll push it after patch approval of [1/2] leaving some time for comments.
>> 
>> I’ll note that the SVE2 patterns for XAR should also be improved in a similar
>> but that is a separate patch.
>> 
>> Thanks,
>> Kyrill
>> 
>> Signed-off-by: Kyrylo Tkachov 
>> 
>> gcc/
>>  PR target/117048
>>  * config/aarch64/aarch64-simd.md (aarch64_xarqv2di): Redefine into a
>>  define_expand.
>>  (*aarch64_xarqv2di_insn): Define.
>> 
>> gcc/testsuite/
>>  PR target/117048
>>  * g++.target/aarch64/pr117048.C: New test.
>> 
>> From 4f699bf239a563a05e88da5958c44a643718852c Mon Sep 17 00:00:00 2001
>> From: Kyrylo Tkachov 
>> Date: Wed, 9 Oct 2024 09:40:33 -0700
>> Subject: [PATCH 2/2] PR target/117048 aarch64: Use more canonical and
>> optimization-friendly representation for XAR instruction
>> 
>> The pattern for the Advanced SIMD XAR instruction isn't very
>> optimization-friendly at the moment.
>> In the testcase from the PR once simlify-rtx has tried done its work it
>> generates the RTL:
>> (set (reg:V2DI 119 [ _14 ])
>>(rotate:V2DI (xor:V2DI (reg:V2DI 114 [ vect__1.12_16 ])
>>(reg:V2DI 116 [ *m1_01_8(D) ]))
>>(const_vector:V2DI [
>>(const_int 32 [0x20]) repeated x2
>>])))
>> 
>> which fails to match our XAR pattern because the pattern expects:
>> 1) A ROTATERT instead of the ROTATE.  However, according to the RTL ops
>> documentation the preferred form of rotate-by-immediate is ROTATE, which
>> I take to mean it's the canonical form.
>> ROTATE (x, C) <-> ROTATERT (x, MODE_WIDTH - C) so it's better to match just
>> one canonical representation.
>> 2) A CONST_INT shift amount whereas the midend asks for a repeated vector
>> constant.
>> 
>> These issues are fixed by introducing a dedicated expander for the
>> aarch64_xarqv2di name, needed by the arm_neon.h intrinsic, that translate
>> the intrinsic-level CONST_INT immediate (the right-rotate amount) into
>> a repeated vector constant subtracted from 64 to give the corresponding
>> left-rotate amount that is fed to the new representation for the XA

[PATCH v3 4/5] c++/modules: Check linkage for exported declarations

2024-10-11 Thread Nathaniel Shead
Updated to use 'unnamed namespace' instead, added some more details to
the internal linkage namespace diagnostic.

-- >8 --

By [module.interface] p3, if an exported declaration is not within a
header unit, it shall not declare a name with internal linkage.

Unfortunately we cannot just do this within set_originating_module,
since at the locations its called the linkage for declarations are not
always fully determined yet.  We could move the calls but this causes
the checking assertion to fail as the originating module declaration may
have moved, and in general for some kinds of declarations it's not
always obvious where it should be moved to.

This patch instead introduces a new function to check that the linkage
of a declaration within a module is correct, to be called for all
declarations once their linkage is fully determined.

As a drive-by fix this patch also improves the source location of
namespace aliases to point at the identifier rather than the terminating
semicolon.

gcc/cp/ChangeLog:

* cp-tree.h (check_module_decl_linkage): Declare.
* decl2.cc (finish_static_data_member_decl): Check linkage.
* module.cc (set_originating_module): Adjust comment.
(check_module_decl_linkage): New function.
* name-lookup.cc (do_namespace_alias): Build alias with
specified location, check linkage.
(pushtag): Check linkage.
(push_namespace): Slightly clarify error message.
* name-lookup.h (do_namespace_alias): Add location parameter.
* parser.cc (cp_parser_namespace_alias_definition): Pass
identifier location to do_namespace_alias.
(cp_parser_alias_declaration): Check linkage.
(cp_parser_init_declarator): Check linkage.
(cp_parser_function_definition_after_declarator): Check linkage.
(cp_parser_save_member_function_body): Check linkage.
* pt.cc (finish_concept_definition): Mark as public, check
linkage.

libcc1/ChangeLog:

* libcp1plugin.cc (plugin_add_namespace_alias): Call
do_namespace_alias with input_location.

gcc/testsuite/ChangeLog:

* g++.dg/modules/export-3.C: Adjust error message.
* g++.dg/modules/export-6.C: New test.

Signed-off-by: Nathaniel Shead 
Reviewed-by: Jason Merrill 
---
 gcc/cp/cp-tree.h|  1 +
 gcc/cp/decl2.cc |  1 +
 gcc/cp/module.cc| 29 +---
 gcc/cp/name-lookup.cc   | 20 +++---
 gcc/cp/name-lookup.h|  2 +-
 gcc/cp/parser.cc|  9 ++-
 gcc/cp/pt.cc|  2 ++
 gcc/testsuite/g++.dg/modules/export-3.C |  2 +-
 gcc/testsuite/g++.dg/modules/export-6.C | 35 +
 libcc1/libcp1plugin.cc  |  2 +-
 10 files changed, 92 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/modules/export-6.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index a71d0704f8b..f57a9b0b6ce 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -7456,6 +7456,7 @@ extern void set_originating_module (tree, bool friend_p = 
false);
 extern tree get_originating_module_decl (tree) ATTRIBUTE_PURE;
 extern int get_originating_module (tree, bool for_mangle = false) 
ATTRIBUTE_PURE;
 extern unsigned get_importing_module (tree, bool = false) ATTRIBUTE_PURE;
+extern void check_module_decl_linkage (tree);
 
 /* Where current instance of the decl got declared/defined/instantiated.  */
 extern void set_instantiating_module (tree);
diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc
index 0279372488c..97ce4473b1c 100644
--- a/gcc/cp/decl2.cc
+++ b/gcc/cp/decl2.cc
@@ -1019,6 +1019,7 @@ finish_static_data_member_decl (tree decl,
 }
 
   cp_finish_decl (decl, init, init_const_expr_p, asmspec_tree, flags);
+  check_module_decl_linkage (decl);
 }
 
 /* DECLARATOR and DECLSPECS correspond to a class member.  The other
diff --git a/gcc/cp/module.cc b/gcc/cp/module.cc
index f75e211e0e1..456b73d59b6 100644
--- a/gcc/cp/module.cc
+++ b/gcc/cp/module.cc
@@ -19926,11 +19926,34 @@ set_originating_module (tree decl, bool friend_p 
ATTRIBUTE_UNUSED)
   DECL_MODULE_ATTACH_P (decl) = true;
 }
 
-  if (!module_exporting_p ())
+  /* It is ill-formed to export a declaration with internal linkage.  However,
+ at the point this function is called we don't yet always know whether this
+ declaration has internal linkage; instead we defer this check for callers
+ to do once visibility has been determined.  */
+  if (module_exporting_p ())
+DECL_MODULE_EXPORT_P (decl) = true;
+}
+
+/* Checks whether DECL within a module unit has valid linkage for its kind.
+   Must be called after visibility for DECL has been finalised.  */
+
+void
+check_module_decl_linkage (tree decl)
+{
+  if (!module_has_cmi_p ())
 return;
 
-  // FIXME: Check ill-formed linkage
-  DECL_MODULE_EXPORT_P (decl) = true;
+  /* An internal-linkage declaration cannot be

Re: [PATCH 4/4] c++: enable modules by default in c++20

2024-10-11 Thread Jakub Jelinek
On Wed, Oct 09, 2024 at 07:06:26PM -0400, Patrick Palka wrote:
> On Wed, 9 Oct 2024, Jason Merrill wrote:
> 
> > Tested x86_64-pc-linux-gnu, will apply to trunk with the rest of the patch
> > series.
> > 
> > -- 8< --
> > 
> > At this point there doesn't seem to be much reason not to have modules
> > support enabled by default in C++20, and it's good get more test coverage to
> > find corner case bugs like some I fixed recently.
> 
> Not sure how much we care about PCH anymore, but won't this effectively
> disable PCH in C++20 and later due to
> 
>   /* C++ modules and PCH don't play together.  */
>   if (flag_modules)
> return 2;
> 
> in c_common_valid_pch?

Is it known why those 3 lines were added there?

Is it just somebody who uses modules doesn't need PCH, modules obsolete PCH,
or some code in module.cc lacking GTY(()) markups needed for PCH
save/restore, something else?

If it is just a precaution, perhaps we should just remove it and add a few
tests, if it is known that some cases just don't work with PCH, perhaps
only return 2; if e.g. some module keywords (or anything related to it; or
whatever is known not to work with PCH) are seen in the PCH header rather
than just because -fmodules is on, that option doesn't imply one actually
uses modules, just that one could.

Jakub



Re: pair-fusion: Assume alias conflict if common address reg changes [PR116783]

2024-10-11 Thread Richard Sandiford
Alex Coplan  writes:
> Hi,
>
> As the PR shows, pair-fusion was tricking memory_modified_in_insn_p into
> returning false when a common base register (in this case, x1) was
> modified between the mem and the store insn.  This lead to wrong code as
> the accesses really did alias.
>
> To avoid this sort of problem, this patch avoids invoking RTL alias
> analysis altogether (and assume an alias conflict) if the two insns to
> be compared share a common address register R, and the insns see different
> definitions of R (i.e. it was modified in between).
>
> Bootstrapped/regtested on aarch64-linux-gnu (all languages, both regular
> bootstrap and LTO+PGO bootstrap).  OK for trunk?

Sorry for the slow review.  The patch looks good to me, but...

> @@ -2544,11 +2624,37 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, 
> unsigned access_size,
>  && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ());
>};
>  
> +  // Maximum number of distinct regnos we expect to appear in a single
> +  // MEM (and thus in a candidate insn).
> +  static constexpr int max_mem_regs = 2;
> +  auto_vec addr_use_vec[2];
> +  use_array addr_uses[2];
> +
> +  // Collect the lists of register uses that occur in the candidate MEMs.
> +  for (int i = 0; i < 2; i++)
> +{
> +  // N.B. it's safe for us to ignore uses that only occur in notes
> +  // here (e.g. in a REG_EQUIV expression) since we only pass the
> +  // MEM down to the alias machinery, so it can't see any insn-level
> +  // notes.
> +  for (auto use : insns[i]->uses ())
> + if (use->is_reg ()
> + && use->includes_address_uses ()
> + && !use->only_occurs_in_notes ())
> +   {
> + gcc_checking_assert (addr_use_vec[i].length () < max_mem_regs);
> + addr_use_vec[i].quick_push (use);

...if possible, I think it would be better to just use safe_push here,
without the assert.  There'd then be no need to split max_mem_regs out;
it could just be hard-coded in the addr_use_vec declaration.

Or does that not work for some reason?  I'm getting a sense of deja vu...

If it doesn't work, an alternative would be to use access_array_builder.

OK for trunk and backports if using safe_push works.

Thanks,
Richard

> +   }
> +  addr_uses[i] = use_array (addr_use_vec[i]);
> +}
> +


>store_walker
> -forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p);
> +forward_store_walker (mem_defs[0], cand_mems[0], addr_uses[0], insns[1],
> +   tombstone_p);
>  
>store_walker
> -backward_store_walker (mem_defs[1], cand_mems[1], insns[0], tombstone_p);
> +backward_store_walker (mem_defs[1], cand_mems[1], addr_uses[1], insns[0],
> +tombstone_p);
>  
>alias_walker *walkers[4] = {};
>if (mem_defs[0])
> @@ -2562,8 +2668,10 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, 
> unsigned access_size,
>  {
>// We want to find any loads hanging off the first store.
>mem_defs[0] = memory_access (insns[0]->defs ());
> -  load_walker forward_load_walker (mem_defs[0], insns[0], 
> insns[1]);
> -  load_walker backward_load_walker (mem_defs[1], insns[1], 
> insns[0]);
> +  load_walker forward_load_walker (mem_defs[0], insns[0],
> +   addr_uses[0], insns[1]);
> +  load_walker backward_load_walker (mem_defs[1], insns[1],
> +   addr_uses[1], insns[0]);
>walkers[2] = &forward_load_walker;
>walkers[3] = &backward_load_walker;
>m_pass->do_alias_analysis (alias_hazards, walkers, load_p);
> diff --git a/gcc/testsuite/g++.dg/torture/pr116783.C 
> b/gcc/testsuite/g++.dg/torture/pr116783.C
> new file mode 100644
> index 000..6d59159459d
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/torture/pr116783.C
> @@ -0,0 +1,98 @@
> +// { dg-do run }
> +// { dg-additional-options "-fstack-protector-strong 
> -fno-late-combine-instructions" }
> +// { dg-require-effective-target fstack_protector }
> +// { dg-require-effective-target c++11 }
> +
> +struct Private {
> +  char data[24]{};
> +  long moved_from : 4;
> +  Private() : moved_from (0) {}
> +};
> +
> +struct QVariant {
> +  __attribute__((noipa))
> +  ~QVariant() {
> +if (!d.moved_from && d.data[0] != 42)
> +  __builtin_abort ();
> +  }
> +  __attribute__((noipa))
> +  QVariant() {
> +d.data[0] = 42;
> +  }
> +  __attribute__((noipa))
> +  QVariant(QVariant &other) : d(other.d) {}
> +  QVariant(QVariant &&other) : d(other.d) {
> +other.d = Private();
> +other.d.moved_from = true;
> +  }
> +  QVariant &operator=(QVariant);
> +  Private d;
> +};
> +
> +QVariant id (QVariant v) { return v; }
> +QVariant &QVariant::operator=(QVariant other)
> +{
> +  id(other);
> +  return *this;
> +}
> +
> +template  struct QList {
> +  T d;
> +  struct const_iterator {
> +T *ptr;
> +T &operator*() { return *ptr; }
> +__attribute__((noipa))
> +bool operato

[PATCH] [RFC] target/117072 - more RTL FMA canonicalization

2024-10-11 Thread Richard Biener
The following helps the x86 backend by canonicalizing FMAs to have
any negation done to one of the commutative multiplication operands
be done to a register (and not a memory operand).  Likewise to
put a register operand first and a memory operand second;
swap_commutative_operands_p seems to treat REG_P and MEM_P the
same but comments indicate "complex expressiosn should be first".

In particular this does (fma MEM REG REG) -> (fma REG MEM REG) and
(fma (neg MEM) REG REG) -> (fma (neg REG) MEM REG) which are the
reasons for the testsuite regressions in gcc.target/i386/cond_op_fma*.c

Bootstrapped and tested on x86_64-unknown-linux-gnu.

I'm not quite sure this is the correct approach - simplify-rtx
doesn't seem to do "only canonicalization" but the existing FMA
case looks odd in that context.

Should the target simply reject cases with wrong "canonicalization"
or does it need to cope with all variants in the patterns that fail
matching during combine without the change?

Thanks,
Richard.

PR target/117072
* simplify-rtx.cc (simplify_context::simplify_ternary_operation):
Adjust FMA canonicalization.
---
 gcc/simplify-rtx.cc | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index e8e60404ef6..8b4fa0d7aa4 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -6830,10 +6830,21 @@ simplify_context::simplify_ternary_operation (rtx_code 
code, machine_mode mode,
op0 = tem, op1 = XEXP (op1, 0), any_change = true;
}
 
-  /* Canonicalize the two multiplication operands.  */
+  /* Canonicalize the two multiplication operands.  A negation
+should go first and if possible the negation should be
+to a register.  */
   /* a * -b + c  =>  -b * a + c.  */
-  if (swap_commutative_operands_p (op0, op1))
+  if (swap_commutative_operands_p (op0, op1)
+ || (REG_P (op1) && !REG_P (op0) && GET_CODE (op0) != NEG))
std::swap (op0, op1), any_change = true;
+  else if (GET_CODE (op0) == NEG && !REG_P (XEXP (op0, 0))
+  && REG_P (op1))
+   {
+ op0 = XEXP (op0, 0);
+ op1 = simplify_gen_unary (NEG, mode, op1, mode);
+ std::swap (op0, op1);
+ any_change = true;
+   }
 
   if (any_change)
return gen_rtx_FMA (mode, op0, op1, op2);
-- 
2.43.0


Re: pair-fusion: Assume alias conflict if common address reg changes [PR116783]

2024-10-11 Thread Richard Biener
On Fri, 11 Oct 2024, Richard Sandiford wrote:

> Alex Coplan  writes:
> > Hi,
> >
> > As the PR shows, pair-fusion was tricking memory_modified_in_insn_p into
> > returning false when a common base register (in this case, x1) was
> > modified between the mem and the store insn.  This lead to wrong code as
> > the accesses really did alias.
> >
> > To avoid this sort of problem, this patch avoids invoking RTL alias
> > analysis altogether (and assume an alias conflict) if the two insns to
> > be compared share a common address register R, and the insns see different
> > definitions of R (i.e. it was modified in between).
> >
> > Bootstrapped/regtested on aarch64-linux-gnu (all languages, both regular
> > bootstrap and LTO+PGO bootstrap).  OK for trunk?
> 
> Sorry for the slow review.  The patch looks good to me, but...
> 
> > @@ -2544,11 +2624,37 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, 
> > unsigned access_size,
> >&& bitmap_bit_p (&m_tombstone_bitmap, insn->uid ());
> >};
> >  
> > +  // Maximum number of distinct regnos we expect to appear in a single
> > +  // MEM (and thus in a candidate insn).
> > +  static constexpr int max_mem_regs = 2;
> > +  auto_vec addr_use_vec[2];
> > +  use_array addr_uses[2];
> > +
> > +  // Collect the lists of register uses that occur in the candidate MEMs.
> > +  for (int i = 0; i < 2; i++)
> > +{
> > +  // N.B. it's safe for us to ignore uses that only occur in notes
> > +  // here (e.g. in a REG_EQUIV expression) since we only pass the
> > +  // MEM down to the alias machinery, so it can't see any insn-level
> > +  // notes.
> > +  for (auto use : insns[i]->uses ())
> > +   if (use->is_reg ()
> > +   && use->includes_address_uses ()
> > +   && !use->only_occurs_in_notes ())
> > + {
> > +   gcc_checking_assert (addr_use_vec[i].length () < max_mem_regs);
> > +   addr_use_vec[i].quick_push (use);
> 
> ...if possible, I think it would be better to just use safe_push here,
> without the assert.  There'd then be no need to split max_mem_regs out;
> it could just be hard-coded in the addr_use_vec declaration.
> 
> Or does that not work for some reason?  I'm getting a sense of deja vu...

safe_push should work but as I understand the desire is to rely
on fully on-stack pre-allocated vectors?

> If it doesn't work, an alternative would be to use access_array_builder.
> 
> OK for trunk and backports if using safe_push works.
> 
> Thanks,
> Richard
> 
> > + }
> > +  addr_uses[i] = use_array (addr_use_vec[i]);
> > +}
> > +
> 
> 
> >store_walker
> > -forward_store_walker (mem_defs[0], cand_mems[0], insns[1], 
> > tombstone_p);
> > +forward_store_walker (mem_defs[0], cand_mems[0], addr_uses[0], 
> > insns[1],
> > + tombstone_p);
> >  
> >store_walker
> > -backward_store_walker (mem_defs[1], cand_mems[1], insns[0], 
> > tombstone_p);
> > +backward_store_walker (mem_defs[1], cand_mems[1], addr_uses[1], 
> > insns[0],
> > +  tombstone_p);
> >  
> >alias_walker *walkers[4] = {};
> >if (mem_defs[0])
> > @@ -2562,8 +2668,10 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, 
> > unsigned access_size,
> >  {
> >// We want to find any loads hanging off the first store.
> >mem_defs[0] = memory_access (insns[0]->defs ());
> > -  load_walker forward_load_walker (mem_defs[0], insns[0], 
> > insns[1]);
> > -  load_walker backward_load_walker (mem_defs[1], insns[1], 
> > insns[0]);
> > +  load_walker forward_load_walker (mem_defs[0], insns[0],
> > + addr_uses[0], insns[1]);
> > +  load_walker backward_load_walker (mem_defs[1], insns[1],
> > + addr_uses[1], insns[0]);
> >walkers[2] = &forward_load_walker;
> >walkers[3] = &backward_load_walker;
> >m_pass->do_alias_analysis (alias_hazards, walkers, load_p);
> > diff --git a/gcc/testsuite/g++.dg/torture/pr116783.C 
> > b/gcc/testsuite/g++.dg/torture/pr116783.C
> > new file mode 100644
> > index 000..6d59159459d
> > --- /dev/null
> > +++ b/gcc/testsuite/g++.dg/torture/pr116783.C
> > @@ -0,0 +1,98 @@
> > +// { dg-do run }
> > +// { dg-additional-options "-fstack-protector-strong 
> > -fno-late-combine-instructions" }
> > +// { dg-require-effective-target fstack_protector }
> > +// { dg-require-effective-target c++11 }
> > +
> > +struct Private {
> > +  char data[24]{};
> > +  long moved_from : 4;
> > +  Private() : moved_from (0) {}
> > +};
> > +
> > +struct QVariant {
> > +  __attribute__((noipa))
> > +  ~QVariant() {
> > +if (!d.moved_from && d.data[0] != 42)
> > +  __builtin_abort ();
> > +  }
> > +  __attribute__((noipa))
> > +  QVariant() {
> > +d.data[0] = 42;
> > +  }
> > +  __attribute__((noipa))
> > +  QVariant(QVariant &other) : d(other.d) {}
> > +  QVariant(QVariant &&other) : d(other.d) {
> > +other.d = Private();
> > +other.d.m

[Patch] Fortran: Dead-function removal in error.cc (shrinking by 40%)

2024-10-11 Thread Tobias Burnus

I found always error.cc rather confusing but I just realized that
we can reduce number of lines in that file by 40% - and remove a lot of
(apparent) complexity.

The removed code is from the old days, when gfortran handled a lot of
diagnostic itself, also because it wanted to show lines with carets,
while the C/C++ diagnostic did not support this.

Well, that changed and gfortran mostly converted to the common diagnostic
code, but somehow the old code remained - without actually being used.

Thus, this patch simply removes it.


I regarding the change as trivial and obvious and to intent to commit
it as obvious. Nonetheless, any comments, suggestions, review remarks?


Tobias


PS: I also wanted to reduce code duplication, but an assert that was previously
only in one code path triggered, showing at least one case where 'locus' is
broken. Something to fix first before sending in that part ...

There are also some other changes in the pipeline:
* I want to move support range-based locations, which is also a good
opportunity to fix some misplaced '1' (e.g. which point at white space
instead of the actual declaration or ...).

* David wants to improve json/sarif output, including stderr + sarif/json
output at the same time, but that has issues with delayed/suppressed/buffered
diagnostic in gfortran (because of the try & error parsing* in Fortran)
→ https://gcc.gnu.org/PR116613 for the former and https://gcc.gnu.org/105916
for the buffering issue.

[(*) e.g., in fixed-form Fortran where spaces have no meaning, the question
when parsing is whether 'd o i = ...' is a 'do i =' loop or a 'doi = ' 
assignment.
If the statement ends without finding a ',' it was an assignment...
To avoid bogus errors, the diagnostic has to be suppressed at times.]
Fortran: Dead-function removal in error.cc (shrinking by 40%)

This patch removes a large number of unused static functions from error.cc,
which previously were used for diagnostic but have been replaced by the common
diagnostic code.

gcc/fortran/ChangeLog:

	* error.cc (error_char, error_string, error_uinteger, error_integer,
	error_hwuint, error_hwint, gfc_widechar_display_length,
	gfc_wide_display_length, error_printf, show_locus, show_loci):
	Remove unused static functions.
	(IBUF_LEN, MAX_ARGS): Remove now unused #define.

diff --git a/gcc/fortran/error.cc b/gcc/fortran/error.cc
index 2c29537a4ff..5165d7c4628 100644
--- a/gcc/fortran/error.cc
+++ b/gcc/fortran/error.cc
@@ -128,136 +143,6 @@ gfc_buffer_error (bool flag)
 }
 
 
-/* Add a single character to the error buffer or output depending on
-   buffered_p.  */
-
-static void
-error_char (char)
-{
-  /* FIXME: Unused function to be removed in a subsequent patch.  */
-}
-
-
-/* Copy a string to wherever it needs to go.  */
-
-static void
-error_string (const char *p)
-{
-  while (*p)
-error_char (*p++);
-}
-
-
-/* Print a formatted integer to the error buffer or output.  */
-
-#define IBUF_LEN 60
-
-static void
-error_uinteger (unsigned long long int i)
-{
-  char *p, int_buf[IBUF_LEN];
-
-  p = int_buf + IBUF_LEN - 1;
-  *p-- = '\0';
-
-  if (i == 0)
-*p-- = '0';
-
-  while (i > 0)
-{
-  *p-- = i % 10 + '0';
-  i = i / 10;
-}
-
-  error_string (p + 1);
-}
-
-static void
-error_integer (long long int i)
-{
-  unsigned long long int u;
-
-  if (i < 0)
-{
-  u = (unsigned long long int) -i;
-  error_char ('-');
-}
-  else
-u = i;
-
-  error_uinteger (u);
-}
-
-
-static void
-error_hwuint (unsigned HOST_WIDE_INT i)
-{
-  char *p, int_buf[IBUF_LEN];
-
-  p = int_buf + IBUF_LEN - 1;
-  *p-- = '\0';
-
-  if (i == 0)
-*p-- = '0';
-
-  while (i > 0)
-{
-  *p-- = i % 10 + '0';
-  i = i / 10;
-}
-
-  error_string (p + 1);
-}
-
-static void
-error_hwint (HOST_WIDE_INT i)
-{
-  unsigned HOST_WIDE_INT u;
-
-  if (i < 0)
-{
-  u = (unsigned HOST_WIDE_INT) -i;
-  error_char ('-');
-}
-  else
-u = i;
-
-  error_uinteger (u);
-}
-
-
-static size_t
-gfc_widechar_display_length (gfc_char_t c)
-{
-  if (gfc_wide_is_printable (c) || c == '\t')
-/* Printable ASCII character, or tabulation (output as a space).  */
-return 1;
-  else if (c < ((gfc_char_t) 1 << 8))
-/* Displayed as \x??  */
-return 4;
-  else if (c < ((gfc_char_t) 1 << 16))
-/* Displayed as \u  */
-return 6;
-  else
-/* Displayed as \U  */
-return 10;
-}
-
-
-/* Length of the ASCII representation of the wide string, escaping wide
-   characters as print_wide_char_into_buffer() does.  */
-
-static size_t
-gfc_wide_display_length (const gfc_char_t *str)
-{
-  size_t i, len;
-
-  for (i = 0, len = 0; str[i]; i++)
-len += gfc_widechar_display_length (str[i]);
-
-  return len;
-}
-
 static int
 print_wide_char_into_buffer (gfc_char_t c, char *buf)
 {
@@ -332,593 +217,6 @@ gfc_print_wide_char (gfc_char_t c)
 }
 
 
-/* Show the file, where it was included, and the source line, give a
-   locus.  Calls error_printf() recursively, but the recursion is 

[PATCH] tree-optimization/117080 - Add SLP_TREE_MEMORY_ACCESS_TYPE

2024-10-11 Thread Richard Biener
It turns out target costing code looks at STMT_VINFO_MEMORY_ACCESS_TYPE
to identify operations from (emulated) gathers for example.  This
doesn't work for SLP loads since we do not set STMT_VINFO_MEMORY_ACCESS_TYPE
there as the vectorization strathegy might differ between different
stmt uses.  It seems we got away with setting it for stores though.
The following adds a memory_access_type field to slp_tree and sets it
from load and store vectorization code.  All the costing doesn't record
the SLP node (that was only done selectively for some corner case).  The
costing is really in need of a big overhaul, the following just massages
the two relevant ops to fix gcc.dg/target/pr88531-2[bc].c FAILs when
switching on SLP for non-grouped stores.  In particular currently
we either have a SLP node or a stmt_info in the cost hook but not both.

So the following mitigates this, postponing a rewrite of costing to
next stage1.  Other targets look possibly affected as well but are
left to respective maintainers to update.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

(this is a re-post/test from a patch from June)

PR tree-optimization/117080
* tree-vectorizer.h (_slp_tree::memory_access_type): Add.
(SLP_TREE_MEMORY_ACCESS_TYPE): New.
(record_stmt_cost): Add another overload.
* tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize
memory_access_type.
* tree-vect-stmts.cc (vectorizable_store): Set
SLP_TREE_MEMORY_ACCESS_TYPE.
(vectorizable_load): Likewise.  Also record the SLP node
when costing emulated gather offset decompose and vector
composition.
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Also
recognize SLP emulated gather/scatter.
---
 gcc/config/i386/i386.cc |  22 ++---
 gcc/tree-vect-slp.cc|   1 +
 gcc/tree-vect-stmts.cc  |  16 +--
 gcc/tree-vectorizer.h   | 102 
 4 files changed, 91 insertions(+), 50 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index ab0ade3790f..a1f0ae7a7e1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25201,13 +25201,21 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  (AGU and load ports).  Try to account for this by scaling the
  construction cost by the number of elements involved.  */
   if ((kind == vec_construct || kind == vec_to_scalar)
-  && stmt_info
-  && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
- || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
-  && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
-  && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
-  != INTEGER_CST))
- || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER))
+  && ((stmt_info
+  && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
+  || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
+  && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
+   && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+   != INTEGER_CST))
+  || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
+  == VMAT_GATHER_SCATTER)))
+ || (node
+ && ((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+ && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
+   (SLP_TREE_REPRESENTATIVE (node
+ != INTEGER_CST))
+ || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
+ == VMAT_GATHER_SCATTER)
 {
   stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
   stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 914b0b61b4d..83cb39fc214 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -124,6 +124,7 @@ _slp_tree::_slp_tree ()
   this->ldst_lanes = false;
   SLP_TREE_VECTYPE (this) = NULL_TREE;
   SLP_TREE_REPRESENTATIVE (this) = NULL;
+  SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT;
   SLP_TREE_REF_COUNT (this) = 1;
   this->failed = NULL;
   this->max_nunits = 1;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index ad4a3141ab8..4f6905f1541 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8350,6 +8350,8 @@ vectorizable_store (vec_info *vinfo,
   if (costing_p) /* transformation not required.  */
 {
   STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
+  if (slp_node)
+   SLP_TREE_MEMORY_ACCESS_TYPE (slp_node) = memory_access_type;
 
   if (loop_vinfo
  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
@@ -8390,7 +8392,10 @@ vectorizable_store (vec_info *vinfo,
  && first_stmt_info != stmt_info)
return true;
 }
-  gcc_assert (memory_access_type == STMT_VINFO_MEMO

Re: [PATCH 1/2] PR 117048: simplify-rtx: Extend (x << C1) | (X >> C2) --> ROTATE transformation to vector operands

2024-10-11 Thread Kyrylo Tkachov


> On 11 Oct 2024, at 12:28, Richard Sandiford  wrote:
>
> External email: Use caution opening links or attachments
>
>
> Kyrylo Tkachov  writes:
>> Hi all,
>>
>> In the testcase from patch [2/2] we want to match a vector rotate operation 
>> from
>> an IOR of left and right shifts by immediate.  simplify-rtx has code for just
>> that but it looks like it's prepared to do handle only scalar operands.
>> In practice most of the code works for vector modes as well except the shift
>> amounts are checked to be CONST_INT rather than vector constants that we have
>> here.  This is easily extended by using unwrap_const_vec_duplicate to extract
>> the repeating constant shift amount.
>
> FWIW, shifting a vector by a scalar is valid rtl (at least AIUI), so the
> current code does handle that case.  But I agree it's missing shifting a
> vector by a vector.
>
> I suppose a fancy version would be to check the rotate condition for each
> individual element of the vector shift amount.  Checking the duplicate
> case is definitely a good (and strict) improvement over the status quo
> though.
>
>> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
>> index e8e60404ef6..7ff14594daa 100644
>> --- a/gcc/simplify-rtx.cc
>> +++ b/gcc/simplify-rtx.cc
>> @@ -3477,12 +3477,16 @@ simplify_context::simplify_binary_operation_1 
>> (rtx_code code,
>>  }
>>
>>   if (GET_CODE (opleft) == ASHIFT && GET_CODE (opright) == LSHIFTRT
>> -  && rtx_equal_p (XEXP (opleft, 0), XEXP (opright, 0))
>> -  && CONST_INT_P (XEXP (opleft, 1))
>> -  && CONST_INT_P (XEXP (opright, 1))
>> -  && (INTVAL (XEXP (opleft, 1)) + INTVAL (XEXP (opright, 1))
>> +   && rtx_equal_p (XEXP (opleft, 0), XEXP (opright, 0)))
>> + {
>> +   rtx leftcst = unwrap_const_vec_duplicate (XEXP (opleft, 1));
>> +   rtx rightcst = unwrap_const_vec_duplicate (XEXP (opright, 1));
>> +
>> +   if (CONST_INT_P (leftcst) && CONST_INT_P (rightcst)
>> +   && (INTVAL (leftcst) + INTVAL (rightcst)
>>== GET_MODE_UNIT_PRECISION (mode)))
>
> Nit: looks like some reindentation might be missing here.
>
>> -return gen_rtx_ROTATE (mode, XEXP (opright, 0), XEXP (opleft, 1));
>> + return gen_rtx_ROTATE (mode, XEXP (opright, 0), XEXP (opleft, 1));
>> + }
>
> Looks good.  So referring back to the above, vector shifts will retain a
> scalar shift amount if they started with a scalar shift amount, and a
> vector shift amount if they started with a vector shift amount.
>
> OK with formatting tweak, thanks.

Thanks for the context and the discussion on the series.
I’ll push the attached (with the fixed indentation).
Kyrill




0001-PR-117048-simplify-rtx-Extend-x-C1-X-C2-ROTATE-trans-v2.patch
Description: 0001-PR-117048-simplify-rtx-Extend-x-C1-X-C2-ROTATE-trans-v2.patch


[PATCH] c, v2: Implement C2Y N3355 - Named Loops [PR117022]

2024-10-11 Thread Jakub Jelinek
On Fri, Oct 11, 2024 at 02:19:08PM +, Joseph Myers wrote:
> There should definitely be a test that -std=c23 -pedantic-errors gives 
> errors for these constructs (I'd say also test that -std=c23 
> -pedantic-errors -Wno-c23-c2y-compat doesn't diagnose them, while -std=c2y 
> -Wc23-c2y-compat does).  Not yet reviewed the rest of the patch.

Added those now.  I've additionally added a testcase to make sure
/* FALLTHRU */ comments don't break it (thankfully they don't, in that
case just a flag is set on the label), but that revealed that there was
a -Wunused-value warning if some labels are just used to name loops and
used in break/continue statement and nowhere else.  And another test
to make sure [[fallthrough]]; does break it, the labels before that aren't
in the same labeled-statement anymore.
So added two-liner if (label) TREE_USED (lab) = 1; to consider break lab;
or continue lab; as uses of the label.

No other changes than those 2 lines in c-decl.cc and testsuite additions.

2024-10-11  Jakub Jelinek  

PR c/117022
gcc/c-family/
* c-common.def (FOR_STMT, WHILE_STMT, DO_STMT, BREAK_STMT,
CONTINUE_STMT, SWITCH_STMT): Add an extra operand, *_NAME
and document it.
* c-common.h (bc_hash_map_t): New typedef.
(struct bc_state): Add bc_hash_map member.
(WHILE_NAME, DO_NAME, FOR_NAME, BREAK_NAME, CONTINUE_NAME,
SWITCH_STMT_NAME): Define.
* c-pretty-print.cc (c_pretty_printer::statement): Print
BREAK_STMT or CONTINUE_STMT operand if any.
* c-gimplify.cc (bc_hash_map): New static variable.
(note_named_bc, release_named_bc): New functions.
(save_bc_state): Save and clear bc_hash_map.
(restore_bc_state): Assert NULL and restore bc_hash_map.
(genericize_c_loop): Add NAME argument, call note_named_bc
and release_named_bc if non-NULL around the body walk.
(genericize_for_stmt, genericize_while_stmt, genericize_do_stmt):
Adjust callers of it.
(genericize_switch_stmt): Rename break_block variable to blab.
Call note_named_bc and release_named_bc if SWITCH_STMT_NAME is
non-NULL around the body walk.
(genericize_continue_stmt): Handle non-NULL CONTINUE_NAME.
(genericize_break_stmt): Handle non-NULL BREAK_NAME.
(c_genericize): Delete and clear bc_hash_map.
gcc/c/
* c-tree.h: Implement C2Y N3355 - Named loops.
(C_DECL_LOOP_NAME, C_DECL_SWITCH_NAME, C_DECL_LOOP_SWITCH_NAME_VALID,
C_DECL_LOOP_SWITCH_NAME_USED, IN_NAMED_STMT): Define.
(c_get_loop_names, c_release_loop_names, c_finish_bc_name): Declare.
(c_start_switch): Add NAME argument.
(c_finish_bc_stmt): Likewise.
* c-lang.h (struct language_function): Add loop_names and
loop_names_hash members.
* c-parser.cc (c_parser_external_declaration,
c_parser_declaration_or_fndef, c_parser_struct_or_union_specifier,
c_parser_parameter_declaration): Adjust c_parser_pragma caller.
(get_before_labels): New function.
(c_parser_compound_statement_nostart): Call get_before_labels when
needed, adjust c_parser_pragma and c_parser_statement_after_labels
callers.
(c_parser_statement): Call get_before_labels first and pass it to
c_parser_statement_after_labels.
(c_parser_bc_name): New function.
(c_parser_statement_after_labels): Add BEFORE_LABELS argument.  Pass
it down to c_parser_switch_statement, c_parser_while_statement,
c_parser_do_statement, c_parser_for_statement and c_parser_pragma.
Call c_parser_bc_name for RID_BREAK and RID_CONTINUE and pass it as
another argument to c_finish_bc_stmt.
(c_parser_if_body, c_parser_else_body): Call get_before_labels
early and pass it to c_parser_statement_after_labels.
(c_parser_switch_statement): Add BEFORE_LABELS argument.  Call
c_get_loop_names, if named, pass switch_name to c_start_switch,
mark it valid and set IN_NAMED_STMT bit in in_statement before
parsing body, otherwise clear IN_NAMED_STMT bit before that parsing.
Run c_release_loop_names at the end.
(c_parser_while_statement, c_parser_do_statement,
c_parser_for_statement): Add BEFORE_LABELS argument.  Call
c_get_loop_names, if named, mark it valid and set IN_NAMED_STMT bit
in in_statement before parsing body, otherwise clear IN_NAMED_STMT
before that parsing, arrange for the loop name if used to be another
*_STMT argument.
(c_parser_objc_class_instance_variables,
c_parser_objc_methodprotolist): Adjust c_parser_pragma callers.
(c_parser_pragma): Add BEFORE_LABELS argument.  Pass it down to
c_parser_for_statement, c_parser_while_statement or
c_parser_do_statement.
(c_parser_omp_loop_nest, c_maybe_parse_omp_decl): Adjust
c_parser_pragma callers.
* c-decl.cc (

Re: [patch, Fortran, RFC] Introduce GFC_STD_UNSIGNED

2024-10-11 Thread Thomas Koenig

Am 11.10.24 um 18:00 schrieb Thomas Koenig:

Hello world,

the attached patch creates an unsigned "standard" for the
gfc_option.allow_std field.

One of the main reason why people want UNSIGNED for Fortran is
interfacing for C.

This is a preparation for further work on the ISO_C_BINDING constants.
That, we do via iso-c-binding.def , whose last field is a standard
for the constant to be defined for the standard in question, which is
then checked.  I could try and invent a different method for this,
but I'd rather not.

So, OK for trunk? Other, better ideas?


ChangeLog was missing, here it is. Also regression-tested.


gcc/fortran/ChangeLog:

* intrinsic.cc (add_functions): Convert uint and
selected_unsigned_kind to GFC_STD_UNSIGNED.
(gfc_check_intrinsic_standard): Handle GFC_STD_UNSIGNED.
* libgfortran.h (GFC_STD_UNSIGNED): Add.
* options.cc (gfc_post_options): Set GFC_STD_UNSIGNED
if -funsigned is set.


Android: Fix build for Android

2024-10-11 Thread yxj-github-437
This is a patch to fix target android

0001-Android-Fix-build-for-Android.patch
Description: Binary data


Re: [PATCH v7] Provide new GCC builtin __builtin_counted_by_ref [PR116016]

2024-10-11 Thread Joseph Myers
On Fri, 27 Sep 2024, Qing Zhao wrote:

> + if (TREE_CODE (TREE_TYPE (ref)) != ARRAY_TYPE)
> +   {
> + error_at (loc, "the argument must be an array"
> +"%<__builtin_counted_by_ref%>");

This diagnostic is missing a space before %<__builtin_counted_by_ref%>.  
It's also ungrammatical; something better would be

"the argument to %<__builtin_counted_by_ref%> must be an array"

or similar.

> +@defbuiltin{@var{type} __builtin_counted_by_ref (@var{ptr})}
> +The built-in function @code{__builtin_counted_by_ref} checks whether the 
> array
> +object pointed by the pointer @var{ptr} has another object associated with it
> +that represents the number of elements in the array object through the
> +@code{counted_by} attribute (i.e. the counted-by object). If so, returns a
> +pointer to the corresponding counted-by object.
> +If such counted-by object does not exist, returns a NULL pointer.

This should be "null pointer" (describing the value returned rather than 
referring to the macro NULL which would be @code{NULL} in the manual).

> +The argument @var{ptr} must be a pointer to an array.
> +The @var{type} of the returned value must be a pointer type pointing to the

"is a pointer type" rather than "must be"; this is describing the type 
returned by a built-in function, not a requirement on a type passed by the 
user.

> +corresponding type of the counted-by object or a VOID pointer type in case
> +of a NULL pointer being returned.

"null pointer", and "void" not "VOID".

> +returns a void NULL pointer.

Likewise, "null pointer", maybe in the form "null pointer to @code{void}".

-- 
Joseph S. Myers
josmy...@redhat.com



RISC-V: Add implication for M extension.

2024-10-11 Thread Patrick O'Neill
Sending this with a text attachment to see if patchworks accepts a 
'text/x-patch' attachment type.

Review can still happen on the original thread:
https://inbox.sourceware.org/gcc-patches/cafryf0mmb9hs9cng4w8y5n06d+4tfuvgrrrnvuvtsropman...@mail.gmail.com/T/#u

Sorry for the technical difficulties Tsung Chun!

- Patrick
From 9b37cebb78d6f0e4fa1c6555ccdda4b946daca88 Mon Sep 17 00:00:00 2001
From: Patrick O'Neill 
Date: Tue, 23 Jan 2024 16:36:53 -0800
Subject: [PATCH] RISC-V: Add regression test for vsetvl bug pr113429

The reduced testcase for pr113429 (cam4 failure) needed additional
modules so it wasn't committed.
The fuzzer found a c testcase that was also fixed with pr113429's fix.
Adding it as a regression test.

	PR target/113429

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/vsetvl/pr113429.c: New test.

Signed-off-by: Patrick O'Neill 
---
 .../gcc.target/riscv/rvv/vsetvl/pr113429.c| 70 +++
 1 file changed, 70 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113429.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113429.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113429.c
new file mode 100644
index 000..05c3eeecb94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113429.c
@@ -0,0 +1,70 @@
+/* { dg-do run } */
+/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3" } */
+
+long a;
+int b, c, d, e, f, g;
+short h, i, j;
+static int k = 3;
+static int l = 6;
+int m[5][7];
+signed char n;
+int *const o = &c;
+
+signed char(p)(signed char p1, signed char q) {
+  return p1 / q;
+}
+
+void s(unsigned p1) {
+  b = (b ^ p1) & 255;
+}
+
+static long t() {
+  long u;
+  signed char v;
+  d = 1;
+  for (; d <= 4; d++) {
+j = 0;
+for (; j <= 4; j++) {
+  v = 0;
+  for (; v <= 4; v++) {
+if (m[v][v])
+  continue;
+c = 0;
+for (; c <= 4; c++) {
+  n = 0;
+  for (; n <= 4; n++) {
+int *w = &e;
+long r = v;
+u = r == 0 ? a : a % r;
+h |= u;
+*w = g;
+--m[n][c];
+f &= *o;
+  }
+}
+if (p((i < 3) ^ 9, k))
+  ;
+else if (v)
+  return 0;
+  }
+}
+  }
+  return 1;
+}
+
+static char x() {
+  for (;;) {
+t();
+if (l)
+  return 0;
+  }
+}
+
+int main() {
+  x();
+  s(e & 255);
+  if (b == 0)
+return 0;
+  else
+return 1;
+}
-- 
2.34.1



[PATCH v3 1/2] libstdc++: Enable memcpy optimizations for distinct integral types [PR93059]

2024-10-11 Thread Jonathan Wakely
The __memcpyable_integer specializations for __int128 etc. need the
__extension__ keyword to avoid -Wpedantic warnings.

[PATCH v2 2/2] is unchanged, so I haven't resent it.

-- >8 --

Currently we only optimize std::copy, std::copy_n etc. to memmove when
the source and destination types are the same. This means that we fail
to optimize copying between distinct 1-byte types, e.g. copying from a
buffer of unsigned char to a buffer of char8_t or vice versa.

This patch adds more partial specializations of the __memcpyable trait
so that we allow memcpy between integers of equal widths. This will
enable memmove for copies between narrow character types and also
between same-width types like int and unsigned.

Enabling the optimization needs to be based on the width of the integer
type, not just the size in bytes. This is because some targets define
non-standard integral types such as __int20 in msp430, which has padding
bits. It would not be safe to memcpy between e.g. __int20 and int32_t,
even though sizeof(__int20) == sizeof(int32_t). A new trait is
introduced to define the width, __memcpyable_integer, and then the
__memcpyable trait compares the widths.

It's safe to copy between signed and unsigned integers of the same
width, because GCC only supports two's complement integers.

I initially though it would be useful to define the specialization
__memcpyable_integer to enable copying between narrow character
types and std::byte. But that isn't possible with std::copy, because
is_assignable is false. Optimized copies using memmove
will already happen for copying std::byte to std::byte, because
__memcpyable is true.

libstdc++-v3/ChangeLog:

PR libstdc++/93059
* include/bits/cpp_type_traits.h (__memcpyable): Add partial
specialization for pointers to distinct types.
(__memcpyable_integer): New trait to control which types can use
cross-type memcpy optimizations.
---
 libstdc++-v3/include/bits/cpp_type_traits.h | 90 -
 1 file changed, 88 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/bits/cpp_type_traits.h 
b/libstdc++-v3/include/bits/cpp_type_traits.h
index 060652afb18..84ad5bbd1e9 100644
--- a/libstdc++-v3/include/bits/cpp_type_traits.h
+++ b/libstdc++-v3/include/bits/cpp_type_traits.h
@@ -434,8 +434,6 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
 };
 #endif
 
-  template struct iterator_traits;
-
   // A type that is safe for use with memcpy, memmove, memcmp etc.
   template
 struct __is_nonvolatile_trivially_copyable
@@ -459,16 +457,104 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
   enum { __value = 0 };
 };
 
+  // Allow memcpy when source and destination are pointers to the same type.
   template
 struct __memcpyable<_Tp*, _Tp*>
 : __is_nonvolatile_trivially_copyable<_Tp>
 { };
 
+  // Source pointer can be const.
   template
 struct __memcpyable<_Tp*, const _Tp*>
 : __is_nonvolatile_trivially_copyable<_Tp>
 { };
 
+  template struct __memcpyable_integer;
+
+  // For heterogeneous types, allow memcpy between equal-sized integers.
+  template
+struct __memcpyable<_Tp*, _Up*>
+{
+  enum {
+   __value = __memcpyable_integer<_Tp>::__width != 0
+   && ((int)__memcpyable_integer<_Tp>::__width
+ == (int)__memcpyable_integer<_Up>::__width)
+  };
+};
+
+  // Specialization for const U* because __is_integer is never true.
+  template
+struct __memcpyable<_Tp*, const _Up*>
+: __memcpyable<_Tp*, _Up*>
+{ };
+
+  template
+struct __memcpyable_integer
+{
+  enum {
+   __width = __is_integer<_Tp>::__value ? (sizeof(_Tp) * __CHAR_BIT__) : 0
+  };
+};
+
+  // Cannot memcpy volatile memory.
+  template
+struct __memcpyable_integer
+{ enum { __width = 0 }; };
+
+  // Specializations for __intNN types with padding bits.
+#if defined __GLIBCXX_TYPE_INT_N_0 && __GLIBCXX_BITSIZE_INT_N_0 % __CHAR_BIT__
+  __extension__
+  template<>
+struct __memcpyable_integer<__GLIBCXX_TYPE_INT_N_0>
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_0 }; };
+  __extension__
+  template<>
+struct __memcpyable_integer
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_0 }; };
+#endif
+#if defined __GLIBCXX_TYPE_INT_N_1 && __GLIBCXX_BITSIZE_INT_N_1 % __CHAR_BIT__
+  __extension__
+  template<>
+struct __memcpyable_integer<__GLIBCXX_TYPE_INT_N_1>
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_1 }; };
+  __extension__
+  template<>
+struct __memcpyable_integer
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_1 }; };
+#endif
+#if defined __GLIBCXX_TYPE_INT_N_2 && __GLIBCXX_BITSIZE_INT_N_2 % __CHAR_BIT__
+  __extension__
+  template<>
+struct __memcpyable_integer<__GLIBCXX_TYPE_INT_N_2>
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_2 }; };
+  __extension__
+  template<>
+struct __memcpyable_integer
+{ enum { __width = __GLIBCXX_BITSIZE_INT_N_2 }; };
+#endif
+#if defined __GLIBCXX_TYPE_INT_N_3 && __GLIBCXX_BITSIZE_INT_N_3 %

Re: RISC-V: Add implication for M extension.

2024-10-11 Thread Patrick O'Neill

On 10/11/24 09:16, Patrick O'Neill wrote:

Sending this with a text attachment to see if patchworks accepts a 
'text/x-patch' attachment type.

Review can still happen on the original thread:
https://inbox.sourceware.org/gcc-patches/cafryf0mmb9hs9cng4w8y5n06d+4tfuvgrrrnvuvtsropman...@mail.gmail.com/T/#u 



Sorry for the technical difficulties Tsung Chun!

- Patrick


Seems like that did the trick. Pre-commit results will be on this page 
within 3 hours:

https://patchwork.sourceware.org/project/gcc/patch/f0d75be3-85e0-4948-8e94-481479eef...@rivosinc.com/

Thanks!
Patrick


Re: [PATCH][aarch64][libstdc++] Use shufflevector instead of shuffle in opt_random.h

2024-10-11 Thread Jonathan Wakely
On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus  wrote:
>
> This patch modifies the implementation of the vectorized Mersenne
> Twister random number generator to use __builtin_shufflevector instead
> of __builtin_shuffle. This makes it (almost) compatible with Clang.
>
> To make the implementation fully compatible with Clang, Clang will need
> to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> currently it does not. This looks like an oversight in Clang and so will
> be addressed separately.
>
> I see no codegen change with this patch.

I'm not qualified to review this myself, but I'd at least like to see
the CI checks passing:
https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7...@nvidia.com/
Apparently the patch couldn't be applied.

Please configure your email client (thunderbird?) to not munge the
patch, or attach it rather than sending inline. Or just use
git-send-email :-)


>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Signed-off-by: Ricardo Jesus 
>
> 2024-09-05  Ricardo Jesus  
>
> * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
> of __builtin_shuffle with __builtin_shufflevector.
> (__aarch64_lsl_128): Move shift amount to a template parameter.
> (__aarch64_lsr_128): Move shift amount to a template parameter.
> (__aarch64_recursion): Update call sites of __aarch64_lsl_128
> and __aarch64_lsr_128.
> ---
>   .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++
>   1 file changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> index 7f756d1572f..7eb816abcd0 100644
> --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> @@ -35,13 +35,13 @@
>   #ifdef __ARM_NEON
>
>   #ifdef __ARM_BIG_ENDIAN
> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> -{16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> +16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> +24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
>   #else
> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> -{_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> +_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> +_C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
>   #endif
>
>   #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> namespace {
>   // Logical Shift right 128-bits by c * 8 bits
>
> -__extension__ extern __inline __Uint32x4_t
> +__extension__
> +template
> +extern __inline __Uint32x4_t
>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
> +__aarch64_lsr_128 (__Uint8x16_t __a)
>   {
> const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>0, 0, 0, 0, 0, 0, 0, 0};
> @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>
>   // Logical Shift left 128-bits by c * 8 bits
>
> -__extension__ extern __inline __Uint32x4_t
> +__extension__
> +template
> +extern __inline __Uint32x4_t
>   __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -__aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
> +__aarch64_lsl_128 (__Uint8x16_t __a)
>   {
> const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>0, 0, 0, 0, 0, 0, 0, 0};
> @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>__Uint32x4_t __e)
>   {
> __Uint32x4_t __y = (__b >> __sr1);
> -  __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
> +  __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
>
> __Uint32x4_t __v = __d << __sl1;
>
> __z = __z ^ __a;
> __z = __z ^ __v;
>
> -  __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
> +  __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
>
> __y = __y & __e;
> __z = __z ^ __x;
> --
> 2.44.0
>



[patch, Fortran, RFC] Introduce GFC_STD_UNSIGNED

2024-10-11 Thread Thomas Koenig

Hello world,

the attached patch creates an unsigned "standard" for the
gfc_option.allow_std field.

One of the main reason why people want UNSIGNED for Fortran is
interfacing for C.

This is a preparation for further work on the ISO_C_BINDING constants.
That, we do via iso-c-binding.def , whose last field is a standard
for the constant to be defined for the standard in question, which is
then checked.  I could try and invent a different method for this,
but I'd rather not.

So, OK for trunk? Other, better ideas?

Best regards

Thomasdiff --git a/gcc/fortran/intrinsic.cc b/gcc/fortran/intrinsic.cc
index 0a6be215825..c6fb0a6de45 100644
--- a/gcc/fortran/intrinsic.cc
+++ b/gcc/fortran/intrinsic.cc
@@ -2264,7 +2264,7 @@ add_functions (void)
   make_generic ("long", GFC_ISYM_LONG, GFC_STD_GNU);
 
   add_sym_2 ("uint", GFC_ISYM_UINT, CLASS_ELEMENTAL, ACTUAL_NO, BT_UNSIGNED,
-	 di, GFC_STD_GNU, gfc_check_uint, gfc_simplify_uint,
+	 di, GFC_STD_UNSIGNED, gfc_check_uint, gfc_simplify_uint,
 	 gfc_resolve_uint, a, BT_REAL, dr, REQUIRED, kind, BT_INTEGER, di,
 	 OPTIONAL);
 
@@ -2966,17 +2966,13 @@ add_functions (void)
 
   make_generic ("selected_int_kind", GFC_ISYM_SI_KIND, GFC_STD_F95);
 
-  if (flag_unsigned)
-{
-
-  add_sym_1 ("selected_unsigned_kind", GFC_ISYM_SU_KIND,
-		 CLASS_TRANSFORMATIONAL, ACTUAL_NO, BT_INTEGER, di,
-		 GFC_STD_GNU, gfc_check_selected_int_kind,
-		 gfc_simplify_selected_unsigned_kind, NULL, r, BT_INTEGER, di,
-		 REQUIRED);
+  add_sym_1 ("selected_unsigned_kind", GFC_ISYM_SU_KIND,
+	 CLASS_TRANSFORMATIONAL, ACTUAL_NO, BT_INTEGER, di,
+	 GFC_STD_UNSIGNED, gfc_check_selected_int_kind,
+	 gfc_simplify_selected_unsigned_kind, NULL, r, BT_INTEGER, di,
+	 REQUIRED);
 
   make_generic ("selected_unsigned_kind", GFC_ISYM_SU_KIND, GFC_STD_GNU);
-}
 
   add_sym_1 ("selected_logical_kind", GFC_ISYM_SL_KIND, CLASS_TRANSFORMATIONAL, ACTUAL_NO, BT_INTEGER, di,
 	 GFC_STD_F2023, /* it has the same requirements */ gfc_check_selected_int_kind,
@@ -4945,6 +4941,10 @@ gfc_check_intrinsic_standard (const gfc_intrinsic_sym* isym,
   symstd_msg = _("for backward compatibility");
   break;
 
+case GFC_STD_UNSIGNED:
+  symstd_msg = _("unsigned");
+  break;
+
 default:
   gfc_internal_error ("Invalid standard code on intrinsic %qs (%d)",
 			  isym->name, isym->standard);
diff --git a/gcc/fortran/libgfortran.h b/gcc/fortran/libgfortran.h
index 895629d6f80..773f2a0b049 100644
--- a/gcc/fortran/libgfortran.h
+++ b/gcc/fortran/libgfortran.h
@@ -23,6 +23,8 @@ along with GCC; see the file COPYING3.  If not see
Nevertheless, some features available in F2018 are prohibited in F2023.
Please remember to keep those definitions in sync with
gfortran.texi.  */
+#define GFC_STD_UNSIGNED	(1<<14) /* Not really a standard, but
+	   better for error handling.  */
 #define GFC_STD_F2023_DEL	(1<<13)	/* Prohibited in F2023.  */
 #define GFC_STD_F2023		(1<<12)	/* New in F2023.  */
 #define GFC_STD_F2018_DEL	(1<<11)	/* Deleted in F2018.  */
diff --git a/gcc/fortran/options.cc b/gcc/fortran/options.cc
index 6f2579ad9de..d998d0e6117 100644
--- a/gcc/fortran/options.cc
+++ b/gcc/fortran/options.cc
@@ -539,6 +539,10 @@ gfc_post_options (const char **pfilename)
   else if (gfc_option.allow_std & GFC_STD_F2003)
 lang_hooks.name = "GNU Fortran2003";
 
+  /* Set the unsigned "standard".  */
+  if (flag_unsigned)
+gfc_option.allow_std |= GFC_STD_UNSIGNED;
+
   return gfc_cpp_preprocess_only ();
 }
 


[PING^2] [PATCH] c: Diagnose declarations that are used only in their own initializer [PR115027]

2024-10-11 Thread Martin Uecker


I like to ping this patch.

Am Freitag, dem 09.08.2024 um 10:20 +0200 schrieb Martin Uecker:
> ok?
> 
> Am Samstag, dem 29.06.2024 um 20:30 +0200 schrieb Martin Uecker:
> > Probably not entirely fool-proof when using statement
> > expressions in initializers, but should be good enough.
> > 
> > 
> > Bootstrapped and regression tested on x86_64.
> > 
> > 
> > 
> > c: Diagnose declarations that are used only in their own initializer 
> > [PR115027]
> > 
> > Track the declaration that is currently being initialized and do not
> > mark it as read when it is used in its own initializer.  This then
> > allows it to be diagnosed as set-but-unused when it is not used
> > elsewhere.
> > 
> > PR c/115027
> > 
> > gcc/c/
> > * c-tree.h (in_decl_init): Declare variable.
> > * c-parser.cc (c_parser_initializer): Record decl being 
> > initialized.
> > * c-typeck.cc (in_decl_init): Defintie variable.
> > (mark_exp_read): Ignore decl currently being initialized.
> > 
> > gcc/testsuite/
> > * gcc.dg/pr115027.c: New test.
> > 
> > diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
> > index 8c4e697a4e1..46060665115 100644
> > --- a/gcc/c/c-parser.cc
> > +++ b/gcc/c/c-parser.cc
> > @@ -6126,11 +6126,14 @@ c_parser_type_name (c_parser *parser, bool 
> > alignas_ok)
> >  static struct c_expr
> >  c_parser_initializer (c_parser *parser, tree decl)
> >  {
> > +  struct c_expr ret;
> > +  tree save = in_decl_init;
> > +  in_decl_init = decl;
> > +
> >if (c_parser_next_token_is (parser, CPP_OPEN_BRACE))
> > -return c_parser_braced_init (parser, NULL_TREE, false, NULL, decl);
> > +ret = c_parser_braced_init (parser, NULL_TREE, false, NULL, decl);
> >else
> >  {
> > -  struct c_expr ret;
> >location_t loc = c_parser_peek_token (parser)->location;
> >ret = c_parser_expr_no_commas (parser, NULL);
> >if (decl != error_mark_node && C_DECL_VARIABLE_SIZE (decl))
> > @@ -6154,8 +6157,9 @@ c_parser_initializer (c_parser *parser, tree decl)
> >   || C_DECL_DECLARED_CONSTEXPR (COMPOUND_LITERAL_EXPR_DECL
> > (ret.value
> > ret = convert_lvalue_to_rvalue (loc, ret, true, true, true);
> > -  return ret;
> >  }
> > +in_decl_init = save;
> > +return ret;
> >  }
> >  
> >  /* The location of the last comma within the current initializer list,
> > diff --git a/gcc/c/c-tree.h b/gcc/c/c-tree.h
> > index 15da875a029..8013963b06d 100644
> > --- a/gcc/c/c-tree.h
> > +++ b/gcc/c/c-tree.h
> > @@ -740,6 +740,8 @@ extern int in_typeof;
> >  extern bool c_in_omp_for;
> >  extern bool c_omp_array_section_p;
> >  
> > +extern tree in_decl_init;
> > +
> >  extern tree c_last_sizeof_arg;
> >  extern location_t c_last_sizeof_loc;
> >  
> > diff --git a/gcc/c/c-typeck.cc b/gcc/c/c-typeck.cc
> > index 455dc374b48..34279dc1d1a 100644
> > --- a/gcc/c/c-typeck.cc
> > +++ b/gcc/c/c-typeck.cc
> > @@ -73,6 +73,9 @@ int in_sizeof;
> >  /* The level of nesting inside "typeof".  */
> >  int in_typeof;
> >  
> > +/* When inside an initializer, this is set to the decl being initialized.  
> > */
> > +tree in_decl_init;
> > +
> >  /* True when parsing OpenMP loop expressions.  */
> >  bool c_in_omp_for;
> >  
> > @@ -2047,7 +2050,8 @@ mark_exp_read (tree exp)
> >  {
> >  case VAR_DECL:
> >  case PARM_DECL:
> > -  DECL_READ_P (exp) = 1;
> > +  if (exp != in_decl_init)
> > +   DECL_READ_P (exp) = 1;
> >break;
> >  case ARRAY_REF:
> >  case COMPONENT_REF:
> > diff --git a/gcc/testsuite/gcc.dg/pr115027.c 
> > b/gcc/testsuite/gcc.dg/pr115027.c
> > new file mode 100644
> > index 000..ac2699f8392
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/pr115027.c
> > @@ -0,0 +1,8 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Wunused-but-set-variable" } */
> > +
> > +void f(void)
> > +{
> > +   struct foo { void *p; };
> > +   struct foo g = { &g };  /* { dg-warning "set but not used" } */
> > +}
> > 
> 



Re: [PING^2] [PATCH] c: Diagnose declarations that are used only in their own initializer [PR115027]

2024-10-11 Thread Jakub Jelinek
On Fri, Oct 11, 2024 at 06:35:26PM +0200, Martin Uecker wrote:
> 
> I like to ping this patch.

I'm a little bit worried about cases like:

int *p;
struct S { int *s; };
int *bar (int *s; };
int *baz (void);

void
foo (void)
{
  struct S a = { bar (&a.s) };
  struct S b = { (p = &a.s, baz ()) };
}

So, perhaps when restoring in_decl_init scan the initializer
and if it only contains references to the decl in some simple contexts
like address of itr, address of its field and similar, have the new
behavior, otherwise if the address or address of its components or similar
is passed to a function, or escapes into another variable and the like
keep previous behavior (i.e. set DECL_READ_P (in_decl_init) = 1).

Jakub



Re: [PATCH] libcpp, v2: Add -Wtrailing-whitespace= warning

2024-10-11 Thread Joseph Myers
On Thu, 19 Sep 2024, Jakub Jelinek wrote:

> Here is a patch which currently allows blank (' ' '\t') and space (' ' '\t'
> '\f' '\v'), cntrl not yet added, not anything non-ASCII, but in theory could
> be added later (though, non-ASCII would be just for inside of comments,
> say non-breaking space etc. in the source is otherwise an error).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux.

OK with duplicates of the tests added using CR LF line endings (and 
comments in those duplicates saying they deliberately use CR LF line 
endings), I think this is a case where it's worth having such tests to 
verify that space / blank before CR LF is handled the same as before LF.

It will be good if we can move to using such options for building GCC's 
target libraries, and host code when supported by the host compiler.

-- 
Joseph S. Myers
josmy...@redhat.com



Re: [PATCH v2] aarch64: Add support for Ampere-1B (-mcpu=ampere1b) CPU

2024-10-11 Thread Philipp Tomsich
We just noticed that we didn't request to backport this one…
OK for backport?

On Thu, 30 Nov 2023 at 00:55, Philipp Tomsich 
wrote:

> Applied to master, thanks!
> Philipp.
>
> On Tue, 28 Nov 2023 at 12:57, Richard Sandiford
>  wrote:
> >
> > Philipp Tomsich  writes:
> > > On Tue, 28 Nov 2023 at 12:21, Richard Sandiford
> > >  wrote:
> > >>
> > >> Philipp Tomsich  writes:
> > >> > This patch adds initial support for Ampere-1B core.
> > >> >
> > >> > The Ampere-1B core implements ARMv8.7 with the following (compiler
> > >> > visible) extensions:
> > >> >  - CSSC (Common Short Sequence Compression instructions),
> > >> >  - MTE (Memory Tagging Extension)
> > >> >  - SM3/SM4
> > >> >
> > >> > gcc/ChangeLog:
> > >> >
> > >> >   * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add
> ampere-1b
> > >> >   * config/aarch64/aarch64-cost-tables.h: Add
> ampere1b_extra_costs
> > >> >   * config/aarch64/aarch64-tune.md: Regenerate
> > >> >   * config/aarch64/aarch64.cc: Include ampere1b tuning model
> > >> >   * doc/invoke.texi: Document -mcpu=ampere1b
> > >> >   * config/aarch64/tuning_models/ampere1b.h: New file.
> > >>
> > >> OK, thanks, but:
> > >>
> > >> >
> > >> > Signed-off-by: Philipp Tomsich 
> > >> > ---
> > >> >
> > >> > Changes in v2:
> > >> > - moved ampere1b model to a separated file
> > >> > - regenerated aarch64-tune.md after rebase
> > >> >
> > >> >  gcc/config/aarch64/aarch64-cores.def|   1 +
> > >> >  gcc/config/aarch64/aarch64-cost-tables.h| 107
> ++
> > >> >  gcc/config/aarch64/aarch64-tune.md  |   2 +-
> > >> >  gcc/config/aarch64/aarch64.cc   |   1 +
> > >> >  gcc/config/aarch64/tuning_models/ampere1b.h | 114
> 
> > >> >  gcc/doc/invoke.texi |   2 +-
> > >> >  6 files changed, 225 insertions(+), 2 deletions(-)
> > >> >  create mode 100644 gcc/config/aarch64/tuning_models/ampere1b.h
> > >> >
> > >> > diff --git a/gcc/config/aarch64/aarch64-cores.def
> b/gcc/config/aarch64/aarch64-cores.def
> > >> > index 16752b77f4b..ad896a80f1f 100644
> > >> > --- a/gcc/config/aarch64/aarch64-cores.def
> > >> > +++ b/gcc/config/aarch64/aarch64-cores.def
> > >> > @@ -74,6 +74,7 @@ AARCH64_CORE("thunderxt83",   thunderxt83,
>  thunderx,  V8A,  (CRC, CRYPTO), thu
> > >> >  /* Ampere Computing ('\xC0') cores. */
> > >> >  AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES,
> SHA3), ampere1, 0xC0, 0xac3, -1)
> > >> >  AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG,
> AES, SHA3, SM4, MEMTAG), ampere1a, 0xC0, 0xac4, -1)
> > >> > +AARCH64_CORE("ampere1b", ampere1b, cortexa57, V8_7A, (F16, RNG,
> AES, SHA3, SM4, MEMTAG, CSSC), ampere1b, 0xC0, 0xac5, -1)
> > >> >  /* Do not swap around "emag" and "xgene1",
> > >> > this order is required to handle variant correctly. */
> > >> >  AARCH64_CORE("emag",emag,  xgene1,V8A,  (CRC,
> CRYPTO), emag, 0x50, 0x000, 3)
> > >> > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h
> b/gcc/config/aarch64/aarch64-cost-tables.h
> > >> > index 0cb638f3a13..4c8da7f119b 100644
> > >> > --- a/gcc/config/aarch64/aarch64-cost-tables.h
> > >> > +++ b/gcc/config/aarch64/aarch64-cost-tables.h
> > >> > @@ -882,4 +882,111 @@ const struct cpu_cost_table
> ampere1a_extra_costs =
> > >> >}
> > >> >  };
> > >> >
> > >> > +const struct cpu_cost_table ampere1b_extra_costs =
> > >> > +{
> > >> > +  /* ALU */
> > >> > +  {
> > >> > +0, /* arith.  */
> > >> > +0, /* logical.  */
> > >> > +0, /* shift.  */
> > >> > +COSTS_N_INSNS (1), /* shift_reg.  */
> > >> > +0, /* arith_shift.  */
> > >> > +COSTS_N_INSNS (1), /* arith_shift_reg.  */
> > >> > +0, /* log_shift.  */
> > >> > +COSTS_N_INSNS (1), /* log_shift_reg.  */
> > >> > +0, /* extend.  */
> > >> > +COSTS_N_INSNS (1), /* extend_arith.  */
> > >> > +0, /* bfi.  */
> > >> > +0, /* bfx.  */
> > >> > +0, /* clz.  */
> > >> > +0, /* rev.  */
> > >> > +0, /* non_exec.  */
> > >> > +true   /* non_exec_costs_exec.  */
> > >> > +  },
> > >> > +  {
> > >> > +/* MULT SImode */
> > >> > +{
> > >> > +  COSTS_N_INSNS (2),   /* simple.  */
> > >> > +  COSTS_N_INSNS (2),   /* flag_setting.  */
> > >> > +  COSTS_N_INSNS (2),   /* extend.  */
> > >> > +  COSTS_N_INSNS (3),   /* add.  */
> > >> > +  COSTS_N_INSNS (3),   /* extend_add.  */
> > >> > +  COSTS_N_INSNS (12)   /* idiv.  */
> > >> > +},
> > >> > +/* MULT DImode */
> > >> > +{
> > >> > +  COSTS_N_INSNS (2),   /* simple.  */
> > >> > +  0,   /* flag_setting (N/A).  */
> > >> > +  COSTS_N_INSNS (2),   /* extend.  */
> > >> > +  COSTS_N_INSNS (3),   /* add.  */
> > >> > +  COSTS_N

Re: [PATCH v7] Provide new GCC builtin __builtin_counted_by_ref [PR116016]

2024-10-11 Thread Qing Zhao
Thanks for the comments.

Will fix all these issues and send out the 8th version soon.

Qing

> On Oct 11, 2024, at 13:16, Joseph Myers  wrote:
> 
> On Fri, 27 Sep 2024, Qing Zhao wrote:
> 
>> +if (TREE_CODE (TREE_TYPE (ref)) != ARRAY_TYPE)
>> +  {
>> + error_at (loc, "the argument must be an array"
>> +   "%<__builtin_counted_by_ref%>");
> 
> This diagnostic is missing a space before %<__builtin_counted_by_ref%>.  
> It's also ungrammatical; something better would be
> 
> "the argument to %<__builtin_counted_by_ref%> must be an array"
> 
> or similar.
> 
>> +@defbuiltin{@var{type} __builtin_counted_by_ref (@var{ptr})}
>> +The built-in function @code{__builtin_counted_by_ref} checks whether the 
>> array
>> +object pointed by the pointer @var{ptr} has another object associated with 
>> it
>> +that represents the number of elements in the array object through the
>> +@code{counted_by} attribute (i.e. the counted-by object). If so, returns a
>> +pointer to the corresponding counted-by object.
>> +If such counted-by object does not exist, returns a NULL pointer.
> 
> This should be "null pointer" (describing the value returned rather than 
> referring to the macro NULL which would be @code{NULL} in the manual).
> 
>> +The argument @var{ptr} must be a pointer to an array.
>> +The @var{type} of the returned value must be a pointer type pointing to the
> 
> "is a pointer type" rather than "must be"; this is describing the type 
> returned by a built-in function, not a requirement on a type passed by the 
> user.
> 
>> +corresponding type of the counted-by object or a VOID pointer type in case
>> +of a NULL pointer being returned.
> 
> "null pointer", and "void" not "VOID".
> 
>> +returns a void NULL pointer.
> 
> Likewise, "null pointer", maybe in the form "null pointer to @code{void}".
> 
> -- 
> Joseph S. Myers
> josmy...@redhat.com
> 



Re: [PATCH][aarch64][libstdc++] Use shufflevector instead of shuffle in opt_random.h

2024-10-11 Thread Jonathan Wakely
On Fri, 11 Oct 2024 at 19:52, Christophe Lyon
 wrote:
>
> On Fri, 11 Oct 2024 at 17:52, Jonathan Wakely  wrote:
> >
> > On Wed, 9 Oct 2024 at 10:41, Ricardo Jesus  wrote:
> > >
> > > This patch modifies the implementation of the vectorized Mersenne
> > > Twister random number generator to use __builtin_shufflevector instead
> > > of __builtin_shuffle. This makes it (almost) compatible with Clang.
> > >
> > > To make the implementation fully compatible with Clang, Clang will need
> > > to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> > > currently it does not. This looks like an oversight in Clang and so will
> > > be addressed separately.
> > >
> > > I see no codegen change with this patch.
> >
> > I'm not qualified to review this myself, but I'd at least like to see
> > the CI checks passing:
> > https://patchwork.sourceware.org/project/gcc/patch/c911a45e-5924-4a4b-9b6b-bb3af0cc7...@nvidia.com/
> > Apparently the patch couldn't be applied.
> >
> > Please configure your email client (thunderbird?) to not munge the
> > patch, or attach it rather than sending inline. Or just use
> > git-send-email :-)
> >
> Hi!
>
> The problem is not with how the patch was sent: patchwork managed to
> see it as a patch, and the CI tried to apply it.
> The problem is that for some reason, git was not able to apply the
> patch to our current baseline.
> Unfortunately, we do not go as far as calling 'git am
> --show-current-patch=diff' or something else to provide more info in
> our CI logs, so we can only guess that something went wrong. Maybe
> your patch is based against a too old revision of GCC?

No, that file hasn't changed anywhere near the patch. The problem is
that the patch was munged by thunderbird, adding line breaks where
they corrupt the patch:


Applying: Use shufflevector instead of shuffle in opt_random.h
error: git diff header lacks filename information when removing 1
leading pathname component (line 6)
Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h

I fixed that manually, but it still fails:

Applying: Use shufflevector instead of shuffle in opt_random.h
error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:35
error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
does not apply
Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h

That's because of an incorrect number of space characters on the
unchanged context lines around the +/- diffs. I fixed that manually,
and failed at the next chunk:

Applying: Use shufflevector instead of shuffle in opt_random.h
error: patch failed: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h:52
error: libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h: patch
does not apply
Patch failed at 0001 Use shufflevector instead of shuffle in opt_random.h

So the problem is how the patch was sent.

>
> Thanks,
>
> Christophe
>
>
> >
> > >
> > > Bootstrapped and tested on aarch64-none-linux-gnu.
> > >
> > > Signed-off-by: Ricardo Jesus 
> > >
> > > 2024-09-05  Ricardo Jesus  
> > >
> > > * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
> > > of __builtin_shuffle with __builtin_shufflevector.
> > > (__aarch64_lsl_128): Move shift amount to a template parameter.
> > > (__aarch64_lsr_128): Move shift amount to a template parameter.
> > > (__aarch64_recursion): Update call sites of __aarch64_lsl_128
> > > and __aarch64_lsr_128.
> > > ---
> > >   .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++
> > >   1 file changed, 16 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > index 7f756d1572f..7eb816abcd0 100644
> > > --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> > > @@ -35,13 +35,13 @@
> > >   #ifdef __ARM_NEON
> > >
> > >   #ifdef __ARM_BIG_ENDIAN
> > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> > > -{16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > > - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> > > +16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> > > +24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
> > >   #else
> > > -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> > > -{_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > > - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> > > +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> > > +_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> > > +_C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
> > >   #endif
> > >
> > >   #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > > @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> > > 

Re: [PATCH v7] c++: Fix overeager Woverloaded-virtual with conversion operators [PR109918]

2024-10-11 Thread Jason Merrill

On 10/11/24 7:02 AM, Simon Martin wrote:

Hi Jason,

On 11 Oct 2024, at 0:35, Jason Merrill wrote:


On 10/7/24 3:35 PM, Simon Martin wrote:

On 7 Oct 2024, at 18:58, Jason Merrill wrote:

On 10/7/24 11:27 AM, Simon Martin wrote:



/* Now give a warning for all base functions without overriders,
   as they are hidden.  */
for (tree base_fndecl : base_fndecls)
+ {
+   if (!base_fndecl || overriden_base_fndecls.contains
(base_fndecl))
+ continue;
+   tree *hider = hidden_base_fndecls.get (base_fndecl);
+   if (hider)


How about looping over hidden_base_fndecls instead of base_fndecls?



Unfortunately it does not work because a given base method can be
hidden
by one overload and overriden by another, in which case we don’t
want
to warn (see for example AA:foo(int) in Woverloaded-virt7.C). So we
need
to take both collections into account.


Yes, you'd still need to check overridden_base_fndecls.contains, but
that doesn't seem any different iterating over hidden_base_fndecls
instead of base_fndecls.

Sure, and I guess iterating over hidden_base_fndecls is more coherent

with what the warning is about. Changed in the attached updated patch,
successfully tested on x86_64-pc-linux-gnu. OK?


OK, thanks.

Jason



[PATCH 1/4] RISC-V: Add testcases for form 2 of vector signed SAT_SUB

2024-10-11 Thread pan2 . li
From: Pan Li 

Form 2:
  #define DEF_VEC_SAT_S_SUB_FMT_2(T, UT, MIN, MAX) \
  void __attribute__((noinline))   \
  vec_sat_s_sub_##T##_fmt_2 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
T minus = (UT)x - (UT)y;   \
out[i] = (x ^ y) >= 0 || (minus ^ x) >= 0  \
  ? minus : x < 0 ? MIN : MAX; \
  }\
  }

DEF_VEC_SAT_S_SUB_FMT_2(int8_t, uint8_t, INT8_MIN, INT8_MAX)

The below test are passed for this patch.
* The rv64gcv fully regression test.

It is test only patch and obvious up to a point, will commit it
directly if no comments in next 48H.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vec_sat_arith.h: Add test helper macros.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i16.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i32.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i64.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i8.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-2-i16.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-2-i32.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-2-i64.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-2-i8.c: New test.

Signed-off-by: Pan Li 
---
 .../rvv/autovec/binop/vec_sat_s_sub-2-i16.c   |  9 
 .../rvv/autovec/binop/vec_sat_s_sub-2-i32.c   |  9 
 .../rvv/autovec/binop/vec_sat_s_sub-2-i64.c   |  9 
 .../rvv/autovec/binop/vec_sat_s_sub-2-i8.c|  9 
 .../autovec/binop/vec_sat_s_sub-run-2-i16.c   | 17 ++
 .../autovec/binop/vec_sat_s_sub-run-2-i32.c   | 17 ++
 .../autovec/binop/vec_sat_s_sub-run-2-i64.c   | 17 ++
 .../autovec/binop/vec_sat_s_sub-run-2-i8.c| 17 ++
 .../riscv/rvv/autovec/vec_sat_arith.h | 22 +++
 9 files changed, 126 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i16.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i32.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i64.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i8.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-2-i16.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-2-i32.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-2-i64.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-2-i8.c

diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i16.c
new file mode 100644
index 000..dec0359c5ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i16.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
-fdump-rtl-expand-details" } */
+
+#include "../vec_sat_arith.h"
+
+DEF_VEC_SAT_S_SUB_FMT_2(int16_t, uint16_t, INT16_MIN, INT16_MAX)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 2 "expand" } } */
+/* { dg-final { scan-assembler-times {vssub\.vv} 1 } } */
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i32.c
new file mode 100644
index 000..72b2d6778cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i32.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
-fdump-rtl-expand-details" } */
+
+#include "../vec_sat_arith.h"
+
+DEF_VEC_SAT_S_SUB_FMT_2(int32_t, uint32_t, INT32_MIN, INT32_MAX)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 2 "expand" } } */
+/* { dg-final { scan-assembler-times {vssub\.vv} 1 } } */
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-2-i64.c
new file mode 100644
index 000..3ca44589e42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_

[PATCH 2/4] Match: Support form 3 for vector signed integer SAT_SUB

2024-10-11 Thread pan2 . li
From: Pan Li 

This patch would like to support the form 3 of the vector signed
integer SAT_SUB.  Aka below example:

Form 3:
  #define DEF_VEC_SAT_S_SUB_FMT_3(T, UT, MIN, MAX) \
  void __attribute__((noinline))   \
  vec_sat_s_sub_##T##_fmt_3 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
T minus;   \
bool overflow = __builtin_sub_overflow (x, y, &minus); \
out[i] = overflow ? x < 0 ? MIN : MAX : minus; \
  }\
  }

Before this patch:
  25   │   if (limit_11(D) != 0)
  26   │ goto ; [89.00%]
  27   │   else
  28   │ goto ; [11.00%]
  29   │ ;;succ:   3
  30   │ ;;8
  31   │
  32   │ ;;   basic block 3, loop depth 0
  33   │ ;;pred:   2
  34   │   _13 = (unsigned long) limit_11(D);
  35   │ ;;succ:   4
  36   │
  37   │ ;;   basic block 4, loop depth 1
  38   │ ;;pred:   3
  39   │ ;;7
  40   │   # ivtmp.7_34 = PHI <0(3), ivtmp.7_30(7)>
  41   │   _26 = op_1_12(D) + ivtmp.7_34;
  42   │   x_29 = MEM[(int8_t *)_26];
  43   │   _1 = op_2_14(D) + ivtmp.7_34;
  44   │   y_24 = MEM[(int8_t *)_1];
  45   │   _9 = .SUB_OVERFLOW (x_29, y_24);
  46   │   _7 = IMAGPART_EXPR <_9>;
  47   │   if (_7 != 0)
  48   │ goto ; [50.00%]
  49   │   else
  50   │ goto ; [50.00%]
  51   │ ;;succ:   6
  52   │ ;;5
  53   │
  54   │ ;;   basic block 5, loop depth 1
  55   │ ;;pred:   4
  56   │   _42 = REALPART_EXPR <_9>;
  57   │   _2 = out_17(D) + ivtmp.7_34;
  58   │   MEM[(int8_t *)_2] = _42;
  59   │   ivtmp.7_27 = ivtmp.7_34 + 1;
  60   │   if (_13 != ivtmp.7_27)
  61   │ goto ; [89.00%]
  62   │   else
  63   │ goto ; [11.00%]
  64   │ ;;succ:   7
  65   │ ;;8
  66   │
  67   │ ;;   basic block 6, loop depth 1
  68   │ ;;pred:   4
  69   │   _38 = x_29 < 0;
  70   │   _39 = (signed char) _38;
  71   │   _40 = -_39;
  72   │   _41 = _40 ^ 127;
  73   │   _33 = out_17(D) + ivtmp.7_34;
  74   │   MEM[(int8_t *)_33] = _41;
  75   │   ivtmp.7_25 = ivtmp.7_34 + 1;
  76   │   if (_13 != ivtmp.7_25)
  77   │ goto ; [89.00%]
  78   │   else
  79   │ goto ; [11.00%]

After this patch:
  77   │   _94 = .SELECT_VL (ivtmp_92, POLY_INT_CST [16, 16]);
  78   │   vect_x_13.9_81 = .MASK_LEN_LOAD (vectp_op_1.7_79, 8B, { -1, ... }, 
_94, 0);
  79   │   vect_y_15.12_85 = .MASK_LEN_LOAD (vectp_op_2.10_83, 8B, { -1, ... }, 
_94, 0);
  80   │   vect_patt_49.13_86 = .SAT_SUB (vect_x_13.9_81, vect_y_15.12_85);
  81   │   .MASK_LEN_STORE (vectp_out.14_88, 8B, { -1, ... }, _94, 0, 
vect_patt_49.13_86);
  82   │   vectp_op_1.7_80 = vectp_op_1.7_79 + _94;
  83   │   vectp_op_2.10_84 = vectp_op_2.10_83 + _94;
  84   │   vectp_out.14_89 = vectp_out.14_88 + _94;
  85   │   ivtmp_93 = ivtmp_92 - _94;

The below test suites are passed for this patch.
* The rv64gcv fully regression test.
* The x86 bootstrap test.
* The x86 fully regression test.

gcc/ChangeLog:

* match.pd: Add matching pattern for vector signed SAT_SUB form 3.

Signed-off-by: Pan Li 
---
 gcc/match.pd | 12 
 1 file changed, 12 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index f2b5f3af9ef..2ab76f9d055 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3417,6 +3417,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
@2)
  (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type
 
+/* Signed saturation sub, case 5:
+   Z = .SUB_OVERFLOW (X, Y)
+   SAT_S_SUB = IMAGPART_EXPR (Z) != 0 ? (-(T)(X < 0) ^ MAX) : minus;  */
+(match (signed_integer_sat_sub @0 @1)
+ (cond^ (ne (imagpart (IFN_SUB_OVERFLOW:c@2 @0 @1)) integer_zerop)
+   (bit_xor:c (nop_convert?
+   (negate (nop_convert? (convert (lt @0 integer_zerop)
+  max_value)
+   (realpart @2))
+ (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type)
+  && types_match (type, @0, @1
+
 /* Unsigned saturation truncate, case 1, sizeof (WT) > sizeof (NT).
SAT_U_TRUNC = (NT)x | (NT)(-(X > (WT)(NT)(-1))).  */
 (match (unsigned_integer_sat_trunc @0)
-- 
2.43.0



[PATCH 4/4] RISC-V: Add testcases for form 4 of vector signed SAT_SUB

2024-10-11 Thread pan2 . li
From: Pan Li 

Form 4:
  #define DEF_VEC_SAT_S_SUB_FMT_4(T, UT, MIN, MAX) \
  void __attribute__((noinline))   \
  vec_sat_s_sub_##T##_fmt_4 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
T minus;   \
bool overflow = __builtin_sub_overflow (x, y, &minus); \
out[i] = !overflow ? minus : x < 0 ? MIN : MAX;\
  }\
  }

The below test are passed for this patch.
* The rv64gcv fully regression test.

It is test only patch and obvious up to a point, will commit it
directly if no comments in next 48H.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vec_sat_arith.h: Add test helper macros.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i16.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i32.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i64.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i8.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-4-i16.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-4-i32.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-4-i64.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-4-i8.c: New test.

Signed-off-by: Pan Li 
---
 .../rvv/autovec/binop/vec_sat_s_sub-4-i16.c   |  9 
 .../rvv/autovec/binop/vec_sat_s_sub-4-i32.c   |  9 
 .../rvv/autovec/binop/vec_sat_s_sub-4-i64.c   |  9 
 .../rvv/autovec/binop/vec_sat_s_sub-4-i8.c|  9 
 .../autovec/binop/vec_sat_s_sub-run-4-i16.c   | 17 ++
 .../autovec/binop/vec_sat_s_sub-run-4-i32.c   | 17 ++
 .../autovec/binop/vec_sat_s_sub-run-4-i64.c   | 17 ++
 .../autovec/binop/vec_sat_s_sub-run-4-i8.c| 17 ++
 .../riscv/rvv/autovec/vec_sat_arith.h | 22 +++
 9 files changed, 126 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i16.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i32.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i64.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i8.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-4-i16.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-4-i32.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-4-i64.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-4-i8.c

diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i16.c
new file mode 100644
index 000..4497f0c1f83
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i16.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
-fdump-rtl-expand-details" } */
+
+#include "../vec_sat_arith.h"
+
+DEF_VEC_SAT_S_SUB_FMT_4(int16_t, uint16_t, INT16_MIN, INT16_MAX)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 2 "expand" } } */
+/* { dg-final { scan-assembler-times {vssub\.vv} 1 } } */
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i32.c
new file mode 100644
index 000..9f06e6a7650
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i32.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
-fdump-rtl-expand-details" } */
+
+#include "../vec_sat_arith.h"
+
+DEF_VEC_SAT_S_SUB_FMT_4(int32_t, uint32_t, INT32_MIN, INT32_MAX)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 2 "expand" } } */
+/* { dg-final { scan-assembler-times {vssub\.vv} 1 } } */
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i64.c
new file mode 100644
index 000..e806fd06c00
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-4-i64.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-

[PATCH 3/4] RISC-V: Add testcases for form 3 of vector signed SAT_SUB

2024-10-11 Thread pan2 . li
From: Pan Li 

Form 3:
  #define DEF_VEC_SAT_S_SUB_FMT_3(T, UT, MIN, MAX) \
  void __attribute__((noinline))   \
  vec_sat_s_sub_##T##_fmt_3 (T *out, T *op_1, T *op_2, unsigned limit) \
  {\
unsigned i;\
for (i = 0; i < limit; i++)\
  {\
T x = op_1[i]; \
T y = op_2[i]; \
T minus;   \
bool overflow = __builtin_sub_overflow (x, y, &minus); \
out[i] = overflow ? x < 0 ? MIN : MAX : minus; \
  }\
  }

The below test are passed for this patch.
* The rv64gcv fully regression test.

It is test only patch and obvious up to a point, will commit it
directly if no comments in next 48H.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vec_sat_arith.h: Add test helper macros.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i16.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i32.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i64.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i8.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-3-i16.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-3-i32.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-3-i64.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-3-i8.c: New test.

Signed-off-by: Pan Li 
---
 .../rvv/autovec/binop/vec_sat_s_sub-3-i16.c   |  9 
 .../rvv/autovec/binop/vec_sat_s_sub-3-i32.c   |  9 
 .../rvv/autovec/binop/vec_sat_s_sub-3-i64.c   |  9 
 .../rvv/autovec/binop/vec_sat_s_sub-3-i8.c|  9 
 .../autovec/binop/vec_sat_s_sub-run-3-i16.c   | 17 ++
 .../autovec/binop/vec_sat_s_sub-run-3-i32.c   | 17 ++
 .../autovec/binop/vec_sat_s_sub-run-3-i64.c   | 17 ++
 .../autovec/binop/vec_sat_s_sub-run-3-i8.c| 17 ++
 .../riscv/rvv/autovec/vec_sat_arith.h | 22 +++
 9 files changed, 126 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i16.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i32.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i64.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i8.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-3-i16.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-3-i32.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-3-i64.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-run-3-i8.c

diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i16.c
new file mode 100644
index 000..c10dc0903c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i16.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
-fdump-rtl-expand-details" } */
+
+#include "../vec_sat_arith.h"
+
+DEF_VEC_SAT_S_SUB_FMT_3(int16_t, uint16_t, INT16_MIN, INT16_MAX)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 2 "expand" } } */
+/* { dg-final { scan-assembler-times {vssub\.vv} 1 } } */
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i32.c
new file mode 100644
index 000..d1352ed56e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i32.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
-fdump-rtl-expand-details" } */
+
+#include "../vec_sat_arith.h"
+
+DEF_VEC_SAT_S_SUB_FMT_3(int32_t, uint32_t, INT32_MIN, INT32_MAX)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 2 "expand" } } */
+/* { dg-final { scan-assembler-times {vssub\.vv} 1 } } */
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i64.c
new file mode 100644
index 000..b86887d332b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/vec_sat_s_sub-3-i64.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-

  1   2   >