[gcc r15-645] AVR: target/115065 - Tweak __clzhi2.

2024-05-18 Thread Georg-Johann Lay via Gcc-cvs
https://gcc.gnu.org/g:988838da722dea09bd81ee9d49800a6f24980372

commit r15-645-g988838da722dea09bd81ee9d49800a6f24980372
Author: Wolfgang Hospital 
Date:   Sat May 18 15:02:51 2024 +0200

AVR: target/115065 - Tweak __clzhi2.

The libgcc implementation of __clzhi2 can be tweaked by
one cycle in some situations by re-arranging the instructions.
It also reduces the WCET by 1 cycle.

libgcc/
PR target/115065
* config/avr/lib1funcs.S (__clzhi2): Tweak.

Diff:
---
 libgcc/config/avr/lib1funcs.S | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libgcc/config/avr/lib1funcs.S b/libgcc/config/avr/lib1funcs.S
index 04a4eb01ab43..d48b04747da8 100644
--- a/libgcc/config/avr/lib1funcs.S
+++ b/libgcc/config/avr/lib1funcs.S
@@ -2921,11 +2921,9 @@ DEFUN __clzhi2
 clr  r26
 tst  r25
 brne 1f
-subi r26, -8
 or   r25, r24
-brne 1f
-ldi  r24, 16
-ret
+breq 0f
+subi r26, -8
 1:  cpi  r25, 16
 brsh 3f
 subi r26, -3
@@ -2936,6 +2934,8 @@ DEFUN __clzhi2
 mov  r24, r26
 clr  r25
 ret
+0:  ldi  r24, 16
+ret
 ENDF __clzhi2
 #endif /* defined (L_clzhi2) */


[gcc r14-10217] AVR: target/115065 - Tweak __clzhi2.

2024-05-18 Thread Georg-Johann Lay via Gcc-cvs
https://gcc.gnu.org/g:3b88dade7ff8a07fd0843ac1281e095cfd94453e

commit r14-10217-g3b88dade7ff8a07fd0843ac1281e095cfd94453e
Author: Wolfgang Hospital 
Date:   Sat May 18 15:02:51 2024 +0200

AVR: target/115065 - Tweak __clzhi2.

The libgcc implementation of __clzhi2 can be tweaked by
one cycle in some situations by re-arranging the instructions.
It also reduces the WCET by 1 cycle.

libgcc/
PR target/115065
* config/avr/lib1funcs.S (__clzhi2): Tweak.

(cherry picked from commit 988838da722dea09bd81ee9d49800a6f24980372)

Diff:
---
 libgcc/config/avr/lib1funcs.S | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libgcc/config/avr/lib1funcs.S b/libgcc/config/avr/lib1funcs.S
index a0854139a0a3..1d04fed72644 100644
--- a/libgcc/config/avr/lib1funcs.S
+++ b/libgcc/config/avr/lib1funcs.S
@@ -2921,11 +2921,9 @@ DEFUN __clzhi2
 clr  r26
 tst  r25
 brne 1f
-subi r26, -8
 or   r25, r24
-brne 1f
-ldi  r24, 16
-ret
+breq 0f
+subi r26, -8
 1:  cpi  r25, 16
 brsh 3f
 subi r26, -3
@@ -2936,6 +2934,8 @@ DEFUN __clzhi2
 mov  r24, r26
 clr  r25
 ret
+0:  ldi  r24, 16
+ret
 ENDF __clzhi2
 #endif /* defined (L_clzhi2) */


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Add Zvfbfwma extension to the -march= option

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:67195fbc4deac8659d8f65ab922416ac451ae5bb

commit 67195fbc4deac8659d8f65ab922416ac451ae5bb
Author: Xiao Zeng 
Date:   Wed May 15 10:03:40 2024 +0800

RISC-V: Add Zvfbfwma extension to the -march= option

This patch would like to add new sub extension (aka Zvfbfwma) to the
-march= option. It introduces a new data type BF16.

1 In spec: "Zvfbfwma requires the Zvfbfmin extension and the Zfbfmin 
extension."
  1.1 In EmbeddedProcessor: Zvfbfwma -> Zvfbfmin -> Zve32f
  1.2 In Application Processor: Zvfbfwma -> Zvfbfmin -> V
  1.3 In both scenarios, there are: Zvfbfwma -> Zfbfmin

2 Zvfbfmin's information is in:



3 Zfbfmin's formation is in:



4 Depending on different usage scenarios, the Zvfbfwma extension may
depend on 'V' or 'Zve32f'. This patch only implements dependencies in
scenario of Embedded Processor. This is consistent with the processing
strategy in Zvfbfmin. In scenario of Application Processor, it is
necessary to explicitly indicate the dependent 'V' extension.

5 You can locate more information about Zvfbfwma from below spec doc:



gcc/ChangeLog:

* common/config/riscv/riscv-common.cc:
(riscv_implied_info): Add zvfbfwma item.
(riscv_ext_version_table): Ditto.
(riscv_ext_flag_table): Ditto.
* config/riscv/riscv.opt:
(MASK_ZVFBFWMA): New macro.
(TARGET_ZVFBFWMA): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/arch-37.c: New test.
* gcc.target/riscv/arch-38.c: New test.
* gcc.target/riscv/predef-36.c: New test.
* gcc.target/riscv/predef-37.c: New test.

(cherry picked from commit 38dd4e26e07c6be7cf4d169141ee4f3a03f3a09d)

Diff:
---
 gcc/common/config/riscv/riscv-common.cc|  5 
 gcc/config/riscv/riscv.opt |  2 ++
 gcc/testsuite/gcc.target/riscv/arch-37.c   |  5 
 gcc/testsuite/gcc.target/riscv/arch-38.c   |  5 
 gcc/testsuite/gcc.target/riscv/predef-36.c | 48 ++
 gcc/testsuite/gcc.target/riscv/predef-37.c | 48 ++
 6 files changed, 113 insertions(+)

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index fb76017ffbc0..88204393fde0 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -162,6 +162,8 @@ static const riscv_implied_info_t riscv_implied_info[] =
   {"zfa", "f"},
 
   {"zvfbfmin", "zve32f"},
+  {"zvfbfwma", "zvfbfmin"},
+  {"zvfbfwma", "zfbfmin"},
   {"zvfhmin", "zve32f"},
   {"zvfh", "zve32f"},
   {"zvfh", "zfhmin"},
@@ -336,6 +338,7 @@ static const struct riscv_ext_version 
riscv_ext_version_table[] =
   {"zfh",   ISA_SPEC_CLASS_NONE, 1, 0},
   {"zfhmin",ISA_SPEC_CLASS_NONE, 1, 0},
   {"zvfbfmin",  ISA_SPEC_CLASS_NONE, 1, 0},
+  {"zvfbfwma",  ISA_SPEC_CLASS_NONE, 1, 0},
   {"zvfhmin",   ISA_SPEC_CLASS_NONE, 1, 0},
   {"zvfh",  ISA_SPEC_CLASS_NONE, 1, 0},
 
@@ -1667,6 +1670,7 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =
   {"zve64f",   &gcc_options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_FP_32},
   {"zve64d",   &gcc_options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_FP_64},
   {"zvfbfmin", &gcc_options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_BF_16},
+  {"zvfbfwma", &gcc_options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_BF_16},
   {"zvfhmin",  &gcc_options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_FP_16},
   {"zvfh", &gcc_options::x_riscv_vector_elen_flags, 
MASK_VECTOR_ELEN_FP_16},
 
@@ -1704,6 +1708,7 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =
   {"zfhmin",&gcc_options::x_riscv_zf_subext, MASK_ZFHMIN},
   {"zfh",   &gcc_options::x_riscv_zf_subext, MASK_ZFH},
   {"zvfbfmin",  &gcc_options::x_riscv_zf_subext, MASK_ZVFBFMIN},
+  {"zvfbfwma",  &gcc_options::x_riscv_zf_subext, MASK_ZVFBFWMA},
   {"zvfhmin",   &gcc_options::x_riscv_zf_subext, MASK_ZVFHMIN},
   {"zvfh",  &gcc_options::x_riscv_zf_subext, MASK_ZVFH},
 
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 1252834aec5b..d209ac896fde 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -401,6 +401,8 @@ Mask(ZFH) Var(riscv_zf_subext)
 
 Mask(ZVFBFMIN) Var(riscv_zf_subext)
 
+Mask(ZVFBFWMA) Var(riscv_zf_subext)
+
 Mask(ZVFHMIN) Var(riscv_zf_subext)
 
 Mask(ZVFH)Var(riscv_zf_subext)
diff --git a/gcc/testsuite/gcc.target/riscv/arch-37.c 
b/gcc/testsuite/gcc.target/riscv/arch-37.c
new file mode 100644
index ..5b19a73c5567
--- 

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: testsuite: Drop march-string in cmpmemsi/cpymemsi tests

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:faf2f9ed73969d838026027566473bde14db748b

commit faf2f9ed73969d838026027566473bde14db748b
Author: Christoph Müllner 
Date:   Thu May 16 09:53:47 2024 +0200

RISC-V: testsuite: Drop march-string in cmpmemsi/cpymemsi tests

The tests cmpmemsi-1.c and cpymemsi-1.c are execution ("dg-do run")
tests, which does not have any restrictions for the enabled extensions.
Further, no other listed options are required.
Let's drop the options, so that the test can also be executed on
non-f and non-d targets.  However, we need to set options to the
defaults without '-ansi', because the included test file uses the
'asm' keyword, which is not part of ANSI C.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/cmpmemsi-1.c: Drop options.
* gcc.target/riscv/cpymemsi-1.c: Likewise.

Signed-off-by: Christoph Müllner 
(cherry picked from commit b8b82bb05c10544da05cd0d3d39e6bc3763a8d9f)

Diff:
---
 gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c | 3 +--
 gcc/testsuite/gcc.target/riscv/cpymemsi-1.c | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c 
b/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
index d7e0bc474073..698f27d89fbf 100644
--- a/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cmpmemsi-1.c
@@ -1,6 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-march=rv32gc_zbb -save-temps -g0 -fno-lto" { target { rv32 } 
} } */
-/* { dg-options "-march=rv64gc_zbb -save-temps -g0 -fno-lto" { target { rv64 } 
} } */
+/* { dg-options "-pedantic-errors" } */
 /* { dg-timeout-factor 2 } */
 
 #include "../../gcc.dg/memcmp-1.c"
diff --git a/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c 
b/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
index 983b564ccaf7..30e9f119bedc 100644
--- a/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
+++ b/gcc/testsuite/gcc.target/riscv/cpymemsi-1.c
@@ -1,7 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-march=rv32gc -save-temps -g0 -fno-lto" { target { rv32 } } } 
*/
-/* { dg-options "-march=rv64gc -save-temps -g0 -fno-lto" { target { rv64 } } } 
*/
-/* { dg-additional-options "-DRUN_FRACTION=11" { target simulator } } */
+/* { dg-options "-pedantic-errors" } */
 /* { dg-timeout-factor 2 } */
 
 #include "../../gcc.dg/memcmp-1.c"


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:51b69c80a76ba767ed166e93a569a84dae445b23

commit 51b69c80a76ba767ed166e93a569a84dae445b23
Author: Pan Li 
Date:   Wed May 15 10:14:05 2024 +0800

Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

This patch would like to add the middle-end presentation for the
saturation add.  Aka set the result of add to the max when overflow.
It will take the pattern similar as below.

SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))

Take uint8_t as example, we will have:

* SAT_ADD (1, 254)   => 255.
* SAT_ADD (1, 255)   => 255.
* SAT_ADD (2, 255)   => 255.
* SAT_ADD (255, 255) => 255.

Given below example for the unsigned scalar integer uint64_t:

uint64_t sat_add_u64 (uint64_t x, uint64_t y)
{
  return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
}

Before this patch:
uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
{
  long unsigned int _1;
  _Bool _2;
  long unsigned int _3;
  long unsigned int _4;
  uint64_t _7;
  long unsigned int _10;
  __complex__ long unsigned int _11;

;;   basic block 2, loop depth 0
;;pred:   ENTRY
  _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
  _1 = REALPART_EXPR <_11>;
  _10 = IMAGPART_EXPR <_11>;
  _2 = _10 != 0;
  _3 = (long unsigned int) _2;
  _4 = -_3;
  _7 = _1 | _4;
  return _7;
;;succ:   EXIT

}

After this patch:
uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
{
  uint64_t _7;

;;   basic block 2, loop depth 0
;;pred:   ENTRY
  _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
  return _7;
;;succ:   EXIT
}

The below tests are passed for this patch:
1. The riscv fully regression tests.
3. The x86 bootstrap tests.
4. The x86 fully regression tests.

PR target/51492
PR target/112600

gcc/ChangeLog:

* internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
to the return true switch case(s).
* internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
* match.pd: Add unsigned SAT_ADD match(es).
* optabs.def (OPTAB_NL): Remove fixed-point limitation for
us/ssadd.
* tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New
extern func decl generated in match.pd match.
(match_saturation_arith): New func impl to match the saturation 
arith.
(math_opts_dom_walker::after_dom_children): Try match saturation
arith when IOR expr.

Signed-off-by: Pan Li 
(cherry picked from commit 52b0536710ff3f3ace72ab00ce9ef6c630cd1183)

Diff:
---
 gcc/internal-fn.cc|  1 +
 gcc/internal-fn.def   |  2 ++
 gcc/match.pd  | 51 +++
 gcc/optabs.def|  4 ++--
 gcc/tree-ssa-math-opts.cc | 32 +
 5 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 0a7053c2286c..73045ca8c8c1 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
 case IFN_UBSAN_CHECK_MUL:
 case IFN_ADD_OVERFLOW:
 case IFN_MUL_OVERFLOW:
+case IFN_SAT_ADD:
 case IFN_VEC_WIDEN_PLUS:
 case IFN_VEC_WIDEN_PLUS_LO:
 case IFN_VEC_WIDEN_PLUS_HI:
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 848bb9dbff3f..25badbb86e56 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | 
ECF_NOTHROW, first,
 DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
  smulhrs, umulhrs, binary)
 
+DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary)
+
 DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
 DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
 DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
diff --git a/gcc/match.pd b/gcc/match.pd
index d401e7503e62..aa1e2875c604 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3043,6 +3043,57 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
|| POINTER_TYPE_P (itype))
   && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))
 
+/* Unsigned Saturation Add */
+(match (usadd_left_part_1 @0 @1)
+ (plus:c @0 @1)
+ (if (INTEGRAL_TYPE_P (type)
+  && TYPE_UNSIGNED (TREE_TYPE (@0))
+  && types_match (type, TREE_TYPE (@0))
+  && types_match (type, TREE_TYPE (@1)
+
+(match (usadd_left_part_2 @0 @1)
+ (realpart (IFN_ADD_OVERFLOW:c @0 @1))
+ (if (INTEGRAL_TYPE_P (type)
+  && TYPE_UNSIGNED (TREE_TYPE (@0))
+  && types_match (type, TREE_TYPE (@0))
+  && types_match (type, TREE_TYPE (@1)
+
+(match (usadd_right_part_1 @0 @1)
+ (negate (convert (lt (plus:c @0 @1) @0)))
+ (if (INTEGRAL_TYPE_P (typ

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] Vect: Support new IFN SAT_ADD for unsigned vector int

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:674362d73e964815cdb700edd9fedbfc34c24c21

commit 674362d73e964815cdb700edd9fedbfc34c24c21
Author: Pan Li 
Date:   Wed May 15 10:14:06 2024 +0800

Vect: Support new IFN SAT_ADD for unsigned vector int

For vectorize, we leverage the existing vect pattern recog to find
the pattern similar to scalar and let the vectorizer to perform
the rest part for standard name usadd3 in vector mode.
The riscv vector backend have insn "Vector Single-Width Saturating
Add and Subtract" which can be leveraged when expand the usadd3
in vector mode.  For example:

void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
{
  unsigned i;

  for (i = 0; i < n; i++)
out[i] = (x[i] + y[i]) | (- (uint64_t)((uint64_t)(x[i] + y[i]) < x[i]));
}

Before this patch:
void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
{
  ...
  _80 = .SELECT_VL (ivtmp_78, POLY_INT_CST [2, 2]);
  ivtmp_58 = _80 * 8;
  vect__4.7_61 = .MASK_LEN_LOAD (vectp_x.5_59, 64B, { -1, ... }, _80, 0);
  vect__6.10_65 = .MASK_LEN_LOAD (vectp_y.8_63, 64B, { -1, ... }, _80, 0);
  vect__7.11_66 = vect__4.7_61 + vect__6.10_65;
  mask__8.12_67 = vect__4.7_61 > vect__7.11_66;
  vect__12.15_72 = .VCOND_MASK (mask__8.12_67, { 18446744073709551615,
... }, vect__7.11_66);
  .MASK_LEN_STORE (vectp_out.16_74, 64B, { -1, ... }, _80, 0, 
vect__12.15_72);
  vectp_x.5_60 = vectp_x.5_59 + ivtmp_58;
  vectp_y.8_64 = vectp_y.8_63 + ivtmp_58;
  vectp_out.16_75 = vectp_out.16_74 + ivtmp_58;
  ivtmp_79 = ivtmp_78 - _80;
  ...
}

After this patch:
void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
{
  ...
  _62 = .SELECT_VL (ivtmp_60, POLY_INT_CST [2, 2]);
  ivtmp_46 = _62 * 8;
  vect__4.7_49 = .MASK_LEN_LOAD (vectp_x.5_47, 64B, { -1, ... }, _62, 0);
  vect__6.10_53 = .MASK_LEN_LOAD (vectp_y.8_51, 64B, { -1, ... }, _62, 0);
  vect__12.11_54 = .SAT_ADD (vect__4.7_49, vect__6.10_53);
  .MASK_LEN_STORE (vectp_out.12_56, 64B, { -1, ... }, _62, 0, 
vect__12.11_54);
  ...
}

The below test suites are passed for this patch.
* The riscv fully regression tests.
* The x86 bootstrap tests.
* The x86 fully regression tests.

PR target/51492
PR target/112600

gcc/ChangeLog:

* tree-vect-patterns.cc (gimple_unsigned_integer_sat_add): New
func decl generated by match.pd match.
(vect_recog_sat_add_pattern): New func impl to recog the pattern
for unsigned SAT_ADD.

Signed-off-by: Pan Li 
(cherry picked from commit d4dee347b3fe1982bab26485ff31cd039c9df010)

Diff:
---
 gcc/tree-vect-patterns.cc | 52 +++
 1 file changed, 52 insertions(+)

diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 87c2acff386d..6fd2373644f4 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -4487,6 +4487,57 @@ vect_recog_mult_pattern (vec_info *vinfo,
   return pattern_stmt;
 }
 
+extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
+
+/*
+ * Try to detect saturation add pattern (SAT_ADD), aka below gimple:
+ *   _7 = _4 + _6;
+ *   _8 = _4 > _7;
+ *   _9 = (long unsigned int) _8;
+ *   _10 = -_9;
+ *   _12 = _7 | _10;
+ *
+ * And then simplied to
+ *   _12 = .SAT_ADD (_4, _6);
+ */
+
+static gimple *
+vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
+   tree *type_out)
+{
+  gimple *last_stmt = STMT_VINFO_STMT (stmt_vinfo);
+
+  if (!is_gimple_assign (last_stmt))
+return NULL;
+
+  tree res_ops[2];
+  tree lhs = gimple_assign_lhs (last_stmt);
+
+  if (gimple_unsigned_integer_sat_add (lhs, res_ops, NULL))
+{
+  tree itype = TREE_TYPE (res_ops[0]);
+  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
+
+  if (vtype != NULL_TREE
+   && direct_internal_fn_supported_p (IFN_SAT_ADD, vtype,
+  OPTIMIZE_FOR_BOTH))
+   {
+ *type_out = vtype;
+ gcall *call = gimple_build_call_internal (IFN_SAT_ADD, 2, res_ops[0],
+   res_ops[1]);
+
+ gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL));
+ gimple_call_set_nothrow (call, /* nothrow_p */ false);
+ gimple_set_location (call, gimple_location (last_stmt));
+
+ vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt);
+ return call;
+   }
+}
+
+  return NULL;
+}
+
 /* Detect a signed division by a constant that wouldn't be
otherwise vectorized:
 
@@ -6987,6 +7038,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
   { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
   { vect_recog_divmod_pattern, "divmod" },
   { v

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] Vect: Support loop len in vectorizable early exit

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:4ec3a6b6022c1853cfd5866dea0324a4002413b2

commit 4ec3a6b6022c1853cfd5866dea0324a4002413b2
Author: Pan Li 
Date:   Thu May 16 09:58:13 2024 +0800

Vect: Support loop len in vectorizable early exit

This patch adds early break auto-vectorization support for target which
use length on partial vectorization.  Consider this following example:

unsigned vect_a[802];
unsigned vect_b[802];

void test (unsigned x, int n)
{
  for (int i = 0; i < n; i++)
  {
vect_b[i] = x + i;

if (vect_a[i] > x)
  break;

vect_a[i] = x;
  }
}

We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
And then the IR of RVV looks like below:

  ...
  _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
  _55 = (int) _87;
  ...
  mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
  vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
{0, ... }, _87, 0);
  if (vec_len_mask_72 != { 0, ... })
goto ; [5.50%]
  else
goto ; [94.50%]

The below tests are passed for this patch:
1. The riscv fully regression tests.
2. The x86 bootstrap tests.
3. The x86 fully regression tests.

gcc/ChangeLog:

* tree-vect-loop.cc (vect_gen_loop_len_mask): New func to gen
the loop len mask.
* tree-vect-stmts.cc (vectorizable_early_exit): Invoke the
vect_gen_loop_len_mask for 1 or more stmt(s).
* tree-vectorizer.h (vect_gen_loop_len_mask): New func decl
for vect_gen_loop_len_mask.

Signed-off-by: Pan Li 
(cherry picked from commit 57f8a2f67c1536be23231808ab00613ab69193ed)

Diff:
---
 gcc/tree-vect-loop.cc  | 27 +++
 gcc/tree-vect-stmts.cc | 17 +++--
 gcc/tree-vectorizer.h  |  4 
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 29c03c246d45..6ff3ca09dc6a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -11394,6 +11394,33 @@ vect_get_loop_len (loop_vec_info loop_vinfo, 
gimple_stmt_iterator *gsi,
   return loop_len;
 }
 
+/* Generate the tree for the loop len mask and return it.  Given the lens,
+   nvectors, vectype, index and factor to gen the len mask as below.
+
+   tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
+*/
+tree
+vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+   gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
+   unsigned int nvectors, tree vectype, tree stmt,
+   unsigned int index, unsigned int factor)
+{
+  tree all_one_mask = build_all_ones_cst (vectype);
+  tree all_zero_mask = build_zero_cst (vectype);
+  tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, 
index,
+   factor);
+  tree bias = build_int_cst (intQI_type_node,
+LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
+  tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
+  gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
+   all_one_mask, all_zero_mask, len,
+   bias);
+  gimple_call_set_lhs (call, len_mask);
+  gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
+
+  return len_mask;
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
by factor VF.
If FLAT is true, the loop we started with had unrealistically flat
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index f8d8636b139a..d592dff73e33 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12893,7 +12893,9 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info 
stmt_info,
 ncopies = vect_get_num_copies (loop_vinfo, vectype);
 
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
 
   /* Now build the new conditional.  Pattern gimple_conds get dropped during
  codegen so we must replace the original insn.  */
@@ -12957,12 +12959,11 @@ vectorizable_early_exit (vec_info *vinfo, 
stmt_vec_info stmt_info,
{
  if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
  OPTIMIZE_FOR_SPEED))
-   return false;
+   vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
  else
vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
}
 
-
   return true;
 }
 
@@ -13015,6 +13016,15 @@ vectorizable_early_exit (vec_info *vinfo, 
stmt_vec_info stmt_info,

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Implement vectorizable early exit with vcond_mask_len

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:b1aab03aed7f3d8c9b104b5f596e7e9853b8d5e6

commit b1aab03aed7f3d8c9b104b5f596e7e9853b8d5e6
Author: Pan Li 
Date:   Thu May 16 10:02:40 2024 +0800

RISC-V: Implement vectorizable early exit with vcond_mask_len

After we support the loop lens for the vectorizable,  we would like to
implement the feature for the RISC-V target.  Given below example:

unsigned vect_a[1923];
unsigned vect_b[1923];

void test (unsigned limit, int n)
{
  for (int i = 0; i < n; i++)
{
  vect_b[i] = limit + i;

  if (vect_a[i] > limit)
{
  ret = vect_b[i];
  return ret;
}

  vect_a[i] = limit;
}
}

Before this patch:
  ...
.L8:
  swa3,0(a5)
  addiw a0,a0,1
  addi  a4,a4,4
  addi  a5,a5,4
  beq   a1,a0,.L2
.L4:
  swa0,0(a4)
  lwa2,0(a5)
  bleu  a2,a3,.L8
  ret

After this patch:
  ...
.L5:
  vsetvli   a5,a3,e8,mf4,ta,ma
  vmv1r.v   v4,v2
  vsetvli   t4,zero,e32,m1,ta,ma
  vmv.v.x   v1,a5
  vadd.vv   v2,v2,v1
  vsetvli   zero,a5,e32,m1,ta,ma
  vadd.vv   v5,v4,v3
  slli  a6,a5,2
  vle32.v   v1,0(t1)
  vmsltu.vv v1,v3,v1
  vcpop.m   t4,v1
  beq   t4,zero,.L4
  vmv.x.s   a4,v4
.L3:
  ...

The below tests are passed for this patch:
1. The riscv fully regression tests.

gcc/ChangeLog:

* 
config/riscv/autovec-opt.md(*vcond_mask_len_popcount_):
New pattern of vcond_mask_len_popcount for vector bool mode.
* config/riscv/autovec.md (vcond_mask_len_): New pattern of
vcond_mask_len for vector bool mode.
(cbranch4): New pattern for vector bool mode.
* config/riscv/vector-iterators.md: Add new unspec 
UNSPEC_SELECT_MASK.
* config/riscv/vector.md (@pred_popcount): Add VLS 
mode
to popcount pattern.
(@pred_popcount): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/early-break-1.c: New test.
* gcc.target/riscv/rvv/autovec/early-break-2.c: New test.

Signed-off-by: Pan Li 
(cherry picked from commit 6c1de786e53a11150feb16ba990d0d6c6fd910db)

Diff:
---
 gcc/config/riscv/autovec-opt.md| 33 
 gcc/config/riscv/autovec.md| 61 ++
 gcc/config/riscv/vector-iterators.md   |  1 +
 gcc/config/riscv/vector.md | 18 +++
 .../gcc.target/riscv/rvv/autovec/early-break-1.c   | 34 
 .../gcc.target/riscv/rvv/autovec/early-break-2.c   | 37 +
 6 files changed, 175 insertions(+), 9 deletions(-)

diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index 645dc53d8680..04f85d8e4553 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1436,3 +1436,36 @@
 DONE;
   }
   [(set_attr "type" "vmalu")])
+
+;; Optimization pattern for early break auto-vectorization
+;; vcond_mask_len (mask, ones, zeros, len, bias) + vlmax popcount
+;; -> non vlmax popcount (mask, len)
+(define_insn_and_split "*vcond_mask_len_popcount_"
+  [(set (match_operand:P 0 "register_operand")
+(popcount:P
+ (unspec:VB_VLS [
+  (unspec:VB_VLS [
+   (match_operand:VB_VLS 1 "register_operand")
+   (match_operand:VB_VLS 2 "const_1_operand")
+   (match_operand:VB_VLS 3 "const_0_operand")
+   (match_operand 4 "autovec_length_operand")
+   (match_operand 5 "const_0_operand")] UNSPEC_SELECT_MASK)
+  (match_operand 6 "autovec_length_operand")
+  (const_int 1)
+  (reg:SI VL_REGNUM)
+  (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)))]
+  "TARGET_VECTOR
+   && can_create_pseudo_p ()
+   && riscv_vector::get_vector_mode (Pmode, GET_MODE_NUNITS 
(mode)).exists ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+riscv_vector::emit_nonvlmax_insn (
+   code_for_pred_popcount (mode, Pmode),
+   riscv_vector::CPOP_OP,
+   operands, operands[4]);
+DONE;
+  }
+  [(set_attr "type" "vector")]
+)
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index aa1ae0fe075b..1ee3c8052fb4 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2612,3 +2612,64 @@
 DONE;
   }
 )
+
+;; =
+;; == Early break auto-vectorization patterns
+;; =
+
+;; vcond_mask_len (mask, 1s, 0s, len, bias)
+;; => mask[i] = mask[i] && i < len ? 1 : 0
+(define_insn_and_split "vcond_mask_len_"
+  [(set (match_operand:VB 0 "register_operand")
+(unspec: VB [
+ (match_operand:VB 1 "register_operand")
+ (match_operand:VB 2 "const_1_operand")
+ (match_operand:VB 3 "const_0_operand")
+ (match_operand 4 "autov

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Enable vectorizable early exit testsuite

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:c1ad575242ff3dee66f2775412b1c65efbc2269b

commit c1ad575242ff3dee66f2775412b1c65efbc2269b
Author: Pan Li 
Date:   Thu May 16 10:04:10 2024 +0800

RISC-V: Enable vectorizable early exit testsuite

After we supported vectorizable early exit in RISC-V,  we would like to
enable the gcc vect test for vectorizable early test.

The vect-early-break_124-pr114403.c failed to vectorize for now.
Because that the __builtin_memcpy with 8 bytes failed to folded into
int64 assignment during ccp1.  We will improve that first and mark
this as xfail for RISC-V.

The below tests are passed for this patch:
1. The riscv fully regression tests.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/slp-mask-store-1.c: Add pragma novector as it will
have 2 times LOOP VECTORIZED in RISC-V.
* gcc.dg/vect/vect-early-break_124-pr114403.c: Xfail for the
riscv backend.
* lib/target-supports.exp: Add RISC-V backend.

Signed-off-by: Pan Li 
(cherry picked from commit 556e777298dac8574533935000c57335c5232921)

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c  | 2 ++
 gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c | 2 +-
 gcc/testsuite/lib/target-supports.exp | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c 
b/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c
index fdd9032da98a..2f80bf89e5e6 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-mask-store-1.c
@@ -28,6 +28,8 @@ main ()
 
   if (__builtin_memcmp (x, res, sizeof (x)) != 0)
 abort ();
+
+#pragma GCC novector
   for (int i = 0; i < 32; ++i)
 if (flag[i] != 0 && flag[i] != 1)
   abort ();
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
index 51abf245ccb5..101ae1e0eaa1 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
@@ -2,7 +2,7 @@
 /* { dg-require-effective-target vect_early_break_hw } */
 /* { dg-require-effective-target vect_long_long } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { xfail riscv*-*-* } } 
} */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 3a55b2a4159c..6c828b73ded3 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4105,6 +4105,7 @@ proc check_effective_target_vect_early_break { } {
|| [check_effective_target_arm_v8_neon_ok]
|| [check_effective_target_sse4]
|| [istarget amdgcn-*-*]
+   || [check_effective_target_riscv_v]
}}]
 }
 
@@ -4120,6 +4121,7 @@ proc check_effective_target_vect_early_break_hw { } {
|| [check_effective_target_arm_v8_neon_hw]
|| [check_sse4_hw_available]
|| [istarget amdgcn-*-*]
+   || [check_effective_target_riscv_v_ok]
}}]
 }


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Cleanup some temporally files [NFC]

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:586e678cd18c8d7a72e5f785094d911a098092ff

commit 586e678cd18c8d7a72e5f785094d911a098092ff
Author: Pan Li 
Date:   Fri May 17 07:45:19 2024 +0800

RISC-V: Cleanup some temporally files [NFC]

Just notice some temporally files under gcc/config/riscv,
deleted as useless.

* Empty file j.
* Vim swap file.

gcc/ChangeLog:

* config/riscv/.riscv.cc.swo: Removed.
* config/riscv/j: Removed.

Signed-off-by: Pan Li 
(cherry picked from commit d477d683d5c6db90c80d348c795709ae6444ba7a)

Diff:
---
 gcc/config/riscv/.riscv.cc.swo | Bin 417792 -> 0 bytes
 gcc/config/riscv/j |   0
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/gcc/config/riscv/.riscv.cc.swo b/gcc/config/riscv/.riscv.cc.swo
deleted file mode 100644
index 77ed37353bee..
Binary files a/gcc/config/riscv/.riscv.cc.swo and /dev/null differ
diff --git a/gcc/config/riscv/j b/gcc/config/riscv/j
deleted file mode 100644
index e69de29bb2d1..


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] internal-fn: Do not force vcond_mask operands to reg.

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:17dfc9744f4995d3161eeba104bd86391005769b

commit 17dfc9744f4995d3161eeba104bd86391005769b
Author: Robin Dapp 
Date:   Fri May 10 12:44:44 2024 +0200

internal-fn: Do not force vcond_mask operands to reg.

In order to directly use constants this patch removes force_regs
in the vcond_mask expander.

gcc/ChangeLog:

PR middle-end/113474

* internal-fn.cc (expand_vec_cond_mask_optab_fn):  Remove
force_regs.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113474.c: New test.

(cherry picked from commit 7ca35f2e430081d6ec91e910002f92d9713350fa)

Diff:
---
 gcc/internal-fn.cc|  3 ---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113474.c | 13 +
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 73045ca8c8c1..9c09026793fa 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -3165,9 +3165,6 @@ expand_vec_cond_mask_optab_fn (internal_fn, gcall *stmt, 
convert_optab optab)
   rtx_op1 = expand_normal (op1);
   rtx_op2 = expand_normal (op2);
 
-  mask = force_reg (mask_mode, mask);
-  rtx_op1 = force_reg (mode, rtx_op1);
-
   rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
   create_output_operand (&ops[0], target, mode);
   create_input_operand (&ops[1], rtx_op1, mode);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113474.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113474.c
new file mode 100644
index ..0364bf9f5e38
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113474.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target riscv_v } }  */
+/* { dg-additional-options "-std=c99" }  */
+
+void
+foo (int n, int **a)
+{
+  int b;
+  for (b = 0; b < n; b++)
+for (long e = 8; e > 0; e--)
+  a[b][e] = a[b][e] == 15;
+}
+
+/* { dg-final { scan-assembler "vmerge.vim" } }  */


[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Add initial cost handling for segment loads/stores.

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:d6cb9a0d984a6c9ea0b548178a5cf79629be073b

commit d6cb9a0d984a6c9ea0b548178a5cf79629be073b
Author: Robin Dapp 
Date:   Mon Feb 26 13:09:15 2024 +0100

RISC-V: Add initial cost handling for segment loads/stores.

This patch makes segment loads and stores more expensive.  It adds
segment_permute_2 as well as 3 to 8 cost fields to the common vector
costs and adds handling to adjust_stmt_cost.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (struct common_vector_cost): Add
segment_permute cost.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
Handle segment loads/stores.
* config/riscv/riscv.cc: Initialize segment_permute_[2-8] to 1.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: Adjust test.

(cherry picked from commit e0b9c8ad7098fb08a25a61fe17d4274dd73e5145)

Diff:
---
 gcc/config/riscv/riscv-protos.h|   9 ++
 gcc/config/riscv/riscv-vector-costs.cc | 163 +++--
 gcc/config/riscv/riscv.cc  |  14 ++
 .../gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c   |   4 +-
 4 files changed, 146 insertions(+), 44 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 565ead1382a7..004ceb1031b8 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -222,6 +222,15 @@ struct common_vector_cost
   const int gather_load_cost;
   const int scatter_store_cost;
 
+  /* Segment load/store permute cost.  */
+  const int segment_permute_2;
+  const int segment_permute_3;
+  const int segment_permute_4;
+  const int segment_permute_5;
+  const int segment_permute_6;
+  const int segment_permute_7;
+  const int segment_permute_8;
+
   /* Cost of a vector-to-scalar operation.  */
   const int vec_to_scalar_cost;
 
diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 4582b0db4250..0a88e142a934 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1052,6 +1052,25 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
   return vector_costs::better_main_loop_than_p (other);
 }
 
+/* Returns the group size i.e. the number of vectors to be loaded by a
+   segmented load/store instruction.  Return 0 if it is no segmented
+   load/store.  */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info)
+{
+  if (stmt_info
+  && (kind == vector_load || kind == vector_store)
+  && STMT_VINFO_DATA_REF (stmt_info))
+{
+  stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+  if (stmt_info
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+   return DR_GROUP_SIZE (stmt_info);
+}
+  return 0;
+}
+
 /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1076,55 +1095,115 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, 
loop_vec_info loop,
 case vector_load:
 case vector_store:
{
- /* Unit-stride vector loads and stores do not have offset addressing
-as opposed to scalar loads and stores.
-If the address depends on a variable we need an additional
-add/sub for each load/store in the worst case.  */
- if (stmt_info && stmt_info->stmt)
+ if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
{
- data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- class loop *father = stmt_info->stmt->bb->loop_father;
- if (!loop && father && !father->inner && father->superloops)
+ /* Segment loads and stores.  When the group size is > 1
+the vectorizer will add a vector load/store statement for
+each vector in the group.  Here we additionally add permute
+costs for each.  */
+ /* TODO: Indexed and ordered/unordered cost.  */
+ int group_size = segment_loadstore_group_size (kind, stmt_info);
+ if (group_size > 1)
+   {
+ switch (group_size)
+   {
+   case 2:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+   stmt_cost += costs->vla->segment_permute_2;
+ else
+   stmt_cost += costs->vls->segment_permute_2;
+ break;
+   case 3:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+   stmt_cost += costs->vla->segment_permute_3;
+ else
+   s

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Implement IFN SAT_ADD for both the scalar and vector

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:db2b829f4d45c6f14724148d1f8b2066290b3371

commit db2b829f4d45c6f14724148d1f8b2066290b3371
Author: Pan Li 
Date:   Fri May 17 18:49:46 2024 +0800

RISC-V: Implement IFN SAT_ADD for both the scalar and vector

The patch implement the SAT_ADD in the riscv backend as the
sample for both the scalar and vector.  Given below vector
as example:

void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
{
  unsigned i;

  for (i = 0; i < n; i++)
out[i] = (x[i] + y[i]) | (- (uint64_t)((uint64_t)(x[i] + y[i]) < x[i]));
}

Before this patch:
vec_sat_add_u64:
  ...
  vsetvli a5,a3,e64,m1,ta,ma
  vle64.v v0,0(a1)
  vle64.v v1,0(a2)
  sllia4,a5,3
  sub a3,a3,a5
  add a1,a1,a4
  add a2,a2,a4
  vadd.vv v1,v0,v1
  vmsgtu.vv   v0,v0,v1
  vmerge.vim  v1,v1,-1,v0
  vse64.v v1,0(a0)
  ...

After this patch:
vec_sat_add_u64:
  ...
  vsetvli a5,a3,e64,m1,ta,ma
  vle64.v v1,0(a1)
  vle64.v v2,0(a2)
  sllia4,a5,3
  sub a3,a3,a5
  add a1,a1,a4
  add a2,a2,a4
  vsaddu.vv   v1,v1,v2  <=  Vector Single-Width Saturating Add
  vse64.v v1,0(a0)
  ...

The below test suites are passed for this patch.
* The riscv fully regression tests.
* The aarch64 fully regression tests.
* The x86 bootstrap tests.
* The x86 fully regression tests.

PR target/51492
PR target/112600

gcc/ChangeLog:

* config/riscv/autovec.md (usadd3): New pattern expand for
the unsigned SAT_ADD in vector mode.
* config/riscv/riscv-protos.h (riscv_expand_usadd): New func decl
to expand usadd3 pattern.
(expand_vec_usadd): Ditto but for vector.
* config/riscv/riscv-v.cc (emit_vec_saddu): New func impl to emit
the vsadd insn.
(expand_vec_usadd): New func impl to expand usadd3 for vector.
* config/riscv/riscv.cc (riscv_expand_usadd): New func impl to
expand usadd3 for scalar.
* config/riscv/riscv.md (usadd3): New pattern expand for
the unsigned SAT_ADD in scalar mode.
* config/riscv/vector.md: Allow VLS mode for vsaddu.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/vec_sat_binary.h: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-1.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-2.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-3.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-4.c: New test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-1.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-2.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-3.c: New 
test.
* gcc.target/riscv/rvv/autovec/binop/vec_sat_u_add-run-4.c: New 
test.
* gcc.target/riscv/sat_arith.h: New test.
* gcc.target/riscv/sat_u_add-1.c: New test.
* gcc.target/riscv/sat_u_add-2.c: New test.
* gcc.target/riscv/sat_u_add-3.c: New test.
* gcc.target/riscv/sat_u_add-4.c: New test.
* gcc.target/riscv/sat_u_add-run-1.c: New test.
* gcc.target/riscv/sat_u_add-run-2.c: New test.
* gcc.target/riscv/sat_u_add-run-3.c: New test.
* gcc.target/riscv/sat_u_add-run-4.c: New test.
* gcc.target/riscv/scalar_sat_binary.h: New test.

Signed-off-by: Pan Li 
(cherry picked from commit 34ed2b4593fa98b613632d0dde30b6ba3e7ecad9)

Diff:
---
 gcc/config/riscv/autovec.md| 17 +
 gcc/config/riscv/riscv-protos.h|  2 +
 gcc/config/riscv/riscv-v.cc| 19 ++
 gcc/config/riscv/riscv.cc  | 55 
 gcc/config/riscv/riscv.md  | 11 
 gcc/config/riscv/vector.md | 12 ++--
 .../riscv/rvv/autovec/binop/vec_sat_binary.h   | 33 ++
 .../riscv/rvv/autovec/binop/vec_sat_u_add-1.c  | 19 ++
 .../riscv/rvv/autovec/binop/vec_sat_u_add-2.c  | 20 ++
 .../riscv/rvv/autovec/binop/vec_sat_u_add-3.c  | 20 ++
 .../riscv/rvv/autovec/binop/vec_sat_u_add-4.c  | 20 ++
 .../riscv/rvv/autovec/binop/vec_sat_u_add-run-1.c  | 75 ++
 .../riscv/rvv/autovec/binop/vec_sat_u_add-run-2.c  | 75 ++
 .../riscv/rvv/autovec/binop/vec_sat_u_add-run-3.c  | 75 ++
 .../riscv/rvv/autovec/binop/vec_sat_u_add-run-4.c  | 75 ++
 gcc/testsuite/gcc.target/riscv/sat_arith.h | 31 +
 gcc/testsuite/gcc.target/riscv/sat_u_add-1.c   | 19 ++
 gcc/testsuite/gcc.targe

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Modify _Bfloat16 to __bf16

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:af9118f721e8d586049ff4a60ff7bc5507478344

commit af9118f721e8d586049ff4a60ff7bc5507478344
Author: Xiao Zeng 
Date:   Fri May 17 13:48:21 2024 +0800

RISC-V: Modify _Bfloat16 to __bf16

According to the description in:
,
the type representation symbol of BF16 has been corrected.

Kito Cheng pointed out relevant information in the email:


gcc/ChangeLog:

* config/riscv/riscv-builtins.cc (riscv_init_builtin_types):
Modify _Bfloat16 to __bf16.
* config/riscv/riscv.cc (riscv_mangle_type): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/_Bfloat16-nanboxing.c: Move to...
* gcc.target/riscv/__bf16-nanboxing.c: ...here.
* gcc.target/riscv/bf16_arithmetic.c: Modify _Bfloat16 to __bf16.
* gcc.target/riscv/bf16_call.c: Ditto.
* gcc.target/riscv/bf16_comparison.c: Ditto.
* gcc.target/riscv/bf16_float_libcall_convert.c: Ditto.
* gcc.target/riscv/bf16_integer_libcall_convert.c: Ditto.

(cherry picked from commit 6da1d6efde2282e6582c00d1631e7457975ad998)

Diff:
---
 gcc/config/riscv/riscv-builtins.cc   |  6 +++---
 gcc/config/riscv/riscv.cc|  2 +-
 .../riscv/{_Bfloat16-nanboxing.c => __bf16-nanboxing.c}  | 12 ++--
 gcc/testsuite/gcc.target/riscv/bf16_arithmetic.c |  6 +++---
 gcc/testsuite/gcc.target/riscv/bf16_call.c   |  4 ++--
 gcc/testsuite/gcc.target/riscv/bf16_comparison.c |  6 +++---
 gcc/testsuite/gcc.target/riscv/bf16_float_libcall_convert.c  |  2 +-
 .../gcc.target/riscv/bf16_integer_libcall_convert.c  |  2 +-
 8 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/gcc/config/riscv/riscv-builtins.cc 
b/gcc/config/riscv/riscv-builtins.cc
index 4c08834288ac..dc54e1a59b52 100644
--- a/gcc/config/riscv/riscv-builtins.cc
+++ b/gcc/config/riscv/riscv-builtins.cc
@@ -275,7 +275,7 @@ riscv_init_builtin_types (void)
 lang_hooks.types.register_builtin_type (riscv_float16_type_node,
"_Float16");
 
-  /* Provide the _Bfloat16 type and bfloat16_type_node if needed.  */
+  /* Provide the __bf16 type and bfloat16_type_node if needed.  */
   if (!bfloat16_type_node)
 {
   riscv_bfloat16_type_node = make_node (REAL_TYPE);
@@ -286,9 +286,9 @@ riscv_init_builtin_types (void)
   else
 riscv_bfloat16_type_node = bfloat16_type_node;
 
-  if (!maybe_get_identifier ("_Bfloat16"))
+  if (!maybe_get_identifier ("__bf16"))
 lang_hooks.types.register_builtin_type (riscv_bfloat16_type_node,
-   "_Bfloat16");
+   "__bf16");
 }
 
 /* Implement TARGET_INIT_BUILTINS.  */
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 9ac2be87acd2..2be04ec6bc5e 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -10276,7 +10276,7 @@ riscv_asan_shadow_offset (void)
 static const char *
 riscv_mangle_type (const_tree type)
 {
-  /* Half-precision float, _Float16 is "DF16_" and _Bfloat16 is "DF16b".  */
+  /* Half-precision float, _Float16 is "DF16_" and __bf16 is "DF16b".  */
   if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
 {
   if (TYPE_MODE (type) == HFmode)
diff --git a/gcc/testsuite/gcc.target/riscv/_Bfloat16-nanboxing.c 
b/gcc/testsuite/gcc.target/riscv/__bf16-nanboxing.c
similarity index 83%
rename from gcc/testsuite/gcc.target/riscv/_Bfloat16-nanboxing.c
rename to gcc/testsuite/gcc.target/riscv/__bf16-nanboxing.c
index 11a73d222345..a9a586c98b9c 100644
--- a/gcc/testsuite/gcc.target/riscv/_Bfloat16-nanboxing.c
+++ b/gcc/testsuite/gcc.target/riscv/__bf16-nanboxing.c
@@ -1,14 +1,14 @@
 /* { dg-do compile } */
 /* { dg-options "-march=rv64ifd -mabi=lp64d -mcmodel=medlow -O" } */
 
-_Bfloat16 gvar = 9.87654;
+__bf16 gvar = 9.87654;
 union U
 {
   unsigned short i16;
-  _Bfloat16 f16;
+  __bf16 f16;
 };
 
-_Bfloat16
+__bf16
 test1 (unsigned short input)
 {
   union U tmp;
@@ -16,19 +16,19 @@ test1 (unsigned short input)
   return tmp.f16;
 }
 
-_Bfloat16
+__bf16
 test2 ()
 {
   return 1.234f;
 }
 
-_Bfloat16
+__bf16
 test3 ()
 {
   return gvar;
 }
 
-_Bfloat16
+__bf16
 test ()
 {
   return 0.0f;
diff --git a/gcc/testsuite/gcc.target/riscv/bf16_arithmetic.c 
b/gcc/testsuite/gcc.target/riscv/bf16_arithmetic.c
index 9e4850512600..190cc1d574a6 100644
--- a/gcc/testsuite/gcc.target/riscv/bf16_arithmetic.c
+++ b/gcc/testsuite/gcc.target/riscv/bf16_arithmetic.c
@@ -5,9 +5,9 @@
 /* 1) bf -> sf  (call  __extendbfsf2)  */
 /* 2) sf1 [+|-|*|/] sf2 (call  __[add|sub|mul|div]sf3)  */
 /* 3) sf -> bf  (call  __truncsfbf2)  */
-extern _Bfloat16 bf;
-extern _Bfloat16 bf1;
-ext

[gcc(refs/vendors/riscv/heads/gcc-14-with-riscv-opts)] RISC-V: Fix "Nan-box the result of movbf on soft-bf16"

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:a5445260bd42d74aabe6c11d6207d113aafe2c8c

commit a5445260bd42d74aabe6c11d6207d113aafe2c8c
Author: Xiao Zeng 
Date:   Wed May 15 16:23:16 2024 +0800

RISC-V: Fix "Nan-box the result of movbf on soft-bf16"

1 According to unpriv-isa spec:


  1.1 "FMV.H.X moves the half-precision value encoded in IEEE 754-2008
  standard encoding from the lower 16 bits of integer register rs1
  to the floating-point register rd, NaN-boxing the result."
  1.2 "FMV.W.X moves the single-precision value encoded in IEEE 754-2008
  standard encoding from the lower 32 bits of integer register rs1
  to the floating-point register rd. The bits are not modified in the
  transfer, and in particular, the payloads of non-canonical NaNs are 
preserved."

2 When (!TARGET_ZFHMIN == true && TARGET_HARD_FLOAT == true), instruction 
needs
to be added to complete the Nan-box, as done in
"RISC-V: Nan-box the result of movhf on soft-fp16":



3 Consider the "RISC-V: Nan-box the result of movbf on soft-bf16" in:


It ignores that both hf16 and bf16 are 16bits floating-point.

4 zfbfmin -> zfhmin in:



gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_legitimize_move): Optimize movbf
with Nan-boxing value.
* config/riscv/riscv.md (*movhf_softfloat_boxing): Expand movbf
with Nan-boxing value.
(*mov_softfloat_boxing): Ditto.
with Nan-boxing value.
(*movbf_softfloat_boxing): Delete abandon pattern.

(cherry picked from commit 7422e050f33dd9ee7dcd5a72c80b4e11d61995ce)

Diff:
---
 gcc/config/riscv/riscv.cc | 15 ++-
 gcc/config/riscv/riscv.md | 19 +--
 2 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 2be04ec6bc5e..7a34b4be873f 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -3192,13 +3192,12 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx 
src)
  (set (reg:SI/DI mask) (const_int -65536)
  (set (reg:SI/DI temp) (zero_extend:SI/DI (subreg:HI (reg:HF/BF src) 0)))
  (set (reg:SI/DI temp) (ior:SI/DI (reg:SI/DI mask) (reg:SI/DI temp)))
- (set (reg:HF/BF dest) (unspec:HF/BF[ (reg:SI/DI temp) ]
-   UNSPEC_FMV_SFP16_X/UNSPEC_FMV_SBF16_X))
- */
+ (set (reg:HF/BF dest) (unspec:HF/BF[ (reg:SI/DI temp) ] 
UNSPEC_FMV_FP16_X))
+  */
 
   if (TARGET_HARD_FLOAT
-  && ((!TARGET_ZFHMIN && mode == HFmode)
- || (!TARGET_ZFBFMIN && mode == BFmode))
+  && !TARGET_ZFHMIN
+  && (mode == HFmode || mode == BFmode)
   && REG_P (dest) && FP_REG_P (REGNO (dest))
   && REG_P (src) && !FP_REG_P (REGNO (src))
   && can_create_pseudo_p ())
@@ -3213,10 +3212,8 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx 
src)
   else
emit_insn (gen_iordi3 (temp, mask, temp));
 
-  riscv_emit_move (dest,
-  gen_rtx_UNSPEC (mode, gen_rtvec (1, temp),
-  mode == HFmode ? UNSPEC_FMV_SFP16_X
- : UNSPEC_FMV_SBF16_X));
+  riscv_emit_move (dest, gen_rtx_UNSPEC (mode, gen_rtvec (1, temp),
+UNSPEC_FMV_FP16_X));
 
   return true;
 }
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 04f54cedad94..ff4557c1325f 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -87,8 +87,7 @@
   UNSPEC_STRLEN
 
   ;; Workaround for HFmode and BFmode without hardware extension
-  UNSPEC_FMV_SFP16_X
-  UNSPEC_FMV_SBF16_X
+  UNSPEC_FMV_FP16_X
 
   ;; XTheadFmv moves
   UNSPEC_XTHEADFMV
@@ -1959,23 +1958,15 @@
(set_attr "type" "fmove,move,load,store,mtc,mfc")
(set_attr "mode" "")])
 
-(define_insn "*movhf_softfloat_boxing"
-  [(set (match_operand:HF 0 "register_operand""=f")
-(unspec:HF [(match_operand:X 1 "register_operand" " r")] 
UNSPEC_FMV_SFP16_X))]
+(define_insn "*mov_softfloat_boxing"
+  [(set (match_operand:HFBF 0 "register_operand"   "=f")
+   (unspec:HFBF [(match_operand:X 1 "register_operand" " r")]
+UNSPEC_FMV_FP16_X))]
   "!TARGET_ZFHMIN"
   "fmv.w.x\t%0,%1"
   [(set_attr "type" "fmove")
(set_attr "mode" "SF")])
 
-(define_insn "*movbf_softfloat_boxing"
-  [(set (match_operand:BF 0 "register_operand"   "=f")
-   (unspec:BF [(match_operand:X 1 "register_operand" " r")]
-UNSPEC_FMV_SBF16_X))]
-  "!TARGET_ZFBFMIN"
-

[gcc r15-646] [to-be-committed, RISC-V] Improve some shift-add sequences

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:3c9c52a1c0fa7af22f769a2116b28a0b7ea18129

commit r15-646-g3c9c52a1c0fa7af22f769a2116b28a0b7ea18129
Author: Jeff Law 
Date:   Sat May 18 15:08:07 2024 -0600

[to-be-committed,RISC-V] Improve some shift-add sequences

So this is a minor fix/improvement for shift-add sequences.  This was
supposed to help xz in a minor way IIRC.

Combine may present us with (x + C2') << C1 which was canonicalized from
(x << C1) + C2.

Depending on the precise values of C2 and C2' one form may be better
than the other.  We can (somewhat awkwardly) use riscv_const_insns to
test for which sequence would be preferred.

Tested on Ventana's CI system as well as my own.  Waiting on CI results
from Rivos's tester before moving forward.

Jeff
gcc/
* config/riscv/riscv.md: Add new patterns to allow selection
between (x << C1) + C2 vs (x + C2') << C1 depending on the
cost C2 vs C2'.

gcc/testsuite

* gcc.target/riscv/shift-add-1.c: New test.

Diff:
---
 gcc/config/riscv/riscv.md| 56 
 gcc/testsuite/gcc.target/riscv/shift-add-1.c | 21 +++
 2 files changed, 77 insertions(+)

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index ff4557c1325f..78c16adee980 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -4162,6 +4162,62 @@
   }
 )
 
+;; These are forms of (x << C1) + C2, potentially canonicalized from
+;; ((x + C2') << C1.  Depending on the cost to load C2 vs C2' we may
+;; want to go ahead and recognize this form as C2 may be cheaper to
+;; synthesize than C2'.
+;;
+;; It might be better to refactor riscv_const_insns a bit so that we
+;; can have an API that passes integer values around rather than
+;; constructing a lot of garbage RTL.
+;;
+;; The mvconst_internal pattern in effect requires this pattern to
+;; also be a define_insn_and_split due to insn count costing when
+;; splitting in combine.
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (plus:DI (ashift:DI (match_operand:DI 1 "register_operand" "r")
+   (match_operand 2 "const_int_operand" "n"))
+(match_operand 3 "const_int_operand" "n")))
+   (clobber (match_scratch:DI 4 "=&r"))]
+  "(TARGET_64BIT
+&& riscv_const_insns (operands[3])
+&& ((riscv_const_insns (operands[3])
+< riscv_const_insns (GEN_INT (INTVAL (operands[3]) >> INTVAL 
(operands[2]
+   || riscv_const_insns (GEN_INT (INTVAL (operands[3]) >> INTVAL 
(operands[2]))) == 0))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (ashift:DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 4) (match_dup 3))
+   (set (match_dup 0) (plus:DI (match_dup 0) (match_dup 4)))]
+  ""
+  [(set_attr "type" "arith")])
+
+(define_insn_and_split ""
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (sign_extend:DI (plus:SI (ashift:SI
+  (match_operand:SI 1 "register_operand" "r")
+  (match_operand 2 "const_int_operand" "n"))
+(match_operand 3 "const_int_operand" "n"
+   (clobber (match_scratch:DI 4 "=&r"))]
+  "(TARGET_64BIT
+&& riscv_const_insns (operands[3])
+&& ((riscv_const_insns (operands[3])
+< riscv_const_insns (GEN_INT (INTVAL (operands[3]) >> INTVAL 
(operands[2]
+   || riscv_const_insns (GEN_INT (INTVAL (operands[3]) >> INTVAL 
(operands[2]))) == 0))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (ashift:DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 4) (match_dup 3))
+   (set (match_dup 0) (sign_extend:DI (plus:SI (match_dup 5) (match_dup 6]
+  "{
+ operands[1] = gen_lowpart (DImode, operands[1]);
+ operands[5] = gen_lowpart (SImode, operands[0]);
+ operands[6] = gen_lowpart (SImode, operands[4]);
+   }"
+  [(set_attr "type" "arith")])
+
+
 (include "bitmanip.md")
 (include "crypto.md")
 (include "sync.md")
diff --git a/gcc/testsuite/gcc.target/riscv/shift-add-1.c 
b/gcc/testsuite/gcc.target/riscv/shift-add-1.c
new file mode 100644
index ..d98875c32716
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/shift-add-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs -mabi=lp64" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+
+int composeFromSurrogate(const unsigned short high) {
+
+return  ((high - 0xD800) << 10) ;
+}
+
+
+long composeFromSurrogate_2(const unsigned long high) {
+
+return  ((high - 0xD800) << 10) ;
+}
+
+
+/* { dg-final { scan-assembler-times "\tli\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tslli\t" 2 } } */
+/* { dg-final { scan-assembler-times "\taddw\t" 1 } } */
+/* { dg-final { scan-assembler-times "\tadd\t" 1 } } */
+


[gcc r13-8777] [committed] Fix RISC-V missing stack tie

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:162c441c9462d073c53dde87258898795bf28a5c

commit r13-8777-g162c441c9462d073c53dde87258898795bf28a5c
Author: Jeff Law 
Date:   Thu Mar 21 20:41:59 2024 -0600

[committed] Fix RISC-V missing stack tie

As some of you know, Raphael has been working on stack-clash support for the
RISC-V port.  A little while ago Florian reached out to us with an issue 
where
glibc was failing its smoke test due to referencing an unallocated stack 
slot.

Without diving into the code in detail I (incorrectly) concluded it was a
problem with the fallback of using Ada's stack-check paths due to not having
stack-clash support.

Once enough stack-clash bits were ready I had Raphael review the code 
generated
for Florian's test and we concluded the the original case from Florian was 
just
wrong irrespective of stack clash/stack check.  While Raphael's stack-clash
work will indirectly fix Florian's case, it really should also work without
stack-clash.

In particular this code was called out by valgrind:

> 0003cb5e :
> __GI___realpath():
>3cb5e:   81010113addisp,sp,-2032
>3cb62:   7d313423sd  s3,1992(sp)
>3cb66:   79fdlui s3,0xf
>3cb68:   7e813023sd  s0,2016(sp)
>3cb6c:   7c913c23sd  s1,2008(sp)
>3cb70:   7f010413addis0,sp,2032
>3cb74:   35098793addia5,s3,848 # 
f350 <__libc_initial+0xffe8946a>
>3cb78:   74fdlui s1,0xf
>3cb7a:   008789b3add s3,a5,s0
>3cb7e:   f9048793addia5,s1,-112 # 
ef90 <__libc_initial+0xffe890aa>
>3cb82:   008784b3add s1,a5,s0
>3cb86:   77fdlui a5,0xf
>3cb88:   7d413023sd  s4,1984(sp)
>3cb8c:   7b513c23sd  s5,1976(sp)
>3cb90:   7e113423sd  ra,2024(sp)
>3cb94:   7d213823sd  s2,2000(sp)
>3cb98:   7b613823sd  s6,1968(sp)
>3cb9c:   7b713423sd  s7,1960(sp)
>3cba0:   7b813023sd  s8,1952(sp)
>3cba4:   79913c23sd  s9,1944(sp)
>3cba8:   79a13823sd  s10,1936(sp)
>3cbac:   79b13423sd  s11,1928(sp)
>3cbb0:   34878793addia5,a5,840 # 
f348 <__libc_initial+0xffe89462>
>3cbb4:   4713li  a4,1024
>3cbb8:   00132a17auipc   s4,0x132
>3cbbc:   ae0a3a03ld  s4,-1312(s4) # 16e698 
<__stack_chk_guard>
>3cbc0:   01098893addia7,s3,16
>3cbc4:   42098693addia3,s3,1056
>3cbc8:   b8040a93addis5,s0,-1152
>3cbcc:   97a2add a5,a5,s0
>3cbce:   000a3603ld  a2,0(s4)
>3cbd2:   f8c43423sd  a2,-120(s0)
>3cbd6:   4601li  a2,0
>3cbd8:   3d14b023sd  a7,960(s1)
>3cbdc:   3ce4b423sd  a4,968(s1)
>3cbe0:   7cd4b823sd  a3,2000(s1)
>3cbe4:   7ce4bc23sd  a4,2008(s1)
>3cbe8:   b7543823sd  s5,-1168(s0)
>3cbec:   b6e43c23sd  a4,-1160(s0)
>3cbf0:   e38csd  a1,0(a5)
>3cbf2:   b0010113addisp,sp,-1280
In particular note the store at 0x3cbd8.  That's hitting (s1 + 960). If you
chase the values around, you'll find it's a bit more than 1k into 
unallocated
stack space.  It's also worth noting the final stack adjustment at 0x3cbf2.

While I haven't reproduced Florian's code exactly, I was able to get 
reasonably
close and verify my suspicion that everything was fine before sched2 and
incorrect after sched2.  It was also obvious at that point what had gone 
wrong
-- we were missing a stack tie after the final stack pointer adjustment.

This patch adds the missing stack tie.

While not technically a regression, I shudder at the thought of chasing one 
of
these issues down again in the wild.  Been there, done that.

Regression tested on rv64gc.  Verified the scheduler no longer mucked up
realpath by hand.  Pushing to the trunk.

gcc/
* config/riscv/riscv.cc (riscv_expand_prologue): Add missing stack
 

[gcc r15-647] RISC-V: Implement -m{,no}fence-tso

2024-05-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:a6114c2a691112f9cf5b072c21685d2e43c76d81

commit r15-647-ga6114c2a691112f9cf5b072c21685d2e43c76d81
Author: Palmer Dabbelt 
Date:   Sat May 18 15:15:09 2024 -0600

RISC-V: Implement -m{,no}fence-tso

Some processors from T-Head don't implement the `fence.tso` instruction
natively and instead trap to firmware.  This breaks some users who
haven't yet updated the firmware and one could imagine it breaking users
who are trying to build firmware if they're using the C memory model.

So just add an option to disable emitting it, in a similar fashion to
how we allow users to forbid other instructions.

Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1070959
---
I've just smoke tested this one, but

void func(void) { __atomic_thread_fence(__ATOMIC_ACQ_REL); }

generates `fence.tso` without the argument and `fence rw,rw` with
`-mno-fence-tso`, so it seems to be at least mostly there.  I figured
I'd just send it up for comments before putting together the DG bits:
it's kind of a pain to carry around these workarounds for unimplemented
instructions, but it's in HW so there's not much we can do about that.

gcc/ChangeLog:

* config/riscv/riscv.opt: Add -mno-fence-tso.
* config/riscv/sync-rvwmo.md (mem_thread_fence_rvwmo): Respect
-mno-fence-tso.
* doc/invoke.texi (RISC-V): Document -mno-fence-tso.

Diff:
---
 gcc/config/riscv/riscv.opt | 4 
 gcc/config/riscv/sync-rvwmo.md | 2 +-
 gcc/doc/invoke.texi| 8 
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index d209ac896fde..87f583320168 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -624,3 +624,7 @@ Enum(tls_type) String(desc) Value(TLS_DESCRIPTORS)
 mtls-dialect=
 Target RejectNegative Joined Enum(tls_type) Var(riscv_tls_dialect) 
Init(TLS_TRADITIONAL) Save
 Specify TLS dialect.
+
+mfence-tso
+Target Var(TARGET_FENCE_TSO) Init(1)
+Specifies whether the fence.tso instruction should be used.
diff --git a/gcc/config/riscv/sync-rvwmo.md b/gcc/config/riscv/sync-rvwmo.md
index d4fd26069f74..e639a1e23924 100644
--- a/gcc/config/riscv/sync-rvwmo.md
+++ b/gcc/config/riscv/sync-rvwmo.md
@@ -33,7 +33,7 @@
 if (model == MEMMODEL_SEQ_CST)
return "fence\trw,rw";
 else if (model == MEMMODEL_ACQ_REL)
-   return "fence.tso";
+   return TARGET_FENCE_TSO ? "fence.tso" : "fence\trw,rw";
 else if (model == MEMMODEL_ACQUIRE)
return "fence\tr,rw";
 else if (model == MEMMODEL_RELEASE)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b9408ecc9188..70e8004a71b2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1244,6 +1244,7 @@ See RS/6000 and PowerPC Options.
 -mplt  -mno-plt
 -mabi=@var{ABI-string}
 -mfdiv  -mno-fdiv
+-mfence-tso  -mno-fence-tso
 -mdiv  -mno-div
 -misa-spec=@var{ISA-spec-string}
 -march=@var{ISA-string}
@@ -30436,6 +30437,13 @@ Do or don't use hardware floating-point divide and 
square root instructions.
 This requires the F or D extensions for floating-point registers.  The default
 is to use them if the specified architecture has these instructions.
 
+@opindex mfence-tso
+@item -mfence-tso
+@itemx -mno-fence-tso
+Do or don't use the @samp{fence.tso} instruction, which is unimplemented on
+some processors (including those from T-Head).  If the @samp{fence.tso}
+instruction is not availiable then a stronger fence will be used instead.
+
 @opindex mdiv
 @item -mdiv
 @itemx -mno-div