[gcc r15-2981] forwprop: Also dce from added statements from gimple_simplify

2024-08-18 Thread Andrew Pinski via Gcc-cvs
https://gcc.gnu.org/g:3ae8794665ee7cbefee755d4b4be8d1ecb8c2a81

commit r15-2981-g3ae8794665ee7cbefee755d4b4be8d1ecb8c2a81
Author: Andrew Pinski 
Date:   Sat Aug 17 12:14:54 2024 -0700

forwprop: Also dce from added statements from gimple_simplify

This extends r14-3982-g9ea74d235c7e78 to also include the newly added 
statements
since some of them might be dead too (due to the way match and simplify 
works).
This was noticed while working on adding a new match and simplify pattern 
where a
new statement that got added was not being used.

Bootstrapped and tested on x86_64-linux-gnu with no regressions.

gcc/ChangeLog:

* gimple-fold.cc (mark_lhs_in_seq_for_dce): New function.
(replace_stmt_with_simplification): Call mark_lhs_in_seq_for_dce
right before inserting the sequence.
(fold_stmt_1): Add dce_worklist argument, update call to
replace_stmt_with_simplification.
(fold_stmt): Add dce_worklist argument, update call to fold_stmt_1.
(fold_stmt_inplace): Update call to fold_stmt_1.
* gimple-fold.h (fold_stmt): Add bitmap argument.
* tree-ssa-forwprop.cc (pass_forwprop::execute): Update call to 
fold_stmt.

Signed-off-by: Andrew Pinski 

Diff:
---
 gcc/gimple-fold.cc   | 43 ---
 gcc/gimple-fold.h|  4 ++--
 gcc/tree-ssa-forwprop.cc |  2 +-
 3 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
index 18d7a6b176d..0bec35d06f6 100644
--- a/gcc/gimple-fold.cc
+++ b/gcc/gimple-fold.cc
@@ -5914,6 +5914,24 @@ has_use_on_stmt (tree name, gimple *stmt)
   return false;
 }
 
+/* Add the lhs of each statement of SEQ to DCE_WORKLIST. */
+
+static void
+mark_lhs_in_seq_for_dce (bitmap dce_worklist, gimple_seq seq)
+{
+  if (!dce_worklist)
+return;
+
+  for (gimple_stmt_iterator i = gsi_start (seq);
+   !gsi_end_p (i); gsi_next (&i))
+{
+  gimple *stmt = gsi_stmt (i);
+  tree name = gimple_get_lhs (stmt);
+  if (name && TREE_CODE (name) == SSA_NAME)
+   bitmap_set_bit (dce_worklist, SSA_NAME_VERSION (name));
+}
+}
+
 /* Worker for fold_stmt_1 dispatch to pattern based folding with
gimple_simplify.
 
@@ -5924,7 +5942,8 @@ has_use_on_stmt (tree name, gimple *stmt)
 static bool
 replace_stmt_with_simplification (gimple_stmt_iterator *gsi,
  gimple_match_op *res_op,
- gimple_seq *seq, bool inplace)
+ gimple_seq *seq, bool inplace,
+ bitmap dce_worklist)
 {
   gimple *stmt = gsi_stmt (*gsi);
   tree *ops = res_op->ops;
@@ -5992,6 +6011,8 @@ replace_stmt_with_simplification (gimple_stmt_iterator 
*gsi,
  print_gimple_stmt (dump_file, gsi_stmt (*gsi),
 0, TDF_SLIM);
}
+  // Mark the lhs of the new statements maybe for dce
+  mark_lhs_in_seq_for_dce (dce_worklist, *seq);
   gsi_insert_seq_before (gsi, *seq, GSI_SAME_STMT);
   return true;
 }
@@ -6015,6 +6036,8 @@ replace_stmt_with_simplification (gimple_stmt_iterator 
*gsi,
  print_gimple_stmt (dump_file, gsi_stmt (*gsi),
 0, TDF_SLIM);
}
+ // Mark the lhs of the new statements maybe for dce
+ mark_lhs_in_seq_for_dce (dce_worklist, *seq);
  gsi_insert_seq_before (gsi, *seq, GSI_SAME_STMT);
  return true;
}
@@ -6032,6 +6055,8 @@ replace_stmt_with_simplification (gimple_stmt_iterator 
*gsi,
print_gimple_seq (dump_file, *seq, 0, TDF_SLIM);
  print_gimple_stmt (dump_file, gsi_stmt (*gsi), 0, TDF_SLIM);
}
+  // Mark the lhs of the new statements maybe for dce
+  mark_lhs_in_seq_for_dce (dce_worklist, *seq);
   gsi_insert_seq_before (gsi, *seq, GSI_SAME_STMT);
   return true;
 }
@@ -6047,6 +6072,8 @@ replace_stmt_with_simplification (gimple_stmt_iterator 
*gsi,
  fprintf (dump_file, "gimple_simplified to ");
  print_gimple_seq (dump_file, *seq, 0, TDF_SLIM);
}
+ // Mark the lhs of the new statements maybe for dce
+ mark_lhs_in_seq_for_dce (dce_worklist, *seq);
  gsi_replace_with_seq_vops (gsi, *seq);
  return true;
}
@@ -6214,7 +6241,8 @@ maybe_canonicalize_mem_ref_addr (tree *t, bool is_debug = 
false)
distinguishes both cases.  */
 
 static bool
-fold_stmt_1 (gimple_stmt_iterator *gsi, bool inplace, tree (*valueize) (tree))
+fold_stmt_1 (gimple_stmt_iterator *gsi, bool inplace, tree (*valueize) (tree),
+bitmap dce_worklist = nullptr)
 {
   bool changed = false;
   gimple *stmt = gsi_stmt (*gsi);
@@ -6382,7 +6410,8 @@ fold_stmt_1 (gimple_stmt_iterator *gsi, bool inplace, 
tree (*valueize) (tree))
   if (gimple_simplify (stmt, &res_op, inplace ? NULL : &seq,
 

[gcc r15-2982] AVR: target/116407 - Fix linker error "relocation truncated to fit".

2024-08-18 Thread Georg-Johann Lay via Gcc-cvs
https://gcc.gnu.org/g:dfb2e8caa85d1059a0ab8ed4f19568c04c9f13a4

commit r15-2982-gdfb2e8caa85d1059a0ab8ed4f19568c04c9f13a4
Author: Georg-Johann Lay 
Date:   Sun Aug 18 15:00:55 2024 +0200

AVR: target/116407 - Fix linker error "relocation truncated to fit".

Some text peepholes output extra instructions prior to a branch
instruction and that increase the jump offset of backward branches.

PR target/116407
gcc/
* config/avr/avr-protos.h (avr_jump_mode): Add an int argument.
* config/avr/avr.cc (avr_jump_mode): Add an int argument to increase
the computed jump offset of backwards branches.
* config/avr/avr.md (*dec-and-branchhi!=-1, *dec-and-branchsi!=-1):
Increase the jump offset used by avr_jump_mode() as needed.
gcc/testsuite/
* gcc.target/avr/torture/pr116407-2.c: New test.
* gcc.target/avr/torture/pr116407-4.c: New test.

Diff:
---
 gcc/config/avr/avr-protos.h   |  2 +-
 gcc/config/avr/avr.cc | 11 +---
 gcc/config/avr/avr.md |  8 +++---
 gcc/testsuite/gcc.target/avr/torture/pr116407-2.c | 34 +++
 gcc/testsuite/gcc.target/avr/torture/pr116407-4.c | 34 +++
 5 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index 7b666f17718..34298b976a7 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -115,7 +115,7 @@ extern const char* avr_out_reload_inpsi (rtx*, rtx, int*);
 extern const char* avr_out_lpm (rtx_insn *, rtx*, int*);
 extern void avr_notice_update_cc (rtx body, rtx_insn *insn);
 extern int reg_unused_after (rtx_insn *insn, rtx reg);
-extern int avr_jump_mode (rtx x, rtx_insn *insn);
+extern int avr_jump_mode (rtx x, rtx_insn *insn, int = 0);
 extern int test_hard_reg_class (enum reg_class rclass, rtx x);
 extern int jump_over_one_insn_p (rtx_insn *insn, rtx dest);
 
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index 8c19bcb34a6..c520b98a178 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -4133,19 +4133,22 @@ avr_use_by_pieces_infrastructure_p (unsigned 
HOST_WIDE_INT size,
 /* Choose mode for jump insn:
1 - relative jump in range -63 <= x <= 62 ;
2 - relative jump in range -2046 <= x <= 2045 ;
-   3 - absolute jump (only for ATmega[16]03).  */
+   3 - absolute jump (only when we have JMP / CALL).
+
+   When jumping backwards, assume the jump offset is EXTRA words
+   bigger than inferred from insn addresses.  */
 
 int
-avr_jump_mode (rtx x, rtx_insn *insn)
+avr_jump_mode (rtx x, rtx_insn *insn, int extra)
 {
   int dest_addr = INSN_ADDRESSES (INSN_UID (GET_CODE (x) == LABEL_REF
? XEXP (x, 0) : x));
   int cur_addr = INSN_ADDRESSES (INSN_UID (insn));
   int jump_distance = cur_addr - dest_addr;
 
-  if (IN_RANGE (jump_distance, -63, 62))
+  if (IN_RANGE (jump_distance, -63, 62 - extra))
 return 1;
-  else if (IN_RANGE (jump_distance, -2046, 2045))
+  else if (IN_RANGE (jump_distance, -2046, 2045 - extra))
 return 2;
   else if (AVR_HAVE_JMP_CALL)
 return 3;
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 28841e40db1..8c4819a901f 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -7605,7 +7605,7 @@
"sbc %C0,__zero_reg__" CR_TAB
"sbc %D0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn);
+int jump_mode = avr_jump_mode (operands[2], insn, 3 - avr_adiw_reg_p 
(operands[0]));
 const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);
 
@@ -7642,7 +7642,7 @@
   output_asm_insn ("subi %A0,1" CR_TAB
"sbc %B0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn);
+int jump_mode = avr_jump_mode (operands[2], insn, 1 - avr_adiw_reg_p 
(operands[0]));
 const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);
 
@@ -7681,7 +7681,7 @@
   output_asm_insn ("subi %A0,1" CR_TAB
"sbc %B0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn);
+int jump_mode = avr_jump_mode (operands[2], insn, 1 - avr_adiw_reg_p 
(operands[0]));
 const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);
 
@@ -7718,7 +7718,7 @@
  "sub %A0,%3" CR_TAB
  "sbc %B0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn);
+int jump_mode = avr_jump_mode (operands[2], insn, 1 - avr_adiw_reg_p 
(operands[0]));
 const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRIN

[gcc r14-10599] AVR: target/116407 - Fix linker error "relocation truncated to fit".

2024-08-18 Thread Georg-Johann Lay via Gcc-cvs
https://gcc.gnu.org/g:919c42bf4487a8f0cc584a3ca53784c381f19dfa

commit r14-10599-g919c42bf4487a8f0cc584a3ca53784c381f19dfa
Author: Georg-Johann Lay 
Date:   Sun Aug 18 15:00:55 2024 +0200

AVR: target/116407 - Fix linker error "relocation truncated to fit".

Some text peepholes output extra instructions prior to a branch
instruction and that increase the jump offset of backward branches.

PR target/116407
gcc/
* config/avr/avr-protos.h (avr_jump_mode): Add an int argument.
* config/avr/avr.cc (avr_jump_mode): Add an int argument to increase
the computed jump offset of backwards branches.
* config/avr/avr.md (*dec-and-branchhi!=-1, *dec-and-branchsi!=-1):
Increase the jump offset used by avr_jump_mode() as needed.
gcc/testsuite/
* gcc.target/avr/torture/pr116407-2.c: New test.
* gcc.target/avr/torture/pr116407-4.c: New test.

(cherry picked from commit dfb2e8caa85d1059a0ab8ed4f19568c04c9f13a4)

Diff:
---
 gcc/config/avr/avr-protos.h   |  2 +-
 gcc/config/avr/avr.cc | 11 +---
 gcc/config/avr/avr.md |  8 +++---
 gcc/testsuite/gcc.target/avr/torture/pr116407-2.c | 34 +++
 gcc/testsuite/gcc.target/avr/torture/pr116407-4.c | 34 +++
 5 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index dc23cfbf461..d918e2fce31 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -112,7 +112,7 @@ extern const char* avr_out_reload_inpsi (rtx*, rtx, int*);
 extern const char* avr_out_lpm (rtx_insn *, rtx*, int*);
 extern void avr_notice_update_cc (rtx body, rtx_insn *insn);
 extern int reg_unused_after (rtx_insn *insn, rtx reg);
-extern int avr_jump_mode (rtx x, rtx_insn *insn);
+extern int avr_jump_mode (rtx x, rtx_insn *insn, int = 0);
 extern int test_hard_reg_class (enum reg_class rclass, rtx x);
 extern int jump_over_one_insn_p (rtx_insn *insn, rtx dest);
 
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index 3c1a0bf707f..c6f29fd48f9 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -3989,19 +3989,22 @@ avr_use_by_pieces_infrastructure_p (unsigned 
HOST_WIDE_INT size,
 /* Choose mode for jump insn:
1 - relative jump in range -63 <= x <= 62 ;
2 - relative jump in range -2046 <= x <= 2045 ;
-   3 - absolute jump (only for ATmega[16]03).  */
+   3 - absolute jump (only when we have JMP / CALL).
+
+   When jumping backwards, assume the jump offset is EXTRA words
+   bigger than inferred from insn addresses.  */
 
 int
-avr_jump_mode (rtx x, rtx_insn *insn)
+avr_jump_mode (rtx x, rtx_insn *insn, int extra)
 {
   int dest_addr = INSN_ADDRESSES (INSN_UID (GET_CODE (x) == LABEL_REF
? XEXP (x, 0) : x));
   int cur_addr = INSN_ADDRESSES (INSN_UID (insn));
   int jump_distance = cur_addr - dest_addr;
 
-  if (IN_RANGE (jump_distance, -63, 62))
+  if (IN_RANGE (jump_distance, -63, 62 - extra))
 return 1;
-  else if (IN_RANGE (jump_distance, -2046, 2045))
+  else if (IN_RANGE (jump_distance, -2046, 2045 - extra))
 return 2;
   else if (AVR_HAVE_JMP_CALL)
 return 3;
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 55de11079c3..9aaefcfcd64 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -7771,7 +7771,7 @@
"sbc %C0,__zero_reg__" CR_TAB
"sbc %D0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn);
+int jump_mode = avr_jump_mode (operands[2], insn, 3 - avr_adiw_reg_p 
(operands[0]));
 const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);
 
@@ -7808,7 +7808,7 @@
   output_asm_insn ("subi %A0,1" CR_TAB
"sbc %B0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn);
+int jump_mode = avr_jump_mode (operands[2], insn, 1 - avr_adiw_reg_p 
(operands[0]));
 const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);
 
@@ -7847,7 +7847,7 @@
   output_asm_insn ("subi %A0,1" CR_TAB
"sbc %B0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn);
+int jump_mode = avr_jump_mode (operands[2], insn, 1 - avr_adiw_reg_p 
(operands[0]));
 const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);
 
@@ -7884,7 +7884,7 @@
  "sub %A0,%3" CR_TAB
  "sbc %B0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn);
+int jump_mode = avr_jump_mode (operands[2], insn, 1 - avr_adiw_reg_p 
(operands[0]));
 const char *op = ((EQ == 

[gcc r13-8980] AVR: target/116407 - Fix linker error "relocation truncated to fit".

2024-08-18 Thread Georg-Johann Lay via Gcc-cvs
https://gcc.gnu.org/g:58c8882ff5ba249ef4cfc99f3c3f8ada8810f75a

commit r13-8980-g58c8882ff5ba249ef4cfc99f3c3f8ada8810f75a
Author: Georg-Johann Lay 
Date:   Sun Aug 18 15:00:55 2024 +0200

AVR: target/116407 - Fix linker error "relocation truncated to fit".

Some text peepholes output extra instructions prior to a branch
instruction and that increase the jump offset of backward branches.

PR target/116407
gcc/
* config/avr/avr-protos.h (avr_jump_mode): Add an int argument.
* config/avr/avr.cc (avr_jump_mode): Add an int argument to increase
the computed jump offset of backwards branches.
* config/avr/avr.md (*dec-and-branchhi!=-1, *dec-and-branchsi!=-1):
Increase the jump offset used by avr_jump_mode() as needed.
gcc/testsuite/
* gcc.target/avr/torture/pr116407-2.c: New test.
* gcc.target/avr/torture/pr116407-4.c: New test.

(cherry picked from commit dfb2e8caa85d1059a0ab8ed4f19568c04c9f13a4)

Diff:
---
 gcc/config/avr/avr-protos.h   |  2 +-
 gcc/config/avr/avr.cc | 11 +---
 gcc/config/avr/avr.md | 24 ++--
 gcc/testsuite/gcc.target/avr/torture/pr116407-2.c | 34 +++
 gcc/testsuite/gcc.target/avr/torture/pr116407-4.c | 34 +++
 5 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index cf45a8c4499..27659bc8e15 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -108,7 +108,7 @@ extern const char* avr_out_lpm (rtx_insn *, rtx*, int*);
 extern void avr_notice_update_cc (rtx body, rtx_insn *insn);
 extern int reg_unused_after (rtx_insn *insn, rtx reg);
 extern int _reg_unused_after (rtx_insn *insn, rtx reg);
-extern int avr_jump_mode (rtx x, rtx_insn *insn);
+extern int avr_jump_mode (rtx x, rtx_insn *insn, int = 0);
 extern int test_hard_reg_class (enum reg_class rclass, rtx x);
 extern int jump_over_one_insn_p (rtx_insn *insn, rtx dest);
 
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index 676ab2405db..f5e53aba708 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -3388,19 +3388,22 @@ avr_use_by_pieces_infrastructure_p (unsigned 
HOST_WIDE_INT size,
 /* Choose mode for jump insn:
1 - relative jump in range -63 <= x <= 62 ;
2 - relative jump in range -2046 <= x <= 2045 ;
-   3 - absolute jump (only for ATmega[16]03).  */
+   3 - absolute jump (only when we have JMP / CALL).
+
+   When jumping backwards, assume the jump offset is EXTRA words
+   bigger than inferred from insn addresses.  */
 
 int
-avr_jump_mode (rtx x, rtx_insn *insn)
+avr_jump_mode (rtx x, rtx_insn *insn, int extra)
 {
   int dest_addr = INSN_ADDRESSES (INSN_UID (GET_CODE (x) == LABEL_REF
 ? XEXP (x, 0) : x));
   int cur_addr = INSN_ADDRESSES (INSN_UID (insn));
   int jump_distance = cur_addr - dest_addr;
 
-  if (IN_RANGE (jump_distance, -63, 62))
+  if (IN_RANGE (jump_distance, -63, 62 - extra))
 return 1;
-  else if (IN_RANGE (jump_distance, -2046, 2045))
+  else if (IN_RANGE (jump_distance, -2046, 2045 - extra))
 return 2;
   else if (AVR_HAVE_JMP_CALL)
 return 3;
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index c8b5ccaf098..9bd6b9119ec 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -7625,8 +7625,6 @@
   (pc)))]
   "dead_or_set_regno_p (insn, REG_CC)"
   {
-const char *op;
-int jump_mode;
 if (avr_adiw_reg_p (operands[0]))
   output_asm_insn ("sbiw %0,1" CR_TAB
"sbc %C0,__zero_reg__" CR_TAB
@@ -7637,8 +7635,8 @@
"sbc %C0,__zero_reg__" CR_TAB
"sbc %D0,__zero_reg__", operands);
 
-jump_mode = avr_jump_mode (operands[2], insn);
-op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
+int jump_mode = avr_jump_mode (operands[2], insn, 3 - avr_adiw_reg_p 
(operands[0]));
+const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);
 
 switch (jump_mode)
@@ -7668,16 +7666,14 @@
   (pc)))]
   "dead_or_set_regno_p (insn, REG_CC)"
   {
-const char *op;
-int jump_mode;
 if (avr_adiw_reg_p (operands[0]))
   output_asm_insn ("sbiw %0,1", operands);
 else
   output_asm_insn ("subi %A0,1" CR_TAB
"sbc %B0,__zero_reg__", operands);
 
-jump_mode = avr_jump_mode (operands[2], insn);
-op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
+int jump_mode = avr_jump_mode (operands[2], insn, 1 - avr_adiw_reg_p 
(operands[0]));
+const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);
 
 switch (jump_mode)
@@ -7709,16 +7705,14 @@
  

[gcc r15-2983] AVR: ad target/116407 - Fix linker error "relocation truncated to fit".

2024-08-18 Thread Georg-Johann Lay via Gcc-cvs
https://gcc.gnu.org/g:22acd3c4d18dbd4d4d39d7770145fe3ec36073f6

commit r15-2983-g22acd3c4d18dbd4d4d39d7770145fe3ec36073f6
Author: Georg-Johann Lay 
Date:   Sun Aug 18 17:07:15 2024 +0200

AVR: ad target/116407 - Fix linker error "relocation truncated to fit".

PR target/116407
gcc/
* config/avr/avr.md (*dec-and-branchhi!=-1.l.clobber):
Increase the additional jump offset to 2 words.

Diff:
---
 gcc/config/avr/avr.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 8c4819a901f..57f4a08c58c 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -7718,7 +7718,7 @@
  "sub %A0,%3" CR_TAB
  "sbc %B0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn, 1 - avr_adiw_reg_p 
(operands[0]));
+int jump_mode = avr_jump_mode (operands[2], insn, 2);
 const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);


[gcc r15-2984] AVR: Tweak 16-bit addition with const that didn't get a LD_REGS register.

2024-08-18 Thread Georg-Johann Lay via Gcc-cvs
https://gcc.gnu.org/g:1ed1dd521b8104dd76d010aaf78f130b1832864e

commit r15-2984-g1ed1dd521b8104dd76d010aaf78f130b1832864e
Author: Georg-Johann Lay 
Date:   Sun Aug 18 18:26:16 2024 +0200

AVR: Tweak 16-bit addition with const that didn't get a LD_REGS register.

The 16-bit additions like addhi3 have two forms: One with a scratch:QI
and one without, where the latter is required because reload cannot
deal with a scratch when spill code pops a 16-bit addition.

Passes like combine and fwprop1 may come up with the non-scratch version,
which is sub-optimal in the case when the addition is performed in a
NO_LD_REGS register because the operands will be spilled to LD_REGS.
Having a scratch:QI at disposal can lead to better code with less spills.

gcc/
* config/avr/avr.md (*add3_split) [!reload_completed]:
Add a scratch:QI to 16-bit additions with constant.

Diff:
---
 gcc/config/avr/avr.md | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 57f4a08c58c..c10709ecef0 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -1724,12 +1724,28 @@
(match_operand:ALL2 2 "nonmemory_or_const_operand" "r,s,IJ 
YIJ,n Ynn")))]
   ""
   "#"
-  "&& reload_completed"
+  "&& 1"
   [(parallel [(set (match_dup 0)
(plus:ALL2 (match_dup 1)
   (match_dup 2)))
   (clobber (reg:CC REG_CC))])]
-  ""
+  {
+// Passes like combine and fwprop1 may remove the scratch from an
+// addhi3 insn.  Add the scratch again because having a QImode
+// scratch reg available is better than spilling the operands in
+// the case when we don't get a d-regs register.
+if (! reload_completed
+&& const_operand (operands[2], mode)
+&& ! stack_register_operand (operands[0], HImode)
+&& ! stack_register_operand (operands[1], HImode))
+  {
+emit (gen_add3_clobber (operands[0], operands[1], operands[2]));
+DONE;
+  }
+
+if (! reload_completed)
+  FAIL;
+  }
   [(set_attr "isa" "*,*,adiw,*")])
 
 ;; "*addhi3"


[gcc r14-10600] AVR: ad target/116407 - Fix linker error "relocation truncated to fit".

2024-08-18 Thread Georg-Johann Lay via Gcc-cvs
https://gcc.gnu.org/g:c7257486630c419f0d140d558eaa20d183439811

commit r14-10600-gc7257486630c419f0d140d558eaa20d183439811
Author: Georg-Johann Lay 
Date:   Sun Aug 18 17:07:15 2024 +0200

AVR: ad target/116407 - Fix linker error "relocation truncated to fit".

PR target/116407
gcc/
* config/avr/avr.md (*dec-and-branchhi!=-1.l.clobber):
Increase the additional jump offset to 2 words.

(cherry picked from commit 22acd3c4d18dbd4d4d39d7770145fe3ec36073f6)

Diff:
---
 gcc/config/avr/avr.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index 9aaefcfcd64..b7273fa19f6 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -7884,7 +7884,7 @@
  "sub %A0,%3" CR_TAB
  "sbc %B0,__zero_reg__", operands);
 
-int jump_mode = avr_jump_mode (operands[2], insn, 1 - avr_adiw_reg_p 
(operands[0]));
+int jump_mode = avr_jump_mode (operands[2], insn, 2);
 const char *op = ((EQ == ) ^ (jump_mode == 1)) ? "brcc" : "brcs";
 operands[1] = gen_rtx_CONST_STRING (VOIDmode, op);


[gcc r15-2985] libgfortran: implement fpu-macppc for Darwin, support IEEE arithmetic

2024-08-18 Thread François-Xavier Coudert via Gcc-cvs
https://gcc.gnu.org/g:1cfe4a4d0d4447b364815d5e5c889deb2e533669

commit r15-2985-g1cfe4a4d0d4447b364815d5e5c889deb2e533669
Author: Sergey Fedorov 
Date:   Sun Aug 18 18:52:51 2024 +0200

libgfortran: implement fpu-macppc for Darwin, support IEEE arithmetic

This allows to build and use IEEE modules on Darwin PowerPC.

libgfortran/ChangeLog:

* config/fpu-macppc.h (new file): initial support for 
powerpc-darwin.
* configure.host: enable ieee_support for powerpc-darwin case,
set fpu_host='fpu-macppc'.

Signed-off-by: Sergey Fedorov 

Diff:
---
 libgfortran/config/fpu-macppc.h | 413 
 libgfortran/configure.host  |   9 +
 2 files changed, 422 insertions(+)

diff --git a/libgfortran/config/fpu-macppc.h b/libgfortran/config/fpu-macppc.h
new file mode 100644
index 000..cbffa01e0bb
--- /dev/null
+++ b/libgfortran/config/fpu-macppc.h
@@ -0,0 +1,413 @@
+/* FPU-related code for PowerPC.
+   Copyright (C) 2023-2024 Free Software Foundation, Inc.
+   Contributed by Sergey Fedorov 
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+the GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version 3.1,
+as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+. */
+
+/* While some of the definitions and functions used here are available
+   with Apple libm, we on purpose avoid pulling it in: to avoid potential
+   conflicts and not self-impose unnecessary constraints. */
+
+/* FP exception flags */
+#define FE_INEXACT  0x0200
+#define FE_DIVBYZERO0x0400
+#define FE_UNDERFLOW0x0800
+#define FE_OVERFLOW 0x1000
+#define FE_INVALID  0x2000
+
+#define FE_ALL_EXCEPT   0x3E00
+#define FE_NO_EXCEPT0xC1FF
+
+/* Extra invalid flags */
+#define FE_INVALID_SNAN 0x0100
+#define FE_INVALID_ISI  0x0080
+#define FE_INVALID_IDI  0x0040
+#define FE_INVALID_ZDZ  0x0020
+#define FE_INVALID_IMZ  0x0010
+#define FE_INVALID_XVC  0x0008
+#define FE_INVALID_SOFT 0x0400
+#define FE_INVALID_SQRT 0x0200
+#define FE_INVALID_CVI  0x0100
+
+/* Rounding modes */
+#define FE_TONEAREST0x
+#define FE_TOWARDZERO   0x0001
+#define FE_UPWARD   0x0002
+#define FE_DOWNWARD 0x0003
+
+/* There is no consistency re what is to be included in all_invalid.
+   Apple libm has 0x01f80300, Openlibm/FreeBSD/NetBSD has 0x21f80700
+   and OpenBSD has 0x01f80700. In particular, FE_INVALID_SOFT and
+   FE_INVALID are contested. */
+#define FE_ALL_INVALID  0x01F80300
+#define FE_NO_INVALID   0xFE07FCFF
+
+#define FE_ALL_FLAGS0xFFF80300
+#define FE_NO_FLAGS 0x0007FCFF
+
+#define FE_ALL_RND  0x0003
+#define FE_NO_RND   0xFFFC
+
+/* Floating-point exception summary (FX) bit. */
+#define FE_SET_FX   0x8000
+#define FE_CLR_FX   0x7FFF
+
+/* Some implementations use FE_INVALID_SOFT here. */
+#define SET_INVALID 0x0100
+
+#define FE_EXCEPT_SHIFT 22
+#define EXCEPT_MASK FE_ALL_EXCEPT >> FE_EXCEPT_SHIFT
+
+typedef unsigned intfenv_t;
+typedef unsigned intfexcept_t;
+
+/* default environment object */
+extern const fenv_t _FE_DFL_ENV;
+/* pointer to default environment */
+#define FE_DFL_ENV  &_FE_DFL_ENV
+
+typedef union {
+struct {
+unsigned int hi;
+fenv_t   lo;
+} i;
+double   d;
+} hexdouble;
+
+#define HEXDOUBLE(hi, lo) {{ hi, lo }}
+
+
+/* Check we can actually store the FPU state in the allocated size. */
+_Static_assert (sizeof(fenv_t) <= (size_t) GFC_FPE_STATE_BUFFER_SIZE,
+   "GFC_FPE_STATE_BUFFER_SIZE is too small");
+
+
+static inline int fegetexcept(void)
+{
+hexdouble fe;
+
+fe.d = __builtin_mffs();
+return ((fe.i.lo & EXCEPT_MASK) << FE_EXCEPT_SHIFT);
+}
+
+static inline int feclearexcept(int excepts)
+{
+hexdouble fe;
+
+if (excepts & FE_INVALID)
+excepts |= FE_ALL_INVALID;
+fe.d = __builtin_mffs();
+fe.i.lo &= ~excepts;
+if ((fe.i.lo & FE_ALL_EXCEPT) == 0)
+fe.i.lo &= FE_CLR_FX;
+__builtin_mtfsf(0xFF, fe.d);
+return 0;
+}
+
+/* It is not possible to set VX bit directly. */
+static inl

[gcc r15-2986] PHIOPT: move factor_out_conditional_operation over to use gimple_match_op

2024-08-18 Thread Andrew Pinski via Gcc-cvs
https://gcc.gnu.org/g:cd2f394418be0cc15d05c97ed72567f2f5e15172

commit r15-2986-gcd2f394418be0cc15d05c97ed72567f2f5e15172
Author: Andrew Pinski 
Date:   Sat Apr 20 00:13:12 2024 -0700

PHIOPT: move factor_out_conditional_operation over to use gimple_match_op

To start working on more with expressions with more than one operand, 
converting
over to use gimple_match_op is needed.
The added side-effect here is factor_out_conditional_operation can now 
support
builtins/internal calls that has one operand without any extra code added.

Note on the changed testcases:
* pr87007-5.c: the test was testing testing for avoiding partial register 
stalls
for the sqrt and making sure there is only one zero of the register before 
the
branch, the phiopt would now merge the sqrt's so disable phiopt.

Bootstrapped and tested on x86_64-linux-gnu with no regressions.

gcc/ChangeLog:

* gimple-match-exports.cc 
(gimple_match_op::operands_occurs_in_abnormal_phi):
New function.
* gimple-match.h (gimple_match_op): Add 
operands_occurs_in_abnormal_phi.
* tree-ssa-phiopt.cc (factor_out_conditional_operation): Use 
gimple_match_op
instead of manually extracting from/creating the gimple.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr87007-5.c: Disable phi-opt.

Signed-off-by: Andrew Pinski 

Diff:
---
 gcc/gimple-match-exports.cc   | 14 +++
 gcc/gimple-match.h|  2 +
 gcc/testsuite/gcc.target/i386/pr87007-5.c |  5 ++-
 gcc/tree-ssa-phiopt.cc| 66 ++-
 4 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/gcc/gimple-match-exports.cc b/gcc/gimple-match-exports.cc
index aacf3ff0414..15d54b7d843 100644
--- a/gcc/gimple-match-exports.cc
+++ b/gcc/gimple-match-exports.cc
@@ -126,6 +126,20 @@ gimple_match_op::resimplify (gimple_seq *seq, tree 
(*valueize)(tree))
 }
 }
 
+/* Returns true if any of the operands of THIS occurs
+   in abnormal phis. */
+bool
+gimple_match_op::operands_occurs_in_abnormal_phi() const
+{
+  for (unsigned int i = 0; i < num_ops; i++)
+{
+   if (TREE_CODE (ops[i]) == SSA_NAME
+  && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (ops[i]))
+   return true;
+}
+  return false;
+}
+
 /* Return whether T is a constant that we'll dispatch to fold to
evaluate fully constant expressions.  */
 
diff --git a/gcc/gimple-match.h b/gcc/gimple-match.h
index d710fcbace2..8edff578ba9 100644
--- a/gcc/gimple-match.h
+++ b/gcc/gimple-match.h
@@ -136,6 +136,8 @@ public:
 
   /* The operands to CODE.  Only the first NUM_OPS entries are meaningful.  */
   tree ops[MAX_NUM_OPS];
+
+  bool operands_occurs_in_abnormal_phi() const;
 };
 
 inline
diff --git a/gcc/testsuite/gcc.target/i386/pr87007-5.c 
b/gcc/testsuite/gcc.target/i386/pr87007-5.c
index 8f2dc947f6c..c696827df12 100644
--- a/gcc/testsuite/gcc.target/i386/pr87007-5.c
+++ b/gcc/testsuite/gcc.target/i386/pr87007-5.c
@@ -1,8 +1,11 @@
 /* { dg-do compile } */
-/* { dg-options "-Ofast -march=skylake-avx512 -mfpmath=sse -fno-tree-vectorize 
-fdump-tree-cddce3-details -fdump-tree-lsplit-optimized" } */
+/* { dg-options "-Ofast -march=skylake-avx512 -mfpmath=sse -fno-tree-vectorize 
-fdump-tree-cddce3-details -fdump-tree-lsplit-optimized -fno-ssa-phiopt" } */
 /* Load of d2/d3 is hoisted out, the loop is split, store of d1 and sqrt
are sunk out of the loop and the loop is elided.  One vsqrtsd with
memory operand needs a xor to avoid partial dependence.  */
+/* Phi-OPT needs to be disabled otherwise, sqrt calls are merged which is 
better
+   but we are testing to make sure the partial register stall for SSE is still 
avoided
+   for sqrts.  */
 
 #include
 
diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
index aa414f6..2d4aba5b087 100644
--- a/gcc/tree-ssa-phiopt.cc
+++ b/gcc/tree-ssa-phiopt.cc
@@ -220,13 +220,12 @@ static gphi *
 factor_out_conditional_operation (edge e0, edge e1, gphi *phi,
   tree arg0, tree arg1, gimple *cond_stmt)
 {
-  gimple *arg0_def_stmt = NULL, *arg1_def_stmt = NULL, *new_stmt;
-  tree new_arg0 = NULL_TREE, new_arg1 = NULL_TREE;
+  gimple *arg0_def_stmt = NULL, *arg1_def_stmt = NULL;
   tree temp, result;
   gphi *newphi;
   gimple_stmt_iterator gsi, gsi_for_def;
   location_t locus = gimple_location (phi);
-  enum tree_code op_code;
+  gimple_match_op arg0_op, arg1_op;
 
   /* Handle only PHI statements with two arguments.  TODO: If all
  other arguments to PHI are INTEGER_CST or if their defining
@@ -250,31 +249,31 @@ factor_out_conditional_operation (edge e0, edge e1, gphi 
*phi,
   /* Check if arg0 is an SSA_NAME and the stmt which defines arg0 is
  an unary operation.  */
   arg0_def_stmt = SSA_NAME_DEF_STMT (arg0);
-  if (!is_gimple_assign (arg0_def_stmt)
-  || (gimple_assign_rhs_class (arg0_def_stmt) != GIMPLE_UNARY_RHS

[gcc r15-2987] doc: Tweak gm2 mailing list address

2024-08-18 Thread Gerald Pfeifer via Gcc-cvs
https://gcc.gnu.org/g:b9ac01d8b9be76d852df5e684d62382e0720fc5e

commit r15-2987-gb9ac01d8b9be76d852df5e684d62382e0720fc5e
Author: Gerald Pfeifer 
Date:   Sat Aug 17 20:57:39 2024 +0200

doc: Tweak gm2 mailing list address

gcc:
* doc/gm2.texi (Contributing): Tweak gm2 mailing list address.

Diff:
---
 gcc/doc/gm2.texi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/doc/gm2.texi b/gcc/doc/gm2.texi
index b2e4aa2e9c0..1db07c91cf2 100644
--- a/gcc/doc/gm2.texi
+++ b/gcc/doc/gm2.texi
@@ -3060,7 +3060,7 @@ Please do and please read the GNU Emacs info under
 
 You might consider joining the GM2 Mailing list before you start
 coding.  The mailing list may be subscribed via a web interface
-@uref{http://lists.nongnu.org/mailman/listinfo/gm2} or via email
+@uref{https://lists.nongnu.org/mailman/listinfo/gm2} or via email
 @email{gm2-subscribe@@nongnu.org}.
 
 Many thanks and enjoy your coding!


[gcc r15-2988] libstdc++: Remove note from the GCC 4.0.1 days

2024-08-18 Thread Gerald Pfeifer via Gcc-cvs
https://gcc.gnu.org/g:fc41263061808a8539ba7ee1f607920a54e7b265

commit r15-2988-gfc41263061808a8539ba7ee1f607920a54e7b265
Author: Gerald Pfeifer 
Date:   Sun Aug 18 21:18:42 2024 +0200

libstdc++: Remove note from the GCC 4.0.1 days

libstdc++-v3:
* doc/xml/manual/prerequisites.xml: Remove note from the
GCC 4.0.1 days.
* doc/html/manual/setup.html: Regenerate.

Diff:
---
 libstdc++-v3/doc/html/manual/setup.html   | 5 +
 libstdc++-v3/doc/xml/manual/prerequisites.xml | 5 +
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/libstdc++-v3/doc/html/manual/setup.html 
b/libstdc++-v3/doc/html/manual/setup.html
index 78d2a00c50a..d8c5ff65cff 100644
--- a/libstdc++-v3/doc/html/manual/setup.html
+++ b/libstdc++-v3/doc/html/manual/setup.html
@@ -29,10 +29,7 @@
the tools you will need if you wish to modify the source.
 
Additional data is given here only where it applies to libstdc++.
-  As of GCC 4.0.1 the minimum version of binutils required to build
-  libstdc++ is 2.15.90.0.1.1.
-  Older releases of libstdc++ do not require such a recent version,
-  but to take full advantage of useful space-saving features and
+  To take full advantage of useful space-saving features and
   bug-fixes you should use a recent binutils whenever possible.
   The configure process will automatically detect and use these
   features if the underlying support is present.
diff --git a/libstdc++-v3/doc/xml/manual/prerequisites.xml 
b/libstdc++-v3/doc/xml/manual/prerequisites.xml
index a3c6e732a77..0efe63bcd46 100644
--- a/libstdc++-v3/doc/xml/manual/prerequisites.xml
+++ b/libstdc++-v3/doc/xml/manual/prerequisites.xml
@@ -25,10 +25,7 @@
Additional data is given here only where it applies to libstdc++.
   
 
-   As of GCC 4.0.1 the minimum version of binutils required to build
-  libstdc++ is 2.15.90.0.1.1.
-  Older releases of libstdc++ do not require such a recent version,
-  but to take full advantage of useful space-saving features and
+   To take full advantage of useful space-saving features and
   bug-fixes you should use a recent binutils whenever possible.
   The configure process will automatically detect and use these
   features if the underlying support is present.


[gcc r15-2989] [PR rtl-optimization/115876] Avoid ubsan in ext-dce.cc

2024-08-18 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:f10d2ee95356b9de6c44d701c4dfa8fb088714d2

commit r15-2989-gf10d2ee95356b9de6c44d701c4dfa8fb088714d2
Author: Jeff Law 
Date:   Sun Aug 18 16:55:52 2024 -0600

[PR rtl-optimization/115876] Avoid ubsan in ext-dce.cc

This fixes two general ubsan issues in ext-dce, both related to use-side
processsing of modes > DImode.

In ext_dce_process_uses we can be presented with something like this as a 
use
(subreg:SI (reg:TF) 12)

That will result in an out of range shift for a HOST_WIDE_INT object.  Where
this happens is safe to just break from the SET context and process the
subjects.  This will ultimately result in seeing (reg:TF) and we'll mark all
bit groups as live.

In carry_backpropagate we can be presented with a TImode shift (for example)
and the shift count can be > 63 for such a shift.  This naturally trips 
ubsan
as well as we're operating on 64 bit objects.

We can just return mmask in this case noting that every bit group is live.

The combination of these two fixes eliminates all the reported ubsan issues 
in
ext-dce seen in a bootstrap and regression test on x86.

While I was in there I went ahead and fixed the various hardcoded 63/64 
values
to be HOST_BITS_PER_WIDE_INT based.

Bootstrapped and regression tested on x86 with no regressions.  Also built 
with
ubsan enabled and verified the build logs and testsuite logs don't call out 
any
issues in ext-dce anymore.

Pushing to the trunk.

PR rtl-optimization/115876
gcc
* ext-dce.cc (ext_dce_process_sets): Replace hardcoded 63/64 
instances
with HOST_BITS_PER_WIDE_INT based values.
(carry_backpropagate): Handle modes with more bits than
HOST_BITS_PER_WIDE_INT gracefully, avoiding undefined behavior.
(ext_dce_process_uses): Handle subreg offsets which would result
in ubsan shifts gracefully, avoiding undefined behavior.

Diff:
---
 gcc/ext-dce.cc | 30 +-
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/gcc/ext-dce.cc b/gcc/ext-dce.cc
index 017e2de000d..eee9208f0d6 100644
--- a/gcc/ext-dce.cc
+++ b/gcc/ext-dce.cc
@@ -207,7 +207,7 @@ ext_dce_process_sets (rtx_insn *insn, rtx obj, bitmap 
live_tmp)
 wider than DImode.  */
  scalar_int_mode outer_mode;
  if (!is_a  (GET_MODE (x), &outer_mode)
- || GET_MODE_BITSIZE (outer_mode) > 64)
+ || GET_MODE_BITSIZE (outer_mode) > HOST_BITS_PER_WIDE_INT)
{
  /* Skip the subrtxs of this destination.  There is
 little value in iterating into the subobjects, so
@@ -239,7 +239,7 @@ ext_dce_process_sets (rtx_insn *insn, rtx obj, bitmap 
live_tmp)
 that case.  Remember, we can not just continue to process
 the inner RTXs due to the STRICT_LOW_PART.  */
  if (!is_a  (GET_MODE (SUBREG_REG (x)), 
&outer_mode)
- || GET_MODE_BITSIZE (outer_mode) > 64)
+ || GET_MODE_BITSIZE (outer_mode) > HOST_BITS_PER_WIDE_INT)
{
  /* Skip the subrtxs of the STRICT_LOW_PART.  We can't
 process them because it'll set objects as no longer
@@ -293,7 +293,7 @@ ext_dce_process_sets (rtx_insn *insn, rtx obj, bitmap 
live_tmp)
 the top of the loop which just complicates the flow even
 more.  */
  if (!is_a  (GET_MODE (SUBREG_REG (x)), 
&outer_mode)
- || GET_MODE_BITSIZE (outer_mode) > 64)
+ || GET_MODE_BITSIZE (outer_mode) > HOST_BITS_PER_WIDE_INT)
{
  skipped_dest = true;
  iter.skip_subrtxes ();
@@ -329,7 +329,7 @@ ext_dce_process_sets (rtx_insn *insn, rtx obj, bitmap 
live_tmp)
}
 
  /* BIT >= 64 indicates something went horribly wrong.  */
- gcc_assert (bit <= 63);
+ gcc_assert (bit <= HOST_BITS_PER_WIDE_INT - 1);
 
  /* Now handle the actual object that was changed.  */
  if (REG_P (x))
@@ -483,6 +483,17 @@ carry_backpropagate (unsigned HOST_WIDE_INT mask, enum 
rtx_code code, rtx x)
 
   enum machine_mode mode = GET_MODE_INNER (GET_MODE (x));
   unsigned HOST_WIDE_INT mmask = GET_MODE_MASK (mode);
+
+  /* While we don't try to optimize operations on types larger
+ than 64 bits, we do want to make sure not to invoke undefined
+ behavior when presented with such operations during use
+ processing.  The safe thing to do is to just return mmask
+ for that scenario indicating every possible chunk is life.  */
+  scalar_int_mode smode;
+  if (!is_a  (mode, &smode)
+  || GET_MODE_BITSIZE (smode) > HOST_BITS_PER_WIDE_INT)
+return mmask;
+
   switch (code)
 {
 case PLUS:
@@ -733,8 +744,17 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj,

[gcc r15-2991] AVX10.2 ymm rounding: Support vadd{s, d, h} and vcmp{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:e22e3af1954469c40b139b7cfa8e7708592f4bfd

commit r15-2991-ge22e3af1954469c40b139b7cfa8e7708592f4bfd
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:51 2024 +0800

AVX10.2 ymm rounding: Support vadd{s,d,h} and vcmp{s,d,h} intrins

gcc/ChangeLog:

* config.gcc: Add avx10_2roundingintrin.h.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V4DF_FTYPE_V4DF_V4DF_V4DF_UQI_INT, 
V8SF_FTYPE_V8SF_V8SF_V8SF_UQI_INT,
V16HF_FTYPE_V16HF_V16HF_V16HF_UHI_INT, 
UQI_FTYPE_V4DF_V4DF_INT_UQI_INT,
UHI_FTYPE_V16HF_V16HF_INT_UHI_INT, UQI_FTYPE_V8SF_V8SF_INT_UQI_INT.
* config/i386/immintrin.h: Include avx10_2roundingintrin.h.
* config/i386/sse.md: Change subst_attr name due to renaming.
* config/i386/subst.md:
(): Add condition check for avx10.2
rounding control 256bit intrins and renamed to ...
(): ...this.
(round_saeonly_mode512bit_condition): Add condition check for
avx10.2 rounding control 256 bit intris and renamed to ...
(round_saeonly_mode_condition): ...this.
* config/i386/avx10_2roundingintrin.h: New file.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add -mavx10.2 and new builtin test.
* gcc.target/i386/avx-2.c: Ditto.
* gcc.target/i386/sse-13.c: Add new tests.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: New test.

Diff:
---
 gcc/config.gcc |   2 +-
 gcc/config/i386/avx10_2roundingintrin.h| 337 +
 gcc/config/i386/i386-builtin-types.def |   8 +
 gcc/config/i386/i386-builtin.def   |   8 +
 gcc/config/i386/i386-expand.cc |   6 +
 gcc/config/i386/immintrin.h|   2 +
 gcc/config/i386/sse.md | 100 +++---
 gcc/config/i386/subst.md   |  32 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  10 +-
 gcc/testsuite/gcc.target/i386/avx-2.c  |   2 +-
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  64 
 gcc/testsuite/gcc.target/i386/sse-13.c |   8 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  17 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  17 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   8 +
 15 files changed, 558 insertions(+), 63 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index a36dd1bcbc6..2c0f4518638 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -452,7 +452,7 @@ i[34567]86-*-* | x86_64-*-*)
   cmpccxaddintrin.h amxfp16intrin.h prfchiintrin.h
   raointintrin.h amxcomplexintrin.h avxvnniint16intrin.h
   sm3intrin.h sha512intrin.h sm4intrin.h
-  usermsrintrin.h"
+  usermsrintrin.h avx10_2roundingintrin.h"
;;
 ia64-*-*)
extra_headers=ia64intrin.h
diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
new file mode 100644
index 000..5698ed05c1d
--- /dev/null
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -0,0 +1,337 @@
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use  directly; include  
instead."
+#endif
+
+#ifndef _AVX10_2ROUNDINGINTRIN_H_INCLUDED
+#define _AVX10_2ROUNDINGINTRIN_H_INCLUDED
+
+#ifndef __AVX10_2_256__
+#pragma GCC push_options
+#pragma GCC target("avx10.2-256")
+#define __DISABLE_AVX10_2_256__
+#endif /* __AVX10_2_256__ */
+
+#ifdef  __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inl

[gcc r15-2992] AVX10.2 ymm rounding: Support vcvtdq2p{s, h} and vcvtpd2p{s, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:85e874d19548f0dcb9a3f14f9e4b1e3411c88c4b

commit r15-2992-g85e874d19548f0dcb9a3f14f9e4b1e3411c88c4b
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:53 2024 +0800

AVX10.2 ymm rounding: Support vcvtdq2p{s,h} and vcvtpd2p{s,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: Add new intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SF_FTYPE_V8SI_V8SF_UQI_INT, V4SF_FTYPE_V4DF_V4SF_UQI_INT,
V8HF_FTYPE_V8SI_V8HF_UQI_INT, V8HF_FTYPE_V4DF_V8HF_UQI_INT.
* config/i386/sse.md:

(avx512fp16_vcvt2ph_):
Add condition check.
(avx512fp16_vcvtpd2ph_v4df_mask_round): New expand.
(*avx512fp16_vcvt2ph__mask): Change name to
avx512fp16_vcvt2ph__mask_1
and extend pattern to generate 256bit insns.
(avx_cvtpd2ps256): Change name to
avx_cvtpd2ps256 and extend pattern to
generate 256bit insns.
* config/i386/subst.md (round_applied): New condition.
(round_suff): New iterator.
(round_mode_condition): Add V32HI check for 512bit.
(round_saeonly_mode_condition): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/sse-14.c: Add new macro test.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 210 +
 gcc/config/i386/i386-builtin-types.def |   4 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   4 +
 gcc/config/i386/sse.md |  32 +++-
 gcc/config/i386/subst.md   |   4 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  44 -
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 322 insertions(+), 16 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 5698ed05c1d..09285c1ffcd 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -216,6 +216,138 @@ _mm256_mask_cmp_round_ps_mask (__mmask8 __U, __m256 __A, 
__m256 __B,
(__mmask8) __U,
__R);
 }
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi32_ph (__m256i __A, const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtdq2ph256_mask_round ((__v8si) __A,
+  (__v8hf)
+  _mm_setzero_ph (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepi32_ph (__m128h __W, __mmask8 __U, __m256i __A,
+  const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtdq2ph256_mask_round ((__v8si) __A,
+  (__v8hf) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepi32_ph (__mmask8 __U, __m256i __A, const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtdq2ph256_mask_round ((__v8si) __A,
+  (__v8hf)
+  _mm_setzero_ph (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi32_ps (__m256i __A, const int __R)
+{
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask_round ((__v8si) __A,
+(__v8sf)
+_mm256_undefined_ps (),
+  

[gcc r15-2993] AVX10.2 ymm rounding: Support vcvtpd2{, u}{dq, qq} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:508ac49e1a94c28346642bff512d0ed5f4f58b64

commit r15-2993-g508ac49e1a94c28346642bff512d0ed5f4f58b64
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:55 2024 +0800

AVX10.2 ymm rounding: Support vcvtpd2{,u}{dq,qq} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: Add new intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V4DI_FTYPE_V4DF_V4DI_UQI_INT, V4SI_FTYPE_V4DF_V4SI_UQI_INT.
* config/i386/sse.md:
(avx_cvtpd2dq256): Change name to
avx_cvtpd2dq256 and extend pattern to
generate 256bit insns.
(fixuns_notrunc2):
Add round_mode_condition.
* config/i386/subst.md (round_pd2udqsuff): New iterator.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/sse-14.c: Add new macro test.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 218 +
 gcc/config/i386/i386-builtin-types.def |   2 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   2 +
 gcc/config/i386/sse.md |  13 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  33 
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 303 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 09285c1ffcd..3e5e9f3ba0e 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -348,6 +348,144 @@ _mm256_maskz_cvt_roundpd_ps (__mmask8 __U, __m256d __A, 
const int __R)
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundpd_epi32 (__m256d __A, const int __R)
+{
+  return
+(__m128i) __builtin_ia32_cvtpd2dq256_mask_round ((__v4df) __A,
+(__v4si)
+_mm_undefined_si128 (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A,
+  const int __R)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask_round ((__v4df) __A,
+ (__v4si) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m256d __A, const int __R)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask_round ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundpd_epi64 (__m256d __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvtpd2qq256_mask_round ((__v4df) __A,
+(__v4di)
+_mm256_setzero_si256 (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask_round ((__v4df) __A,
+ (__v4di) __W,
+ 

[gcc r15-2994] AVX10.2 ymm rounding: Support vcvtph2p{s, d, sx} and vcvtph2{, u}{dq, qq} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6f2eac53b6026836f3222961c32312e02c2c7dbc

commit r15-2994-g6f2eac53b6026836f3222961c32312e02c2c7dbc
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:56 2024 +0800

AVX10.2 ymm rounding: Support vcvtph2p{s,d,sx} and vcvtph2{,u}{dq,qq} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SF_FTYPE_V8HF_V8SF_UQI_INT, V8SI_FTYPE_V8HF_V8SI_UQI_INT,
V4DF_FTYPE_V8HF_V4DF_UQI_INT, V4DI_FTYPE_V8HF_V4DI_UQI_INT.
* config/i386/sse.md:
(avx512fp16_float_extend_ph2):
Add condition check.
(avx512fp16_vcvtph2_
):
Ditto.
(avx512fp16_float_extend_ph2): Extend round 
saeonly.
(vcvtph2ps256): Ditto.
* config/i386/subst.md
(round_saeonly_applied): New condition.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 384 +
 gcc/config/i386/i386-builtin-types.def |   4 +
 gcc/config/i386/i386-builtin.def   |   7 +
 gcc/config/i386/i386-expand.cc |   4 +
 gcc/config/i386/sse.md |  19 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   7 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  57 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   7 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  20 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  21 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   7 +
 12 files changed, 529 insertions(+), 9 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 3e5e9f3ba0e..29966f5e1bf 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -486,6 +486,246 @@ _mm256_maskz_cvt_roundpd_epu64 (__mmask8 __U, __m256d 
__A, const int __R)
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_epi32 (__m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2dq256_mask_round ((__v8hf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_epi32 (__m256i __W, __mmask8 __U, __m128h __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtph2dq256_mask_round ((__v8hf) __A,
+  (__v8si) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundph_epi32 (__mmask8 __U, __m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2dq256_mask_round ((__v8hf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_pd (__m128h __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_vcvtph2pd256_mask_round ((__v8hf) __A,
+  (__v4df)
+  _mm256_setzero_pd (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_pd (__m256d __W, __mmask8 __U, __m128h __A,
+   const int __R)
+{
+  return (_

[gcc r15-2995] AVX10.2 ymm rounding: Support vcvtph2{, u}w and vcvtps2p{d, hx} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b70bb94aca7bc10a54f744d793c32c51f91ce195

commit r15-2995-gb70bb94aca7bc10a54f744d793c32c51f91ce195
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:57 2024 +0800

AVX10.2 ymm rounding: Support vcvtph2{,u}w and vcvtps2p{d,hx} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V16HI_FTYPE_V16HF_V16HI_UHI_INT, V4DF_FTYPE_V4SF_V4DF_UQI_INT
V8HF_FTYPE_V8SF_V8HF_UQI_INT.
* config/i386/sse.md
(avx512fp16_vcvt2ph_):
Add round condition check.
* config/i386/subst.md (round_mode_condition): Add V16HI check for
256bit.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 220 +
 gcc/config/i386/i386-builtin-types.def |   3 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   3 +
 gcc/config/i386/sse.md |   2 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  36 
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 304 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 29966f5e1bf..bc3f92a7d1a 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -726,6 +726,143 @@ _mm256_maskz_cvt_roundph_epu64 (__mmask8 __U, __m128h 
__A, const int __R)
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_epu16 (__m256h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2uw256_mask_round ((__v16hf) __A,
+ (__v16hi)
+ _mm256_undefined_si256 (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_epu16 (__m256i __W, __mmask16 __U, __m256h __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtph2uw256_mask_round ((__v16hf) __A,
+  (__v16hi) __W,
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundph_epu16 (__mmask16 __U, __m256h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2uw256_mask_round ((__v16hf) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundph_epi16 (__m256h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtph2w256_mask_round ((__v16hf) __A,
+(__v16hi)
+_mm256_undefined_si256 (),
+(__mmask16) -1,
+__R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundph_epi16 (__m256i __W, __mmask16 __U, __m256h __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtph2w256_mask_round ((__v16hf) __A,
+ (__v16hi) __W,
+   

[gcc r15-2996] AVX10.2 ymm rounding: Support vcvtps2{, u}{dq, qq} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:0f5a42d41b46b746c6f77374d76a3b918a1e2b57

commit r15-2996-g0f5a42d41b46b746c6f77374d76a3b918a1e2b57
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:58 2024 +0800

AVX10.2 ymm rounding: Support vcvtps2{,u}{dq,qq} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SI_FTYPE_V8SF_V8SI_UQI_INT, V4DI_FTYPE_V4SF_V4DI_UQI_INT.
* config/i386/sse.md
(_fix_notrunc):
Extend to round.

(_fixuns_notrunc):
Add round condition check.
* config/i386/subst.md (round_constraint4): New.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-1.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 226 +
 gcc/config/i386/i386-builtin-types.def |   2 +
 gcc/config/i386/i386-builtin.def   |   4 +
 gcc/config/i386/i386-expand.cc |   2 +
 gcc/config/i386/sse.md |  10 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   4 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-1.c |  32 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   4 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   4 +
 12 files changed, 308 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index bc3f92a7d1a..fca10a6b586 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -863,6 +863,146 @@ _mm256_maskz_cvtx_roundps_ph (__mmask8 __U, __m256 __A, 
const int __R)
(__mmask8) __U,
__R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundps_epi32 (__m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundps_epi32 (__m256i __W, __mmask8 __U, __m256 __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvtps2dq256_mask_round ((__v8sf) __A,
+  (__v8si) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundps_epi32 (__mmask8 __U, __m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvtps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundps_epi64 (__m128 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvtps2qq256_mask_round ((__v4sf) __A,
+(__v4di)
+_mm256_setzero_si256 (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundps_epi64 (__m256i __W, __mmask8 __U, __m128 __A,
+  const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask_round ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U,
+   

[gcc r15-2997] AVX10.2 ymm rounding: Support vcvtqq2p{s, d, h} and vcvttpd2{, u}{dq, qq} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6e231f8504874828b23bbe89f3ef4086dcc15a44

commit r15-2997-g6e231f8504874828b23bbe89f3ef4086dcc15a44
Author: Hu, Lin1 
Date:   Mon Aug 19 10:08:59 2024 +0800

AVX10.2 ymm rounding: Support vcvtqq2p{s,d,h} and vcvttpd2{,u}{dq,qq} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V4DF_FTYPE_V4DI_V4DF_UQI_INT, V4SF_FTYPE_V4DI_V4SF_UQI_INT,
V8HF_FTYPE_V4DI_V8HF_UQI_INT.
* config/i386/sse.md:
(avx512fp16_vcvtqq2ph_v4di_mask_round): New expand.
(*avx512fp16_vcvt2ph__mask):
Extend round control and add "_1" suffix.

(float2):
Add condition check.

(float2):
Ditto.

(float2):
Limit suffix output.
(unspec_fix_truncv4dfv4si2): Extend round control.
(unspec_fixuns_truncv4dfv4si2): Ditto.
* config/i386/subst.md (round_qq2pssuff): New iterator.
(round_saeonly_suff): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-2.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 390 +
 gcc/config/i386/i386-builtin-types.def |   3 +
 gcc/config/i386/i386-builtin.def   |   7 +
 gcc/config/i386/i386-expand.cc |   3 +
 gcc/config/i386/sse.md |  43 ++-
 gcc/config/i386/subst.md   |   2 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   7 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-2.c |  72 
 gcc/testsuite/gcc.target/i386/sse-13.c |   7 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  21 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  21 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   7 +
 12 files changed, 569 insertions(+), 14 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index fca10a6b586..25efd9d7b96 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1003,6 +1003,244 @@ _mm256_maskz_cvt_roundps_epu64 (__mmask8 __U, __m128 
__A, const int __R)
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi64_pd (__m256i __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask_round ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepi64_pd (__m256d __W, __mmask8 __U, __m256i __A,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask_round ((__v4di) __A,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepi64_pd (__mmask8 __U, __m256i __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask_round ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi64_ph (__m256i __A, const int __R)
+{
+  return (__m128h) __builtin_ia32_vcvtqq2ph256_mask_round ((__v4di) __A,
+  (__v8hf)
+  _mm_setzero_ph (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m128h
+__attr

[gcc r15-2998] AVX10.2 ymm rounding: Support vcvttph2{, u}{dq, qq, w} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:493c5096050523ebc05e5fa21612683a996b97a7

commit r15-2998-g493c5096050523ebc05e5fa21612683a996b97a7
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:00 2024 +0800

AVX10.2 ymm rounding: Support vcvttph2{,u}{dq,qq,w} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md 
(avx512fp16_fix_trunc2):
Extend round control for 256bit.
(unspec_avx512fp16_fix_trunc2):
Ditto.

(avx512fp16_fix_trunc2):
Add condition check.
* config/i386/subst.md
(round_saeonly_mode_condition): Add V16HI check for 256bit.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-2.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 335 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/config/i386/sse.md |  10 +-
 gcc/config/i386/subst.md   |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-2.c |  46 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 10 files changed, 447 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 25efd9d7b96..45a04e5a7a8 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1241,6 +1241,216 @@ _mm256_maskz_cvtt_roundpd_epu64 (__mmask8 __U, __m256d 
__A, const int __R)
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundph_epi32 (__m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvttph2dq256_mask_round ((__v8hf) __A,
+  (__v8si)
+  _mm256_setzero_si256 (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundph_epi32 (__m256i __W, __mmask8 __U, __m128h __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvttph2dq256_mask_round ((__v8hf) __A,
+   (__v8si) __W,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundph_epi32 (__mmask8 __U, __m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvttph2dq256_mask_round ((__v8hf) __A,
+  (__v8si)
+  _mm256_setzero_si256 (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundph_epi64 (__m128h __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_vcvttph2qq256_mask_round ((__v8hf) __A,
+  (__v4di)
+  _mm256_setzero_si256 (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundph_epi64 (__m256i __W, __mmask8 __U, __m128h __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_vcvttph2qq256_mask_round ((__v8hf) __A,
+   (__v4di) __W,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundph_epi64 (__mmask8 __U, __m

[gcc r15-3000] AVX10.2 ymm rounding: Support vcvt{, u}w2ph and vdivp{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:3d1b5530ea1d23e26dc5ab70aa4a2e7b9dc19b50

commit r15-3000-g3d1b5530ea1d23e26dc5ab70aa4a2e7b9dc19b50
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:03 2024 +0800

AVX10.2 ymm rounding: Support vcvt{,u}w2ph and vdivp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V16HF_FTYPE_V16HI_V16HF_UHI_INT.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 286 +
 gcc/config/i386/i386-builtin-types.def |   1 +
 gcc/config/i386/i386-builtin.def   |   5 +
 gcc/config/i386/i386-expand.cc |   1 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   5 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  58 +
 gcc/testsuite/gcc.target/i386/sse-13.c |   5 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  15 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   5 +
 10 files changed, 396 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 384facb424c..15ea46b5983 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1757,6 +1757,183 @@ _mm256_maskz_cvt_roundepu64_ps (__mmask8 __U, __m256i 
__A, const int __R)
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepu16_ph (__m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtuw2ph256_mask_round ((__v16hi) __A,
+  (__v16hf)
+  _mm256_setzero_ph (),
+  (__mmask16) -1,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepu16_ph (__m256h __W, __mmask16 __U, __m256i __A,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtuw2ph256_mask_round ((__v16hi) __A,
+  (__v16hf) __W,
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepu16_ph (__mmask16 __U, __m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtuw2ph256_mask_round ((__v16hi) __A,
+  (__v16hf)
+  _mm256_setzero_ph (),
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvt_roundepi16_ph (__m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtw2ph256_mask_round ((__v16hi) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvt_roundepi16_ph (__m256h __W, __mmask16 __U, __m256i __A,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtw2ph256_mask_round ((__v16hi) __A,
+ (__v16hf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvt_roundepi16_ph (__mmask16 __U, __m256i __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_vcvtw2ph256_mask_round ((__v16hi) __A,
+  

[gcc r15-2999] AVX10.2 ymm rounding: Support vcvttps2{, u}{dq, qq} and vcvtu{dq, qq}2p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:b2754227139512adecb6fda067632b587ff4a017

commit r15-2999-gb2754227139512adecb6fda067632b587ff4a017
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:01 2024 +0800

AVX10.2 ymm rounding: Support vcvttps2{,u}{dq,qq} and vcvtu{dq,qq}2p{s,d,h} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md
(unspec_fix_truncv8sfv8si2): Extend rounding control.
(fixuns_trunc2):
Ditto.

(floatuns2):
Add condition check.

(fix_trunc2):
Remove round_saeonly_name.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-2.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 492 +
 gcc/config/i386/i386-builtin.def   |   9 +
 gcc/config/i386/sse.md |  27 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   9 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-2.c |  75 
 gcc/testsuite/gcc.target/i386/sse-13.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  26 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  27 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   9 +
 9 files changed, 670 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 45a04e5a7a8..384facb424c 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1451,6 +1451,312 @@ _mm256_maskz_cvtt_roundph_epi16 (__mmask16 __U, __m256h 
__A, const int __R)
  (__mmask16) __U,
  __R);
 }
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundps_epi32 (__m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundps_epi32 (__m256i __W, __mmask8 __U, __m256 __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask_round ((__v8sf) __A,
+  (__v8si) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundps_epi32 (__mmask8 __U, __m256 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2dq256_mask_round ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtt_roundps_epi64 (__m128 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2qq256_mask_round ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtt_roundps_epi64 (__m256i __W, __mmask8 __U, __m128 __A,
+   const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask_round ((__v4sf) __A,
+  (__v4di) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtt_roundps_epi64 (__mmask8 __U, __m128 __A, const int __R)
+{
+  return
+(__m256i) __builtin_ia32_cvttps2qq256_mask_round ((__v4sf) __A,
+  

[gcc r15-3001] AVX10.2 ymm rounding: Support vfc{madd, mul}cph, vfixupimmp{s, d} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:95980b292b24110d3f1dffb81926df23c61b4fe7

commit r15-3001-g95980b292b24110d3f1dffb81926df23c61b4fe7
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:04 2024 +0800

AVX10.2 ymm rounding: Support vfc{madd,mul}cph, vfixupimmp{s,d} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V16HF_FTYPE_V16HF_V16HF_INT, V16HF_FTYPE_V16HF_V16HF_V16HF_INT,
V16HF_FTYPE_V16HF_V16HF_V16HF_UQI_INT,
V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI_INT,
V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI_INT.
* config/i386/sse.md:
(_fixupimm):
Add condition check.
(_fixupimm_mask): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 247 +
 gcc/config/i386/i386-builtin-types.def |   5 +
 gcc/config/i386/i386-builtin.def   |  10 +
 gcc/config/i386/i386-expand.cc |   5 +
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  10 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  49 
 gcc/testsuite/gcc.target/i386/sse-13.c |  10 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  13 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  13 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  10 +
 11 files changed, 374 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 15ea46b5983..d5ea6bc57da 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -1934,6 +1934,164 @@ _mm256_maskz_div_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
  (__mmask8) __U,
  __R);
 }
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmadd_round_pch (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmadd_round_pch (__m256h __A, __mmask8 __U, __m256h __B,
+ __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_mask_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fcmadd_round_pch (__m256h __A, __m256h __B, __m256h __D,
+  __mmask8 __U, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_mask3_round ((__v16hf) __A,
+(__v16hf) __B,
+(__v16hf) __D,
+__U,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmadd_round_pch (__mmask8 __U, __m256h __A, __m256h __B,
+  __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_maskz_round ((__v16hf) __A,
+(__v16hf) __B,
+(__v16hf) __D,
+__U,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmul_round_pch (__m256h __A, __m256h __B, const int __R)
+{
+  return
+(__m256h) __builtin_ia32_vfcmulcph256_round ((__v16hf) __A,
+  

[gcc r15-3002] AVX10.2 ymm rounding: Support vfmadd{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:0683ca355a87fd36a2e7ae1721199204ceff4c4c

commit r15-3002-g0683ca355a87fd36a2e7ae1721199204ceff4c4c
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:05 2024 +0800

AVX10.2 ymm rounding: Support vfmadd{132,231,213}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fmadd__mask3): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: New test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 176 +
 gcc/config/i386/i386-builtin.def   |   9 ++
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   9 ++
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  31 
 gcc/testsuite/gcc.target/i386/sse-13.c |   9 ++
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   9 ++
 9 files changed, 268 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index d5ea6bc57da..9015095144e 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2092,6 +2092,146 @@ _mm256_maskz_fixupimm_round_ps (__mmask8 __U, __m256 
__A, __m256 __B,
(__mmask8) __U,
__R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+   __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_round_pd (__m256d __A, __m256d __B, __m256d __D,
+__mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_round_ph (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddph256_mask_round ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __D,
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_round_ph (__m256h __A, __mmask16 __U, __m256h __B,
+   __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmadd

[gcc r15-3003] AVX10.2 ymm rounding: Support vfmaddcph and vfmaddsub{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:cfbc94eaf167ae7aecd21ee6054556e1cf9d7143

commit r15-3003-gcfbc94eaf167ae7aecd21ee6054556e1cf9d7143
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:06 2024 +0800

AVX10.2 ymm rounding: Support vfmaddcph and vfmaddsub{132,231,213}p{s,d,h} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fmaddsub__mask): Add condition check.
(_fmaddsub__mask3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 238 +
 gcc/config/i386/i386-builtin.def   |  13 ++
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  13 ++
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  43 
 gcc/testsuite/gcc.target/i386/sse-13.c |  13 ++
 gcc/testsuite/gcc.target/i386/sse-14.c |  16 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  13 ++
 9 files changed, 366 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 9015095144e..95e42410a10 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2232,6 +2232,193 @@ _mm256_maskz_fmadd_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_round_pch (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_round ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __D,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_round_pch (__m256h __A, __mmask16 __U, __m256h __B,
+__m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_round_pch (__m256h __A, __m256h __B, __m256h __D,
+ __mmask16 __U, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_mask3_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_round_pch (__mmask16 __U, __m256h __A, __m256h __B,
+ __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_maskz_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   __U,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask_round ((__v4df) __A,
+(__v4df) __B,
+(__v4df) __D,
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmaddsub_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+  __m256d __D, const int __R)

[gcc r15-3005] AVX10.2 ymm rounding: Support vfmulcph and vfnmadd{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:6f0aa7add1d9177f60016b32ca9ca8b16b173a56

commit r15-3005-g6f0aa7add1d9177f60016b32ca9ca8b16b173a56
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:09 2024 +0800

AVX10.2 ymm rounding: Support vfmulcph and vfnmadd{132,231,213}p{s,d,h} 
intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 241 +
 gcc/config/i386/i386-builtin.def   |  11 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |  11 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +
 gcc/testsuite/gcc.target/i386/sse-13.c |  11 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  14 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  14 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  11 +
 8 files changed, 363 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 346a32c1a8a..3f833bffa54 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2697,6 +2697,185 @@ _mm256_maskz_fmsubadd_round_ps (__mmask8 __U, __m256 
__A, __m256 __B,
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmul_round_pch (__m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_round ((__v16hf) __B,
+(__v16hf) __D,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmul_round_pch (__m256h __A, __mmask8 __U, __m256h __B,
+   __m256h __D, const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask_round ((__v16hf) __B,
+ (__v16hf) __D,
+ (__v16hf) __A,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmul_round_pch (__mmask8 __U, __m256h __B, __m256h __D,
+const int __R)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask_round ((__v16hf) __B,
+ (__v16hf) __D,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmadd_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmadd_round_pd (__m256d __A, __m256d __B, __m256d __D,
+ __mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask3_round ((__v4df) __A,
+   (__v4df) __B,
+   (__v4df) __D,
+  

[gcc r15-3004] AVX10.2 ymm rounding: Support vfm{sub, subadd}{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:dd48acbe85ca55dd23ffafbb917ffe559d13b6a3

commit r15-3004-gdd48acbe85ca55dd23ffafbb917ffe559d13b6a3
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:08 2024 +0800

AVX10.2 ymm rounding: Support vfm{sub,subadd}{132,231,213}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fmsub__mask): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 350 +
 gcc/config/i386/i386-builtin.def   |  18 ++
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |  18 ++
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  62 
 gcc/testsuite/gcc.target/i386/sse-13.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-14.c |  24 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  24 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |  18 ++
 9 files changed, 533 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 95e42410a10..346a32c1a8a 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2419,6 +2419,284 @@ _mm256_maskz_fmaddsub_round_ps (__mmask8 __U, __m256 
__A, __m256 __B,
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+   __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __D,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsub_round_pd (__m256d __A, __m256d __B, __m256d __D,
+__mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask3_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U, __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsub_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_maskz_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U, __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_round_ph (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h)
+__builtin_ia32_vfmsubph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  (__mmask16) -1, __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_round_ph (__m256h __A, __mmask16 __U, __m256h __B,
+   __m256h __D, const int __R)
+{
+  return (__m256h)
+__builtin_ia32_vfmsubph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __D,
+  (__mmask16) __U, __R);
+}
+
+extern __inline __m256h
+__attribute__

[gcc r15-3006] AVX10.2 ymm rounding: Support vfnmsub{132, 231, 213}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:0983d406ae2e84394b25248865f51c686b119a57

commit r15-3006-g0983d406ae2e84394b25248865f51c686b119a57
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:10 2024 +0800

AVX10.2 ymm rounding: Support vfnmsub{132,231,213}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(_fnmsub__mask3): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 181 +
 gcc/config/i386/i386-builtin.def   |   9 +
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   9 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  31 
 gcc/testsuite/gcc.target/i386/sse-13.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  12 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   9 +
 9 files changed, 273 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 3f833bffa54..afc1220fea4 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -2876,6 +2876,151 @@ _mm256_maskz_fnmadd_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_round_pd (__m256d __A, __m256d __B, __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_round_pd (__m256d __A, __mmask8 __U, __m256d __B,
+__m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __D,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmsub_round_pd (__m256d __A, __m256d __B, __m256d __D,
+ __mmask8 __U, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3_round ((__v4df) __A,
+   (__v4df) __B,
+   (__v4df) __D,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmsub_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256d __D, const int __R)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_maskz_round ((__v4df) __A,
+   (__v4df) __B,
+   (__v4df) __D,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_round_ph (__m256h __A, __m256h __B, __m256h __D, const int __R)
+{
+  return (__m256h)
+__builtin_ia32_vfnmsubph256_mask_round ((__v16hf) __A,
+   (__v16hf) __B,
+   (__v16hf) __D,
+   (__mmask16) -1,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_round_ph (__m256h __A, __mmask16 __U, __m256h __B,
+__m256h __D, const int __R)
+{
+  return

[gcc r15-3007] AVX10.2 ymm rounding: Support vgetexpp{s, d, h} and vgetmantp{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:8d4f542935c09f40bb7fd8fd863cc8df80cc970e

commit r15-3007-g8d4f542935c09f40bb7fd8fd863cc8df80cc970e
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:11 2024 +0800

AVX10.2 ymm rounding: Support vgetexpp{s,d,h} and vgetmantp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin): Handle
V8SF_FTYPE_V8SF_V8SF_UQI_INT, V4DF_FTYPE_V4DF_V4DF_UQI_INT,
V16HF_FTYPE_V16HF_V16HF_UHI_INT, 
V16HF_FTYPE_V16HF_INT_V16HF_UHI_INT,
V4DF_FTYPE_V4DF_INT_V4DF_UQI_INT, V8SF_FTYPE_V8SF_INT_V8SF_UQI_INT.
* config/i386/sse.md:
(_getexp):
Add condition check.
(_getmant):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 341 +
 gcc/config/i386/i386-builtin-types.def |   6 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/config/i386/i386-expand.cc |   6 +
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  59 
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 11 files changed, 474 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index afc1220fea4..07729a6cc04 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3021,6 +3021,217 @@ _mm256_maskz_fnmsub_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_round_pd (__m256d __A, const int __R)
+{
+  return
+(__m256d) __builtin_ia32_getexppd256_mask_round ((__v4df) __A,
+(__v4df)
+_mm256_undefined_pd (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+const int __R)
+{
+  return (__m256d) __builtin_ia32_getexppd256_mask_round ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_round_pd (__mmask8 __U, __m256d __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_getexppd256_mask_round ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_round_ph (__m256h __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask_round ((__v16hf) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+const int __R)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask_round ((__v16hf) __A,
+ (__v16hf) __W,
+ (__mmask16) __

[gcc r15-3008] AVX10.2 ymm rounding: Support v{max, min}p{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:cc8a7596477e9d6ac972aadabbb2fd88baa1abf4

commit r15-3008-gcc8a7596477e9d6ac972aadabbb2fd88baa1abf4
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:13 2024 +0800

AVX10.2 ymm rounding: Support v{max,min}p{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 360 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 8 files changed, 470 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index 07729a6cc04..a5712f5230a 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3232,6 +3232,228 @@ _mm256_maskz_getmant_round_ps (__mmask8 __U, __m256 __A,
  _mm256_setzero_ps (),
  __U, __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_round_pd (__m256d __A, __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_undefined_pd (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_setzero_pd (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_round_ph (__m256h __A, __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_maxph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf)
+  _mm256_undefined_ph (),
+  (__mmask16) -1,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+ __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_maxph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __W,
+  (__mmask16) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_round_ph (__mmask16 __U, __m256h __A, __m256h __B,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_maxph256_mask_round ((__v16hf) __A,
+

[gcc r15-3009] AVX10.2 ymm rounding: Support vmulp{s, d, h} and vrangep{s, d} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:90cc5b0c4609a9fb3257d2cce7b7abc896c6faab

commit r15-3009-g90cc5b0c4609a9fb3257d2cce7b7abc896c6faab
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:14 2024 +0800

AVX10.2 ymm rounding: Support vmulp{s,d,h} and vrangep{s,d} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin-types.def: Add new DEF_FUNCTION_TYPE.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-expand.cc (ix86_expand_round_builtin):
Handle V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI_INT,
V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI_INT.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 313 +
 gcc/config/i386/i386-builtin-types.def |   2 +
 gcc/config/i386/i386-builtin.def   |   5 +
 gcc/config/i386/i386-expand.cc |   2 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   5 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  43 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   5 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  15 +
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   5 +
 10 files changed, 410 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index a5712f5230a..ac0914415c9 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3454,6 +3454,198 @@ _mm256_maskz_min_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
  (__mmask8) __U,
  __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_round_pd (__m256d __A, __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_undefined_pd (),
+  (__mmask8) -1,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df) __W,
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask_round ((__v4df) __A,
+  (__v4df) __B,
+  (__v4df)
+  _mm256_setzero_pd (),
+  (__mmask8) __U,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_round_ph (__m256h __A, __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_mulph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf)
+  _mm256_undefined_ph (),
+  (__mmask16) -1,
+  __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+ __m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_mulph256_mask_round ((__v16hf) __A,
+  (__v16hf) __B,
+  (__v16hf) __W,
+  (__mmask16)

[gcc r15-3010] AVX10.2 ymm rounding: Support vreducep{s, d, h} and vrndscalep{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:9afa5081212e1fc3cb2c4efc9b4f421eecf68810

commit r15-3010-g9afa5081212e1fc3cb2c4efc9b4f421eecf68810
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:18 2024 +0800

AVX10.2 ymm rounding: Support vreducep{s,d,h} and vrndscalep{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/sse.md:
(reducep):
Add condition check.
(_rndscale): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 367 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/config/i386/sse.md |   4 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   6 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 +
 gcc/testsuite/gcc.target/i386/sse-22.c |  18 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 9 files changed, 479 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index ac0914415c9..d6b8e2695de 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3646,6 +3646,233 @@ _mm256_maskz_range_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
(__mmask8) __U,
__R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_round_pd (__m256d __A, const int __C, const int __R)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask_round ((__v4df) __A,
+ __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+const int __C, const int __R)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask_round ((__v4df) __A,
+ __C,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_round_pd (__mmask8 __U, __m256d __A, const int __C,
+ const int __R)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask_round ((__v4df) __A,
+ __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_round_ph (__m256h __A, const int __C, const int __R)
+{
+  return (__m256h) __builtin_ia32_reduceph256_mask_round ((__v16hf) __A,
+ __C,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+const int __C, const int __R)
+{
+  return (__m256h) __builtin_ia32_reduceph256_mask_round ((__v16hf) __A,
+ __C,
+ (__v16hf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h

[gcc r15-3011] AVX10.2 ymm rounding: Support vscalefp{s,d,h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:1f86cf06c7897f6ab467443b5fe8789cc95fe0c4

commit r15-3011-g1f86cf06c7897f6ab467443b5fe8789cc95fe0c4
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:19 2024 +0800

AVX10.2 ymm rounding: Support vscalefp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def: Add new builtins.
* config/i386/sse.md:
(_scalef): Add condition check.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 182 +
 gcc/config/i386/i386-builtin.def   |   3 +
 gcc/config/i386/sse.md |   2 +-
 gcc/testsuite/gcc.target/i386/avx-1.c  |   3 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  25 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   3 +
 gcc/testsuite/gcc.target/i386/sse-14.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-22.c |   9 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   3 +
 9 files changed, 238 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index d6b8e2695de..f35f2337858 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3873,6 +3873,119 @@ _mm256_maskz_roundscale_round_ps (__mmask8 __U, __m256 
__A, const int __C,
   (__mmask8) __U,
   __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_round_pd (__m256d __A, __m256d __B, const int __R)
+{
+  return
+(__m256d) __builtin_ia32_scalefpd256_mask_round ((__v4df) __A,
+(__v4df) __B,
+(__v4df)
+_mm256_undefined_pd (),
+(__mmask8) -1,
+__R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+__m256d __B, const int __R)
+{
+  return (__m256d) __builtin_ia32_scalefpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_round_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ const int __R)
+{
+  return (__m256d) __builtin_ia32_scalefpd256_mask_round ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_round_ph (__m256h __A, __m256h __B, const int __R)
+{
+  return
+(__m256h) __builtin_ia32_scalefph256_mask_round ((__v16hf) __A,
+(__v16hf) __B,
+(__v16hf)
+_mm256_undefined_ph (),
+(__mmask16) -1,
+__R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+__m256h __B, const int __R)
+{
+  return (__m256h) __builtin_ia32_scalefph256_mask_round ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artifi

[gcc r15-3012] AVX10.2 ymm rounding: Support vsqrtp{s, d, h} and vsubp{s, d, h} intrins

2024-08-18 Thread Haochen Jiang via Gcc-cvs
https://gcc.gnu.org/g:7f62e7104ebc11c4570745972a023579922ef265

commit r15-3012-g7f62e7104ebc11c4570745972a023579922ef265
Author: Hu, Lin1 
Date:   Mon Aug 19 10:09:20 2024 +0800

AVX10.2 ymm rounding: Support vsqrtp{s,d,h} and vsubp{s,d,h} intrins

gcc/ChangeLog:

* config/i386/avx10_2roundingintrin.h: New intrins.
* config/i386/i386-builtin.def (BDESC): Add new builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-1.c: Add new builtin test.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add new macro test.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/avx10_2-rounding-3.c: Add test.

Diff:
---
 gcc/config/i386/avx10_2roundingintrin.h| 339 +
 gcc/config/i386/i386-builtin.def   |   6 +
 gcc/testsuite/gcc.target/i386/avx-1.c  |   6 +
 gcc/testsuite/gcc.target/i386/avx10_2-rounding-3.c |  50 +++
 gcc/testsuite/gcc.target/i386/sse-13.c |   7 +
 gcc/testsuite/gcc.target/i386/sse-14.c |  18 ++
 gcc/testsuite/gcc.target/i386/sse-22.c |  15 +
 gcc/testsuite/gcc.target/i386/sse-23.c |   6 +
 8 files changed, 447 insertions(+)

diff --git a/gcc/config/i386/avx10_2roundingintrin.h 
b/gcc/config/i386/avx10_2roundingintrin.h
index f35f2337858..c7146e37ec9 100644
--- a/gcc/config/i386/avx10_2roundingintrin.h
+++ b/gcc/config/i386/avx10_2roundingintrin.h
@@ -3986,6 +3986,216 @@ _mm256_maskz_scalef_round_ps (__mmask8 __U, __m256 __A, 
__m256 __B,
 (__mmask8) __U,
 __R);
 }
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_round_pd (__m256d __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask_round ((__v4df) __A,
+   (__v4df)
+   _mm256_undefined_pd (),
+   (__mmask8) -1,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_round_pd (__m256d __W, __mmask8 __U, __m256d __A,
+  const int __R)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask_round ((__v4df) __A,
+   (__v4df) __W,
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_round_pd (__mmask8 __U, __m256d __A, const int __R)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask_round ((__v4df) __A,
+   (__v4df)
+   _mm256_setzero_pd (),
+   (__mmask8) __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_round_ph (__m256h __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_sqrtph256_mask_round ((__v16hf) __A,
+   (__v16hf)
+   _mm256_undefined_ph (),
+   (__mmask16) -1,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_round_ph (__m256h __W, __mmask16 __U, __m256h __A,
+  const int __R)
+{
+  return (__m256h) __builtin_ia32_sqrtph256_mask_round ((__v16hf) __A,
+   (__v16hf) __W,
+   (__mmask16) __U,
+   __R);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_round_ph (__mmask16 __U, __m256h __A, const int __R)
+{
+  return (__m256h) __builtin_ia32_sqrtph256_mask_round ((__v16hf) __A,
+   (__v16hf)
+   _mm256_setzero_ph (),
+   (__mmask16) __U,
+   __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_round_ps (__m256 __A, const int __R)
+{
+  return (__m256) __builtin_ia3

[gcc r15-3013] aarch64: Implement 16-byte vector mode const0 store by TImode

2024-08-18 Thread HaoChen Gui via Gcc-cvs
https://gcc.gnu.org/g:8d6c6fbc5271dde433998c09407b30e2cf195420

commit r15-3013-g8d6c6fbc5271dde433998c09407b30e2cf195420
Author: Haochen Gui 
Date:   Mon Aug 19 10:35:47 2024 +0800

aarch64: Implement 16-byte vector mode const0 store by TImode

gcc/
* config/aarch64/aarch64-simd.md (mov for VSTRUCT_QD):
Expand 16-byte vector mode const0 store by TImode.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 475f19766c3..23c03a96371 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7809,7 +7809,16 @@
(match_operand:VSTRUCT_QD 1 "general_operand"))]
   "TARGET_FLOAT"
 {
-  if (can_create_pseudo_p ())
+  if (known_eq (GET_MODE_SIZE (mode), 16)
+  && operands[1] == CONST0_RTX (mode)
+  && MEM_P (operands[0])
+  && (can_create_pseudo_p ()
+ || memory_address_p (TImode, XEXP (operands[0], 0
+{
+  operands[0] = adjust_address (operands[0], TImode, 0);
+  operands[1] = CONST0_RTX (TImode);
+}
+  else if (can_create_pseudo_p ())
 {
   if (GET_CODE (operands[0]) != REG)
operands[1] = force_reg (mode, operands[1]);