[gcc r15-1726] vect: Fix shift-by-induction for single-lane slp
https://gcc.gnu.org/g:1ff5f8f8a05dd57620a1e2abbf87bd511b113cce commit r15-1726-g1ff5f8f8a05dd57620a1e2abbf87bd511b113cce Author: Feng Xue Date: Wed Jun 26 22:02:53 2024 +0800 vect: Fix shift-by-induction for single-lane slp Allow shift-by-induction for slp node, when it is single lane, which is aligned with the original loop-based handling. 2024-06-26 Feng Xue gcc/ * tree-vect-stmts.cc (vectorizable_shift): Allow shift-by-induction for single-lane slp node. gcc/testsuite/ * gcc.dg/vect/vect-shift-6.c * gcc.dg/vect/vect-shift-7.c Diff: --- gcc/testsuite/gcc.dg/vect/vect-shift-6.c | 52 gcc/testsuite/gcc.dg/vect/vect-shift-7.c | 69 gcc/tree-vect-stmts.cc | 2 +- 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/vect-shift-6.c b/gcc/testsuite/gcc.dg/vect/vect-shift-6.c new file mode 100644 index 000..277093bc7bb --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-shift-6.c @@ -0,0 +1,52 @@ +/* { dg-require-effective-target vect_shift } */ +/* { dg-require-effective-target vect_int } */ + +#include +#include +#include "tree-vect.h" + +#define N 32 + +int32_t A[N]; +int32_t B[N]; + +#define FN(name) \ +__attribute__((noipa)) \ +void name(int32_t *a) \ +{ \ + for (int i = 0; i < N / 2; i++) \ +{ \ + a[2 * i + 0] <<= i; \ + a[2 * i + 1] <<= i; \ +} \ +} + + +FN(foo_vec) + +#pragma GCC push_options +#pragma GCC optimize ("O0") +FN(foo_novec) +#pragma GCC pop_options + +int main () +{ + int i; + + check_vect (); + +#pragma GCC novector + for (i = 0; i < N; i++) +A[i] = B[i] = -(i + 1); + + foo_vec(A); + foo_novec(B); + + /* check results: */ +#pragma GCC novector + for (i = 0; i < N; i++) +if (A[i] != B[i]) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-shift-7.c b/gcc/testsuite/gcc.dg/vect/vect-shift-7.c new file mode 100644 index 000..6de3f39a87f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-shift-7.c @@ -0,0 +1,69 @@ +/* { dg-require-effective-target vect_shift } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "--param max-completely-peel-times=6" } */ + +#include +#include +#include "tree-vect.h" + +#define N 16 +#define M 16 + +int32_t A[N]; +int32_t B[N]; + +#define FN(name) \ +__attribute__((noipa)) \ +void name(int32_t *a, int m) \ +{ \ + for (int i = 0; i < N / 2; i++) \ +{ \ + int s1 = i; \ + int s2 = s1 + 1; \ + int32_t r1 = 0; \ + int32_t r2 = 7; \ + int32_t t1 = m; \ + \ + for (int j = 0; j < M; j++) \ + { \ +r1 += t1 << s1;\ +r2 += t1 << s2;\ +t1++; \ +s1++; \ +s2++; \ + } \ + \ + a[2 * i + 0] = r1; \ + a[2 * i + 1] = r2; \ +} \ +} + + +FN(foo_vec) + +#pragma GCC push_options +#pragma GCC optimize ("O0") +FN(foo_novec) +#pragma GCC pop_options + +int main () +{ + int i; + + check_vect (); + +#pragma GCC novector + for (i = 0; i < N; i++) +A[i] = B[i] = 0; + + foo_vec(A, 0); + foo_novec(B, 0); + + /* check results: */ +#pragma GCC novector + for (i = 0; i < N; i++) +if (A[i] != B[i]) + abort (); + + return 0; +} diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 7b889f31645..aab3aa59962 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -6175,7 +6175,7 @@ vectorizable_shift (vec_info *vinfo, if ((dt[1] == vect_internal_def || dt[1] == vect_induction_def || dt[1] == vect_nested_cycle) - && !slp_node) + && (!slp_node || SLP_TREE_LANES (slp_node) == 1)) scalar_shift_arg = false; else if (dt[1] == vect_constant_def || dt[1] == vect_external_def
[gcc r15-1727] vect: Determine input vectype for multiple lane-reducing operations
https://gcc.gnu.org/g:3aa004f1db327d5728a8fd0afcfed24e767f0499 commit r15-1727-g3aa004f1db327d5728a8fd0afcfed24e767f0499 Author: Feng Xue Date: Sun Jun 16 13:00:32 2024 +0800 vect: Determine input vectype for multiple lane-reducing operations The input vectype of reduction PHI statement must be determined before vect cost computation for the reduction. Since lance-reducing operation has different input vectype from normal one, so we need to traverse all reduction statements to find out the input vectype with the least lanes, and set that to the PHI statement. 2024-06-16 Feng Xue gcc/ * tree-vect-loop.cc (vectorizable_reduction): Determine input vectype during traversal of reduction statements. Diff: --- gcc/tree-vect-loop.cc | 79 --- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 6f32867f85a..3095ff5ab6b 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo, { stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); stmt_vec_info vdef = vect_stmt_to_vectorize (def); - if (STMT_VINFO_REDUC_IDX (vdef) == -1) + int reduc_idx = STMT_VINFO_REDUC_IDX (vdef); + + if (reduc_idx == -1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -7686,10 +7688,57 @@ vectorizable_reduction (loop_vec_info loop_vinfo, return false; } } - else if (!stmt_info) - /* First non-conversion stmt. */ - stmt_info = vdef; - reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; + else + { + /* First non-conversion stmt. */ + if (!stmt_info) + stmt_info = vdef; + + if (lane_reducing_op_p (op.code)) + { + enum vect_def_type dt; + tree vectype_op; + + /* The last operand of lane-reducing operation is for +reduction. */ + gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1); + + if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op)) + return false; + + tree type_op = TREE_TYPE (op.ops[0]); + + if (!vectype_op) + { + vectype_op = get_vectype_for_scalar_type (loop_vinfo, + type_op); + if (!vectype_op) + return false; + } + + /* For lane-reducing operation vectorizable analysis needs the +reduction PHI information */ + STMT_VINFO_REDUC_DEF (def) = phi_info; + + /* Each lane-reducing operation has its own input vectype, while +reduction PHI will record the input vectype with the least +lanes. */ + STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op; + + /* To accommodate lane-reducing operations of mixed input +vectypes, choose input vectype with the least lanes for the +reduction PHI statement, which would result in the most +ncopies for vectorized reduction results. */ + if (!vectype_in + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) + < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op + vectype_in = vectype_op; + } + else + vectype_in = STMT_VINFO_VECTYPE (phi_info); + } + + reduc_def = op.ops[reduc_idx]; reduc_chain_length++; if (!stmt_info && slp_node) slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0]; @@ -7747,6 +7796,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; + gimple_match_op op; if (!gimple_extract_op (stmt_info->stmt, &op)) gcc_unreachable (); @@ -7831,16 +7882,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op.ops[i]), slp_op[i]); - /* To properly compute ncopies we are interested in the widest -non-reduction input type in case we're looking at a widening -accumulation that we later handle in vect_transform_reduction. */ - if (lane_reducing - && vectype_op[i] - && (!vectype_in - || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) - < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i])) - vectype_in = vectype_op[i]; - /* Record how the non-reduction-def value of COND
[gcc r15-1728] Harden SLP reduction support wrt STMT_VINFO_REDUC_IDX
https://gcc.gnu.org/g:b443d7122ee8013c5af127d3d183a03962967f57 commit r15-1728-gb443d7122ee8013c5af127d3d183a03962967f57 Author: Richard Biener Date: Thu Jun 27 11:36:07 2024 +0200 Harden SLP reduction support wrt STMT_VINFO_REDUC_IDX The following makes sure that for a SLP reductions all lanes have the same STMT_VINFO_REDUC_IDX. Once we move that info and can adjust it we can implement swapping. It also makes the existing protection against operand swapping trigger for all stmts participating in a reduction, not just the final one marked as reduction-def. * tree-vect-slp.cc (vect_build_slp_tree_1): Compare STMT_VINFO_REDUC_IDX. (vect_build_slp_tree_2): Prevent operand swapping for all stmts participating in a reduction. Diff: --- gcc/tree-vect-slp.cc | 23 +-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index dd9017e5b3a..48e0f9d2705 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1072,6 +1072,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, stmt_vec_info first_load = NULL, prev_first_load = NULL; bool first_stmt_ldst_p = false, ldst_p = false; bool first_stmt_phi_p = false, phi_p = false; + int first_reduc_idx = -1; bool maybe_soft_fail = false; tree soft_fail_nunits_vectype = NULL_TREE; @@ -1204,6 +1205,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, first_stmt_code = rhs_code; first_stmt_ldst_p = ldst_p; first_stmt_phi_p = phi_p; + first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info); /* Shift arguments should be equal in all the packed stmts for a vector shift with scalar shift operand. */ @@ -1267,6 +1269,24 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, } else { + if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info) + /* For SLP reduction groups the index isn't necessarily +uniform but only that of the first stmt matters. */ + && !(first_reduc_idx != -1 + && STMT_VINFO_REDUC_IDX (stmt_info) != -1 + && REDUC_GROUP_FIRST_ELEMENT (stmt_info))) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: different reduc_idx " + "%d instead of %d in %G", + STMT_VINFO_REDUC_IDX (stmt_info), + first_reduc_idx, stmt); + } + /* Mismatch. */ + continue; + } if (first_stmt_code != rhs_code && alt_stmt_code == ERROR_MARK) alt_stmt_code = rhs_code; @@ -2530,8 +2550,7 @@ out: && oprnds_info[1]->first_dt == vect_internal_def && is_gimple_assign (stmt_info->stmt) /* Swapping operands for reductions breaks assumptions later on. */ - && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def - && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) + && STMT_VINFO_REDUC_IDX (stmt_info) == -1) { /* See whether we can swap the matching or the non-matching stmt operands. */
[gcc r15-1729] tree-optimization/115701 - factor out maybe_duplicate_ssa_info_at_copy
https://gcc.gnu.org/g:b5c64b413fd5bc03a1a8ef86d005892071e42cbe commit r15-1729-gb5c64b413fd5bc03a1a8ef86d005892071e42cbe Author: Richard Biener Date: Sun Jun 30 11:28:11 2024 +0200 tree-optimization/115701 - factor out maybe_duplicate_ssa_info_at_copy The following factors out the code that preserves SSA info of the LHS of a SSA copy LHS = RHS when LHS is about to be eliminated to RHS. PR tree-optimization/115701 * tree-ssanames.h (maybe_duplicate_ssa_info_at_copy): Declare. * tree-ssanames.cc (maybe_duplicate_ssa_info_at_copy): New function, split out from ... * tree-ssa-copy.cc (fini_copy_prop): ... here. * tree-ssa-sccvn.cc (eliminate_dom_walker::eliminate_stmt): ... and here. Diff: --- gcc/tree-ssa-copy.cc | 32 ++-- gcc/tree-ssa-sccvn.cc | 21 ++--- gcc/tree-ssanames.cc | 28 gcc/tree-ssanames.h | 3 ++- 4 files changed, 34 insertions(+), 50 deletions(-) diff --git a/gcc/tree-ssa-copy.cc b/gcc/tree-ssa-copy.cc index bb88472304c..9c9ec47adca 100644 --- a/gcc/tree-ssa-copy.cc +++ b/gcc/tree-ssa-copy.cc @@ -527,38 +527,10 @@ fini_copy_prop (void) || copy_of[i].value == var) continue; - /* In theory the points-to solution of all members of the - copy chain is their intersection. For now we do not bother -to compute this but only make sure we do not lose points-to -information completely by setting the points-to solution -of the representative to the first solution we find if -it doesn't have one already. */ + /* Duplicate points-to and range info appropriately. */ if (copy_of[i].value != var && TREE_CODE (copy_of[i].value) == SSA_NAME) - { - basic_block copy_of_bb - = gimple_bb (SSA_NAME_DEF_STMT (copy_of[i].value)); - basic_block var_bb = gimple_bb (SSA_NAME_DEF_STMT (var)); - if (POINTER_TYPE_P (TREE_TYPE (var)) - && SSA_NAME_PTR_INFO (var) - && !SSA_NAME_PTR_INFO (copy_of[i].value)) - { - duplicate_ssa_name_ptr_info (copy_of[i].value, - SSA_NAME_PTR_INFO (var)); - /* Points-to information is cfg insensitive, -but [E]VRP might record context sensitive alignment -info, non-nullness, etc. So reset context sensitive -info if the two SSA_NAMEs aren't defined in the same -basic block. */ - if (var_bb != copy_of_bb) - reset_flow_sensitive_info (copy_of[i].value); - } - else if (!POINTER_TYPE_P (TREE_TYPE (var)) - && SSA_NAME_RANGE_INFO (var) - && !SSA_NAME_RANGE_INFO (copy_of[i].value) - && var_bb == copy_of_bb) - duplicate_ssa_name_range_info (copy_of[i].value, var); - } + maybe_duplicate_ssa_info_at_copy (var, copy_of[i].value); } class copy_folder copy_folder; diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc index fbbfa557833..dc377fa16ce 100644 --- a/gcc/tree-ssa-sccvn.cc +++ b/gcc/tree-ssa-sccvn.cc @@ -6886,27 +6886,10 @@ eliminate_dom_walker::eliminate_stmt (basic_block b, gimple_stmt_iterator *gsi) /* If this now constitutes a copy duplicate points-to and range info appropriately. This is especially -important for inserted code. See tree-ssa-copy.cc -for similar code. */ +important for inserted code. */ if (sprime && TREE_CODE (sprime) == SSA_NAME) - { - basic_block sprime_b = gimple_bb (SSA_NAME_DEF_STMT (sprime)); - if (POINTER_TYPE_P (TREE_TYPE (lhs)) - && SSA_NAME_PTR_INFO (lhs) - && ! SSA_NAME_PTR_INFO (sprime)) - { - duplicate_ssa_name_ptr_info (sprime, - SSA_NAME_PTR_INFO (lhs)); - if (b != sprime_b) - reset_flow_sensitive_info (sprime); - } - else if (INTEGRAL_TYPE_P (TREE_TYPE (lhs)) - && SSA_NAME_RANGE_INFO (lhs) - && ! SSA_NAME_RANGE_INFO (sprime) - && b == sprime_b) - duplicate_ssa_name_range_info (sprime, lhs); - } + maybe_duplicate_ssa_info_at_copy (lhs, sprime); /* Inhibit the use of an inserted PHI on a loop header when the address of the memory reference is a simple induction diff --git a/gcc/tree-ssanames.cc b/gcc/tree-ssanames.cc index 411ea848c49..bb9ed373f36 100644 --- a/gcc/tree-ssanames.cc +++ b/gcc/tree-ssanames.cc @@ -769,6 +769,34 @@ duplicate_ssa_name_range_info (tree name, tree src) } } +/* For a SSA copy DEST = SRC duplicate SSA info present on DEST to SRC + to preserve it in case DEST is eliminated to SRC. */ + +void +maybe_duplicate_ssa_info_at_co
[gcc r15-1730] tree-optimization/115701 - fix maybe_duplicate_ssa_info_at_copy
https://gcc.gnu.org/g:b77f17c5feec9614568bf2dee7f7d811465ee4a5 commit r15-1730-gb77f17c5feec9614568bf2dee7f7d811465ee4a5 Author: Richard Biener Date: Sun Jun 30 11:34:43 2024 +0200 tree-optimization/115701 - fix maybe_duplicate_ssa_info_at_copy The following restricts copying of points-to info from defs that might be in regions invoking UB and are never executed. PR tree-optimization/115701 * tree-ssanames.cc (maybe_duplicate_ssa_info_at_copy): Only copy info from within the same BB. * gcc.dg/torture/pr115701.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/torture/pr115701.c | 22 ++ gcc/tree-ssanames.cc| 22 -- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/gcc/testsuite/gcc.dg/torture/pr115701.c b/gcc/testsuite/gcc.dg/torture/pr115701.c new file mode 100644 index 000..9b7c34b23d7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr115701.c @@ -0,0 +1,22 @@ +/* { dg-do run } */ +/* IPA PTA disables local PTA recompute after IPA. */ +/* { dg-additional-options "-fipa-pta" } */ + +int a, c, d; +static int b; +int main() +{ + int *e = &a, **f = &e; + while (1) { +int **g, ***h = &f; +if (c) + *g = e; +else if (!b) + break; +*e = **g; +e = &d; + } + if (e != &a) +__builtin_abort(); + return 0; +} diff --git a/gcc/tree-ssanames.cc b/gcc/tree-ssanames.cc index bb9ed373f36..4f83fcbb517 100644 --- a/gcc/tree-ssanames.cc +++ b/gcc/tree-ssanames.cc @@ -775,25 +775,19 @@ duplicate_ssa_name_range_info (tree name, tree src) void maybe_duplicate_ssa_info_at_copy (tree dest, tree src) { + /* While points-to info is flow-insensitive we have to avoid copying + info from not executed regions invoking UB to dominating defs. */ + if (gimple_bb (SSA_NAME_DEF_STMT (src)) + != gimple_bb (SSA_NAME_DEF_STMT (dest))) +return; + if (POINTER_TYPE_P (TREE_TYPE (dest)) && SSA_NAME_PTR_INFO (dest) && ! SSA_NAME_PTR_INFO (src)) -{ - duplicate_ssa_name_ptr_info (src, SSA_NAME_PTR_INFO (dest)); - /* Points-to information is cfg insensitive, -but VRP might record context sensitive alignment -info, non-nullness, etc. So reset context sensitive -info if the two SSA_NAMEs aren't defined in the same -basic block. */ - if (gimple_bb (SSA_NAME_DEF_STMT (src)) - != gimple_bb (SSA_NAME_DEF_STMT (dest))) - reset_flow_sensitive_info (src); -} +duplicate_ssa_name_ptr_info (src, SSA_NAME_PTR_INFO (dest)); else if (INTEGRAL_TYPE_P (TREE_TYPE (dest)) && SSA_NAME_RANGE_INFO (dest) - && ! SSA_NAME_RANGE_INFO (src) - && (gimple_bb (SSA_NAME_DEF_STMT (src)) - == gimple_bb (SSA_NAME_DEF_STMT (dest + && ! SSA_NAME_RANGE_INFO (src)) duplicate_ssa_name_range_info (src, dest); }
[gcc r15-1731] hppa: Fix ICE caused by mismatched predicate and constraint in xmpyu patterns
https://gcc.gnu.org/g:30ad2fafa9ab2497cc12df62a3240cff6ef25d00 commit r15-1731-g30ad2fafa9ab2497cc12df62a3240cff6ef25d00 Author: John David Anglin Date: Sun Jun 30 09:48:21 2024 -0400 hppa: Fix ICE caused by mismatched predicate and constraint in xmpyu patterns 2024-06-30 John David Anglin gcc/ChangeLog: PR target/115691 * config/pa/pa.md: Remove incorrect xmpyu patterns. Diff: --- gcc/config/pa/pa.md | 18 -- 1 file changed, 18 deletions(-) diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md index b0f29a44bae..9e410f43052 100644 --- a/gcc/config/pa/pa.md +++ b/gcc/config/pa/pa.md @@ -5503,24 +5503,6 @@ [(set_attr "type" "fpmuldbl") (set_attr "length" "4")]) -(define_insn "" - [(set (match_operand:DI 0 "register_operand" "=f") - (mult:DI (zero_extend:DI (match_operand:SI 1 "register_operand" "f")) -(match_operand:DI 2 "uint32_operand" "f")))] - "TARGET_PA_11 && ! TARGET_SOFT_FLOAT && ! TARGET_SOFT_MULT && !TARGET_64BIT" - "xmpyu %1,%R2,%0" - [(set_attr "type" "fpmuldbl") - (set_attr "length" "4")]) - -(define_insn "" - [(set (match_operand:DI 0 "register_operand" "=f") - (mult:DI (zero_extend:DI (match_operand:SI 1 "register_operand" "f")) -(match_operand:DI 2 "uint32_operand" "f")))] - "TARGET_PA_11 && ! TARGET_SOFT_FLOAT && ! TARGET_SOFT_MULT && TARGET_64BIT" - "xmpyu %1,%2R,%0" - [(set_attr "type" "fpmuldbl") - (set_attr "length" "4")]) - (define_insn "" [(set (reg:SI 29) (mult:SI (reg:SI 26) (reg:SI 25))) (clobber (match_operand:SI 0 "register_operand" "=a"))
[gcc r14-10362] Fortran: fix for CHARACTER(len=*) dummies with bind(C) [PR115390]
https://gcc.gnu.org/g:b31e1900fa0cffabb0702962d01ba3fe917fdf69 commit r14-10362-gb31e1900fa0cffabb0702962d01ba3fe917fdf69 Author: Harald Anlauf Date: Tue Jun 18 21:57:19 2024 +0200 Fortran: fix for CHARACTER(len=*) dummies with bind(C) [PR115390] gcc/fortran/ChangeLog: PR fortran/115390 * trans-decl.cc (gfc_conv_cfi_to_gfc): Move derivation of type sizes for character via gfc_trans_vla_type_sizes to after character length has been set. gcc/testsuite/ChangeLog: PR fortran/115390 * gfortran.dg/bind_c_char_11.f90: New test. (cherry picked from commit 954f9011c4923b72f42cc6ca8460333e7c7aad98) Diff: --- gcc/fortran/trans-decl.cc| 4 +-- gcc/testsuite/gfortran.dg/bind_c_char_11.f90 | 45 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/gcc/fortran/trans-decl.cc b/gcc/fortran/trans-decl.cc index 301439baaf5..1a319b27449 100644 --- a/gcc/fortran/trans-decl.cc +++ b/gcc/fortran/trans-decl.cc @@ -7056,8 +7056,8 @@ gfc_conv_cfi_to_gfc (stmtblock_t *init, stmtblock_t *finally, if (sym->ts.type == BT_CHARACTER && !INTEGER_CST_P (sym->ts.u.cl->backend_decl)) { - gfc_conv_string_length (sym->ts.u.cl, NULL, init); - gfc_trans_vla_type_sizes (sym, init); + gfc_conv_string_length (sym->ts.u.cl, NULL, &block); + gfc_trans_vla_type_sizes (sym, &block); } /* gfc->data = cfi->base_addr - or for scalars: gfc = cfi->base_addr. diff --git a/gcc/testsuite/gfortran.dg/bind_c_char_11.f90 b/gcc/testsuite/gfortran.dg/bind_c_char_11.f90 new file mode 100644 index 000..5ed8e82853b --- /dev/null +++ b/gcc/testsuite/gfortran.dg/bind_c_char_11.f90 @@ -0,0 +1,45 @@ +! { dg-do compile } +! { dg-additional-options "-Wuninitialized" } +! +! PR fortran/115390 - fixes for CHARACTER(len=*) dummies with bind(C) + +module test + implicit none +contains + subroutine bar(s,t) bind(c) +character(*), intent(in) :: s,t +optional :: t +call foo(s,t) + end + subroutine bar1(s,t) bind(c) +character(*), intent(in) :: s(:),t(:) +optional :: t +call foo1(s,t) + end + subroutine bar4(s,t) bind(c) +character(len=*,kind=4), intent(in) :: s,t +optional:: t +call foo4(s,t) + end + subroutine bar5(s,t) bind(c) +character(len=*,kind=4), intent(in) :: s(:),t(:) +optional:: t +call foo5(s,t) + end + subroutine foo(s,t) +character(*), intent(in) :: s,t +optional :: t + end + subroutine foo1(s,t) +character(*), intent(in) :: s(:),t(:) +optional :: t + end + subroutine foo4(s,t) +character(len=*,kind=4), intent(in) :: s,t +optional:: t + end + subroutine foo5(s,t) +character(len=*,kind=4), intent(in) :: s(:),t(:) +optional:: t + end +end
[gcc r14-10363] Fortran: fix passing of optional dummy as actual to optional argument [PR55978]
https://gcc.gnu.org/g:9f147487de660f026e2fb1281e1a1800f58b3bdd commit r14-10363-g9f147487de660f026e2fb1281e1a1800f58b3bdd Author: Harald Anlauf Date: Sun Jun 23 22:36:43 2024 +0200 Fortran: fix passing of optional dummy as actual to optional argument [PR55978] gcc/fortran/ChangeLog: PR fortran/55978 * trans-array.cc (gfc_conv_array_parameter): Do not dereference data component of a missing allocatable dummy array argument for passing as actual to optional dummy. Harden logic of presence check for optional pointer dummy by using TRUTH_ANDIF_EXPR instead of TRUTH_AND_EXPR. gcc/testsuite/ChangeLog: PR fortran/55978 * gfortran.dg/optional_absent_12.f90: New test. (cherry picked from commit f02c70dafd384f0c44d7a0920f4a75a30e267045) Diff: --- gcc/fortran/trans-array.cc | 20 gcc/testsuite/gfortran.dg/optional_absent_12.f90 | 30 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/gcc/fortran/trans-array.cc b/gcc/fortran/trans-array.cc index a15ff30e8f4..d4b16772de2 100644 --- a/gcc/fortran/trans-array.cc +++ b/gcc/fortran/trans-array.cc @@ -8673,6 +8673,10 @@ gfc_conv_array_parameter (gfc_se * se, gfc_expr * expr, bool g77, && (sym->backend_decl != parent)) this_array_result = false; + /* Passing an optional dummy argument as actual to an optional dummy? */ + bool pass_optional; + pass_optional = fsym && fsym->attr.optional && sym && sym->attr.optional; + /* Passing address of the array if it is not pointer or assumed-shape. */ if (full_array_var && g77 && !this_array_result && sym->ts.type != BT_DERIVED && sym->ts.type != BT_CLASS) @@ -8710,6 +8714,14 @@ gfc_conv_array_parameter (gfc_se * se, gfc_expr * expr, bool g77, if (size) array_parameter_size (&se->pre, tmp, expr, size); se->expr = gfc_conv_array_data (tmp); + if (pass_optional) + { + tree cond = gfc_conv_expr_present (sym); + se->expr = build3_loc (input_location, COND_EXPR, +TREE_TYPE (se->expr), cond, se->expr, +fold_convert (TREE_TYPE (se->expr), + null_pointer_node)); + } return; } } @@ -8959,8 +8971,8 @@ gfc_conv_array_parameter (gfc_se * se, gfc_expr * expr, bool g77, tmp = fold_build2_loc (input_location, NE_EXPR, logical_type_node, fold_convert (TREE_TYPE (tmp), ptr), tmp); - if (fsym && fsym->attr.optional && sym && sym->attr.optional) - tmp = fold_build2_loc (input_location, TRUTH_AND_EXPR, + if (pass_optional) + tmp = fold_build2_loc (input_location, TRUTH_ANDIF_EXPR, logical_type_node, gfc_conv_expr_present (sym), tmp); @@ -8994,8 +9006,8 @@ gfc_conv_array_parameter (gfc_se * se, gfc_expr * expr, bool g77, tmp = fold_build2_loc (input_location, NE_EXPR, logical_type_node, fold_convert (TREE_TYPE (tmp), ptr), tmp); - if (fsym && fsym->attr.optional && sym && sym->attr.optional) - tmp = fold_build2_loc (input_location, TRUTH_AND_EXPR, + if (pass_optional) + tmp = fold_build2_loc (input_location, TRUTH_ANDIF_EXPR, logical_type_node, gfc_conv_expr_present (sym), tmp); diff --git a/gcc/testsuite/gfortran.dg/optional_absent_12.f90 b/gcc/testsuite/gfortran.dg/optional_absent_12.f90 new file mode 100644 index 000..1e61d91fb6d --- /dev/null +++ b/gcc/testsuite/gfortran.dg/optional_absent_12.f90 @@ -0,0 +1,30 @@ +! { dg-do run } +! { dg-additional-options "-fcheck=array-temps" } +! +! PR fortran/55978 - comment#19 +! +! Test passing of (missing) optional dummy to optional array argument + +program test + implicit none + integer, pointer :: p(:) => null() + call one (p) + call one (null()) + call one () + call three () +contains + subroutine one (y) +integer, pointer, optional, intent(in) :: y(:) +call two (y) + end subroutine one + + subroutine three (z) +integer, allocatable, optional, intent(in) :: z(:) +call two (z) + end subroutine three + + subroutine two (x) +integer, optional, intent(in) :: x(*) +if (present (x)) stop 1 + end subroutine two +end
[gcc r14-10364] Fortran: fix ALLOCATE with SOURCE of deferred character length [PR114019]
https://gcc.gnu.org/g:603b344c07aa55f8292446e8fd28f5da9a983a21 commit r14-10364-g603b344c07aa55f8292446e8fd28f5da9a983a21 Author: Harald Anlauf Date: Fri Jun 28 21:44:06 2024 +0200 Fortran: fix ALLOCATE with SOURCE of deferred character length [PR114019] gcc/fortran/ChangeLog: PR fortran/114019 * trans-stmt.cc (gfc_trans_allocate): Fix handling of case of scalar character expression being used for SOURCE. gcc/testsuite/ChangeLog: PR fortran/114019 * gfortran.dg/allocate_with_source_33.f90: New test. (cherry picked from commit 7682d115402743090f20aca63a3b5e6c205dedff) Diff: --- gcc/fortran/trans-stmt.cc | 5 +- .../gfortran.dg/allocate_with_source_33.f90| 69 ++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/gcc/fortran/trans-stmt.cc b/gcc/fortran/trans-stmt.cc index 87dd833872a..1fd75c6a37c 100644 --- a/gcc/fortran/trans-stmt.cc +++ b/gcc/fortran/trans-stmt.cc @@ -6464,7 +6464,10 @@ gfc_trans_allocate (gfc_code * code, gfc_omp_namelist *omp_allocate) else if (se.expr != NULL_TREE && temp_var_needed) { tree var, desc; - tmp = GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (se.expr)) || is_coarray ? + tmp = (GFC_DESCRIPTOR_TYPE_P (TREE_TYPE (se.expr)) +|| is_coarray +|| (code->expr3->ts.type == BT_CHARACTER +&& code->expr3->rank == 0)) ? se.expr : build_fold_indirect_ref_loc (input_location, se.expr); diff --git a/gcc/testsuite/gfortran.dg/allocate_with_source_33.f90 b/gcc/testsuite/gfortran.dg/allocate_with_source_33.f90 new file mode 100644 index 000..43a03625950 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/allocate_with_source_33.f90 @@ -0,0 +1,69 @@ +! { dg-do compile } +! { dg-options "-O0" } +! +! PR fortran/114019 - allocation with source of deferred character length + +subroutine s + implicit none + character(1) :: w = "4" + character(*), parameter :: str = "123" + character(5), pointer :: chr_pointer1 + character(:), pointer :: chr_pointer2 + character(:), pointer :: chr_ptr_arr(:) + character(5), allocatable :: chr_alloc1 + character(:), allocatable :: chr_alloc2 + character(:), allocatable :: chr_all_arr(:) + allocate (chr_pointer1, source=w// str//w) + allocate (chr_pointer2, source=w// str//w) + allocate (chr_ptr_arr, source=w//[str//w]) + allocate (chr_alloc1, source=w// str//w) + allocate (chr_alloc2, source=w// str//w) + allocate (chr_all_arr, source=w//[str//w]) + allocate (chr_pointer2, source=str) + allocate (chr_pointer2, source=w) + allocate (chr_alloc2, source=str) + allocate (chr_alloc2, source=w) + allocate (chr_pointer1, mold =w// str//w) + allocate (chr_pointer2, mold =w// str//w) + allocate (chr_ptr_arr, mold =w//[str//w]) + allocate (chr_alloc1, mold =w// str//w) + allocate (chr_alloc2, mold =w// str//w) + allocate (chr_all_arr, mold =w//[str//w]) + allocate (chr_pointer2, mold =str) + allocate (chr_pointer2, mold =w) + allocate (chr_alloc2, mold =str) + allocate (chr_alloc2, mold =w) +end + +subroutine s2 + implicit none + integer, parameter :: ck=4 + character(kind=ck,len=1) :: w = ck_"4" + character(kind=ck,len=*), parameter :: str = ck_"123" + character(kind=ck,len=5), pointer :: chr_pointer1 + character(kind=ck,len=:), pointer :: chr_pointer2 + character(kind=ck,len=:), pointer :: chr_ptr_arr(:) + character(kind=ck,len=5), allocatable :: chr_alloc1 + character(kind=ck,len=:), allocatable :: chr_alloc2 + character(kind=ck,len=:), allocatable :: chr_all_arr(:) + allocate (chr_pointer1, source=w// str//w) + allocate (chr_pointer2, source=w// str//w) + allocate (chr_ptr_arr, source=w//[str//w]) + allocate (chr_alloc1, source=w// str//w) + allocate (chr_alloc2, source=w// str//w) + allocate (chr_all_arr, source=w//[str//w]) + allocate (chr_pointer2, source=str) + allocate (chr_pointer2, source=w) + allocate (chr_alloc2, source=str) + allocate (chr_alloc2, source=w) + allocate (chr_pointer1, mold =w// str//w) + allocate (chr_pointer2, mold =w// str//w) + allocate (chr_ptr_arr, mold =w//[str//w]) + allocate (chr_alloc1, mold =w// str//w) + allocate (chr_alloc2, mold =w// str//w) + allocate (chr_all_arr, mold =w//[str//w]) + allocate (chr_pointer2, mold =str) + allocate (chr_pointer2, mold =w) + allocate (chr_alloc2, mold =str) + allocate (chr_alloc2, mold =w) +end
[gcc r15-1733] Define mask as extern instead of uninitialized local variables.
https://gcc.gnu.org/g:5e1a9f4ccff390ae79a9b9d0d39b325f2b4ea925 commit r15-1733-g5e1a9f4ccff390ae79a9b9d0d39b325f2b4ea925 Author: liuhongt Date: Wed Jun 26 11:17:46 2024 +0800 Define mask as extern instead of uninitialized local variables. The testcases are supposed to scan for vpopcnt{b,w,d,q} operations with k mask, but mask is defined as uninitialized local variable which will be set as 0 at rtl expand phase. And it's further simplified off by late_combine which caused scan assembly failure. Move the definition of mask outside to make the testcases more stable. gcc/testsuite/ChangeLog: PR target/115610 * gcc.target/i386/avx512bitalg-vpopcntb.c: Define mask as extern instead of uninitialized local variables. * gcc.target/i386/avx512bitalg-vpopcntbvl.c: Ditto. * gcc.target/i386/avx512bitalg-vpopcntw.c: Ditto. * gcc.target/i386/avx512bitalg-vpopcntwvl.c: Ditto. * gcc.target/i386/avx512vpopcntdq-vpopcntd.c: Ditto. * gcc.target/i386/avx512vpopcntdq-vpopcntq.c: Ditto. Diff: --- gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c| 3 +-- gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c | 4 ++-- gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c| 2 +- gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c | 4 ++-- gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c | 5 +++-- gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c index 44b82c0519d..66d24107c26 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c +++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c @@ -7,10 +7,9 @@ #include extern __m512i z, z1; - +extern __mmask16 msk; int foo () { - __mmask16 msk; __m512i c = _mm512_popcnt_epi8 (z); asm volatile ("" : "+v" (c)); c = _mm512_mask_popcnt_epi8 (z1, msk, z); diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c index 8c2dfaba9c6..8ab05653f7c 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c +++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c @@ -11,11 +11,11 @@ extern __m256i y, y_1; extern __m128i x, x_1; +extern __mmask32 msk32; +extern __mmask16 msk16; int foo () { - __mmask32 msk32; - __mmask16 msk16; __m256i c256 = _mm256_popcnt_epi8 (y); asm volatile ("" : "+v" (c256)); c256 = _mm256_mask_popcnt_epi8 (y_1, msk32, y); diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c index 2ef8589f6c1..c741bf48a51 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c +++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c @@ -7,10 +7,10 @@ #include extern __m512i z, z1; +extern __mmask16 msk; int foo () { - __mmask16 msk; __m512i c = _mm512_popcnt_epi16 (z); asm volatile ("" : "+v" (c)); c = _mm512_mask_popcnt_epi16 (z1, msk, z); diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c index c976461b12e..79bb3c31e85 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c +++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c @@ -11,11 +11,11 @@ extern __m256i y, y_1; extern __m128i x, x_1; +extern __mmask16 msk16; +extern __mmask8 msk8; int foo () { - __mmask16 msk16; - __mmask8 msk8; __m256i c256 = _mm256_popcnt_epi16 (y); asm volatile ("" : "+v" (c256)); c256 = _mm256_mask_popcnt_epi16 (y_1, msk16, y); diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c index b4d82f97032..776a4753d8e 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c +++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c @@ -15,11 +15,12 @@ extern __m128i x, x_1; extern __m256i y, y_1; extern __m512i z, z_1; +extern __mmask16 msk; +extern __mmask8 msk8; + int foo () { - __mmask16 msk; - __mmask8 msk8; __m128i a = _mm_popcnt_epi32 (x); asm volatile ("" : "+v" (a)); a = _mm_mask_popcnt_epi32 (x_1, msk8, x); diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c index e87d6c999b6..c6314ac5deb 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c +++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c @@ -15,10 +15,10 @@ extern __m128i x, x_1; extern __m256i y, y_1; extern __m512i z, z_1; +extern __mmask8 msk; int foo () { - __mmask8 msk; __m128i a = _mm_popcnt_epi64 (x); asm volatile ("" : "+v" (a)); a = _mm_mask_popcnt_epi64 (x_1, msk, x);
[gcc r15-1734] Extend lshifrtsi3_1_zext to ?k alternative.
https://gcc.gnu.org/g:8e1fa107a63b2e160b6bf69de4fe163dd3cebd80 commit r15-1734-g8e1fa107a63b2e160b6bf69de4fe163dd3cebd80 Author: liuhongt Date: Wed Jun 26 13:07:31 2024 +0800 Extend lshifrtsi3_1_zext to ?k alternative. late_combine will combine lshift + zero into *lshifrtsi3_1_zext which cause extra mov between gpr and kmask, add ?k to the pattern. gcc/ChangeLog: PR target/115610 * config/i386/i386.md (<*insnsi3_zext): Add alternative ?k, enable it only for lshiftrt and under avx512bw. * config/i386/sse.md (*klshrsi3_1_zext): New define_insn, and add corresponding define_split after it. Diff: --- gcc/config/i386/i386.md | 19 +-- gcc/config/i386/sse.md | 28 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b6ccb1e798d..59a889da304 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -16836,10 +16836,10 @@ (set_attr "mode" "SI")]) (define_insn "*si3_1_zext" - [(set (match_operand:DI 0 "register_operand" "=r,r,r") + [(set (match_operand:DI 0 "register_operand" "=r,r,r,?k") (zero_extend:DI - (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm,rm") - (match_operand:QI 2 "nonmemory_operand" "cI,r,cI" + (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm,rm,k") + (match_operand:QI 2 "nonmemory_operand" "cI,r,cI,I" (clobber (reg:CC FLAGS_REG))] "TARGET_64BIT && ix86_binary_operator_ok (, SImode, operands, TARGET_APX_NDD)" @@ -16850,6 +16850,8 @@ case TYPE_ISHIFTX: return "#"; +case TYPE_MSKLOG: + return "#"; default: if (operands[2] == const1_rtx && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)) @@ -16860,8 +16862,8 @@ : "{l}\t{%2, %k0|%k0, %2}"; } } - [(set_attr "isa" "*,bmi2,apx_ndd") - (set_attr "type" "ishift,ishiftx,ishift") + [(set_attr "isa" "*,bmi2,apx_ndd,avx512bw") + (set_attr "type" "ishift,ishiftx,ishift,msklog") (set (attr "length_immediate") (if_then_else (and (match_operand 2 "const1_operand") @@ -16869,7 +16871,12 @@ (match_test "optimize_function_for_size_p (cfun)"))) (const_string "0") (const_string "*"))) - (set_attr "mode" "SI")]) + (set_attr "mode" "SI") + (set (attr "enabled") + (if_then_else + (eq_attr "alternative" "3") + (symbol_ref " == LSHIFTRT && TARGET_AVX512BW") + (const_string "*")))]) ;; Convert shift to the shiftx pattern to avoid flags dependency. (define_split diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a94ec3c441f..3db4f374b9b 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2183,6 +2183,34 @@ (match_dup 2))) (unspec [(const_int 0)] UNSPEC_MASKOP)])]) +(define_insn "*klshrsi3_1_zext" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (lshiftrt:SI (match_operand:SI 1 "register_operand" "k") + (match_operand 2 "const_0_to_31_operand" "I" + (unspec [(const_int 0)] UNSPEC_MASKOP)] + "TARGET_AVX512BW" + "kshiftrd\t{%2, %1, %0|%0, %1, %2}" +[(set_attr "type" "msklog") + (set_attr "prefix" "vex") + (set_attr "mode" "SI")]) + +(define_split + [(set (match_operand:DI 0 "mask_reg_operand") + (zero_extend:DI + (lshiftrt:SI + (match_operand:SI 1 "mask_reg_operand") + (match_operand 2 "const_0_to_31_operand" +(clobber (reg:CC FLAGS_REG))] + "TARGET_AVX512BW && reload_completed" + [(parallel + [(set (match_dup 0) + (zero_extend:DI +(lshiftrt:SI + (match_dup 1) + (match_dup 2 + (unspec [(const_int 0)] UNSPEC_MASKOP)])]) + (define_insn "ktest" [(set (reg:CC FLAGS_REG) (unspec:CC
[gcc r15-1735] Enable flate-combine.
https://gcc.gnu.org/g:e62ea4fb8ffcab06ddd02f26db91b29b7270743f commit r15-1735-ge62ea4fb8ffcab06ddd02f26db91b29b7270743f Author: liuhongt Date: Wed Jun 26 13:52:24 2024 +0800 Enable flate-combine. Move pass_stv2 and pass_rpad after pre_reload pass_late_combine, also define target_insn_cost to prevent post_reload pass_late_combine to revert the optimziation did in pass_rpad. Adjust testcases since pass_late_combine generates better code but break scan assembly. .i.e Under 32-bit target, gcc used to generate broadcast from stack and then do the real operation. After flate_combine, they're combined into embeded broadcast operations. gcc/ChangeLog: * config/i386/i386-features.cc (ix86_rpad_gate): New function. * config/i386/i386-options.cc (ix86_override_options_after_change): Don't disable flate_combine. * config/i386/i386-passes.def: Move pass_stv2 and pass_rpad after pre_reload pas_late_combine. * config/i386/i386-protos.h (ix86_rpad_gate): New declare. * config/i386/i386.cc (ix86_insn_cost): New function. (TARGET_INSN_COST): Define. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Adjus testcase. * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Ditto. * gcc.target/i386/avx512f-fmadd-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Ditto. * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Ditto. * gcc.target/i386/pr91333.c: Ditto. * gcc.target/i386/vect-strided-4.c: Ditto. Diff: --- gcc/config/i386/i386-features.cc | 16 +++- gcc/config/i386/i386-options.cc| 4 gcc/config/i386/i386-passes.def| 4 ++-- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.cc| 18 ++ .../gcc.target/i386/avx512f-broadcast-pr87767-1.c | 4 ++-- .../gcc.target/i386/avx512f-broadcast-pr87767-5.c | 1 - gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c | 2 +- gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c | 2 +- .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c | 2 +- .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c | 2 +- .../gcc.target/i386/avx512vl-broadcast-pr87767-1.c | 4 ++-- .../gcc.target/i386/avx512vl-broadcast-pr87767-5.c | 2 -- gcc/testsuite/gcc.target/i386/pr91333.c| 2 +- gcc/testsuite/gcc.target/i386/vect-strided-4.c | 2 +- 15 files changed, 42 insertions(+), 24 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 607d1991460..fc224ed06b0 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -2995,6 +2995,16 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt) return new pass_insert_endbr_and_patchable_area (ctxt); } +bool +ix86_rpad_gate () +{ + return (TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && TARGET_SSE_MATH + && optimize + && optimize_function_for_speed_p (cfun)); +} + /* At entry of the nearest common dominator for basic blocks with conversions/rcp/sqrt/rsqrt/round, generate a single vxorps %xmmN, %xmmN, %xmmN @@ -3232,11 +3242,7 @@ public: /* opt_pass methods: */ bool gate (function *) final override { - return (TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY - && TARGET_SSE_MATH - && optimize - && optimize_function_for_speed_p (cfun)); + return ix86_rpad_gate (); } unsigned int execute (function *) final override diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 9c12d498928..1ef2c71a7a2 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -1944,10 +1944,6 @@ ix86_override_options_after_change (void) flag_cunroll_grow_size = flag_peel_loops || optimize >= 3; } - /* Late combine tends to undo some of the effects of STV and RPAD, - by combining instructions back to their original form. */ - if (!OPTION_SET_P (flag_late_combine_instructions)) -flag_late_combine_instructions = 0; } /* Clear stack slot assignments remembered from previous functions. diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index 7d96766f7b9..2d29f65da88 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -25,11 +25,11 @@ along with GCC; see the file COPYING3. If not see */ INSERT_P
[gcc r15-1736] Add more splitters to match (unspec [op1 op2 (gt op3 constm1_operand)] UNSPEC_BLENDV)
https://gcc.gnu.org/g:2e2dfa0095c3326a0a5fc2ff175918b42eeb044f commit r15-1736-g2e2dfa0095c3326a0a5fc2ff175918b42eeb044f Author: liuhongt Date: Mon Jun 17 17:16:46 2024 +0800 Add more splitters to match (unspec [op1 op2 (gt op3 constm1_operand)] UNSPEC_BLENDV) These define_insn_and_split are needed after vcond{,u,eq} is obsolete. gcc/ChangeLog: PR target/115517 * config/i386/sse.md (*_blendv_gt): New define_insn_and_split. (*_blendv_gtint): Ditto. (*_blendv_not_gtint): Ditto. (*_pblendvb_gt): Ditto. (*_pblendvb_gt_subreg_not): Ditto. Diff: --- gcc/config/i386/sse.md | 130 + 1 file changed, 130 insertions(+) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 3db4f374b9b..423f13d3982 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -23079,6 +23079,32 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "")]) +(define_insn_and_split "*_blendv_gt" + [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "vector_operand" "Yrja,*xja,xjm") + (match_operand:VF_128_256 2 "register_operand" "0,0,x") + (gt:VF_128_256 +(match_operand: 3 "register_operand" "Yz,Yz,x") +(match_operand: 4 "vector_all_ones_operand"))] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:VF_128_256 +[(match_dup 2) (match_dup 1) (match_dup 3)] UNSPEC_BLENDV))] + "operands[3] = gen_lowpart (mode, operands[3]);" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "addr" "gpr16") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "")]) + (define_mode_attr ssefltmodesuffix [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps") (V2DF "pd") (V4DF "pd") (V4SF "ps") (V8SF "ps")]) @@ -23118,6 +23144,38 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "")]) +(define_insn_and_split "*_blendv_gtint" + [(set (match_operand: 0 "register_operand" "=Yr,*x,x") + (unspec: + [(match_operand: 1 "vector_operand" "Yrja,*xja,xjm") + (match_operand: 2 "register_operand" "0,0,x") + (subreg: +(gt:VI48_AVX + (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x") + (match_operand:VI48_AVX 4 "vector_all_ones_operand")) 0)] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec: +[(match_dup 2) (match_dup 1) (match_dup 3)] UNSPEC_BLENDV))] +{ + operands[0] = gen_lowpart (mode, operands[0]); + operands[1] = gen_lowpart (mode, operands[1]); + operands[2] = gen_lowpart (mode, operands[2]); + operands[3] = gen_lowpart (mode, operands[3]); +} + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "addr" "gpr16") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "")]) + ;; PR target/100738: Transform vpcmpeqd + vpxor + vblendvps to vblendvps for inverted mask; (define_insn_and_split "*_blendv_not_ltint" [(set (match_operand: 0 "register_operand") @@ -23145,6 +23203,32 @@ operands[3] = gen_lowpart (mode, operands[3]); }) +(define_insn_and_split "*_blendv_not_gtint" + [(set (match_operand: 0 "register_operand") + (unspec: + [(match_operand: 1 "vector_operand") + (match_operand: 2 "register_operand") + (subreg: +(gt:VI48_AVX + (subreg:VI48_AVX + (not: +(match_operand: 3 "register_operand")) 0) + (match_operand:VI48_AVX 4 "vector_all_ones_operand")) 0)] + UNSPEC_BLENDV))] + "TARGET_SSE4_1 && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (unspec: +[(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] +{ + operands[0] = gen_lowpart (mode, operands[0]); + operands[2] = gen_lowpart (mode, operands[2]); + operands[1] = force_reg (mode, + gen_lowpart (mode, operands[1])); + operands[3] = gen_lowpart (mode, operands[3]); +}) + (define_insn "_dp" [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") (unspec:VF_128_256 @@ -23299,6 +23383,30 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "")]) +(define_insn_and_split "*_pblendvb_gt" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") + (unspe
[gcc r15-1737] Lower AVX512 kmask comparison back to AVX2 comparison when op_{true, false} is vector -1/0.
https://gcc.gnu.org/g:b06a108f0fbffe12493b527224f6e4131a72beac commit r15-1737-gb06a108f0fbffe12493b527224f6e4131a72beac Author: liuhongt Date: Tue Jun 18 14:03:42 2024 +0800 Lower AVX512 kmask comparison back to AVX2 comparison when op_{true,false} is vector -1/0. gcc/ChangeLog PR target/115517 * config/i386/sse.md (*_cvtmask2_not): New pre_reload splitter. (*_cvtmask2_not): Ditto. (*avx2_pcmp3_6): Ditto. (*avx2_pcmp3_7): Ditto. Diff: --- gcc/config/i386/sse.md | 97 ++ 1 file changed, 97 insertions(+) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 423f13d3982..3d790af3a2c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -10008,6 +10008,24 @@ [(set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn_and_split "*_cvtmask2_not" + [(set (match_operand:VI12_AVX512VL 0 "register_operand") + (vec_merge:VI12_AVX512VL + (match_operand:VI12_AVX512VL 2 "const0_operand") + (match_operand:VI12_AVX512VL 3 "vector_all_ones_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512BW && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 4) + (not: (match_dup 1))) + (set (match_dup 0) + (vec_merge:VI12_AVX512VL + (match_dup 3) + (match_dup 2) + (match_dup 4)))] + "operands[4] = gen_reg_rtx (mode);") + (define_expand "_cvtmask2" [(set (match_operand:VI48_AVX512VL 0 "register_operand") (vec_merge:VI48_AVX512VL @@ -10046,6 +10064,24 @@ (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn_and_split "*_cvtmask2_not" + [(set (match_operand:VI48_AVX512VL 0 "register_operand") + (vec_merge:VI48_AVX512VL + (match_operand:VI48_AVX512VL 2 "const0_operand") + (match_operand:VI48_AVX512VL 3 "vector_all_ones_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 4) + (not: (match_dup 1))) + (set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 3) + (match_dup 2) + (match_dup 4)))] + "operands[4] = gen_reg_rtx (mode);") + (define_insn "*_cvtmask2_pternlog_false_dep" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (vec_merge:VI48_AVX512VL @@ -17738,6 +17774,67 @@ std::swap (operands[1], operands[2]); }) +(define_int_attr pcmp_usmin + [(UNSPEC_PCMP "smin") (UNSPEC_UNSIGNED_PCMP "umin")]) + +(define_insn_and_split "*avx2_pcmp3_6" + [(set (match_operand:VI_128_256 0 "register_operand") + (vec_merge:VI_128_256 + (match_operand:VI_128_256 1 "vector_all_ones_operand") + (match_operand:VI_128_256 2 "const0_operand") + (unspec: + [(match_operand:VI_128_256 3 "nonimmediate_operand") +(match_operand:VI_128_256 4 "nonimmediate_operand") +(match_operand:SI 5 "const_0_to_7_operand")] +UNSPEC_PCMP_ITER)))] + "TARGET_AVX512VL && ix86_pre_reload_split () + && (INTVAL (operands[5]) == 2 || INTVAL (operands[5]) == 5)" + "#" + "&& 1" + [(const_int 0)] +{ + rtx dst_min = gen_reg_rtx (mode); + + if (MEM_P (operands[3]) && MEM_P (operands[4])) +operands[3] = force_reg (mode, operands[3]); + emit_insn (gen_3 (dst_min, operands[3], operands[4])); + rtx eq_op = INTVAL (operands[5]) == 2 ? operands[3] : operands[4]; + emit_move_insn (operands[0], gen_rtx_EQ (mode, eq_op, dst_min)); + DONE; +}) + +(define_insn_and_split "*avx2_pcmp3_7" + [(set (match_operand:VI_128_256 0 "register_operand") + (vec_merge:VI_128_256 + (match_operand:VI_128_256 1 "const0_operand") + (match_operand:VI_128_256 2 "vector_all_ones_operand") + (unspec: + [(match_operand:VI_128_256 3 "nonimmediate_operand") +(match_operand:VI_128_256 4 "nonimmediate_operand") +(match_operand:SI 5 "const_0_to_7_operand")] +UNSPEC_PCMP_ITER)))] + "TARGET_AVX512VL && ix86_pre_reload_split () + /* NE is commutative. */ + && (INTVAL (operands[5]) == 4 + /* LE, 3 must be register. */ + || INTVAL (operands[5]) == 2 + /* NLT aka GE, 4 must be register and we swap operands. */ + || INTVAL (operands[5]) == 5)" + "#" + "&& 1" + [(const_int 0)] +{ + if (INTVAL (operands[5]) == 5) +std::swap (operands[3], operands[4]); + + if (MEM_P (operands[3])) +operands[3] = force_reg (mode, operands[3]); + enum rtx_code code = INTVAL (operands[5]) != 4 ? GT : EQ; + emit_move_insn (operands[0], gen_rtx_fmt_ee (code, mode, + operands[3], operands[4])); + DONE; +}) + (define_expand "_eq3" [(set (match_operand: 0 "register_operand") (unspec:
[gcc r15-1739] Add more splitter for mskmov with avx512 comparison.
https://gcc.gnu.org/g:3cb204046c0db899750aee9480af4f1953a40ac3 commit r15-1739-g3cb204046c0db899750aee9480af4f1953a40ac3 Author: liuhongt Date: Wed Jun 19 13:12:00 2024 +0800 Add more splitter for mskmov with avx512 comparison. gcc/ChangeLog: PR target/115517 * config/i386/sse.md (*_movmsk_lt_avx512): New define_insn_and_split. (*_movmsk_ext_lt_avx512): Ditto. (*_pmovmskb_lt_avx512): Ditto. (*_pmovmskb_zext_lt_avx512): Ditto. (*sse2_pmovmskb_ext_lt_avx512): Ditto. (*pmovsk_kmask_v16qi_avx512): Ditto. (*pmovsk_mask_v32qi_avx512): Ditto. (*pmovsk_mask_cmp__avx512): Ditto. (*pmovsk_ptest__avx512): Ditto. Diff: --- gcc/config/i386/sse.md | 232 - 1 file changed, 209 insertions(+), 23 deletions(-) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 694b4b8f07c..3ffa1881c83 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -10071,24 +10071,6 @@ [(set_attr "prefix" "evex") (set_attr "mode" "")]) -(define_insn_and_split "*_cvtmask2_not" - [(set (match_operand:VI12_AVX512VL 0 "register_operand") - (vec_merge:VI12_AVX512VL - (match_operand:VI12_AVX512VL 2 "const0_operand") - (match_operand:VI12_AVX512VL 3 "vector_all_ones_operand") - (match_operand: 1 "register_operand")))] - "TARGET_AVX512BW && ix86_pre_reload_split ()" - "#" - "&& 1" - [(set (match_dup 4) - (not: (match_dup 1))) - (set (match_dup 0) - (vec_merge:VI12_AVX512VL - (match_dup 3) - (match_dup 2) - (match_dup 4)))] - "operands[4] = gen_reg_rtx (mode);") - (define_expand "_cvtmask2" [(set (match_operand:VI48_AVX512VL 0 "register_operand") (vec_merge:VI48_AVX512VL @@ -10128,10 +10110,10 @@ (set_attr "mode" "")]) (define_insn_and_split "*_cvtmask2_not" - [(set (match_operand:VI48_AVX512VL 0 "register_operand") - (vec_merge:VI48_AVX512VL - (match_operand:VI48_AVX512VL 2 "const0_operand") - (match_operand:VI48_AVX512VL 3 "vector_all_ones_operand") + [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand") + (vec_merge:VI1248_AVX512VLBW + (match_operand:VI1248_AVX512VLBW 2 "const0_operand") + (match_operand:VI1248_AVX512VLBW 3 "vector_all_ones_operand") (match_operand: 1 "register_operand")))] "TARGET_AVX512F && ix86_pre_reload_split ()" "#" @@ -10139,7 +10121,7 @@ [(set (match_dup 4) (not: (match_dup 1))) (set (match_dup 0) - (vec_merge:VI48_AVX512VL + (vec_merge:VI1248_AVX512VLBW (match_dup 3) (match_dup 2) (match_dup 4)))] @@ -21816,6 +21798,30 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "")]) +(define_insn_and_split "*_movmsk_lt_avx512" + [(set (match_operand:SI 0 "register_operand" "=r,jr") + (unspec:SI + [(subreg:VF_128_256 + (vec_merge: +(match_operand: 3 "vector_all_ones_operand") +(match_operand: 4 "const0_operand") +(unspec: + [(match_operand: 1 "register_operand" "x,x") + (match_operand: 2 "const0_operand") + (const_int 1)] + UNSPEC_PCMP)) 0)] + UNSPEC_MOVMSK))] + "TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))] + "operands[1] = gen_lowpart (mode, operands[1]);" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "")]) + (define_insn_and_split "*_movmsk_ext_lt" [(set (match_operand:DI 0 "register_operand" "=r,jr") (any_extend:DI @@ -21835,6 +21841,31 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "")]) +(define_insn_and_split "*_movmsk_ext_lt_avx512" + [(set (match_operand:DI 0 "register_operand" "=r,jr") + (any_extend:DI + (unspec:SI + [(subreg:VF_128_256 + (vec_merge: + (match_operand: 3 "vector_all_ones_operand") + (match_operand: 4 "const0_operand") + (unspec: + [(match_operand: 1 "register_operand" "x,x") +(match_operand: 2 "const0_operand") +(const_int 1)] + UNSPEC_PCMP)) 0)] + UNSPEC_MOVMSK)))] + "TARGET_64BIT && TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (any_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] + "operands[1] = gen_lowpart (mode, operands[1]);" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "")]) + (define_insn_and_split "*_movmsk_shift" [(set (match_operand:SI 0 "register_operand" "=r,jr") (unspec:SI @@ -22024,6 +22055,34 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode"
[gcc r15-1740] Adjust testcase for the regressed testcases after obsolete of vcond{, u, eq}.
https://gcc.gnu.org/g:e94e6ee495d95f29355bbc017214228a5e367638 commit r15-1740-ge94e6ee495d95f29355bbc017214228a5e367638 Author: liuhongt Date: Wed Jun 19 16:05:58 2024 +0800 Adjust testcase for the regressed testcases after obsolete of vcond{,u,eq}. > Richard suggests that we implement the "obvious" transforms like > inversion in the middle-end but if for example unsigned compares > are not supported the us_minus + eq + negative trick isn't on > that list. > > The main reason to restrict vec_cmp would be to avoid > a <= b ? c : d going with an unsupported vec_cmp but instead > do a > b ? d : c - the alternative is trying to fix this > on the RTL side via combine. I understand the non-native Yes, I have a patch which can fix most regressions via pattern match in combine. Still there is a situation that is difficult to deal with, mainly the optimization w/o sse4.1 . Because pblendvb/blendvps/blendvpd only exists under sse4.1, w/o sse4.1, it takes 3 instructions (pand,pandn,por) to simulate the vcond_mask, and the combine matches up to 4 instructions, which makes it currently impossible to use the combine to recover those optimizations in the vcond{,u,eq}.i.e min/max. In the case of sse 4.1 and above, there is basically no regression anymore. the regression testcases w/o sse4.1 FAIL: g++.target/i386/pr100637-1b.C -std=gnu++14 scan-assembler-times pcmpeqb 2 FAIL: g++.target/i386/pr100637-1b.C -std=gnu++17 scan-assembler-times pcmpeqb 2 FAIL: g++.target/i386/pr100637-1b.C -std=gnu++20 scan-assembler-times pcmpeqb 2 FAIL: g++.target/i386/pr100637-1b.C -std=gnu++98 scan-assembler-times pcmpeqb 2 FAIL: g++.target/i386/pr100637-1w.C -std=gnu++14 scan-assembler-times pcmpeqw 2 FAIL: g++.target/i386/pr100637-1w.C -std=gnu++17 scan-assembler-times pcmpeqw 2 FAIL: g++.target/i386/pr100637-1w.C -std=gnu++20 scan-assembler-times pcmpeqw 2 FAIL: g++.target/i386/pr100637-1w.C -std=gnu++98 scan-assembler-times pcmpeqw 2 FAIL: g++.target/i386/pr103861-1.C -std=gnu++14 scan-assembler-times pcmpeqb 2 FAIL: g++.target/i386/pr103861-1.C -std=gnu++17 scan-assembler-times pcmpeqb 2 FAIL: g++.target/i386/pr103861-1.C -std=gnu++20 scan-assembler-times pcmpeqb 2 FAIL: g++.target/i386/pr103861-1.C -std=gnu++98 scan-assembler-times pcmpeqb 2 FAIL: gcc.target/i386/pr88540.c scan-assembler minpd gcc/testsuite/ChangeLog: PR target/115517 * g++.target/i386/pr100637-1b.C: Add xfail and -mno-sse4.1. * g++.target/i386/pr100637-1w.C: Ditto. * g++.target/i386/pr103861-1.C: Ditto. * gcc.target/i386/pr88540.c: Ditto. * gcc.target/i386/pr103941-2.c: Add -mno-avx512f. * g++.target/i386/sse4_1-pr100637-1b.C: New test. * g++.target/i386/sse4_1-pr100637-1w.C: New test. * g++.target/i386/sse4_1-pr103861-1.C: New test. * gcc.target/i386/sse4_1-pr88540.c: New test. Diff: --- gcc/testsuite/g++.target/i386/pr100637-1b.C| 4 ++-- gcc/testsuite/g++.target/i386/pr100637-1w.C| 4 ++-- gcc/testsuite/g++.target/i386/pr103861-1.C | 4 ++-- gcc/testsuite/g++.target/i386/sse4_1-pr100637-1b.C | 17 + gcc/testsuite/g++.target/i386/sse4_1-pr100637-1w.C | 17 + gcc/testsuite/g++.target/i386/sse4_1-pr103861-1.C | 17 + gcc/testsuite/gcc.target/i386/pr103941-2.c | 2 +- gcc/testsuite/gcc.target/i386/pr88540.c| 4 ++-- gcc/testsuite/gcc.target/i386/sse4_1-pr88540.c | 10 ++ 9 files changed, 70 insertions(+), 9 deletions(-) diff --git a/gcc/testsuite/g++.target/i386/pr100637-1b.C b/gcc/testsuite/g++.target/i386/pr100637-1b.C index 35b5df7c9dd..dccb8f5e712 100644 --- a/gcc/testsuite/g++.target/i386/pr100637-1b.C +++ b/gcc/testsuite/g++.target/i386/pr100637-1b.C @@ -1,6 +1,6 @@ /* PR target/100637 */ /* { dg-do compile } */ -/* { dg-options "-O2 -msse2" } */ +/* { dg-options "-O2 -msse2 -mno-sse4.1" } */ typedef unsigned char __attribute__((__vector_size__ (4))) __v4qu; typedef char __attribute__((__vector_size__ (4))) __v4qi; @@ -13,5 +13,5 @@ __v4qu us (__v4qi a, __v4qi b) { return (a > b) ? au : bu; } __v4qi su (__v4qu a, __v4qu b) { return (a > b) ? as : bs; } __v4qi ss (__v4qi a, __v4qi b) { return (a > b) ? as : bs; } -/* { dg-final { scan-assembler-times "pcmpeqb" 2 } } */ +/* { dg-final { scan-assembler-times "pcmpeqb" 2 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times "pcmpgtb" 2 } } */ diff --git a/gcc/testsuite/g++.target/i386/pr100637-1w.C b/gcc/testsuite/g++.target/i386/pr100637-1w.C index a3ed06fddee..a0aab62db33 100644 --- a/gcc/testsuite/g++.target/i386/pr100637-1w.C +++ b/gcc/testsuite/g++.target/i386/pr100637-1w.C @@ -1,6 +1,6 @@ /* PR target/100637 */ /* { dg-do compile } */ -/* { dg-opti
[gcc r15-1738] Match IEEE min/max with UNSPEC_IEEE_{MIN,MAX}.
https://gcc.gnu.org/g:09737d9605521df9232d9990006c44955064f44e commit r15-1738-g09737d9605521df9232d9990006c44955064f44e Author: liuhongt Date: Tue Jun 18 15:52:02 2024 +0800 Match IEEE min/max with UNSPEC_IEEE_{MIN,MAX}. These versions of the min/max patterns implement exactly the operations min = (op1 < op2 ? op1 : op2) max = (!(op1 < op2) ? op1 : op2) gcc/ChangeLog: PR target/115517 * config/i386/sse.md (*minmax3_1): New pre_reload define_insn_and_split. (*minmax3_2): Ditto. Diff: --- gcc/config/i386/sse.md | 63 ++ 1 file changed, 63 insertions(+) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 3d790af3a2c..694b4b8f07c 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -3096,6 +3096,69 @@ (set_attr "prefix" "") (set_attr "mode" "")]) +(define_insn_and_split "*minmax3_1" + [(set (match_operand:VFH 0 "register_operand") + (vec_merge:VFH + (match_operand:VFH 1 "nonimmediate_operand") + (match_operand:VFH 2 "nonimmediate_operand") + (unspec: + [(match_operand:VFH 3 "nonimmediate_operand") +(match_operand:VFH 4 "nonimmediate_operand") +(match_operand:SI 5 "const_0_to_31_operand")] +UNSPEC_PCMP)))] + "TARGET_SSE && ix86_pre_reload_split () + && ((rtx_equal_p (operands[1], operands[3]) + && rtx_equal_p (operands[2], operands[4])) + || (rtx_equal_p (operands[1], operands[4]) + && rtx_equal_p (operands[2], operands[3]))) + && (INTVAL (operands[5]) == 1 || INTVAL (operands[5]) == 14)" + "#" + "&& 1" + [(const_int 0)] + { + int u = UNSPEC_IEEE_MIN; + if ((INTVAL (operands[5]) == 1 && rtx_equal_p (operands[1], operands[4])) + || (INTVAL (operands[5]) == 14 && rtx_equal_p (operands[1], operands[3]))) + u = UNSPEC_IEEE_MAX; + + if (MEM_P (operands[1])) + operands[1] = force_reg (mode, operands[1]); + rtvec v = gen_rtvec (2, operands[1], operands[2]); + rtx tmp = gen_rtx_UNSPEC (mode, v, u); + emit_move_insn (operands[0], tmp); + DONE; + }) + +(define_insn_and_split "*minmax3_2" + [(set (match_operand:VF_128_256 0 "register_operand") + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "nonimmediate_operand") + (match_operand:VF_128_256 2 "nonimmediate_operand") + (lt:VF_128_256 +(match_operand:VF_128_256 3 "nonimmediate_operand") +(match_operand:VF_128_256 4 "nonimmediate_operand"))] +UNSPEC_BLENDV))] + "TARGET_SSE && ix86_pre_reload_split () + && ((rtx_equal_p (operands[1], operands[3]) + && rtx_equal_p (operands[2], operands[4])) + || (rtx_equal_p (operands[1], operands[4]) + && rtx_equal_p (operands[2], operands[3])))" + "#" + "&& 1" + [(const_int 0)] + { + int u = UNSPEC_IEEE_MIN; + if (rtx_equal_p (operands[1], operands[3])) + u = UNSPEC_IEEE_MAX; + + if (MEM_P (operands[2])) + force_reg (mode, operands[2]); + rtvec v = gen_rtvec (2, operands[2], operands[1]); + rtx tmp = gen_rtx_UNSPEC (mode, v, u); + emit_move_insn (operands[0], tmp); + DONE; + }) + ;; These versions of the min/max patterns implement exactly the operations ;; min = (op1 < op2 ? op1 : op2) ;; max = (!(op1 < op2) ? op1 : op2)
[gcc r15-1741] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.
https://gcc.gnu.org/g:2ccdd0f22312a14ac64bf944fdc4f8e7532eb0eb commit r15-1741-g2ccdd0f22312a14ac64bf944fdc4f8e7532eb0eb Author: liuhongt Date: Thu Jun 20 12:41:13 2024 +0800 Optimize a < 0 ? -1 : 0 to (signed)a >> 31. Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31 and x < 0 ? 1 : 0 into (unsigned) x >> 31. Add define_insn_and_split for the optimization did in ix86_expand_int_vcond. gcc/ChangeLog: PR target/115517 * config/i386/sse.md ("*ashr3_1"): New define_insn_and_split. (*avx512_ashr3_1): Ditto. (*avx2_lshr3_1): Ditto. (*avx2_lshr3_2): Ditto and add 2 combine splitter after it. * config/i386/mmx.md (mmxscalarsize): New mode attribute. (*mmw_ashr3_1): New define_insn_and_split. ("mmx_3): Add a combine spiltter after it. (*mmx_ashrv2hi3_1): New define_insn_and_plit, also add a combine splitter after it. gcc/testsuite/ChangeLog: * gcc.target/i386/pr111023-2.c: Adjust testcase. * gcc.target/i386/vect-div-1.c: Ditto. Diff: --- gcc/config/i386/mmx.md | 52 +++ gcc/config/i386/sse.md | 83 ++ gcc/testsuite/gcc.target/i386/pr111023-2.c | 4 +- gcc/testsuite/gcc.target/i386/vect-div-1.c | 2 +- 4 files changed, 138 insertions(+), 3 deletions(-) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 359dc90628d..fca28df99a1 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -147,6 +147,14 @@ (V4HI "hi") (V2HI "hi") (V8QI "qi")]) +(define_mode_attr mmxscalarsize + [(V1DI "64") + (V2SI "32") (V2SF "32") + (V4HF "16") (V4BF "16") + (V2HF "16") (V2BF "16") + (V4HI "16") (V2HI "16") + (V8QI "8")]) + (define_mode_attr Yv_Yw [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")]) @@ -3620,6 +3628,17 @@ (const_string "0"))) (set_attr "mode" "DI,TI,TI")]) +(define_insn_and_split "*mmx_ashr3_1" + [(set (match_operand:MMXMODE24 0 "register_operand") + (lt:MMXMODE24 + (match_operand:MMXMODE24 1 "register_operand") + (match_operand:MMXMODE24 2 "const0_operand")))] + "TARGET_MMX_WITH_SSE && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) (ashiftrt:MMXMODE24 (match_dup 1) (match_dup 3)))] + "operands[3] = gen_int_mode ( - 1, DImode);") + (define_expand "ashr3" [(set (match_operand:MMXMODE24 0 "register_operand") (ashiftrt:MMXMODE24 @@ -3646,6 +3665,17 @@ (const_string "0"))) (set_attr "mode" "DI,TI,TI")]) +(define_split + [(set (match_operand:MMXMODE248 0 "register_operand") + (and:MMXMODE248 + (lt:MMXMODE248 + (match_operand:MMXMODE248 1 "register_operand") + (match_operand:MMXMODE248 2 "const0_operand")) + (match_operand:MMXMODE248 3 "const1_operand")))] + "TARGET_MMX_WITH_SSE && ix86_pre_reload_split ()" + [(set (match_dup 0) (lshiftrt:MMXMODE248 (match_dup 1) (match_dup 4)))] + "operands[4] = gen_int_mode ( - 1, DImode);") + (define_expand "3" [(set (match_operand:MMXMODE24 0 "register_operand") (any_lshift:MMXMODE24 @@ -3687,6 +3717,28 @@ (const_string "0"))) (set_attr "mode" "TI")]) +(define_insn_and_split "*mmx_ashrv2hi3_1" + [(set (match_operand:V2HI 0 "register_operand") + (lt:V2HI + (match_operand:V2HI 1 "register_operand") + (match_operand:V2HI 2 "const0_operand")))] + "TARGET_SSE2 && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) (ashiftrt:V2HI (match_dup 1) (match_dup 3)))] + "operands[3] = gen_int_mode (15, DImode);") + +(define_split + [(set (match_operand:V2HI 0 "register_operand") + (and:V2HI + (lt:V2HI + (match_operand:V2HI 1 "register_operand") + (match_operand:V2HI 2 "const0_operand")) + (match_operand:V2HI 3 "const1_operand")))] + "TARGET_SSE2 && ix86_pre_reload_split ()" + [(set (match_dup 0) (lshiftrt:V2HI (match_dup 1) (match_dup 4)))] + "operands[4] = gen_int_mode (15, DImode);") + (define_expand "v8qi3" [(set (match_operand:V8QI 0 "register_operand") (any_shift:V8QI (match_operand:V8QI 1 "register_operand") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 3ffa1881c83..1169e93453e 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -16923,6 +16923,17 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "")]) +(define_insn_and_split "*ashr3_1" + [(set (match_operand:VI24_AVX2 0 "register_operand") + (lt:VI24_AVX2 + (match_operand:VI24_AVX2 1 "register_operand") + (match_operand:VI24_AVX2 2 "const0_operand")))] + "TARGET_SSE2 && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) (ashiftrt:VI24_AVX2 (match_dup 1) (match_dup 3)))] + "operands[3] = gen_int_mode ( - 1, DImode);") + (define_
[gcc r15-1742] Remove vcond{, u, eq} expanders since they will be obsolete.
https://gcc.gnu.org/g:55f80c690c5fa59836646565a9dee2a3f68374a0 commit r15-1742-g55f80c690c5fa59836646565a9dee2a3f68374a0 Author: liuhongt Date: Mon Jun 24 09:19:01 2024 +0800 Remove vcond{,u,eq} expanders since they will be obsolete. gcc/ChangeLog: PR target/115517 * config/i386/mmx.md (vcondv2sf): Removed. (vcond): Ditto. (vcond): Ditto. (vcondu): Ditto. (vcondu): Ditto. * config/i386/sse.md (vcond): Ditto. (vcond): Ditto. (vcond): Ditto. (vcond): Ditto. (vcond): Ditto. (vcond): Ditto. (vcond): Ditto. (vcondv2di): Ditto. (vcondu): Ditto. (vcondu): Ditto. (vcondu): Ditto. (vconduv2di): Ditto. (vcondeqv2di): Ditto. Diff: --- gcc/config/i386/mmx.md | 97 -- gcc/config/i386/sse.md | 213 - 2 files changed, 310 deletions(-) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index fca28df99a1..94d3a6e5692 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1180,39 +1180,6 @@ DONE; }) -(define_expand "vcondv2sf" - [(set (match_operand:V2FI 0 "register_operand") - (if_then_else:V2FI - (match_operator 3 "" - [(match_operand:V2SF 4 "nonimmediate_operand") -(match_operand:V2SF 5 "nonimmediate_operand")]) - (match_operand:V2FI 1 "general_operand") - (match_operand:V2FI 2 "general_operand")))] - "TARGET_MMX_WITH_SSE && ix86_partial_vec_fp_math" -{ - rtx ops[6]; - ops[5] = gen_reg_rtx (V4SFmode); - ops[4] = gen_reg_rtx (V4SFmode); - ops[3] = gen_rtx_fmt_ee (GET_CODE (operands[3]), VOIDmode, ops[4], ops[5]); - ops[2] = lowpart_subreg (mode, - force_reg (mode, operands[2]), - mode); - ops[1] = lowpart_subreg (mode, - force_reg (mode, operands[1]), - mode); - ops[0] = gen_reg_rtx (mode); - - emit_insn (gen_movq_v2sf_to_sse (ops[5], operands[5])); - emit_insn (gen_movq_v2sf_to_sse (ops[4], operands[4])); - - bool ok = ix86_expand_fp_vcond (ops); - gcc_assert (ok); - - emit_move_insn (operands[0], lowpart_subreg (mode, ops[0], - mode)); - DONE; -}) - (define_insn "@sse4_1_insertps_" [(set (match_operand:V2FI 0 "register_operand" "=Yr,*x,v") (unspec:V2FI @@ -4041,70 +4008,6 @@ DONE; }) -(define_expand "vcond" - [(set (match_operand:MMXMODE124 0 "register_operand") - (if_then_else:MMXMODE124 - (match_operator 3 "" - [(match_operand:MMXMODEI 4 "register_operand") -(match_operand:MMXMODEI 5 "register_operand")]) - (match_operand:MMXMODE124 1) - (match_operand:MMXMODE124 2)))] - "TARGET_MMX_WITH_SSE - && (GET_MODE_NUNITS (mode) - == GET_MODE_NUNITS (mode))" -{ - bool ok = ix86_expand_int_vcond (operands); - gcc_assert (ok); - DONE; -}) - -(define_expand "vcond" - [(set (match_operand:VI_16_32 0 "register_operand") - (if_then_else:VI_16_32 - (match_operator 3 "" - [(match_operand:VI_16_32 4 "register_operand") -(match_operand:VI_16_32 5 "register_operand")]) - (match_operand:VI_16_32 1) - (match_operand:VI_16_32 2)))] - "TARGET_SSE2" -{ - bool ok = ix86_expand_int_vcond (operands); - gcc_assert (ok); - DONE; -}) - -(define_expand "vcondu" - [(set (match_operand:MMXMODE124 0 "register_operand") - (if_then_else:MMXMODE124 - (match_operator 3 "" - [(match_operand:MMXMODEI 4 "register_operand") -(match_operand:MMXMODEI 5 "register_operand")]) - (match_operand:MMXMODE124 1) - (match_operand:MMXMODE124 2)))] - "TARGET_MMX_WITH_SSE - && (GET_MODE_NUNITS (mode) - == GET_MODE_NUNITS (mode))" -{ - bool ok = ix86_expand_int_vcond (operands); - gcc_assert (ok); - DONE; -}) - -(define_expand "vcondu" - [(set (match_operand:VI_16_32 0 "register_operand") - (if_then_else:VI_16_32 - (match_operator 3 "" - [(match_operand:VI_16_32 4 "register_operand") -(match_operand:VI_16_32 5 "register_operand")]) - (match_operand:VI_16_32 1) - (match_operand:VI_16_32 2)))] - "TARGET_SSE2" -{ - bool ok = ix86_expand_int_vcond (operands); - gcc_assert (ok); - DONE; -}) - (define_expand "vcond_mask_" [(set (match_operand:MMXMODE124 0 "register_operand") (vec_merge:MMXMODE124 diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1169e93453e..d71b0f2567e 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -4848,72 +4848,6 @@ DONE; }) -(define_expand "vcond" - [(set (match_operand:V_512 0 "register_operand") - (if_then_else:V_512 - (match_operator 3 "" - [(match_operand:VF_512 4 "
[gcc r15-1743] tree-optimization/115694 - ICE with complex store rewrite
https://gcc.gnu.org/g:543a5b9da964f821b9e723ed9c93d6cdca464d47 commit r15-1743-g543a5b9da964f821b9e723ed9c93d6cdca464d47 Author: Richard Biener Date: Sun Jun 30 13:07:14 2024 +0200 tree-optimization/115694 - ICE with complex store rewrite The following adds a missed check when forwprop attempts to rewrite a complex store. PR tree-optimization/115694 * tree-ssa-forwprop.cc (pass_forwprop::execute): Check the store is complex before rewriting it. * g++.dg/torture/pr115694.C: New testcase. Diff: --- gcc/testsuite/g++.dg/torture/pr115694.C | 13 + gcc/tree-ssa-forwprop.cc| 2 ++ 2 files changed, 15 insertions(+) diff --git a/gcc/testsuite/g++.dg/torture/pr115694.C b/gcc/testsuite/g++.dg/torture/pr115694.C new file mode 100644 index 000..bbce47decf8 --- /dev/null +++ b/gcc/testsuite/g++.dg/torture/pr115694.C @@ -0,0 +1,13 @@ +// { dg-do compile } + +_Complex a; +typedef struct { + double a[2]; +} b; +void c(b); +void d() +{ + _Complex b1 = a; + b t = __builtin_bit_cast (b, b1); + c(t); +} diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc index 05d42ccd3c6..abf71f0d3a0 100644 --- a/gcc/tree-ssa-forwprop.cc +++ b/gcc/tree-ssa-forwprop.cc @@ -3762,6 +3762,8 @@ pass_forwprop::execute (function *fun) && gimple_store_p (use_stmt) && !gimple_has_volatile_ops (use_stmt) && is_gimple_assign (use_stmt) + && (TREE_CODE (TREE_TYPE (gimple_assign_lhs (use_stmt))) + == COMPLEX_TYPE) && (TREE_CODE (gimple_assign_lhs (use_stmt)) != TARGET_MEM_REF)) {