[gcc r15-2881] Initial support for AVX10.2
https://gcc.gnu.org/g:4bcb480d103b36c389daaf711f0f25d74379adb6 commit r15-2881-g4bcb480d103b36c389daaf711f0f25d74379adb6 Author: Haochen Jiang Date: Mon Aug 12 15:30:07 2024 +0800 Initial support for AVX10.2 gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_available_features): Handle avx10.2. * common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AVX10_2_256_SET): New. (OPTION_MASK_ISA2_AVX10_2_512_SET): Ditto. (OPTION_MASK_ISA2_AVX10_1_256_UNSET): Add OPTION_MASK_ISA2_AVX10_2_256_UNSET. (OPTION_MASK_ISA2_AVX10_1_512_UNSET): Add OPTION_MASK_ISA2_AVX10_2_512_UNSET. (OPTION_MASK_ISA2_AVX10_2_256_UNSET): New. (OPTION_MASK_ISA2_AVX10_2_512_UNSET): Ditto. (ix86_handle_option): Handle avx10.2-256 and avx10.2-512. * common/config/i386/i386-cpuinfo.h (enum processor_features): Add FEATURE_AVX10_2_256 and FEATURE_AVX10_2_512. * common/config/i386/i386-isas.h: Add ISA_NAMES_TABLE_ENTRY for avx10.2-256 and avx10.2-512. * config/i386/i386-c.cc (ix86_target_macros_internal): Define __AVX10_2_256__ and __AVX10_2_512__. * config/i386/i386-isa.def (AVX10_2): Add DEF_PTA(AVX10_2_256) and DEF_PTA(AVX10_2_512). * config/i386/i386-options.cc (isa2_opts): Add -mavx10.2-256 and -mavx10.2-512. (ix86_valid_target_attribute_inner_p): Handle avx10.2-256 and avx10.2-512. * config/i386/i386.opt: Add option -mavx10.2, -mavx10.2-256 and -mavx10.2-512. * config/i386/i386.opt.urls: Regenerated. * doc/extend.texi: Document avx10.2, avx10.2-256 and avx10.2-512. * doc/invoke.texi: Document -mavx10.2, -mavx10.2-256 and -mavx10.2-512. * doc/sourcebuild.texi: Document target avx10.2, avx10.2-256, avx10.2-512. gcc/testsuite/ChangeLog: * g++.dg/other/i386-2.C: Ditto. * g++.dg/other/i386-3.C: Ditto. * gcc.target/i386/sse-12.c: Ditto. * gcc.target/i386/sse-13.c: Ditto. * gcc.target/i386/sse-14.c: Ditto. * gcc.target/i386/sse-22.c: Ditto. * gcc.target/i386/sse-23.c: Ditto. Diff: --- gcc/common/config/i386/cpuinfo.h | 6 + gcc/common/config/i386/i386-common.cc | 43 -- gcc/common/config/i386/i386-cpuinfo.h | 2 ++ gcc/common/config/i386/i386-isas.h | 3 +++ gcc/config/i386/i386-c.cc | 4 gcc/config/i386/i386-isa.def | 2 ++ gcc/config/i386/i386-options.cc| 7 +- gcc/config/i386/i386.opt | 15 gcc/config/i386/i386.opt.urls | 9 +++ gcc/doc/extend.texi| 15 gcc/doc/invoke.texi| 17 +++--- gcc/doc/sourcebuild.texi | 9 +++ gcc/testsuite/g++.dg/other/i386-2.C| 9 --- gcc/testsuite/g++.dg/other/i386-3.C| 9 --- gcc/testsuite/gcc.target/i386/sse-12.c | 2 +- gcc/testsuite/gcc.target/i386/sse-13.c | 2 +- gcc/testsuite/gcc.target/i386/sse-14.c | 2 +- gcc/testsuite/gcc.target/i386/sse-22.c | 4 ++-- gcc/testsuite/gcc.target/i386/sse-23.c | 2 +- 19 files changed, 140 insertions(+), 22 deletions(-) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 2ae77d335d24..2ae383eb6ab5 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -1006,6 +1006,9 @@ get_available_features (struct __processor_model *cpu_model, if (ebx & bit_AVX10_256) switch (version) { + case 2: + set_feature (FEATURE_AVX10_2_256); + /* Fall through. */ case 1: set_feature (FEATURE_AVX10_1_256); break; @@ -1016,6 +1019,9 @@ get_available_features (struct __processor_model *cpu_model, if (ebx & bit_AVX10_512) switch (version) { + case 2: + set_feature (FEATURE_AVX10_2_512); + /* Fall through. */ case 1: set_feature (FEATURE_AVX10_1_512); break; diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index e38b1b22ffb1..fb744319b05e 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -122,6 +122,11 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA2_AVX10_1_256_SET OPTION_MASK_ISA2_AVX10_1_256 #define OPTION_MASK_ISA2_AVX10_1_512_SET \ (OPTION_MASK_ISA2_AVX10_1_256_SET | OPTION_MASK_ISA2_AVX10_1_512) +#define OPTION_MASK_ISA2_AVX10_2_256_SET \ + (OPTION_MASK_ISA2_AVX10_1_256_SET | OPTION_MASK_ISA2_AVX10_2_256) +#define OPTION_MASK_ISA2_AVX10_2_512_SET \ + (OPTION_MASK_ISA2_AVX10_1_512_SET | OPTION_MASK_ISA
[gcc r15-2882] Fortran: Fix coarray in associate not linking [PR85510]
https://gcc.gnu.org/g:8d8db21eb726b785782f4a41ad85a0d4be63068a commit r15-2882-g8d8db21eb726b785782f4a41ad85a0d4be63068a Author: Andre Vehreschild Date: Mon Jul 22 15:31:37 2024 +0200 Fortran: Fix coarray in associate not linking [PR85510] PR fortran/85510 gcc/fortran/ChangeLog: * resolve.cc (resolve_variable): Mark the variable as host associated only, when it is not in an associate block. * trans-decl.cc (generate_coarray_init): Remove incorrect unused flag on parameter. gcc/testsuite/ChangeLog: * gfortran.dg/coarray/pr85510.f90: New test. Diff: --- gcc/fortran/resolve.cc| 10 ++ gcc/fortran/trans-decl.cc | 2 +- gcc/testsuite/gfortran.dg/coarray/pr85510.f90 | 19 +++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/gcc/fortran/resolve.cc b/gcc/fortran/resolve.cc index eb3085a05ca2..8e88aac2fe0e 100644 --- a/gcc/fortran/resolve.cc +++ b/gcc/fortran/resolve.cc @@ -6114,10 +6114,12 @@ resolve_variable (gfc_expr *e) /* If a symbol has been host_associated mark it. This is used latter, to identify if aliasing is possible via host association. */ if (sym->attr.flavor == FL_VARIABLE - && gfc_current_ns->parent - && (gfc_current_ns->parent == sym->ns - || (gfc_current_ns->parent->parent - && gfc_current_ns->parent->parent == sym->ns))) + && (!sym->ns->code || sym->ns->code->op != EXEC_BLOCK + || !sym->ns->code->ext.block.assoc) + && gfc_current_ns->parent + && (gfc_current_ns->parent == sym->ns + || (gfc_current_ns->parent->parent + && gfc_current_ns->parent->parent == sym->ns))) sym->attr.host_assoc = 1; if (gfc_current_ns->proc_name diff --git a/gcc/fortran/trans-decl.cc b/gcc/fortran/trans-decl.cc index ca6a515a1800..6692ac7ef4c3 100644 --- a/gcc/fortran/trans-decl.cc +++ b/gcc/fortran/trans-decl.cc @@ -5950,7 +5950,7 @@ generate_coarray_sym_init (gfc_symbol *sym) coarrays. */ static void -generate_coarray_init (gfc_namespace * ns __attribute((unused))) +generate_coarray_init (gfc_namespace *ns) { tree fndecl, tmp, decl, save_fn_decl; diff --git a/gcc/testsuite/gfortran.dg/coarray/pr85510.f90 b/gcc/testsuite/gfortran.dg/coarray/pr85510.f90 new file mode 100644 index ..c6777cad6ed1 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/coarray/pr85510.f90 @@ -0,0 +1,19 @@ +!{ dg-do run } + +! Contributed by Damian Rouson +! Check that PR fortran/85510 links. + +module foo +contains + subroutine bar() +integer, save :: i[*] = 1 +associate(n=>1) + if (i[1] /= 1) stop 1 +end associate + end subroutine +end module + +use foo +call bar() +end +
[gcc r15-2883] aarch64: Emit ADD X, Y, Y instead of SHL X, Y, #1 for Advanced SIMD
https://gcc.gnu.org/g:fcc766c82cf8e0473ba54f1660c8282a7ce3231c commit r15-2883-gfcc766c82cf8e0473ba54f1660c8282a7ce3231c Author: Kyrylo Tkachov Date: Mon Aug 5 11:29:44 2024 -0700 aarch64: Emit ADD X, Y, Y instead of SHL X, Y, #1 for Advanced SIMD On many cores, including Neoverse V2 the throughput of vector ADD instructions is higher than vector shifts like SHL. We can lean on that to emit code like: add v0.4s, v0.4s, v0.4s instead of: shl v0.4s, v0.4s, 1 LLVM already does this trick. In RTL the code gets canonincalised from (plus x x) to (ashift x 1) so I opted to instead do this at the final assembly printing stage, similar to how we emit CMLT instead of SSHR elsewhere in the backend. I'd like to also do this for SVE shifts, but those will have to be separate patches. Signed-off-by: Kyrylo Tkachov gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_simd_imm_shl): Rewrite to new syntax. Add =w,w,vs1 alternative. * config/aarch64/constraints.md (vs1): New constraint. gcc/testsuite/ChangeLog: * gcc.target/aarch64/advsimd_shl_add.c: New test. Diff: --- gcc/config/aarch64/aarch64-simd.md | 12 ++-- gcc/config/aarch64/constraints.md | 6 ++ gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c | 64 ++ 3 files changed, 77 insertions(+), 5 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index cc612ec2ca0e..475f19766c38 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1352,12 +1352,14 @@ ) (define_insn "aarch64_simd_imm_shl" - [(set (match_operand:VDQ_I 0 "register_operand" "=w") - (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w") - (match_operand:VDQ_I 2 "aarch64_simd_lshift_imm" "Dl")))] + [(set (match_operand:VDQ_I 0 "register_operand") + (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand") + (match_operand:VDQ_I 2 "aarch64_simd_lshift_imm")))] "TARGET_SIMD" - "shl\t%0., %1., %2" - [(set_attr "type" "neon_shift_imm")] + {@ [ cons: =0, 1, 2 ; attrs: type ] + [ w , w, vs1 ; neon_add ] add\t%0., %1., %1. + [ w , w, Dl ; neon_shift_imm ] shl\t%0., %1., %2 + } ) (define_insn "aarch64_simd_reg_sshl" diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index a2878f580d90..f491e4bd6a06 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -667,6 +667,12 @@ SMAX and SMIN operations." (match_operand 0 "aarch64_sve_vsm_immediate")) +(define_constraint "vs1" + "@internal + A constraint that matches a vector of immediate one." + (and (match_code "const,const_vector") + (match_test "op == CONST1_RTX (GET_MODE (op))"))) + (define_constraint "vsA" "@internal A constraint that matches an immediate operand valid for SVE FADD diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c b/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c new file mode 100644 index ..a161f89a3acc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd_shl_add.c @@ -0,0 +1,64 @@ +/* { dg-do compile } */ +/* { dg-additional-options "--save-temps -O1" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +typedef __INT64_TYPE__ __attribute__ ((vector_size (16))) v2di; +typedef int __attribute__ ((vector_size (16))) v4si; +typedef short __attribute__ ((vector_size (16))) v8hi; +typedef char __attribute__ ((vector_size (16))) v16qi; +typedef short __attribute__ ((vector_size (8))) v4hi; +typedef char __attribute__ ((vector_size (8))) v8qi; + +#define FUNC(S) \ +S \ +foo_##S (S a) \ +{ return a << 1; } + +/* +** foo_v2di: +** addv0.2d, v0.2d, v0.2d +** ret +*/ + +FUNC (v2di) + +/* +** foo_v4si: +** addv0.4s, v0.4s, v0.4s +** ret +*/ + +FUNC (v4si) + +/* +** foo_v8hi: +** addv0.8h, v0.8h, v0.8h +** ret +*/ + +FUNC (v8hi) + +/* +** foo_v16qi: +** addv0.16b, v0.16b, v0.16b +** ret +*/ + +FUNC (v16qi) + +/* +** foo_v4hi: +** addv0.4h, v0.4h, v0.4h +** ret +*/ + +FUNC (v4hi) + +/* +** foo_v8qi: +** addv0.8b, v0.8b, v0.8b +** ret +*/ + +FUNC (v8qi) +
[gcc r15-2884] Use splay-tree-utils.h in tree-ssa-sccvn [PR30920]
https://gcc.gnu.org/g:9ab8681db6c7736357a8713afec7c7b09080cba9 commit r15-2884-g9ab8681db6c7736357a8713afec7c7b09080cba9 Author: Richard Sandiford Date: Mon Aug 12 10:52:29 2024 +0100 Use splay-tree-utils.h in tree-ssa-sccvn [PR30920] This patch is an attempt to gauge opinion on one way of fixing PR30920. The PR points out that the libiberty splay tree implementation does not implement the algorithm described by Sleator and Tarjan and has unclear complexity bounds. (It's also somewhat dangerous in that splay_tree_min and splay_tree_max walk the tree without splaying, meaning that they are fully linear in the worst case, rather than amortised logarithmic.) These properties have been carried over to typed-splay-tree.h. We could fix those problems directly in the existing implementations, and probably should for libiberty. But when I added rtl-ssa, I also added a third(!) splay tree implementation: splay-tree-utils.h. In response to Jeff's understandable unease about having three implementations, I was supposed to go back during the next stage 1 and reduce it to no more than two. I never did that. :-( splay-tree-utils.h is so called because rtl-ssa uses splay trees in structures that are relatively small and very size-sensitive. I therefore wanted to be able to embed the splay tree links directly in the structures, rather than pay the penalty of using separate nodes with one-way or two-way links between them. There were also operations for which it was convenient to treat the splay tree root as an explicitly managed cursor, rather than treating the tree as a pure ADT. The interface is therefore a bit more low-level than for the other implementations. I wondered whether the same trade-offs might apply to users of the libiberty splay trees. The first one I looked at in detail was SCC value numbering, which seemed like it would benefit from using splay-tree-utils.h directly. The patch does that. It also adds a couple of new helper routines to splay-tree-utils.h. I don't expect this approach to be the right one for every use of splay trees. E.g. splay tree used for omp gimplification would certainly need separate nodes. gcc/ PR other/30920 * splay-tree-utils.h (rooted_splay_tree::insert_relative) (rooted_splay_tree::lookup_le): New functions. (rooted_splay_tree::remove_root_and_splay_next): Likewise. * splay-tree-utils.tcc (rooted_splay_tree::insert_relative): New function, extracted from... (rooted_splay_tree::insert): ...here. (rooted_splay_tree::lookup_le): New function. (rooted_splay_tree::remove_root_and_splay_next): Likewise. * tree-ssa-sccvn.cc (pd_range::m_children): New member variable. (vn_walk_cb_data::vn_walk_cb_data): Initialize first_range. (vn_walk_cb_data::known_ranges): Use a default_splay_tree. (vn_walk_cb_data::~vn_walk_cb_data): Remove freeing of known_ranges. (pd_range_compare, pd_range_alloc, pd_range_dealloc): Delete. (vn_walk_cb_data::push_partial_def): Rewrite splay tree operations to use splay-tree-utils.h. * rtl-ssa/accesses.cc (function_info::add_use): Use insert_relative. Diff: --- gcc/rtl-ssa/accesses.cc | 8 ++-- gcc/splay-tree-utils.h | 29 + gcc/splay-tree-utils.tcc | 69 +++--- gcc/tree-ssa-sccvn.cc| 106 --- 4 files changed, 131 insertions(+), 81 deletions(-) diff --git a/gcc/rtl-ssa/accesses.cc b/gcc/rtl-ssa/accesses.cc index 5e9077545a81..ef99759871aa 100644 --- a/gcc/rtl-ssa/accesses.cc +++ b/gcc/rtl-ssa/accesses.cc @@ -1232,16 +1232,16 @@ function_info::add_use (use_info *use) need_use_splay_tree (def); int comparison = lookup_use (def->m_use_tree, insn); gcc_checking_assert (comparison != 0); - splay_tree_node *neighbor = def->m_use_tree.root (); + use_info *neighbor = def->m_use_tree.root ()->value (); // If USE comes before NEIGHBOR, insert USE to NEIGHBOR's left, // otherwise insert USE to NEIGHBOR's right. auto *use_node = allocate> (use); - def->m_use_tree.insert_child (neighbor, comparison > 0, use_node); + def->m_use_tree.insert_relative (comparison, use_node); if (comparison > 0) -insert_use_after (use, neighbor->value ()); +insert_use_after (use, neighbor); else -insert_use_before (use, neighbor->value ()); +insert_use_before (use, neighbor); } void diff --git a/gcc/splay-tree-utils.h b/gcc/splay-tree-utils.h index 8344808f6d19..9526e0ba3363 100644 --- a/gcc/splay-tree-utils.h +++ b/gcc/splay-tree-utils.h @@ -185,6 +185,21 @@ public: template bool insert (node_type new_node, Comparator compare); + // Insert NEW_NODE into the spl
[gcc r15-2885] Avoid cfg corruption when using sjlj exceptions where loops are present in the assign_params emitted
https://gcc.gnu.org/g:40b9a7beb79acbea569be3066768cfb62c0f7c31 commit r15-2885-g40b9a7beb79acbea569be3066768cfb62c0f7c31 Author: Joern Rennecke Date: Mon Aug 12 11:04:51 2024 +0100 Avoid cfg corruption when using sjlj exceptions where loops are present in the assign_params emitted code. 2024-08-06 Joern Rennecke gcc/ * except.cc (sjlj_emit_function_enter): Set fn_begin_outside_block again if encountering a jump instruction. Diff: --- gcc/except.cc | 4 1 file changed, 4 insertions(+) diff --git a/gcc/except.cc b/gcc/except.cc index b5886e97be9c..5bb5edbd8065 100644 --- a/gcc/except.cc +++ b/gcc/except.cc @@ -1228,6 +1228,10 @@ sjlj_emit_function_enter (rtx_code_label *dispatch_label) else if (NOTE_INSN_BASIC_BLOCK_P (fn_begin)) fn_begin_outside_block = false; } +/* assign_params can indirectly call emit_block_move_via_loop, e.g. + for g++.dg/torture/pr85627.C for 16-bit targets. */ +else if (JUMP_P (fn_begin)) + fn_begin_outside_block = true; #ifdef DONT_USE_BUILTIN_SETJMP if (dispatch_label)
[gcc r15-2886] This fixes problems with tests that exceed a data type or the maximum stack frame size on 16 bit tar
https://gcc.gnu.org/g:46bd63d872ffb9733c4fff51033447f26ce56576 commit r15-2886-g46bd63d872ffb9733c4fff51033447f26ce56576 Author: Joern Rennecke Date: Mon Aug 12 11:13:24 2024 +0100 This fixes problems with tests that exceed a data type or the maximum stack frame size on 16 bit targets. Note: GCC has a limitation that a stack frame cannot exceed half the address space. For two tests the decision to modify or skip them seems not so clear-cut; I choose to modify gcc.dg/pr47893.c to use types that fit the numbers, as that seemed to have little impact on the test, and skip gcc.dg/pr115646.c for 16 bit, as layout of structs with bitfields members can have quite subtle rules. gcc/testsuite/ * gcc.dg/pr107523.c: Make sure variables can fit numbers. * gcc.dg/pr47893.c: Add dg-require-effective-target size20plus clause. * c-c++-common/torture/builtin-clear-padding-2.c: dg-require-effective-target size20plus. * gcc.dg/pr115646.c: dg-require-effective-target int32plus. * c-c++-common/analyzer/coreutils-sum-pr108666.c: For c++, expect a warning about exceeding maximum object size if not size20plus. * gcc.dg/torture/inline-mem-cpy-1.c: Like the included file, dg-require-effective-target ptr32plus. * gcc.dg/torture/inline-mem-cmp-1.c: Likewise. Diff: --- gcc/testsuite/c-c++-common/analyzer/coreutils-sum-pr108666.c | 2 +- gcc/testsuite/c-c++-common/torture/builtin-clear-padding-2.c | 1 + gcc/testsuite/gcc.dg/pr107523.c | 10 ++ gcc/testsuite/gcc.dg/pr115646.c | 1 + gcc/testsuite/gcc.dg/pr47893.c | 3 +++ gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c | 1 + gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c | 1 + 7 files changed, 14 insertions(+), 5 deletions(-) diff --git a/gcc/testsuite/c-c++-common/analyzer/coreutils-sum-pr108666.c b/gcc/testsuite/c-c++-common/analyzer/coreutils-sum-pr108666.c index dadd27eaf410..c41b61dd081d 100644 --- a/gcc/testsuite/c-c++-common/analyzer/coreutils-sum-pr108666.c +++ b/gcc/testsuite/c-c++-common/analyzer/coreutils-sum-pr108666.c @@ -35,7 +35,7 @@ bsd_sum_stream(FILE* stream, void* resstream, uintmax_t* length) int checksum = 0; uintmax_t total_bytes = 0; static const size_t buffer_length = 32768; - uint8_t* buffer = (uint8_t *) malloc(buffer_length); + uint8_t* buffer = (uint8_t *) malloc(buffer_length); /* { dg-warning "argument 1 value '32768' exceeds maximum object size 32767" "" { target { c++ && { ! size20plus } } } } */ if (!buffer) return -1; diff --git a/gcc/testsuite/c-c++-common/torture/builtin-clear-padding-2.c b/gcc/testsuite/c-c++-common/torture/builtin-clear-padding-2.c index 099f202ebc75..d91b01640dc4 100644 --- a/gcc/testsuite/c-c++-common/torture/builtin-clear-padding-2.c +++ b/gcc/testsuite/c-c++-common/torture/builtin-clear-padding-2.c @@ -1,5 +1,6 @@ /* PR libstdc++/88101 */ /* { dg-do run } */ +/* { dg-require-effective-target size20plus } */ typedef int T __attribute__((aligned (16384))); struct S { char a; short b; long double c; T d; T e; long long f; }; diff --git a/gcc/testsuite/gcc.dg/pr107523.c b/gcc/testsuite/gcc.dg/pr107523.c index 1e5ed46c6362..d6afa131249d 100644 --- a/gcc/testsuite/gcc.dg/pr107523.c +++ b/gcc/testsuite/gcc.dg/pr107523.c @@ -1,10 +1,12 @@ /* { dg-do run } */ /* { dg-options "-O2 " } */ -int a, b = 1; -unsigned int c = 1; +typedef __typeof__(465984011) i32; +typedef __typeof__(465984011U) u32; +i32 a, b = 1; +u32 c = 1; int main() { - int d = 1, f; + i32 d = 1, f; if (b) d = 0; a = -1; @@ -14,7 +16,7 @@ int main() { f = b; b = f; if (f <= a) { -int g = -(a && 1), h = g - f && a, i = ~(c / f) && 1 % (a | h); +i32 g = -(a && 1), h = g - f && a, i = ~(c / f) && 1 % (a | h); if (c) { g = f; if (i || (g && (g > -465984012))) diff --git a/gcc/testsuite/gcc.dg/pr115646.c b/gcc/testsuite/gcc.dg/pr115646.c index 24bc1e45..7938a309513f 100644 --- a/gcc/testsuite/gcc.dg/pr115646.c +++ b/gcc/testsuite/gcc.dg/pr115646.c @@ -1,5 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-O2" } */ +/* { dg-require-effective-target int32plus } */ extern double pow(double x, double y); diff --git a/gcc/testsuite/gcc.dg/pr47893.c b/gcc/testsuite/gcc.dg/pr47893.c index 7e1064d03aa9..cb71132665f4 100644 --- a/gcc/testsuite/gcc.dg/pr47893.c +++ b/gcc/testsuite/gcc.dg/pr47893.c @@ -3,6 +3,9 @@ /* { dg-options "-O2" } */ /* { dg-additional-options "-mtune=atom -fno-omit-frame-pointer -fno-strict-aliasing" { target { { i?86-*-* x86_64-*-* } && ia32 } } } */ /* { dg-skip-if "Too much RAM needed" { "avr-*-*" } } */ +/* About 50 KB code, 33 KB stack, too big for byte-addressed + von Neumann targets with 16 bit addresses. */ +/* { dg-require-effective-targ
[gcc r15-2887] 16-bit testsuite fixes - excessive code size
https://gcc.gnu.org/g:24df2ab33c5e805054006e7b4b94d4270d82074f commit r15-2887-g24df2ab33c5e805054006e7b4b94d4270d82074f Author: Joern Rennecke Date: Mon Aug 12 11:30:02 2024 +0100 16-bit testsuite fixes - excessive code size gcc/testsuite/ * gcc.c-torture/execute/20021120-1.c: Skip if not size20plus or -Os. * gcc.dg/fixed-point/convert-float-4.c: Require size20plus. * gcc.dg/torture/pr112282.c: Skip if -O0 unless size20plus. * g++.dg/lookup/pr21802.C: Require size20plus. Diff: --- gcc/testsuite/g++.dg/lookup/pr21802.C | 1 + gcc/testsuite/gcc.c-torture/execute/20021120-1.c | 2 ++ gcc/testsuite/gcc.dg/fixed-point/convert-float-4.c | 1 + gcc/testsuite/gcc.dg/torture/pr112282.c| 1 + 4 files changed, 5 insertions(+) diff --git a/gcc/testsuite/g++.dg/lookup/pr21802.C b/gcc/testsuite/g++.dg/lookup/pr21802.C index 18b2219166a4..0b1d63c3c4b5 100644 --- a/gcc/testsuite/g++.dg/lookup/pr21802.C +++ b/gcc/testsuite/g++.dg/lookup/pr21802.C @@ -1,5 +1,6 @@ // PR c++/21802 // { dg-do run } +// { dg-require-effective-target size20plus } #include struct X; diff --git a/gcc/testsuite/gcc.c-torture/execute/20021120-1.c b/gcc/testsuite/gcc.c-torture/execute/20021120-1.c index 68043cc949cb..013e0a4650ec 100644 --- a/gcc/testsuite/gcc.c-torture/execute/20021120-1.c +++ b/gcc/testsuite/gcc.c-torture/execute/20021120-1.c @@ -1,3 +1,5 @@ +/* { dg-skip-if "memory tight" { ! size20plus } { "*" } { "-Os" } } */ + void abort (void); void exit (int); diff --git a/gcc/testsuite/gcc.dg/fixed-point/convert-float-4.c b/gcc/testsuite/gcc.dg/fixed-point/convert-float-4.c index c25a55c07155..a9275342f08e 100644 --- a/gcc/testsuite/gcc.dg/fixed-point/convert-float-4.c +++ b/gcc/testsuite/gcc.dg/fixed-point/convert-float-4.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-options "-std=gnu99 -O0" } */ +/* { dg-require-effective-target size20plus } */ /* C99 6.3 Conversions. diff --git a/gcc/testsuite/gcc.dg/torture/pr112282.c b/gcc/testsuite/gcc.dg/torture/pr112282.c index cfe364f9a841..a016f6e230cb 100644 --- a/gcc/testsuite/gcc.dg/torture/pr112282.c +++ b/gcc/testsuite/gcc.dg/torture/pr112282.c @@ -1,4 +1,5 @@ /* { dg-do run } */ +/* { dg-skip-if "memory tight" { ! size20plus } { "-O0" } } */ #if __SIZEOF_INT__ < 4 #define Xint __INT32_TYPE__
[gcc r15-2888] AVR: target/85624 - Fix non-matching alignment in clrmem* insns.
https://gcc.gnu.org/g:68da681e614c2750f648dac2cd0b2595999ca5d9 commit r15-2888-g68da681e614c2750f648dac2cd0b2595999ca5d9 Author: Georg-Johann Lay Date: Mon Aug 12 14:39:24 2024 +0200 AVR: target/85624 - Fix non-matching alignment in clrmem* insns. The clrmem* patterns don't use the provided alignment information, hence the setmemhi expander can just pass down 0 as alignment to the clrmem* insns. PR target/85624 gcc/ * config/avr/avr.md (setmemhi): Set alignment to 0. gcc/testsuite/ * gcc.target/avr/torture/pr85624.c: New test. Diff: --- gcc/config/avr/avr.md | 2 ++ gcc/testsuite/gcc.target/avr/torture/pr85624.c | 7 +++ 2 files changed, 9 insertions(+) diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md index 84dfe4c40ecf..359343e563d5 100644 --- a/gcc/config/avr/avr.md +++ b/gcc/config/avr/avr.md @@ -1355,6 +1355,8 @@ gen_int_mode (INTVAL (operands[1]), mode)); rtx addr0 = copy_to_mode_reg (Pmode, XEXP (operands[0], 0)); operands[0] = gen_rtx_MEM (BLKmode, addr0); +// Alignment is unused; just set it to 0. +operands[3] = const0_rtx; }) diff --git a/gcc/testsuite/gcc.target/avr/torture/pr85624.c b/gcc/testsuite/gcc.target/avr/torture/pr85624.c new file mode 100644 index ..b183d4558df7 --- /dev/null +++ b/gcc/testsuite/gcc.target/avr/torture/pr85624.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ + +int foo (void) +{ + volatile int arr[3] __attribute__((aligned(128))) = { 0 }; + return arr[2]; +}
[gcc r15-2889] ifcvt: handle sequences that clobber flags in noce_convert_multiple_sets
https://gcc.gnu.org/g:28b3812c9d81203ae3d6a5350d8f828f4e659e50 commit r15-2889-g28b3812c9d81203ae3d6a5350d8f828f4e659e50 Author: Manolis Tsamis Date: Fri Jun 30 13:06:42 2023 +0200 ifcvt: handle sequences that clobber flags in noce_convert_multiple_sets This is an extension of what was done in PR106590. Currently if a sequence generated in noce_convert_multiple_sets clobbers the condition rtx (cc_cmp or rev_cc_cmp) then only seq1 is used afterwards (sequences that emit the comparison itself). Since this applies only from the next iteration it assumes that the sequences generated (in particular seq2) doesn't clobber the condition rtx itself before using it in the if_then_else, which is only true in specific cases (currently only register/subregister moves are allowed). This patch changes this so it also tests if seq2 clobbers cc_cmp/rev_cc_cmp in the current iteration. It also checks whether the resulting sequence clobbers the condition attached to the jump. This makes it possible to include arithmetic operations in noce_convert_multiple_sets. It also makes the code that checks whether the condition is used outside of the if_then_else emitted more robust. gcc/ChangeLog: * ifcvt.cc (check_for_cc_cmp_clobbers): Use modified_in_p instead. (noce_convert_multiple_sets_1): Don't use seq2 if it clobbers cc_cmp. Punt if seq clobbers cond. Refactor the code that sets read_comparison. Diff: --- gcc/ifcvt.cc | 127 +-- 1 file changed, 79 insertions(+), 48 deletions(-) diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc index 58ed42673e5b..58c34aaf1ee4 100644 --- a/gcc/ifcvt.cc +++ b/gcc/ifcvt.cc @@ -3592,20 +3592,6 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) return true; } -/* Helper function for noce_convert_multiple_sets_1. If store to - DEST can affect P[0] or P[1], clear P[0]. Called via note_stores. */ - -static void -check_for_cc_cmp_clobbers (rtx dest, const_rtx, void *p0) -{ - rtx *p = (rtx *) p0; - if (p[0] == NULL_RTX) -return; - if (reg_overlap_mentioned_p (dest, p[0]) - || (p[1] && reg_overlap_mentioned_p (dest, p[1]))) -p[0] = NULL_RTX; -} - /* This goes through all relevant insns of IF_INFO->then_bb and tries to create conditional moves. In case a simple move sufficis the insn should be listed in NEED_NO_CMOV. The rewired-src cases should be @@ -3731,36 +3717,71 @@ noce_convert_multiple_sets_1 (struct noce_if_info *if_info, creating an additional compare for each. If successful, costing is easier and this sequence is usually preferred. */ if (cc_cmp) - seq2 = try_emit_cmove_seq (if_info, temp, cond, - new_val, old_val, need_cmov, - &cost2, &temp_dest2, cc_cmp, rev_cc_cmp); + { + seq2 = try_emit_cmove_seq (if_info, temp, cond, +new_val, old_val, need_cmov, +&cost2, &temp_dest2, cc_cmp, rev_cc_cmp); + + /* The if_then_else in SEQ2 may be affected when cc_cmp/rev_cc_cmp is +clobbered. We can't safely use the sequence in this case. */ + for (rtx_insn *iter = seq2; iter; iter = NEXT_INSN (iter)) + if (modified_in_p (cc_cmp, iter) + || (rev_cc_cmp && modified_in_p (rev_cc_cmp, iter))) + { + seq2 = NULL; + break; + } + } /* The backend might have created a sequence that uses the -condition. Check this. */ +condition as a value. Check this. */ + + /* We cannot handle anything more complex than a reg or constant. */ + if (!REG_P (XEXP (cond, 0)) && !CONSTANT_P (XEXP (cond, 0))) + read_comparison = true; + + if (!REG_P (XEXP (cond, 1)) && !CONSTANT_P (XEXP (cond, 1))) + read_comparison = true; + rtx_insn *walk = seq2; - while (walk) + int if_then_else_count = 0; + while (walk && !read_comparison) { - rtx set = single_set (walk); + rtx exprs_to_check[2]; + unsigned int exprs_count = 0; - if (!set || !SET_SRC (set)) + rtx set = single_set (walk); + if (set && XEXP (set, 1) + && GET_CODE (XEXP (set, 1)) == IF_THEN_ELSE) { - walk = NEXT_INSN (walk); - continue; + /* We assume that this is the cmove created by the backend that +naturally uses the condition. */ + exprs_to_check[exprs_count++] = XEXP (XEXP (set, 1), 1); + exprs_to_check[exprs_count++] = XEXP (XEXP (set, 1), 2); + if_then_else_count++; } + else if (NONDEBUG_INSN_P (walk)) + exprs_to_check[exprs_count++] = PATTERN (walk); - rtx src = SET_S
[gcc r15-2890] ifcvt: Allow more operations in multiple set if conversion
https://gcc.gnu.org/g:72c9b5f438f22cca493b4e2a8a2a31ff61bf1477 commit r15-2890-g72c9b5f438f22cca493b4e2a8a2a31ff61bf1477 Author: Manolis Tsamis Date: Fri Jun 30 14:05:15 2023 +0200 ifcvt: Allow more operations in multiple set if conversion Currently the operations allowed for if conversion of a basic block with multiple sets are few, namely REG, SUBREG and CONST_INT (as controlled by bb_ok_for_noce_convert_multiple_sets). This commit allows more operations (arithmetic, compare, etc) to participate in if conversion. The target's profitability hook and ifcvt's costing is expected to reject sequences that are unprofitable. This is especially useful for targets which provide a rich selection of conditional instructions (like aarch64 which has cinc, csneg, csinv, ccmp, ...) which are currently not used in basic blocks with more than a single set. For targets that have a rich selection of conditional instructions, like aarch64, we have seen an ~5x increase of profitable if conversions for multiple set blocks in SPEC CPU 2017 benchmarks. gcc/ChangeLog: * ifcvt.cc (try_emit_cmove_seq): Modify comments. (noce_convert_multiple_sets_1): Modify comments. (bb_ok_for_noce_convert_multiple_sets): Allow more operations. gcc/testsuite/ChangeLog: * gcc.target/aarch64/ifcvt_multiple_sets_arithm.c: New test. Diff: --- gcc/ifcvt.cc | 34 -- .../aarch64/ifcvt_multiple_sets_arithm.c | 79 ++ 2 files changed, 92 insertions(+), 21 deletions(-) diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc index 58c34aaf1ee4..f496a46e600d 100644 --- a/gcc/ifcvt.cc +++ b/gcc/ifcvt.cc @@ -3432,13 +3432,13 @@ try_emit_cmove_seq (struct noce_if_info *if_info, rtx temp, /* We have something like: if (x > y) - { i = a; j = b; k = c; } + { i = EXPR_A; j = EXPR_B; k = EXPR_C; } Make it: - tmp_i = (x > y) ? a : i; - tmp_j = (x > y) ? b : j; - tmp_k = (x > y) ? c : k; + tmp_i = (x > y) ? EXPR_A : i; + tmp_j = (x > y) ? EXPR_B : j; + tmp_k = (x > y) ? EXPR_C : k; i = tmp_i; j = tmp_j; k = tmp_k; @@ -3858,11 +3858,10 @@ noce_convert_multiple_sets_1 (struct noce_if_info *if_info, -/* Return true iff basic block TEST_BB is comprised of only - (SET (REG) (REG)) insns suitable for conversion to a series - of conditional moves. Also check that we have more than one set - (other routines can handle a single set better than we would), and - fewer than PARAM_MAX_RTL_IF_CONVERSION_INSNS sets. While going +/* Return true iff basic block TEST_BB is suitable for conversion to a + series of conditional moves. Also check that we have more than one + set (other routines can handle a single set better than we would), + and fewer than PARAM_MAX_RTL_IF_CONVERSION_INSNS sets. While going through the insns store the sum of their potential costs in COST. */ static bool @@ -3888,20 +3887,13 @@ bb_ok_for_noce_convert_multiple_sets (basic_block test_bb, unsigned *cost) rtx dest = SET_DEST (set); rtx src = SET_SRC (set); - /* We can possibly relax this, but for now only handle REG to REG -(including subreg) moves. This avoids any issues that might come -from introducing loads/stores that might violate data-race-freedom -guarantees. */ - if (!REG_P (dest)) - return false; - - if (!((REG_P (src) || CONSTANT_P (src)) - || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src)) - && subreg_lowpart_p (src + /* Do not handle anything involving memory loads/stores since it might +violate data-race-freedom guarantees. */ + if (!REG_P (dest) || contains_mem_rtx_p (src)) return false; - /* Destination must be appropriate for a conditional write. */ - if (!noce_operand_ok (dest)) + /* Destination and source must be appropriate. */ + if (!noce_operand_ok (dest) || !noce_operand_ok (src)) return false; /* We must be able to conditionally move in this mode. */ diff --git a/gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_arithm.c b/gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_arithm.c new file mode 100644 index ..ba7f948aba57 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ifcvt_multiple_sets_arithm.c @@ -0,0 +1,79 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-rtl-ce1" } */ + +void sink2(int, int); +void sink3(int, int, int); + +void cond1(int cond, int x, int y) +{ + if (cond) +{ + x = x << 4; + y = 1; +} + + sink2(x, y); +} + +void cond2(int cond, int x, int y) +{ + if (cond) +{ + x++; + y++; +} + + sink2(x, y); +} + +void cond3(int cond, int x1, int x2, int x3) +{ + if (cond) +{ + x1++; + x2++; + x3++; +} +
[gcc r15-2891] ifcvt: Handle multiple rewired regs and refactor noce_convert_multiple_sets
https://gcc.gnu.org/g:b219cbeda72d23b7ad6ff12cd159784b7ef00667 commit r15-2891-gb219cbeda72d23b7ad6ff12cd159784b7ef00667 Author: Manolis Tsamis Date: Tue Aug 29 11:47:39 2023 +0200 ifcvt: Handle multiple rewired regs and refactor noce_convert_multiple_sets The existing implementation of need_cmov_or_rewire and noce_convert_multiple_sets_1 assumes that sets are either REG or SUBREG. This commit enchances them so they can handle/rewire arbitrary set statements. To do that a new helper struct noce_multiple_sets_info is introduced which is used by noce_convert_multiple_sets and its helper functions. This results in cleaner function signatures, improved efficientcy (a number of vecs and hash set/map are replaced with a single vec of struct) and simplicity. gcc/ChangeLog: * ifcvt.cc (need_cmov_or_rewire): Renamed init_noce_multiple_sets_info. (init_noce_multiple_sets_info): Initialize noce_multiple_sets_info. (noce_convert_multiple_sets_1): Use noce_multiple_sets_info and handle rewiring of multiple registers. (noce_convert_multiple_sets): Updated to use noce_multiple_sets_info. * ifcvt.h (struct noce_multiple_sets_info): Introduce new struct noce_multiple_sets_info to store info for noce_convert_multiple_sets. gcc/testsuite/ChangeLog: * gcc.target/aarch64/ifcvt_multiple_sets_rewire.c: New test. Diff: --- gcc/ifcvt.cc | 243 + gcc/ifcvt.h| 16 ++ .../aarch64/ifcvt_multiple_sets_rewire.c | 20 ++ 3 files changed, 141 insertions(+), 138 deletions(-) diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc index f496a46e600d..3e25f30b67ee 100644 --- a/gcc/ifcvt.cc +++ b/gcc/ifcvt.cc @@ -98,14 +98,10 @@ static bool dead_or_predicable (basic_block, basic_block, basic_block, edge, bool); static void noce_emit_move_insn (rtx, rtx); static rtx_insn *block_has_only_trap (basic_block); -static void need_cmov_or_rewire (basic_block, hash_set *, -hash_map *); +static void init_noce_multiple_sets_info (basic_block, + auto_delete_vec &); static bool noce_convert_multiple_sets_1 (struct noce_if_info *, - hash_set *, - hash_map *, - auto_vec *, - auto_vec *, - auto_vec *, int *); + auto_delete_vec &, int *); /* Count the number of non-jump active insns in BB. */ @@ -3487,24 +3483,13 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) rtx x = XEXP (cond, 0); rtx y = XEXP (cond, 1); - /* The true targets for a conditional move. */ - auto_vec targets; - /* The temporaries introduced to allow us to not consider register - overlap. */ - auto_vec temporaries; - /* The insns we've emitted. */ - auto_vec unmodified_insns; - - hash_set need_no_cmov; - hash_map rewired_src; - - need_cmov_or_rewire (then_bb, &need_no_cmov, &rewired_src); + auto_delete_vec insn_info; + init_noce_multiple_sets_info (then_bb, insn_info); int last_needs_comparison = -1; bool ok = noce_convert_multiple_sets_1 -(if_info, &need_no_cmov, &rewired_src, &targets, &temporaries, - &unmodified_insns, &last_needs_comparison); +(if_info, insn_info, &last_needs_comparison); if (!ok) return false; @@ -3519,8 +3504,7 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) end_sequence (); start_sequence (); ok = noce_convert_multiple_sets_1 - (if_info, &need_no_cmov, &rewired_src, &targets, &temporaries, -&unmodified_insns, &last_needs_comparison); + (if_info, insn_info, &last_needs_comparison); /* Actually we should not fail anymore if we reached here, but better still check. */ if (!ok) @@ -3529,12 +3513,12 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) /* We must have seen some sort of insn to insert, otherwise we were given an empty BB to convert, and we can't handle that. */ - gcc_assert (!unmodified_insns.is_empty ()); + gcc_assert (!insn_info.is_empty ()); /* Now fixup the assignments. */ - for (unsigned i = 0; i < targets.length (); i++) -if (targets[i] != temporaries[i]) - noce_emit_move_insn (targets[i], temporaries[i]); + for (unsigned i = 0; i < insn_info.length (); i++) +if (insn_info[i]->target != insn_info[i]->temporary) + noce_emit_move_insn (insn_info[i]->target, insn_info[i]->temporary); /* Actually emit the sequence if it isn't too expensive. */ rtx_insn *seq = get_insns (); @@ -3549,10 +3533,10 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) set_used_flags (insn); /* Mark all our temporari
[gcc r15-2892] borrowck: Avoid overloading issues on 32bit architectures
https://gcc.gnu.org/g:12028d7b97a89e6f160b43c70cbf660583039e9b commit r15-2892-g12028d7b97a89e6f160b43c70cbf660583039e9b Author: Arthur Cohen Date: Fri Aug 2 11:18:51 2024 +0200 borrowck: Avoid overloading issues on 32bit architectures On architectures where `size_t` is `unsigned int`, such as 32bit x86, we encounter an issue with `PlaceId` and `FreeRegion` being aliases to the same types. This poses an issue for overloading functions for these two types, such as `push_subset` in that case. This commit renames one of these `push_subset` functions to avoid the issue, but this should be fixed with a newtype pattern for these two types. gcc/rust/ChangeLog: * checks/errors/borrowck/rust-bir-fact-collector.h (points): Rename `push_subset(PlaceId, PlaceId)` to `push_subset_place(PlaceId, PlaceId)` Diff: --- gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h b/gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h index bb8fedaf3db7..6601c981779f 100644 --- a/gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h +++ b/gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h @@ -334,7 +334,7 @@ protected: // Main collection entry points (for different categories). expr.get_rhs () - 1, current_bb, current_stmt); issue_read_move (expr.get_rhs ()); -push_subset (lhs, expr.get_rhs ()); +push_place_subset (lhs, expr.get_rhs ()); } void visit (const CallExpr &expr) override @@ -660,7 +660,7 @@ protected: // Subset helpers. } } - void push_subset (PlaceId lhs, PlaceId rhs) + void push_place_subset (PlaceId lhs, PlaceId rhs) { auto &lhs_place = place_db[lhs]; auto &rhs_place = place_db[rhs];
[gcc r15-2893] borrowck: Fix debug prints on 32-bits architectures
https://gcc.gnu.org/g:edc47d3ac95734b6076187d00feb6b49931ad1cc commit r15-2893-gedc47d3ac95734b6076187d00feb6b49931ad1cc Author: Arthur Cohen Date: Fri Aug 2 11:10:52 2024 +0200 borrowck: Fix debug prints on 32-bits architectures gcc/rust/ChangeLog: * checks/errors/borrowck/rust-bir-builder.h: Cast size_t values to unsigned long before printing. * checks/errors/borrowck/rust-bir-fact-collector.h: Likewise. Diff: --- gcc/rust/checks/errors/borrowck/rust-bir-builder.h| 3 ++- gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h | 6 -- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/gcc/rust/checks/errors/borrowck/rust-bir-builder.h b/gcc/rust/checks/errors/borrowck/rust-bir-builder.h index e9108703be1d..6b29eaeba5a8 100644 --- a/gcc/rust/checks/errors/borrowck/rust-bir-builder.h +++ b/gcc/rust/checks/errors/borrowck/rust-bir-builder.h @@ -91,7 +91,8 @@ private: ctx.fn_free_regions[bound.second.get_index ()]); auto last_bound = universal_region_bounds.back (); - rust_debug ("\t\t %ld: %ld", last_bound.first, last_bound.second); + rust_debug ("\t\t %lu: %lu", (unsigned long) last_bound.first, + (unsigned long) last_bound.second); } // TODO: handle type_region constraints diff --git a/gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h b/gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h index 6601c981779f..1cd6b4d480d8 100644 --- a/gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h +++ b/gcc/rust/checks/errors/borrowck/rust-bir-fact-collector.h @@ -621,14 +621,16 @@ protected: // Generic BIR operations. protected: // Subset helpers. void push_subset (FreeRegion lhs, FreeRegion rhs) { -rust_debug ("\t\tpush_subset: '?%lu: '?%lu", lhs, rhs); +rust_debug ("\t\tpush_subset: '?%lu: '?%lu", (unsigned long) lhs, + (unsigned long) rhs); facts.subset_base.emplace_back (lhs, rhs, get_current_point_mid ()); } void push_subset_all (FreeRegion lhs, FreeRegion rhs) { -rust_debug ("\t\tpush_subset_all: '?%lu: '?%lu", lhs, rhs); +rust_debug ("\t\tpush_subset_all: '?%lu: '?%lu", (unsigned long) lhs, + (unsigned long) rhs); for (auto point : cfg_points_all) facts.subset_base.emplace_back (lhs, rhs, point);
[gcc r15-2894] [rtl-optimization/116244] Don't create bogus regs in alter_subreg
https://gcc.gnu.org/g:e9738e77674e23f600315ca1efed7d1c7944d0cc commit r15-2894-ge9738e77674e23f600315ca1efed7d1c7944d0cc Author: Jeff Law Date: Mon Aug 12 07:29:25 2024 -0600 [rtl-optimization/116244] Don't create bogus regs in alter_subreg So this is another nasty latent bug exposed by ext-dce. Similar to the prior m68k failure it's another problem with how we handle paradoxical subregs on big endian targets. In this instance when we remove the hard subregs we take something like: (subreg:DI (reg:SI 0) 0) And turn it into (reg:SI -1) Which is clearly wrong. (reg:SI 0) is correct. The transformation happens in alter_subreg, but I really wanted to fix this in subreg_regno since we could have similar problems in some of the other callers of subreg_regno. Unfortunately reload depends on the current behavior of subreg_regno; in the cases where the return value is an invalid register, the wrong half of a register pair, etc the resulting bogus value is detected by reload and triggers reloading of the inner object. So that's the new comment in subreg_regno. The second best place to fix is alter_subreg which is what this patch does. If presented with a paradoxical subreg, then the base register number should always be REGNO (SUBREG_REG (object)). It's just how paradoxicals are designed to work. I haven't tried to fix the other places that call subreg_regno. After being burned by reload, I'm more than a bit worried about unintended fallout. I must admit I'm surprised we haven't stumbled over this before and that it didn't fix any failures on the big endian embedded targets. Boostrapped & regression tested on x86_64, also went through all the embedded targets in my tester and bootstrapped on m68k & s390x to get some additional big endian testing. Pushing to the trunk. rtl-optimization/116244 gcc/ * rtlanal.cc (subreg_regno): Update comment. * final.cc (alter_subrg): Always use REGNO (SUBREG_REG ()) to get the base regsiter for paradoxical subregs. gcc/testsuite/ * g++.target/m68k/m68k.exp: New test driver. * g++.target/m68k/pr116244.C: New test. Diff: --- gcc/final.cc | 12 +- gcc/rtlanal.cc | 11 +- gcc/testsuite/g++.target/m68k/m68k.exp | 34 + gcc/testsuite/g++.target/m68k/pr116244.C | 226 +++ 4 files changed, 281 insertions(+), 2 deletions(-) diff --git a/gcc/final.cc b/gcc/final.cc index eb9e065d9f0a..0167b2f8602b 100644 --- a/gcc/final.cc +++ b/gcc/final.cc @@ -3123,7 +3123,17 @@ alter_subreg (rtx *xp, bool final_p) unsigned int regno; poly_int64 offset; - regno = subreg_regno (x); + /* A paradoxical should always be REGNO (y) + 0. Using subreg_regno +for something like (subreg:DI (reg:SI N) 0) on a WORDS_BIG_ENDIAN +target will return N-1 which is catastrophic for N == 0 and just +wrong for other cases. + +Fixing subreg_regno would be a better option, except that reload +depends on its current behavior. */ + if (paradoxical_subreg_p (x)) + regno = REGNO (y); + else + regno = subreg_regno (x); if (subreg_lowpart_p (x)) offset = byte_lowpart_offset (GET_MODE (x), GET_MODE (y)); else diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc index 4158a531bdd7..6f6e6544755d 100644 --- a/gcc/rtlanal.cc +++ b/gcc/rtlanal.cc @@ -4313,7 +4313,16 @@ lowpart_subreg_regno (unsigned int regno, machine_mode xmode, return simplify_subreg_regno (regno, xmode, offset, ymode); } -/* Return the final regno that a subreg expression refers to. */ +/* Return the final regno that a subreg expression refers to. + + Callers such as reload_inner_reg_of_subreg rely on this returning + the simplified hard reg, even if that result is not a valid regno for + the given mode. That triggers reloading the inner part of the + subreg. + + That inherently means other uses of this routine probably need + to be audited for their behavior when requested subreg can't + be expressed as a hard register after apply the offset. */ unsigned int subreg_regno (const_rtx x) { diff --git a/gcc/testsuite/g++.target/m68k/m68k.exp b/gcc/testsuite/g++.target/m68k/m68k.exp new file mode 100644 index ..8f6416e9fdfc --- /dev/null +++ b/gcc/testsuite/g++.target/m68k/m68k.exp @@ -0,0 +1,34 @@ +# Copyright (C) 2019-2024 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version.
[gcc r15-2895] RISC-V: Fix missing abi arg in test
https://gcc.gnu.org/g:ef90a136da4c3e0b28997da25c30fdce1bcb115c commit r15-2895-gef90a136da4c3e0b28997da25c30fdce1bcb115c Author: Edwin Lu Date: Wed Aug 7 10:34:10 2024 -0700 RISC-V: Fix missing abi arg in test The following test was failing when building on 32 bit targets due to not overwriting the mabi arg. This resulted in dejagnu attempting to run the test with -mabi=ilp32d -march=rv64gcv_zvl256b gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/base/pr116202-run-1.c: Add mabi arg Signed-off-by: Edwin Lu Diff: --- gcc/testsuite/gcc.target/riscv/rvv/base/pr116202-run-1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr116202-run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr116202-run-1.c index d150f20b5d93..02814183dbb9 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/base/pr116202-run-1.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr116202-run-1.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O3 -march=rv64gcv_zvl256b -fdump-rtl-expand-details" } */ +/* { dg-options "-O3 -march=rv64gcv_zvl256b -mabi=lp64d -fdump-rtl-expand-details" } */ int b[24]; _Bool c[24];
[gcc r15-2896] rs6000: ROP - Do not disable shrink-wrapping for leaf functions [PR114759]
https://gcc.gnu.org/g:0451bc503da9c858e9f1ddfb8faec367c2e032c8 commit r15-2896-g0451bc503da9c858e9f1ddfb8faec367c2e032c8 Author: Peter Bergner Date: Tue Jun 18 17:42:45 2024 -0500 rs6000: ROP - Do not disable shrink-wrapping for leaf functions [PR114759] Only disable shrink-wrapping when using -mrop-protect when we know we will be emitting the ROP-protect hash instructions (ie, non-leaf functions). 2024-06-17 Peter Bergner gcc/ PR target/114759 * config/rs6000/rs6000.cc (rs6000_override_options_after_change): Move the disabling of shrink-wrapping from here * config/rs6000/rs6000-logue.cc (rs6000_emit_prologue): ...to here. gcc/testsuite/ PR target/114759 * gcc.target/powerpc/pr114759-1.c: New test. Diff: --- gcc/config/rs6000/rs6000-logue.cc | 5 + gcc/config/rs6000/rs6000.cc | 4 gcc/testsuite/gcc.target/powerpc/pr114759-1.c | 16 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/gcc/config/rs6000/rs6000-logue.cc b/gcc/config/rs6000/rs6000-logue.cc index edc0d6c8f520..fdb6414f486f 100644 --- a/gcc/config/rs6000/rs6000-logue.cc +++ b/gcc/config/rs6000/rs6000-logue.cc @@ -3012,6 +3012,11 @@ rs6000_emit_prologue (void) && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl)) == NULL)); + /* If we are inserting ROP-protect hash instructions, disable shrink-wrap + until the bug where the hashst insn is emitted in the wrong location + is fixed. See PR101324 for details. */ + if (info->rop_hash_size) +flag_shrink_wrap = 0; frame_pointer_needed_indeed = frame_pointer_needed && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM); diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 0bcc6a2d0ab6..f2bd9edea8a1 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -3431,10 +3431,6 @@ rs6000_override_options_after_change (void) else if (!OPTION_SET_P (flag_cunroll_grow_size)) flag_cunroll_grow_size = flag_peel_loops || optimize >= 3; - /* If we are inserting ROP-protect instructions, disable shrink wrap. */ - if (rs6000_rop_protect) -flag_shrink_wrap = 0; - /* One of the late-combine passes runs after register allocation and can match define_insn_and_splits that were previously used only before register allocation. Some of those define_insn_and_splits diff --git a/gcc/testsuite/gcc.target/powerpc/pr114759-1.c b/gcc/testsuite/gcc.target/powerpc/pr114759-1.c new file mode 100644 index ..579e08e920f2 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr114759-1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mdejagnu-cpu=power10 -mrop-protect -fdump-rtl-pro_and_epilogue" } */ +/* { dg-require-effective-target rop_ok } Only enable on supported ABIs. */ + +/* Verify we still attempt shrink-wrapping when using -mrop-protect + and there are no function calls. */ + +long +foo (long arg) +{ + if (arg) +asm ("" ::: "r20"); + return 0; +} + +/* { dg-final { scan-rtl-dump-times "Performing shrink-wrapping" 1 "pro_and_epilogue" } } */
[gcc(refs/users/meissner/heads/work175-bugs)] Optimize vec_splats of vec_extract for V2DI/V2DF (PR target/99293)
https://gcc.gnu.org/g:bbd50ab9d9c6aac588b3e7a3889b2cf317cf18c6 commit bbd50ab9d9c6aac588b3e7a3889b2cf317cf18c6 Author: Michael Meissner Date: Mon Aug 12 17:37:25 2024 -0400 Optimize vec_splats of vec_extract for V2DI/V2DF (PR target/99293) This patch optimizes cases like: vector double v1, v2; /* ... */ v2 = vec_splats (vec_extract (v1, 0); /* or */ v2 = vec_splats (vec_extract (v1, 1); Previously: vector long long splat_dup_l_0 (vector long long v) { return __builtin_vec_splats (__builtin_vec_extract (v, 0)); } would generate: mfvsrld 9,34 mtvsrdd 34,9,9 blr With this patch, GCC generates: xxpermdi 34,34,34,3 blr I have tested this patch on the following systems and there was no degration. Can I check it into the trunk branch? * Power10, LE, --with-cpu=power10, IBM 128-bit long double * Power9, LE, --with-cpu=power9, IBM 128-bit long double * Power9, LE, --with-cpu=power9, IEEE 128-bit long double * Power9, LE, --with-cpu=power9, 64-bit default long double * Power9, BE, --with-cpu=power9, IBM 128-bit long double * Power8, BE, --with-cpu=power8, IBM 128-bit long double 2024-08-12 Michael Meissner gcc/ PR target/99293 * gcc/config/rs6000/vsx.md (vsx_splat_extract_): New combiner insn. gcc/testsuite/ PR target/108958 * gcc.target/powerpc/pr99293.c: New test. * gcc.target/powerpc/builtins-1.c: Update insn count. Diff: --- gcc/config/rs6000/altivec.md | 51 ++ gcc/config/rs6000/predicates.md| 63 + gcc/testsuite/gcc.target/powerpc/pr89213.c | 107 + 3 files changed, 221 insertions(+) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index aa9d8fffc901..8a7926eb369a 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -170,6 +170,7 @@ UNSPEC_VSTRIL UNSPEC_SLDB UNSPEC_SRDB + UNSPEC_VECTOR_SHIFT ]) (define_c_enum "unspecv" @@ -2176,6 +2177,56 @@ "vsro %0,%1,%2" [(set_attr "type" "vecperm")]) +;; Optimize V2DI shifts by constants. This relies on the shift instructions +;; only looking at the bits needed to do the shift. This means we can use +;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits +;; that the vector shift instructions will not use. +(define_mode_iterator VSHIFT_MODE [(V4SI "TARGET_P9_VECTOR") +(V2DI "TARGET_P8_VECTOR")]) + +(define_code_iterator vshift_code [ashift ashiftrt lshiftrt]) +(define_code_attr vshift_attr [(ashift "ashift") +(ashiftrt "ashiftrt") +(lshiftrt "lshiftrt")]) + +(define_insn_and_split "*altivec___const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (vshift_code:VSHIFT_MODE +(match_operand:VSHIFT_MODE 1 "register_operand" "v") +(match_operand:VSHIFT_MODE 2 "vector_shift_constant" ""))) + (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))] + "((mode == V2DImode && TARGET_P8_VECTOR) +|| (mode == V4SImode && TARGET_P9_VECTOR))" + "#" + "&& 1" + [(set (match_dup 3) + (unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT)) + (set (match_dup 0) + (vshift_code:VSHIFT_MODE (match_dup 1) +(match_dup 3)))] +{ + if (GET_CODE (operands[3]) == SCRATCH) +operands[3] = gen_reg_rtx (mode); + + operands[4] = ((GET_CODE (operands[2]) == CONST_VECTOR) +? CONST_VECTOR_ELT (operands[2], 0) +: XEXP (operands[2], 0)); +}) + +(define_insn "*altivec__shift_const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")] + UNSPEC_VECTOR_SHIFT))] + "TARGET_P8_VECTOR" +{ + if (UINTVAL (operands[1]) <= 15) +return "vspltisw %0,%1"; + else if (TARGET_P9_VECTOR) +return "xxspltib %x0,%1"; + else +gcc_unreachable (); +}) + (define_insn "altivec_vsum4ubs" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v") diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index d23ce9a77a3f..4fca37bc9345 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -861,6 +861,69 @@ return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode); }) +;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element +;; is the same constant, and the constant can be used for a shift opera
[gcc(refs/users/meissner/heads/work175-bugs)] Update ChangeLog.*
https://gcc.gnu.org/g:762bfe72ee49bd65474216af729118bc94272d1a commit 762bfe72ee49bd65474216af729118bc94272d1a Author: Michael Meissner Date: Mon Aug 12 17:41:16 2024 -0400 Update ChangeLog.* Diff: --- gcc/ChangeLog.bugs | 56 ++ 1 file changed, 56 insertions(+) diff --git a/gcc/ChangeLog.bugs b/gcc/ChangeLog.bugs index e8b978dd3d0f..27759bef3284 100644 --- a/gcc/ChangeLog.bugs +++ b/gcc/ChangeLog.bugs @@ -1,3 +1,59 @@ + Branch work175-bugs, patch #310 + +Optimize vec_splats of vec_extract for V2DI/V2DF (PR target/99293) + +This patch optimizes cases like: + + vector double v1, v2; + /* ... */ + v2 = vec_splats (vec_extract (v1, 0); /* or */ + v2 = vec_splats (vec_extract (v1, 1); + +Previously: + + vector long long + splat_dup_l_0 (vector long long v) + { + return __builtin_vec_splats (__builtin_vec_extract (v, 0)); + } + +would generate: + +mfvsrld 9,34 +mtvsrdd 34,9,9 +blr + +With this patch, GCC generates: + +xxpermdi 34,34,34,3 + blr + + +I have tested this patch on the following systems and there was no degration. +Can I check it into the trunk branch? + +* Power10, LE, --with-cpu=power10, IBM 128-bit long double +* Power9, LE, --with-cpu=power9, IBM 128-bit long double +* Power9, LE, --with-cpu=power9, IEEE 128-bit long double +* Power9, LE, --with-cpu=power9, 64-bit default long double +* Power9, BE, --with-cpu=power9, IBM 128-bit long double +* Power8, BE, --with-cpu=power8, IBM 128-bit long double + +2024-08-12 Michael Meissner + +gcc/ + + PR target/99293 + * gcc/config/rs6000/vsx.md (vsx_splat_extract_): New combiner + insn. + +gcc/testsuite/ + + PR target/108958 + * gcc.target/powerpc/pr99293.c: New test. + * gcc.target/powerpc/builtins-1.c: Update insn count. + + Branch work175-bugs, patch #303 Do not add -mvsx when testing the float128 support.
[gcc(refs/users/meissner/heads/work175-bugs)] Revert changes
https://gcc.gnu.org/g:887a3787b425e7792aaa815e6198feaf8b43eb0d commit 887a3787b425e7792aaa815e6198feaf8b43eb0d Author: Michael Meissner Date: Mon Aug 12 17:53:35 2024 -0400 Revert changes Diff: --- gcc/config/rs6000/altivec.md | 51 -- gcc/config/rs6000/predicates.md| 63 - gcc/testsuite/gcc.target/powerpc/pr89213.c | 107 - 3 files changed, 221 deletions(-) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 8a7926eb369a..aa9d8fffc901 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -170,7 +170,6 @@ UNSPEC_VSTRIL UNSPEC_SLDB UNSPEC_SRDB - UNSPEC_VECTOR_SHIFT ]) (define_c_enum "unspecv" @@ -2177,56 +2176,6 @@ "vsro %0,%1,%2" [(set_attr "type" "vecperm")]) -;; Optimize V2DI shifts by constants. This relies on the shift instructions -;; only looking at the bits needed to do the shift. This means we can use -;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits -;; that the vector shift instructions will not use. -(define_mode_iterator VSHIFT_MODE [(V4SI "TARGET_P9_VECTOR") -(V2DI "TARGET_P8_VECTOR")]) - -(define_code_iterator vshift_code [ashift ashiftrt lshiftrt]) -(define_code_attr vshift_attr [(ashift "ashift") -(ashiftrt "ashiftrt") -(lshiftrt "lshiftrt")]) - -(define_insn_and_split "*altivec___const" - [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") - (vshift_code:VSHIFT_MODE -(match_operand:VSHIFT_MODE 1 "register_operand" "v") -(match_operand:VSHIFT_MODE 2 "vector_shift_constant" ""))) - (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))] - "((mode == V2DImode && TARGET_P8_VECTOR) -|| (mode == V4SImode && TARGET_P9_VECTOR))" - "#" - "&& 1" - [(set (match_dup 3) - (unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT)) - (set (match_dup 0) - (vshift_code:VSHIFT_MODE (match_dup 1) -(match_dup 3)))] -{ - if (GET_CODE (operands[3]) == SCRATCH) -operands[3] = gen_reg_rtx (mode); - - operands[4] = ((GET_CODE (operands[2]) == CONST_VECTOR) -? CONST_VECTOR_ELT (operands[2], 0) -: XEXP (operands[2], 0)); -}) - -(define_insn "*altivec__shift_const" - [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") - (unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")] - UNSPEC_VECTOR_SHIFT))] - "TARGET_P8_VECTOR" -{ - if (UINTVAL (operands[1]) <= 15) -return "vspltisw %0,%1"; - else if (TARGET_P9_VECTOR) -return "xxspltib %x0,%1"; - else -gcc_unreachable (); -}) - (define_insn "altivec_vsum4ubs" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v") diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 4fca37bc9345..d23ce9a77a3f 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -861,69 +861,6 @@ return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode); }) -;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element -;; is the same constant, and the constant can be used for a shift operation. -;; This is to prevent sub-optimal code, that needs to load up the constant and -;; then zero extend it 32 or 64-bit vectors or load up the constant from the -;; literal pool. -;; -;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by -;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction. -;; For V2DImode, we do this all of the time, since there is no convenient -;; instruction to load up a vector long long splatted constant. -;; -;; If we can use XXSPLTIB, then allow constants up to 63. If not, we restrict -;; the constant to 0..15 that can be loaded with VSPLTISW. V4SI shifts are -;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31. Values -;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't -;; need this optimization. -(define_predicate "vector_shift_constant" - (match_code "const_vector,vec_duplicate") -{ - unsigned HOST_WIDE_INT min_value; - - if (mode == V2DImode) -{ - min_value = 0; - if (!TARGET_P8_VECTOR) - return 0; -} - else if (mode == V4SImode) -{ - min_value = 16; - if (!TARGET_P9_VECTOR) - return 0; -} - else -return 0; - - unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15; - - if (GET_CODE (op) == CONST_VECTOR) -{ - unsigned HOST_WIDE_INT first = UINTVAL (CONST_VECTOR_ELT (op, 0)); - unsigned nunits = GET_MODE_NUNITS (mode); - unsigned i; - - if (!IN_RANGE (first, min_value, max_value)) - return 0; - - for (i = 1; i < nunits;
[gcc(refs/users/meissner/heads/work175-bugs)] Add better support for shifting vectors with 64-bit elements
https://gcc.gnu.org/g:910aad6838939ac024abe5a4e24801d1e6da0eff commit 910aad6838939ac024abe5a4e24801d1e6da0eff Author: Michael Meissner Date: Mon Aug 12 17:55:09 2024 -0400 Add better support for shifting vectors with 64-bit elements This patch fixes PR target/89213 to allow better code to be generated to do constant shifts of V2DI/V2DF vectors. [gcc] 2024-08-12 Michael Meissner PR target/89213 * config/rs6000/altivec.md (UNSPEC_VECTOR_SHIFT): New unspec. (VSHIFT_MODE): New mode iterator. (vshift_code): New code iterator. (vshift_attr): New code attribute. (altivec___const): New pattern to optimize vector long long/int shifts by a constant. (altivec__shift_const): New helper insn to load up a constant used by the shift operation. * config/rs6000/predicates.md (vector_shift_constant): New predicate. [gcc/testsuite] 2024-08-12 Michael Meissner PR target/89213 * gcc.target/powerpc/pr89213.c: New test. Diff: --- gcc/config/rs6000/altivec.md | 51 ++ gcc/config/rs6000/predicates.md| 63 + gcc/testsuite/gcc.target/powerpc/pr89213.c | 107 + 3 files changed, 221 insertions(+) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index aa9d8fffc901..8a7926eb369a 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -170,6 +170,7 @@ UNSPEC_VSTRIL UNSPEC_SLDB UNSPEC_SRDB + UNSPEC_VECTOR_SHIFT ]) (define_c_enum "unspecv" @@ -2176,6 +2177,56 @@ "vsro %0,%1,%2" [(set_attr "type" "vecperm")]) +;; Optimize V2DI shifts by constants. This relies on the shift instructions +;; only looking at the bits needed to do the shift. This means we can use +;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits +;; that the vector shift instructions will not use. +(define_mode_iterator VSHIFT_MODE [(V4SI "TARGET_P9_VECTOR") +(V2DI "TARGET_P8_VECTOR")]) + +(define_code_iterator vshift_code [ashift ashiftrt lshiftrt]) +(define_code_attr vshift_attr [(ashift "ashift") +(ashiftrt "ashiftrt") +(lshiftrt "lshiftrt")]) + +(define_insn_and_split "*altivec___const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (vshift_code:VSHIFT_MODE +(match_operand:VSHIFT_MODE 1 "register_operand" "v") +(match_operand:VSHIFT_MODE 2 "vector_shift_constant" ""))) + (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))] + "((mode == V2DImode && TARGET_P8_VECTOR) +|| (mode == V4SImode && TARGET_P9_VECTOR))" + "#" + "&& 1" + [(set (match_dup 3) + (unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT)) + (set (match_dup 0) + (vshift_code:VSHIFT_MODE (match_dup 1) +(match_dup 3)))] +{ + if (GET_CODE (operands[3]) == SCRATCH) +operands[3] = gen_reg_rtx (mode); + + operands[4] = ((GET_CODE (operands[2]) == CONST_VECTOR) +? CONST_VECTOR_ELT (operands[2], 0) +: XEXP (operands[2], 0)); +}) + +(define_insn "*altivec__shift_const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")] + UNSPEC_VECTOR_SHIFT))] + "TARGET_P8_VECTOR" +{ + if (UINTVAL (operands[1]) <= 15) +return "vspltisw %0,%1"; + else if (TARGET_P9_VECTOR) +return "xxspltib %x0,%1"; + else +gcc_unreachable (); +}) + (define_insn "altivec_vsum4ubs" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v") diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index d23ce9a77a3f..4fca37bc9345 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -861,6 +861,69 @@ return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode); }) +;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element +;; is the same constant, and the constant can be used for a shift operation. +;; This is to prevent sub-optimal code, that needs to load up the constant and +;; then zero extend it 32 or 64-bit vectors or load up the constant from the +;; literal pool. +;; +;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by +;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction. +;; For V2DImode, we do this all of the time, since there is no convenient +;; instruction to load up a vector long long splatted constant. +;; +;; If we can use XXSPLTIB, then allow constants up to 63. If not, we restrict +;; the constant to 0..15 that can be loaded with VSPLTISW. V4SI shi
[gcc(refs/users/meissner/heads/work175-bugs)] Update ChangeLog.*
https://gcc.gnu.org/g:eef83c37e0c3acaa44cf3cfeaa96df7876500c66 commit eef83c37e0c3acaa44cf3cfeaa96df7876500c66 Author: Michael Meissner Date: Mon Aug 12 17:56:17 2024 -0400 Update ChangeLog.* Diff: --- gcc/ChangeLog.bugs | 68 -- 1 file changed, 20 insertions(+), 48 deletions(-) diff --git a/gcc/ChangeLog.bugs b/gcc/ChangeLog.bugs index 27759bef3284..2d6970807262 100644 --- a/gcc/ChangeLog.bugs +++ b/gcc/ChangeLog.bugs @@ -1,58 +1,30 @@ Branch work175-bugs, patch #310 -Optimize vec_splats of vec_extract for V2DI/V2DF (PR target/99293) +Add better support for shifting vectors with 64-bit elements -This patch optimizes cases like: - - vector double v1, v2; - /* ... */ - v2 = vec_splats (vec_extract (v1, 0); /* or */ - v2 = vec_splats (vec_extract (v1, 1); - -Previously: - - vector long long - splat_dup_l_0 (vector long long v) - { - return __builtin_vec_splats (__builtin_vec_extract (v, 0)); - } - -would generate: - -mfvsrld 9,34 -mtvsrdd 34,9,9 -blr - -With this patch, GCC generates: - -xxpermdi 34,34,34,3 - blr - - -I have tested this patch on the following systems and there was no degration. -Can I check it into the trunk branch? - -* Power10, LE, --with-cpu=power10, IBM 128-bit long double -* Power9, LE, --with-cpu=power9, IBM 128-bit long double -* Power9, LE, --with-cpu=power9, IEEE 128-bit long double -* Power9, LE, --with-cpu=power9, 64-bit default long double -* Power9, BE, --with-cpu=power9, IBM 128-bit long double -* Power8, BE, --with-cpu=power8, IBM 128-bit long double +This patch fixes PR target/89213 to allow better code to be generated to do +constant shifts of V2DI/V2DF vectors. +[gcc] 2024-08-12 Michael Meissner -gcc/ - - PR target/99293 - * gcc/config/rs6000/vsx.md (vsx_splat_extract_): New combiner - insn. - -gcc/testsuite/ - - PR target/108958 - * gcc.target/powerpc/pr99293.c: New test. - * gcc.target/powerpc/builtins-1.c: Update insn count. + PR target/89213 + * config/rs6000/altivec.md (UNSPEC_VECTOR_SHIFT): New unspec. + (VSHIFT_MODE): New mode iterator. + (vshift_code): New code iterator. + (vshift_attr): New code attribute. + (altivec___const): New pattern to optimize + vector long long/int shifts by a constant. + (altivec__shift_const): New helper insn to load up a + constant used by the shift operation. + * config/rs6000/predicates.md (vector_shift_constant): New + predicate. + +[gcc/testsuite] +2024-08-12 Michael Meissner + PR target/89213 + * gcc.target/powerpc/pr89213.c: New test. Branch work175-bugs, patch #303
[gcc(refs/users/meissner/heads/work175-bugs)] Revert changes
https://gcc.gnu.org/g:d012fc1ec5c1ac230c920b242566999d1d343912 commit d012fc1ec5c1ac230c920b242566999d1d343912 Author: Michael Meissner Date: Tue Aug 13 01:08:10 2024 -0400 Revert changes Diff: --- gcc/config/rs6000/altivec.md | 51 -- gcc/config/rs6000/predicates.md| 63 - gcc/testsuite/gcc.target/powerpc/pr89213.c | 107 - 3 files changed, 221 deletions(-) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 8a7926eb369a..aa9d8fffc901 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -170,7 +170,6 @@ UNSPEC_VSTRIL UNSPEC_SLDB UNSPEC_SRDB - UNSPEC_VECTOR_SHIFT ]) (define_c_enum "unspecv" @@ -2177,56 +2176,6 @@ "vsro %0,%1,%2" [(set_attr "type" "vecperm")]) -;; Optimize V2DI shifts by constants. This relies on the shift instructions -;; only looking at the bits needed to do the shift. This means we can use -;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits -;; that the vector shift instructions will not use. -(define_mode_iterator VSHIFT_MODE [(V4SI "TARGET_P9_VECTOR") -(V2DI "TARGET_P8_VECTOR")]) - -(define_code_iterator vshift_code [ashift ashiftrt lshiftrt]) -(define_code_attr vshift_attr [(ashift "ashift") -(ashiftrt "ashiftrt") -(lshiftrt "lshiftrt")]) - -(define_insn_and_split "*altivec___const" - [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") - (vshift_code:VSHIFT_MODE -(match_operand:VSHIFT_MODE 1 "register_operand" "v") -(match_operand:VSHIFT_MODE 2 "vector_shift_constant" ""))) - (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))] - "((mode == V2DImode && TARGET_P8_VECTOR) -|| (mode == V4SImode && TARGET_P9_VECTOR))" - "#" - "&& 1" - [(set (match_dup 3) - (unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT)) - (set (match_dup 0) - (vshift_code:VSHIFT_MODE (match_dup 1) -(match_dup 3)))] -{ - if (GET_CODE (operands[3]) == SCRATCH) -operands[3] = gen_reg_rtx (mode); - - operands[4] = ((GET_CODE (operands[2]) == CONST_VECTOR) -? CONST_VECTOR_ELT (operands[2], 0) -: XEXP (operands[2], 0)); -}) - -(define_insn "*altivec__shift_const" - [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") - (unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")] - UNSPEC_VECTOR_SHIFT))] - "TARGET_P8_VECTOR" -{ - if (UINTVAL (operands[1]) <= 15) -return "vspltisw %0,%1"; - else if (TARGET_P9_VECTOR) -return "xxspltib %x0,%1"; - else -gcc_unreachable (); -}) - (define_insn "altivec_vsum4ubs" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v") diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 4fca37bc9345..d23ce9a77a3f 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -861,69 +861,6 @@ return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode); }) -;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element -;; is the same constant, and the constant can be used for a shift operation. -;; This is to prevent sub-optimal code, that needs to load up the constant and -;; then zero extend it 32 or 64-bit vectors or load up the constant from the -;; literal pool. -;; -;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by -;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction. -;; For V2DImode, we do this all of the time, since there is no convenient -;; instruction to load up a vector long long splatted constant. -;; -;; If we can use XXSPLTIB, then allow constants up to 63. If not, we restrict -;; the constant to 0..15 that can be loaded with VSPLTISW. V4SI shifts are -;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31. Values -;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't -;; need this optimization. -(define_predicate "vector_shift_constant" - (match_code "const_vector,vec_duplicate") -{ - unsigned HOST_WIDE_INT min_value; - - if (mode == V2DImode) -{ - min_value = 0; - if (!TARGET_P8_VECTOR) - return 0; -} - else if (mode == V4SImode) -{ - min_value = 16; - if (!TARGET_P9_VECTOR) - return 0; -} - else -return 0; - - unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15; - - if (GET_CODE (op) == CONST_VECTOR) -{ - unsigned HOST_WIDE_INT first = UINTVAL (CONST_VECTOR_ELT (op, 0)); - unsigned nunits = GET_MODE_NUNITS (mode); - unsigned i; - - if (!IN_RANGE (first, min_value, max_value)) - return 0; - - for (i = 1; i < nunits;
[gcc(refs/users/meissner/heads/work175-bugs)] Add better support for shifting vectors with 64-bit elements
https://gcc.gnu.org/g:cfcc6bdc6eb259eb4b4b4c549f59d8b004a88293 commit cfcc6bdc6eb259eb4b4b4c549f59d8b004a88293 Author: Michael Meissner Date: Tue Aug 13 01:39:47 2024 -0400 Add better support for shifting vectors with 64-bit elements This patch fixes PR target/89213 to allow better code to be generated to do constant shifts of V2DI/V2DF vectors. Previously GCC would do constant shifts of vectors with 64-bit elements by using: XXSPLTIB 32,4 VEXTSB2D 0,0 VSRAD 2,2,0 I.e., the PowerPC does not have a VSPLTISD instruction to load -15..14 for the 64-bit shift count in one instruction. Instead, it would need to load a byte and then convert it to 64-bit. With this patch, GCC now realizes that the vector shift instructions will look at the bottom 6 bits for the shift count, and it can use either a VSPLTISW or XXSPLTIB instruction to load the shift count. [gcc] 2024-08-12 Michael Meissner PR target/89213 * config/rs6000/altivec.md (UNSPEC_VECTOR_SHIFT): New unspec. (VSHIFT_MODE): New mode iterator. (vshift_code): New code iterator. (vshift_attr): New code attribute. (altivec___const): New pattern to optimize vector long long/int shifts by a constant. (altivec__shift_const): New helper insn to load up a constant used by the shift operation. * config/rs6000/predicates.md (vector_shift_constant): New predicate. [gcc/testsuite] 2024-08-12 Michael Meissner PR target/89213 * gcc.target/powerpc/pr89213.c: New test. * gcc.target/powerpc/vec-rlmi-rlnm.c: Update instruction count. Diff: --- gcc/config/rs6000/altivec.md | 51 +++ gcc/config/rs6000/predicates.md | 63 ++ gcc/testsuite/gcc.target/powerpc/pr89213.c | 106 +++ gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c | 4 +- 4 files changed, 222 insertions(+), 2 deletions(-) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index aa9d8fffc901..8a7926eb369a 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -170,6 +170,7 @@ UNSPEC_VSTRIL UNSPEC_SLDB UNSPEC_SRDB + UNSPEC_VECTOR_SHIFT ]) (define_c_enum "unspecv" @@ -2176,6 +2177,56 @@ "vsro %0,%1,%2" [(set_attr "type" "vecperm")]) +;; Optimize V2DI shifts by constants. This relies on the shift instructions +;; only looking at the bits needed to do the shift. This means we can use +;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits +;; that the vector shift instructions will not use. +(define_mode_iterator VSHIFT_MODE [(V4SI "TARGET_P9_VECTOR") +(V2DI "TARGET_P8_VECTOR")]) + +(define_code_iterator vshift_code [ashift ashiftrt lshiftrt]) +(define_code_attr vshift_attr [(ashift "ashift") +(ashiftrt "ashiftrt") +(lshiftrt "lshiftrt")]) + +(define_insn_and_split "*altivec___const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (vshift_code:VSHIFT_MODE +(match_operand:VSHIFT_MODE 1 "register_operand" "v") +(match_operand:VSHIFT_MODE 2 "vector_shift_constant" ""))) + (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))] + "((mode == V2DImode && TARGET_P8_VECTOR) +|| (mode == V4SImode && TARGET_P9_VECTOR))" + "#" + "&& 1" + [(set (match_dup 3) + (unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT)) + (set (match_dup 0) + (vshift_code:VSHIFT_MODE (match_dup 1) +(match_dup 3)))] +{ + if (GET_CODE (operands[3]) == SCRATCH) +operands[3] = gen_reg_rtx (mode); + + operands[4] = ((GET_CODE (operands[2]) == CONST_VECTOR) +? CONST_VECTOR_ELT (operands[2], 0) +: XEXP (operands[2], 0)); +}) + +(define_insn "*altivec__shift_const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")] + UNSPEC_VECTOR_SHIFT))] + "TARGET_P8_VECTOR" +{ + if (UINTVAL (operands[1]) <= 15) +return "vspltisw %0,%1"; + else if (TARGET_P9_VECTOR) +return "xxspltib %x0,%1"; + else +gcc_unreachable (); +}) + (define_insn "altivec_vsum4ubs" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v") diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index d23ce9a77a3f..4fca37bc9345 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -861,6 +861,69 @@ return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode); }) +;; Return 1 if the operand is a V2
[gcc(refs/users/meissner/heads/work175-bugs)] Update ChangeLog.*
https://gcc.gnu.org/g:fb4520c60b9606aa98a005655e8c091e9031bbba commit fb4520c60b9606aa98a005655e8c091e9031bbba Author: Michael Meissner Date: Tue Aug 13 01:41:12 2024 -0400 Update ChangeLog.* Diff: --- gcc/ChangeLog.bugs | 20 ++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/gcc/ChangeLog.bugs b/gcc/ChangeLog.bugs index 2d6970807262..64bb8399e95c 100644 --- a/gcc/ChangeLog.bugs +++ b/gcc/ChangeLog.bugs @@ -1,9 +1,22 @@ - Branch work175-bugs, patch #310 + Branch work175-bugs, patch #311 Add better support for shifting vectors with 64-bit elements This patch fixes PR target/89213 to allow better code to be generated to do -constant shifts of V2DI/V2DF vectors. +constant shifts of V2DI/V2DF vectors. Previously GCC would do constant shifts +of vectors with 64-bit elements by using: + + XXSPLTIB 32,4 + VEXTSB2D 0,0 + VSRAD 2,2,0 + +I.e., the PowerPC does not have a VSPLTISD instruction to load -15..14 for the +64-bit shift count in one instruction. Instead, it would need to load a byte +and then convert it to 64-bit. + +With this patch, GCC now realizes that the vector shift instructions will look +at the bottom 6 bits for the shift count, and it can use either a VSPLTISW or +XXSPLTIB instruction to load the shift count. [gcc] 2024-08-12 Michael Meissner @@ -25,6 +38,9 @@ constant shifts of V2DI/V2DF vectors. PR target/89213 * gcc.target/powerpc/pr89213.c: New test. + * gcc.target/powerpc/vec-rlmi-rlnm.c: Update instruction count. + + Branch work175-bugs, patch #310 was reverted Branch work175-bugs, patch #303