[gcc/aoliva/heads/testme] (2 commits) Dump aliases in -fcallgraph-info
The branch 'aoliva/heads/testme' was updated to point to: 17d9d479afd... Dump aliases in -fcallgraph-info It previously pointed to: 8152f1f5491... optimize initialization of small padded objects Diff: !!! WARNING: THE FOLLOWING COMMITS ARE NO LONGER ACCESSIBLE (LOST): --- 8152f1f... optimize initialization of small padded objects Summary of changes (added commits): --- 17d9d47... Dump aliases in -fcallgraph-info ebf9b1b... Optimize initialization of small padded objects
[gcc(refs/users/aoliva/heads/testme)] Optimize initialization of small padded objects
https://gcc.gnu.org/g:ebf9b1becc8cf76421f1741ac8084d139abd49db commit ebf9b1becc8cf76421f1741ac8084d139abd49db Author: Alexandre Oliva Date: Wed Aug 14 21:59:28 2024 -0300 Optimize initialization of small padded objects When small objects containing padding bits (or bytes) are fully initialized, we will often store them in registers, and setting bitfields and other small fields will attempt to preserve the uninitialized padding bits, which tends to be expensive. Zero-initializing registers, OTOH, tends to be cheap. So, if we're optimizing, zero-initialize such small padded objects even if that's not needed for correctness. We can't zero-initialize all such padding objects, though: if there's no padding whatsoever, and all fields are initialized with nonzero, the zero initialization would be flagged as dead. That's why we introduce machinery to detect whether objects have padding bits. I considered distinguishing between bitfields, units and larger padding elements, but I didn't pursue that distinction. Since the object's zero-initialization subsumes fields' zero-initialization, the empty string test in builtin-snprintf-6.c's test_assign_aggregate would regress without the addition of native_encode_constructor. for gcc/ChangeLog * expr.cc (categorize_ctor_elements_1): Change p_complete to int, to distinguish complete initialization in presence or absence of uninitialized padding bits. (categorize_ctor_elements): Likewise. Adjust all callers... * expr.h (categorize_ctor_elements): ... and declaration. (type_has_padding_at_level_p): New. * gimple-fold.cc (type_has_padding_at_level_p): New. * fold-const.cc (native_encode_constructor): New. (native_encode_expr): Call it. * gimplify.cc (gimplify_init_constructor): Clear small non-addressable non-volatile objects with padding or other uninitialized fields as an optimization. Diff: --- gcc/expr.cc| 20 +--- gcc/expr.h | 3 ++- gcc/fold-const.cc | 33 + gcc/gimple-fold.cc | 50 ++ gcc/gimplify.cc| 14 +- 5 files changed, 111 insertions(+), 9 deletions(-) diff --git a/gcc/expr.cc b/gcc/expr.cc index 2089c2b86a9..a701c67b348 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -7096,7 +7096,7 @@ count_type_elements (const_tree type, bool for_ctor_p) static bool categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts, HOST_WIDE_INT *p_unique_nz_elts, - HOST_WIDE_INT *p_init_elts, bool *p_complete) + HOST_WIDE_INT *p_init_elts, int *p_complete) { unsigned HOST_WIDE_INT idx; HOST_WIDE_INT nz_elts, unique_nz_elts, init_elts, num_fields; @@ -7218,7 +7218,10 @@ categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts, if (*p_complete && !complete_ctor_at_level_p (TREE_TYPE (ctor), num_fields, elt_type)) -*p_complete = false; +*p_complete = 0; + else if (*p_complete > 0 + && type_has_padding_at_level_p (TREE_TYPE (ctor))) +*p_complete = -1; *p_nz_elts += nz_elts; *p_unique_nz_elts += unique_nz_elts; @@ -7239,7 +7242,10 @@ categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts, and place it in *P_ELT_COUNT. * whether the constructor is complete -- in the sense that every meaningful byte is explicitly given a value -- - and place it in *P_COMPLETE. + and place it in *P_COMPLETE: + - 0 if any field is missing + - 1 if all fields are initialized, and there's no padding + - -1 if all fields are initialized, but there's padding Return whether or not CTOR is a valid static constant initializer, the same as "initializer_constant_valid_p (CTOR, TREE_TYPE (CTOR)) != 0". */ @@ -7247,12 +7253,12 @@ categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts, bool categorize_ctor_elements (const_tree ctor, HOST_WIDE_INT *p_nz_elts, HOST_WIDE_INT *p_unique_nz_elts, - HOST_WIDE_INT *p_init_elts, bool *p_complete) + HOST_WIDE_INT *p_init_elts, int *p_complete) { *p_nz_elts = 0; *p_unique_nz_elts = 0; *p_init_elts = 0; - *p_complete = true; + *p_complete = 1; return categorize_ctor_elements_1 (ctor, p_nz_elts, p_unique_nz_elts, p_init_elts, p_complete); @@ -7313,7 +7319,7 @@ mostly_zeros_p (const_tree exp) if (TREE_CODE (exp) == CONSTRUCTOR) { HOST_WIDE_INT nz_elts, unz_elts, init_elts; - bool complete_p; + int complete_p; categorize_ctor_elements (exp, &nz_elts, &unz_elts, &init_
[gcc(refs/users/aoliva/heads/testme)] Dump aliases in -fcallgraph-info
https://gcc.gnu.org/g:17d9d479afd4de2939c2d507691394ff32983296 commit 17d9d479afd4de2939c2d507691394ff32983296 Author: Alexandre Oliva Date: Thu Aug 15 02:00:18 2024 -0300 Dump aliases in -fcallgraph-info Dump ICF-unified decls, thunks, aliases and whatnot along with their ultimate targets, with edges from the alias to the target. for gcc/ChangeLog * toplev.cc (dump_final_alias_vcg): New. (dump_final_node_vcg): Dump aliases along with node. Diff: --- gcc/toplev.cc | 37 + 1 file changed, 37 insertions(+) diff --git a/gcc/toplev.cc b/gcc/toplev.cc index eee4805b504..f308fb15108 100644 --- a/gcc/toplev.cc +++ b/gcc/toplev.cc @@ -914,6 +914,37 @@ dump_final_callee_vcg (FILE *f, location_t location, tree callee) fputs ("\" }\n", f); } +/* Callback for cgraph_node::call_for_symbol_thunks_and_aliases to dump to F_ a + node and an edge from ALIAS->DECL to CURRENT_FUNCTION_DECL. */ + +static bool +dump_final_alias_vcg (cgraph_node *alias, void *f_) +{ + FILE *f = (FILE *)f_; + + if (alias->decl == current_function_decl) +return false; + + dump_final_node_vcg_start (f, alias->decl); + fputs ("\" shape : triangle }\n", f); + + fputs ("edge: { sourcename: \"", f); + print_decl_identifier (f, alias->decl, PRINT_DECL_UNIQUE_NAME); + fputs ("\" targetname: \"", f); + print_decl_identifier (f, current_function_decl, PRINT_DECL_UNIQUE_NAME); + location_t location = DECL_SOURCE_LOCATION (alias->decl); + if (LOCATION_LOCUS (location) != UNKNOWN_LOCATION) +{ + expanded_location loc; + fputs ("\" label: \"", f); + loc = expand_location (location); + fprintf (f, "%s:%d:%d", loc.file, loc.line, loc.column); +} + fputs ("\" }\n", f); + + return false; +} + /* Dump final cgraph node in VCG format. */ static void @@ -950,6 +981,12 @@ dump_final_node_vcg (FILE *f) dump_final_callee_vcg (f, c->location, c->decl); vec_free (cfun->su->callees); cfun->su->callees = NULL; + + cgraph_node *node = cgraph_node::get (current_function_decl); + if (!node) +return; + node->call_for_symbol_thunks_and_aliases (dump_final_alias_vcg, f, + true, false); } /* Output stack usage and callgraph info, as requested. */
[gcc r15-2929] aarch64: Rename svpext to svpext_lane [PR116371]
https://gcc.gnu.org/g:cc2d29e5f4434a3fd4e0dd93ea4f9857a0309201 commit r15-2929-gcc2d29e5f4434a3fd4e0dd93ea4f9857a0309201 Author: Richard Sandiford Date: Thu Aug 15 10:10:12 2024 +0100 aarch64: Rename svpext to svpext_lane [PR116371] When implementing the SME2 ACLE, I somehow missed off the _lane suffix on svpext. gcc/ PR target/116371 * config/aarch64/aarch64-sve-builtins-sve2.h (svpext): Rename to... (svpext_lane): ...this. * config/aarch64/aarch64-sve-builtins-sve2.cc (svpext_impl): Rename to... (svpext_lane_impl): ...this and update instantiation accordingly. * config/aarch64/aarch64-sve-builtins-sve2.def (svpext): Rename to... (svpext_lane): ...this. gcc/testsuite/ PR target/116371 * gcc.target/aarch64/sme2/acle-asm/pext_c16.c, gcc.target/aarch64/sme2/acle-asm/pext_c16_x2.c, gcc.target/aarch64/sme2/acle-asm/pext_c32.c, gcc.target/aarch64/sme2/acle-asm/pext_c32_x2.c, gcc.target/aarch64/sme2/acle-asm/pext_c64.c, gcc.target/aarch64/sme2/acle-asm/pext_c64_x2.c, gcc.target/aarch64/sme2/acle-asm/pext_c8.c, gcc.target/aarch64/sme2/acle-asm/pext_c8_x2.c: Replace with... * gcc.target/aarch64/sme2/acle-asm/pext_lane_c16.c, gcc.target/aarch64/sme2/acle-asm/pext_lane_c16_x2.c, gcc.target/aarch64/sme2/acle-asm/pext_lane_c32.c, gcc.target/aarch64/sme2/acle-asm/pext_lane_c32_x2.c, gcc.target/aarch64/sme2/acle-asm/pext_lane_c64.c, gcc.target/aarch64/sme2/acle-asm/pext_lane_c64_x2.c, gcc.target/aarch64/sme2/acle-asm/pext_lane_c8.c, gcc.target/aarch64/sme2/acle-asm/pext_lane_c8_x2.c: ...these new tests, testing for svpext_lane instead of svpext. Diff: --- gcc/config/aarch64/aarch64-sve-builtins-sve2.cc| 4 +- gcc/config/aarch64/aarch64-sve-builtins-sve2.def | 2 +- gcc/config/aarch64/aarch64-sve-builtins-sve2.h | 2 +- .../gcc.target/aarch64/sme2/acle-asm/pext_c16.c| 50 .../gcc.target/aarch64/sme2/acle-asm/pext_c16_x2.c | 54 -- .../gcc.target/aarch64/sme2/acle-asm/pext_c32.c| 50 .../gcc.target/aarch64/sme2/acle-asm/pext_c32_x2.c | 54 -- .../gcc.target/aarch64/sme2/acle-asm/pext_c64.c| 50 .../gcc.target/aarch64/sme2/acle-asm/pext_c64_x2.c | 54 -- .../gcc.target/aarch64/sme2/acle-asm/pext_c8.c | 50 .../gcc.target/aarch64/sme2/acle-asm/pext_c8_x2.c | 54 -- .../aarch64/sme2/acle-asm/pext_lane_c16.c | 50 .../aarch64/sme2/acle-asm/pext_lane_c16_x2.c | 54 ++ .../aarch64/sme2/acle-asm/pext_lane_c32.c | 50 .../aarch64/sme2/acle-asm/pext_lane_c32_x2.c | 54 ++ .../aarch64/sme2/acle-asm/pext_lane_c64.c | 50 .../aarch64/sme2/acle-asm/pext_lane_c64_x2.c | 54 ++ .../aarch64/sme2/acle-asm/pext_lane_c8.c | 50 .../aarch64/sme2/acle-asm/pext_lane_c8_x2.c| 54 ++ 19 files changed, 420 insertions(+), 420 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc index dc591551682..146a5459930 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc @@ -221,7 +221,7 @@ public: } }; -class svpext_impl : public function_base +class svpext_lane_impl : public function_base { public: rtx @@ -619,7 +619,7 @@ FUNCTION (svmullt_lane, unspec_based_lane_function, (UNSPEC_SMULLT, UNSPEC_UMULLT, -1)) FUNCTION (svnbsl, CODE_FOR_MODE0 (aarch64_sve2_nbsl),) FUNCTION (svnmatch, svmatch_svnmatch_impl, (UNSPEC_NMATCH)) -FUNCTION (svpext, svpext_impl,) +FUNCTION (svpext_lane, svpext_lane_impl,) FUNCTION (svpmul, CODE_FOR_MODE0 (aarch64_sve2_pmul),) FUNCTION (svpmullb, unspec_based_function, (-1, UNSPEC_PMULLB, -1)) FUNCTION (svpmullb_pair, unspec_based_function, (-1, UNSPEC_PMULLB_PAIR, -1)) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def index 4366925a971..4543402f836 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def @@ -263,7 +263,7 @@ DEF_SVE_FUNCTION_GS (svmax, binary_opt_single_n, all_arith, x24, none) DEF_SVE_FUNCTION_GS (svmaxnm, binary_opt_single_n, all_float, x24, none) DEF_SVE_FUNCTION_GS (svmin, binary_opt_single_n, all_arith, x24, none) DEF_SVE_FUNCTION_GS (svminnm, binary_opt_single_n, all_float, x24, none) -DEF_SVE_FUNCTION_GS (svpext, extract_
[gcc r15-2930] Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload.
https://gcc.gnu.org/g:f7e672da8fc3d416a6d07eb01f3be4400ef94fac commit r15-2930-gf7e672da8fc3d416a6d07eb01f3be4400ef94fac Author: liuhongt Date: Mon Aug 12 18:24:34 2024 +0800 Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload. It results in 2 failures for x86_64-pc-linux-gnu{\ -march=cascadelake}; gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1 For pr113560.c, now GCC generates mulx instead of mulq with -march=cascadelake, which should be optimal, so adjust testcase for that. For gcc.target/i386/extendditi2-1.c, RA happens to choose another register instead of rax and result in movq%rdi, %rbp movq%rdi, %rax sarq$63, %rbp movq%rbp, %rdx The patch adds a new define_peephole2 for that. gcc/ChangeLog: PR target/116274 * config/i386/i386-expand.cc (ix86_expand_vector_move): Restrict special case TImode to 128-bit vector conversions via V2DI under ix86_pre_reload_split (). * config/i386/i386.cc (inline_secondary_memory_needed): Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload. * config/i386/i386.md (*extendsidi2_rex64): Add a define_peephole2 after it. gcc/testsuite/ChangeLog: * gcc.target/i386/pr116274.c: New test. * gcc.target/i386/pr113560.c: Scan either mulq or mulx. Diff: --- gcc/config/i386/i386-expand.cc | 2 +- gcc/config/i386/i386.cc | 18 -- gcc/config/i386/i386.md | 19 +++ gcc/testsuite/gcc.target/i386/pr113560.c | 2 +- gcc/testsuite/gcc.target/i386/pr116274.c | 12 5 files changed, 45 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index bdbc1423267..ed546eeed6b 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) && SUBREG_P (op1) && GET_MODE (SUBREG_REG (op1)) == TImode && TARGET_64BIT && TARGET_SSE - && can_create_pseudo_p ()) + && ix86_pre_reload_split ()) { rtx tmp = gen_reg_rtx (V2DImode); rtx lo = gen_reg_rtx (DImode); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 0721e38ab2a..d06e2141e56 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -20292,6 +20292,18 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2))) return true; + /* If the target says that inter-unit moves are more expensive +than moving through memory, then don't generate them. */ + if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) + || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) + return true; + + /* With SSE4.1, *mov{ti,di}_internal supports moves between +SSE_REGS and GENERAL_REGS using pinsr{q,d} or pextr{q,d}. */ + if (TARGET_SSE4_1 + && (TARGET_64BIT ? mode == TImode : mode == DImode)) + return false; + int msize = GET_MODE_SIZE (mode); /* Between SSE and general, we have moves no larger than word size. */ @@ -20304,12 +20316,6 @@ inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, if (msize < minsize) return true; - - /* If the target says that inter-unit moves are more expensive -than moving through memory, then don't generate them. */ - if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) - || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) - return true; } return false; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index d3ba2425f16..efbab2f25ec 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -5041,6 +5041,25 @@ DONE; }) +(define_peephole2 + [(set (match_operand:DI 0 "general_reg_operand") + (match_operand:DI 1 "general_reg_operand")) + (parallel [(set (match_dup 0) + (ashiftrt:DI (match_dup 0) + (const_int 63))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_operand:DI 2 "general_reg_operand") (match_dup 1)) + (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))] + "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD) + && REGNO (operands[2]) == AX_REG + && REGNO (operands[3]) == DX_REG + && peep2_reg_dead_p (4, operands[0]) + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[2], operands[0])" + [(set (match_dup 2) (match_dup 1)) + (parallel [(set (match_dup 3) (
[gcc r15-2931] LoongArch: Implement scalar isinf, isnormal, and isfinite via fclass
https://gcc.gnu.org/g:ee4a6343225b6e44b3d2b2c90c355c21f7ec6855 commit r15-2931-gee4a6343225b6e44b3d2b2c90c355c21f7ec6855 Author: Xi Ruoyao Date: Thu Jul 4 02:49:28 2024 +0800 LoongArch: Implement scalar isinf, isnormal, and isfinite via fclass Doing so can avoid loading FP constants from the memory. It also partially fixes PR 66262 as fclass does not signal on sNaN. gcc/ChangeLog: * config/loongarch/loongarch.md (extendsidi2): Add ("=r", "f") alternative and use movfr2gr.s for it. The spec clearly states movfr2gr.s sign extends the value to GRLEN. (fclass_): Make the result SImode instead of a floating mode. The fclass results are really not FP values. (FCLASS_MASK): New define_int_iterator. (fclass_optab): New define_int_attr. (): New define_expand template. gcc/testsuite/ChangeLog: * gcc.target/loongarch/fclass-compile.c: New test. * gcc.target/loongarch/fclass-run.c: New test. Diff: --- gcc/config/loongarch/loongarch.md | 53 +++--- .../gcc.target/loongarch/fclass-compile.c | 20 gcc/testsuite/gcc.target/loongarch/fclass-run.c| 53 ++ 3 files changed, 119 insertions(+), 7 deletions(-) diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md index 73cdb38a406..f70ca85bfb3 100644 --- a/gcc/config/loongarch/loongarch.md +++ b/gcc/config/loongarch/loongarch.md @@ -1851,16 +1851,17 @@ ;; (define_insn "extendsidi2" - [(set (match_operand:DI 0 "register_operand" "=r,r,r,r") + [(set (match_operand:DI 0 "register_operand" "=r,r,r,r,r") (sign_extend:DI - (match_operand:SI 1 "nonimmediate_operand" "r,ZC,m,k")))] + (match_operand:SI 1 "nonimmediate_operand" "r,ZC,m,k,f")))] "TARGET_64BIT" "@ slli.w\t%0,%1,0 ldptr.w\t%0,%1 ld.w\t%0,%1 - ldx.w\t%0,%1" - [(set_attr "move_type" "sll0,load,load,load") + ldx.w\t%0,%1 + movfr2gr.s\t%0,%1" + [(set_attr "move_type" "sll0,load,load,load,mftg") (set_attr "mode" "DI")]) (define_insn "extend2" @@ -4110,14 +4111,52 @@ "movgr2fcsr\t$r%0,%1") (define_insn "fclass_" - [(set (match_operand:ANYF 0 "register_operand" "=f") - (unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")] - UNSPEC_FCLASS))] + [(set (match_operand:SI 0 "register_operand" "=f") + (unspec:SI [(match_operand:ANYF 1 "register_operand" "f")] + UNSPEC_FCLASS))] "TARGET_HARD_FLOAT" "fclass.\t%0,%1" [(set_attr "type" "unknown") (set_attr "mode" "")]) +(define_int_iterator FCLASS_MASK [68 136 952]) +(define_int_attr fclass_optab + [(68 "isinf") + (136"isnormal") + (952"isfinite")]) + +(define_expand "2" + [(match_operand:SI 0 "register_operand" "=r") + (match_operand:ANYF 1 "register_operand" " f") + (const_int FCLASS_MASK)] + "TARGET_HARD_FLOAT" + { +rtx ft0 = gen_reg_rtx (SImode); +rtx t0 = gen_reg_rtx (word_mode); +rtx mask = GEN_INT (); + +emit_insn (gen_fclass_ (ft0, operands[1])); + +if (TARGET_64BIT) + emit_insn (gen_extend_insn (t0, ft0, DImode, SImode, 0)); +else + emit_move_insn (t0, ft0); + +emit_move_insn (t0, gen_rtx_AND (word_mode, t0, mask)); +emit_move_insn (t0, gen_rtx_NE (word_mode, t0, const0_rtx)); + +if (TARGET_64BIT) + { + t0 = lowpart_subreg (SImode, t0, DImode); + SUBREG_PROMOTED_VAR_P (t0) = 1; + SUBREG_PROMOTED_SET (t0, SRP_SIGNED); + } + +emit_move_insn (operands[0], t0); + +DONE; + }) + (define_insn "bytepick_w_" [(set (match_operand:SI 0 "register_operand" "=r") (ior:SI (lshiftrt:SI (match_operand:SI 1 "register_operand" "r") diff --git a/gcc/testsuite/gcc.target/loongarch/fclass-compile.c b/gcc/testsuite/gcc.target/loongarch/fclass-compile.c new file mode 100644 index 000..9c24d6e263c --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/fclass-compile.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=loongarch64 -mfpu=64 -mabi=lp64d" } */ +/* { dg-final { scan-assembler-times "fclass\\.s" 1 } } */ +/* { dg-final { scan-assembler-times "fclass\\.d" 1 } } */ + +__attribute__ ((noipa)) int +test_fclass_f (float f) +{ + return __builtin_isinf (f) +| __builtin_isnormal (f) << 1 +| __builtin_isfinite (f) << 2; +} + +__attribute__ ((noipa)) int +test_fclass_d (double d) +{ + return __builtin_isinf (d) +| __builtin_isnormal (d) << 1 +| __builtin_isfinite (d) << 2; +} diff --git a/gcc/testsuite/gcc.target/loongarch/fclass-run.c b/gcc/testsuite/gcc.target/loongarch/fclass-run.c new file mode 100644 index 000..e5585f9d557 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/fclass-run.c @@ -0,0 +1,53 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fsignal
[gcc] Created branch 'mikael/heads/inline_minmaxloc_without_dim_v11' in namespace 'refs/users'
The branch 'mikael/heads/inline_minmaxloc_without_dim_v11' was created in namespace 'refs/users' pointing to: 5a49705a0b9... fortran: Add -finline-intrinsics flag for MINLOC/MAXLOC [PR
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v11)] fortran: Inline MINLOC/MAXLOC with no DIM and ARRAY of rank 1 [PR90608]
https://gcc.gnu.org/g:19e0b53cc6bab8653ac8313b56c5d41830a514d6 commit 19e0b53cc6bab8653ac8313b56c5d41830a514d6 Author: Mikael Morin Date: Wed Jul 31 10:09:53 2024 +0200 fortran: Inline MINLOC/MAXLOC with no DIM and ARRAY of rank 1 [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Enable inline code generation for the MINLOC and MAXLOC intrinsic, if the DIM argument is not present and ARRAY has rank 1. This case is similar to the case where the result is scalar (DIM present and rank 1 ARRAY), which already supports inline expansion of the intrinsic. Both cases return the same value, with the difference that the result is an array of size 1 if DIM is absent, whereas it's a scalar if DIM is present. So all there is to do for the new case to work is hook the inline expansion with the scalarizer. PR fortran/90608 gcc/fortran/ChangeLog: * trans-array.cc (gfc_conv_ss_startstride): Set the scalarization rank based on the MINLOC/MAXLOC rank if needed. Call the inline code generation and setup the scalarizer array descriptor info in the MINLOC and MAXLOC cases. * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Return the result array element if the scalarizer is setup and we are inside the loops. Restrict library function call dispatch to the case where inline expansion is not supported. Declare an array result if the expression isn't scalar. Initialize the array result single element and return the result variable if the expression isn't scalar. (walk_inline_intrinsic_minmaxloc): New function. (walk_inline_intrinsic_function): Add MINLOC and MAXLOC cases, dispatching to walk_inline_intrinsic_minmaxloc. (gfc_add_intrinsic_ss_code): Add MINLOC and MAXLOC cases. (gfc_inline_intrinsic_function_p): Return true if ARRAY has rank 1, regardless of DIM. Diff: --- gcc/fortran/trans-array.cc | 25 + gcc/fortran/trans-intrinsic.cc | 224 - 2 files changed, 181 insertions(+), 68 deletions(-) diff --git a/gcc/fortran/trans-array.cc b/gcc/fortran/trans-array.cc index 9fb0b2b398d..46e2152d0f0 100644 --- a/gcc/fortran/trans-array.cc +++ b/gcc/fortran/trans-array.cc @@ -4851,6 +4851,8 @@ gfc_conv_ss_startstride (gfc_loopinfo * loop) case GFC_ISYM_UBOUND: case GFC_ISYM_LCOBOUND: case GFC_ISYM_UCOBOUND: + case GFC_ISYM_MAXLOC: + case GFC_ISYM_MINLOC: case GFC_ISYM_SHAPE: case GFC_ISYM_THIS_IMAGE: loop->dimen = ss->dimen; @@ -4900,6 +4902,29 @@ done: case GFC_SS_INTRINSIC: switch (expr->value.function.isym->id) { + case GFC_ISYM_MINLOC: + case GFC_ISYM_MAXLOC: + { + gfc_se se; + gfc_init_se (&se, nullptr); + se.loop = loop; + se.ss = ss; + gfc_conv_intrinsic_function (&se, expr); + gfc_add_block_to_block (&outer_loop->pre, &se.pre); + gfc_add_block_to_block (&outer_loop->post, &se.post); + + info->descriptor = se.expr; + + info->data = gfc_conv_array_data (info->descriptor); + info->data = gfc_evaluate_now (info->data, &outer_loop->pre); + + info->offset = gfc_index_zero_node; + info->start[0] = gfc_index_zero_node; + info->end[0] = gfc_index_zero_node; + info->stride[0] = gfc_index_one_node; + continue; + } + /* Fall through to supply start and stride. */ case GFC_ISYM_LBOUND: case GFC_ISYM_UBOUND: diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc index 2c8512060cc..9fcb57a9cc4 100644 --- a/gcc/fortran/trans-intrinsic.cc +++ b/gcc/fortran/trans-intrinsic.cc @@ -5273,66 +5273,95 @@ strip_kind_from_actual (gfc_actual_arglist * actual) we need to handle. For performance reasons we sometimes create two loops instead of one, where the second one is much simpler. Examples for minloc intrinsic: - 1) Result is an array, a call is generated - 2) Array mask is used and NaNs need to be supported: - limit = Infinity; - pos = 0; - S = from; - while (S <= to) { - if (mask[S]) { - if (pos == 0) pos = S + (1 - from); - if (a[S] <= limit) { limit = a[S]; pos = S + (1 - from); goto lab1; } - } - S++; - } - goto lab2; - lab1:; - while (S <= to) { - if (mask[S]) if (a[S] < limit) { limit = a[S]; pos = S + (1 - from); } - S++; - } - lab2:; - 3) NaNs need to be supported, but it is known at compile time or cheaply - at r
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v11)] fortran: Remove MINLOC/MAXLOC frontend optimization
https://gcc.gnu.org/g:d0ebb72849e061dfcbe243073b82d19cbcd93454 commit d0ebb72849e061dfcbe243073b82d19cbcd93454 Author: Mikael Morin Date: Fri Aug 9 21:21:01 2024 +0200 fortran: Remove MINLOC/MAXLOC frontend optimization Remove the frontend pass rewriting calls of MINLOC/MAXLOC without DIM to calls with one-valued DIM enclosed in an array constructor. This transformation was circumventing the limitation of inline MINLOC/MAXLOC code generation to scalar cases only, allowing inline code to be generated if ARRAY had rank 1 and DIM was absent. As MINLOC/MAXLOC has gained support of inline code generation in that case, the limitation is no longer effective, and the transformation no longer necessary. gcc/fortran/ChangeLog: * frontend-passes.cc (optimize_minmaxloc): Remove. (optimize_expr): Remove dispatch to optimize_minmaxloc. Diff: --- gcc/fortran/frontend-passes.cc | 57 -- 1 file changed, 57 deletions(-) diff --git a/gcc/fortran/frontend-passes.cc b/gcc/fortran/frontend-passes.cc index 8e4c6310ba8..31d553e9844 100644 --- a/gcc/fortran/frontend-passes.cc +++ b/gcc/fortran/frontend-passes.cc @@ -36,7 +36,6 @@ static bool optimize_op (gfc_expr *); static bool optimize_comparison (gfc_expr *, gfc_intrinsic_op); static bool optimize_trim (gfc_expr *); static bool optimize_lexical_comparison (gfc_expr *); -static void optimize_minmaxloc (gfc_expr **); static bool is_empty_string (gfc_expr *e); static void doloop_warn (gfc_namespace *); static int do_intent (gfc_expr **); @@ -356,17 +355,6 @@ optimize_expr (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, if ((*e)->expr_type == EXPR_OP && optimize_op (*e)) gfc_simplify_expr (*e, 0); - if ((*e)->expr_type == EXPR_FUNCTION && (*e)->value.function.isym) -switch ((*e)->value.function.isym->id) - { - case GFC_ISYM_MINLOC: - case GFC_ISYM_MAXLOC: - optimize_minmaxloc (e); - break; - default: - break; - } - if (function_expr) count_arglist --; @@ -2262,51 +2250,6 @@ optimize_trim (gfc_expr *e) return true; } -/* Optimize minloc(b), where b is rank 1 array, into - (/ minloc(b, dim=1) /), and similarly for maxloc, - as the latter forms are expanded inline. */ - -static void -optimize_minmaxloc (gfc_expr **e) -{ - gfc_expr *fn = *e; - gfc_actual_arglist *a; - char *name, *p; - - if (fn->rank != 1 - || fn->value.function.actual == NULL - || fn->value.function.actual->expr == NULL - || fn->value.function.actual->expr->ts.type == BT_CHARACTER - || fn->value.function.actual->expr->rank != 1 - || gfc_inline_intrinsic_function_p (fn)) -return; - - *e = gfc_get_array_expr (fn->ts.type, fn->ts.kind, &fn->where); - (*e)->shape = fn->shape; - fn->rank = 0; - fn->shape = NULL; - gfc_constructor_append_expr (&(*e)->value.constructor, fn, &fn->where); - - name = XALLOCAVEC (char, strlen (fn->value.function.name) + 1); - strcpy (name, fn->value.function.name); - p = strstr (name, "loc0"); - p[3] = '1'; - fn->value.function.name = gfc_get_string ("%s", name); - if (fn->value.function.actual->next) -{ - a = fn->value.function.actual->next; - gcc_assert (a->expr == NULL); -} - else -{ - a = gfc_get_actual_arglist (); - fn->value.function.actual->next = a; -} - a->expr = gfc_get_constant_expr (BT_INTEGER, gfc_default_integer_kind, - &fn->where); - mpz_set_ui (a->expr->value.integer, 1); -} - /* Data package to hand down for DO loop checks in a contained procedure. */ typedef struct contained_info
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v11)] fortran: Outline array bound check generation code
https://gcc.gnu.org/g:87ac113216c4abd9270236f7653eddc3b7fca75c commit 87ac113216c4abd9270236f7653eddc3b7fca75c Author: Mikael Morin Date: Wed Jul 31 10:10:06 2024 +0200 fortran: Outline array bound check generation code The next patch will need reindenting of the array bound check generation code. This outlines it to its own function beforehand, reducing the churn in the next patch. Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- gcc/fortran/ChangeLog: * trans-array.cc (gfc_conv_ss_startstride): Move array bound check generation code... (add_check_section_in_array_bounds): ... here as a new function. Diff: --- gcc/fortran/trans-array.cc | 297 ++--- 1 file changed, 143 insertions(+), 154 deletions(-) diff --git a/gcc/fortran/trans-array.cc b/gcc/fortran/trans-array.cc index 46e2152d0f0..e578b676fcc 100644 --- a/gcc/fortran/trans-array.cc +++ b/gcc/fortran/trans-array.cc @@ -4816,6 +4816,146 @@ gfc_conv_section_startstride (stmtblock_t * block, gfc_ss * ss, int dim) } +/* Generate in INNER the bounds checking code along the dimension DIM for + the array associated with SS_INFO. */ + +static void +add_check_section_in_array_bounds (stmtblock_t *inner, gfc_ss_info *ss_info, + int dim) +{ + gfc_expr *expr = ss_info->expr; + locus *expr_loc = &expr->where; + const char *expr_name = expr->symtree->name; + + gfc_array_info *info = &ss_info->data.array; + + bool check_upper; + if (dim == info->ref->u.ar.dimen - 1 + && info->ref->u.ar.as->type == AS_ASSUMED_SIZE) +check_upper = false; + else +check_upper = true; + + /* Zero stride is not allowed. */ + tree tmp = fold_build2_loc (input_location, EQ_EXPR, logical_type_node, + info->stride[dim], gfc_index_zero_node); + char * msg = xasprintf ("Zero stride is not allowed, for dimension %d " + "of array '%s'", dim + 1, expr_name); + gfc_trans_runtime_check (true, false, tmp, inner, expr_loc, msg); + free (msg); + + tree desc = info->descriptor; + + /* This is the run-time equivalent of resolve.cc's + check_dimension. The logical is more readable there + than it is here, with all the trees. */ + tree lbound = gfc_conv_array_lbound (desc, dim); + tree end = info->end[dim]; + tree ubound = check_upper ? gfc_conv_array_ubound (desc, dim) : NULL_TREE; + + /* non_zerosized is true when the selected range is not + empty. */ + tree stride_pos = fold_build2_loc (input_location, GT_EXPR, logical_type_node, +info->stride[dim], gfc_index_zero_node); + tmp = fold_build2_loc (input_location, LE_EXPR, logical_type_node, +info->start[dim], end); + stride_pos = fold_build2_loc (input_location, TRUTH_AND_EXPR, + logical_type_node, stride_pos, tmp); + + tree stride_neg = fold_build2_loc (input_location, LT_EXPR, logical_type_node, +info->stride[dim], gfc_index_zero_node); + tmp = fold_build2_loc (input_location, GE_EXPR, logical_type_node, +info->start[dim], end); + stride_neg = fold_build2_loc (input_location, TRUTH_AND_EXPR, + logical_type_node, stride_neg, tmp); + tree non_zerosized = fold_build2_loc (input_location, TRUTH_OR_EXPR, + logical_type_node, stride_pos, + stride_neg); + + /* Check the start of the range against the lower and upper + bounds of the array, if the range is not empty. + If upper bound is present, include both bounds in the + error message. */ + if (check_upper) +{ + tmp = fold_build2_loc (input_location, LT_EXPR, logical_type_node, +info->start[dim], lbound); + tmp = fold_build2_loc (input_location, TRUTH_AND_EXPR, logical_type_node, +non_zerosized, tmp); + tree tmp2 = fold_build2_loc (input_location, GT_EXPR, logical_type_node, + info->start[dim], ubound); + tmp2 = fold_build2_loc (input_location, TRUTH_AND_EXPR, logical_type_node, + non_zerosized, tmp2); + msg = xasprintf ("Index '%%ld' of dimension %d of array '%s' outside of " + "expected range (%%ld:%%ld)", dim + 1, expr_name); + gfc_trans_runtime_check (true, false, tmp, inner, expr_loc, msg, + fold_convert (long_integer_type_node, info->start[dim]), + fold_convert (long_integer_type_node, lbound), + fold_convert (long_integer_type_node, ubound)); + gfc_trans_runtime_check (true, false, tmp2, inner, expr_loc, msg, + fold_convert (long_integer_type_node, info->start[dim]), + fold_convert (long_integer_type_node, lbound), +
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v11)] fortran: Inline integral MINLOC/MAXLOC with no DIM and no MASK [PR90608]
https://gcc.gnu.org/g:757ad6b2ca026b460269dde422020e966cc1ff9e commit 757ad6b2ca026b460269dde422020e966cc1ff9e Author: Mikael Morin Date: Wed Jul 31 10:10:19 2024 +0200 fortran: Inline integral MINLOC/MAXLOC with no DIM and no MASK [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Enable generation of inline code for the MINLOC and MAXLOC intrinsic, if the ARRAY argument is of integral type and of any rank (only the rank 1 case was previously inlined), and neither DIM nor MASK arguments are present. This needs a few adjustments in gfc_conv_intrinsic_minmaxloc, mainly to replace the single variables POS and OFFSET, with collections of variables, one variable per dimension each. The restriction to integral ARRAY and absent MASK limits the scope of the change to the cases where we generate single loop inline code. The code generation for the second loop is only accessible with ARRAY of rank 1, so it can continue using a single variable. A later change will extend inlining to the double loop cases. There is some bounds checking code that was previously handled by the library, and that needed some changes in the scalarizer to avoid regressing. The bounds check code generation was already supported by the scalarizer, but it was only applying to array reference sections, checking both for array bound violation and for shape conformability between all the involved arrays. With this change, for MINLOC or MAXLOC, enable the conformability check between all the scalarized arrays, and disable the array bound violation check. PR fortran/90608 gcc/fortran/ChangeLog: * trans-array.cc (gfc_conv_ss_startstride): Set the MINLOC/MAXLOC result upper bound using the rank of the ARRAY argument. Ajdust the error message for intrinsic result arrays. Only check array bounds for array references. Move bound check decision code... (bounds_check_needed): ... here as a new predicate. Allow bound check for MINLOC/MAXLOC intrinsic results. * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Change the result array upper bound to the rank of ARRAY. Update the NONEMPTY variable to depend on the non-empty extent of every dimension. Use one variable per dimension instead of a single variable for the position and the offset. Update their declaration, initialization, and update to affect the variable of each dimension. Use the first variable only in areas only accessed with rank 1 ARRAY argument. Set every element of the result using its corresponding variable. (gfc_inline_intrinsic_function_p): Return true for integral ARRAY and absent DIM and MASK. gcc/testsuite/ChangeLog: * gfortran.dg/maxloc_bounds_4.f90: Additionally accept the error message emitted by the scalarizer. Diff: --- gcc/fortran/trans-array.cc| 68 +--- gcc/fortran/trans-intrinsic.cc| 150 +++--- gcc/testsuite/gfortran.dg/maxloc_bounds_4.f90 | 4 +- 3 files changed, 165 insertions(+), 57 deletions(-) diff --git a/gcc/fortran/trans-array.cc b/gcc/fortran/trans-array.cc index e578b676fcc..1190bfa6c02 100644 --- a/gcc/fortran/trans-array.cc +++ b/gcc/fortran/trans-array.cc @@ -4956,6 +4956,35 @@ add_check_section_in_array_bounds (stmtblock_t *inner, gfc_ss_info *ss_info, } +/* Tells whether we need to generate bounds checking code for the array + associated with SS. */ + +bool +bounds_check_needed (gfc_ss *ss) +{ + /* Catch allocatable lhs in f2003. */ + if (flag_realloc_lhs && ss->no_bounds_check) +return false; + + gfc_ss_info *ss_info = ss->info; + if (ss_info->type == GFC_SS_SECTION) +return true; + + if (!(ss_info->type == GFC_SS_INTRINSIC + && ss_info->expr + && ss_info->expr->expr_type == EXPR_FUNCTION)) +return false; + + gfc_intrinsic_sym *isym = ss_info->expr->value.function.isym; + if (!(isym + && (isym->id == GFC_ISYM_MAXLOC + || isym->id == GFC_ISYM_MINLOC))) +return false; + + return gfc_inline_intrinsic_function_p (ss_info->expr); +} + + /* Calculates the range start and stride for a SS chain. Also gets the descriptor and data pointer. The range of vector subscripts is the size of the vector. Array bounds are also checked. */ @@ -5057,10 +5086,17 @@ done: info->data = gfc_conv_array_data (info->descriptor); info->data = gfc_evaluate_now (info->data, &outer_loop->pre); - info->offset = gfc_index_zero_node; + gfc_expr *array = expr->value.function.actual->expr; + tree rank = build_int_cst (gfc_array_index_type, array->rank); + +
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v11)] fortran: Inline integral MINLOC/MAXLOC with no DIM and scalar MASK [PR90608]
https://gcc.gnu.org/g:6adbf59da3c7c656514fc5d2fd233439010ccb40 commit 6adbf59da3c7c656514fc5d2fd233439010ccb40 Author: Mikael Morin Date: Wed Jul 31 10:10:33 2024 +0200 fortran: Inline integral MINLOC/MAXLOC with no DIM and scalar MASK [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Enable the generation of inline code for MINLOC/MAXLOC when argument ARRAY is of integral type, DIM is not present, and MASK is present and is scalar (only absent MASK or rank 1 ARRAY were inlined before). Scalar masks are implemented with a wrapping condition around the code one would generate if MASK wasn't present, so they are easy to support once inline code without MASK is working. PR fortran/90608 gcc/fortran/ChangeLog: * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Generate variable initialization for each dimension in the else branch of the toplevel condition. (gfc_inline_intrinsic_function_p): Return TRUE for scalar MASK. gcc/testsuite/ChangeLog: * gfortran.dg/maxloc_bounds_7.f90: Additionally accept the error message reported by the scalarizer. Diff: --- gcc/fortran/trans-intrinsic.cc| 13 - gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc index b8a7faf5459..cd7a43f58fb 100644 --- a/gcc/fortran/trans-intrinsic.cc +++ b/gcc/fortran/trans-intrinsic.cc @@ -5914,7 +5914,6 @@ gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op) /* For a scalar mask, enclose the loop in an if statement. */ if (maskexpr && maskss == NULL) { - gcc_assert (loop.dimen == 1); tree ifmask; gfc_init_se (&maskse, NULL); @@ -5929,7 +5928,8 @@ gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op) the pos variable the same way as above. */ gfc_init_block (&elseblock); - gfc_add_modify (&elseblock, pos[0], gfc_index_zero_node); + for (int i = 0; i < loop.dimen; i++) + gfc_add_modify (&elseblock, pos[i], gfc_index_zero_node); elsetmp = gfc_finish_block (&elseblock); ifmask = conv_mask_condition (&maskse, maskexpr, optional_mask); tmp = build3_v (COND_EXPR, ifmask, tmp, elsetmp); @@ -11823,9 +11823,12 @@ gfc_inline_intrinsic_function_p (gfc_expr *expr) if (array->rank == 1) return true; - if (array->ts.type == BT_INTEGER - && dim == nullptr - && mask == nullptr) + if (array->ts.type != BT_INTEGER + || dim != nullptr) + return false; + + if (mask == nullptr + || mask->rank == 0) return true; return false; diff --git a/gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 b/gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 index 206a29b149d..3aa9d3dcebe 100644 --- a/gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 +++ b/gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 @@ -1,6 +1,6 @@ ! { dg-do run } ! { dg-options "-fbounds-check" } -! { dg-shouldfail "Incorrect extent in return value of MAXLOC intrinsic: is 3, should be 2" } +! { dg-shouldfail "Incorrect extent in return value of MAXLOC intrinsic: is 3, should be 2|Array bound mismatch for dimension 1 of array 'res' .3/2." } module tst contains subroutine foo(res) @@ -18,4 +18,4 @@ program main integer :: res(3) call foo(res) end program main -! { dg-output "Fortran runtime error: Incorrect extent in return value of MAXLOC intrinsic: is 3, should be 2" } +! { dg-output "Fortran runtime error: Incorrect extent in return value of MAXLOC intrinsic: is 3, should be 2|Array bound mismatch for dimension 1 of array 'res' .3/2." }
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v11)] fortran: Inline non-character MINLOC/MAXLOC with no DIM [PR90608]
https://gcc.gnu.org/g:d8af439bfeaabcb06f517505ef85fc21bf459165 commit d8af439bfeaabcb06f517505ef85fc21bf459165 Author: Mikael Morin Date: Wed Jul 31 10:10:49 2024 +0200 fortran: Inline non-character MINLOC/MAXLOC with no DIM [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Enable generation of inline MINLOC/MAXLOC code in the case where DIM is not present, and either ARRAY is of floating point type or MASK is an array. Those cases are the remaining bits to fully support inlining of non-CHARACTER MINLOC/MAXLOC without DIM. They are treated together because they generate similar code, the NANs for REAL types being handled a bit like a second level of masking. These are the cases for which we generate two sets of loops. This change affects the code generating the second loop, that was previously accessible only in the cases ARRAY has rank 1 only. The single variable initialization and update are changed to apply to multiple variables, one per dimension. The code generated is as follows (if ARRAY has rank 2): for (idx11 in lower1..upper1) { for (idx12 in lower2..upper2) { ... if (...) { ... goto second_loop; } } } second_loop: for (idx21 in lower1..upper1) { for (idx22 in lower2..upper2) { ... } } This code leads to processing the first elements redundantly, both in the first set of loops and in the second one. The loop over idx22 could start from idx12 the first time it is run, but as it has to start from lower2 for the rest of the runs, this change uses the same bounds for both set of loops for simplicity. In the rank 1 case, this makes the generated code worse compared to the inline code that was generated before. A later change will introduce conditionals to avoid the duplicate processing and restore the generated code in that case. PR fortran/90608 gcc/fortran/ChangeLog: * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Initialize and update all the variables. Put the label and goto in the outermost scalarizer loop. Don't start the second loop where the first stopped. (gfc_inline_intrinsic_function_p): Also return TRUE for array MASK or for any REAL type. gcc/testsuite/ChangeLog: * gfortran.dg/maxloc_bounds_5.f90: Additionally accept error messages reported by the scalarizer. * gfortran.dg/maxloc_bounds_6.f90: Ditto. Diff: --- gcc/fortran/trans-intrinsic.cc| 127 +- gcc/testsuite/gfortran.dg/maxloc_bounds_5.f90 | 4 +- gcc/testsuite/gfortran.dg/maxloc_bounds_6.f90 | 4 +- 3 files changed, 87 insertions(+), 48 deletions(-) diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc index cd7a43f58fb..a92b733cf2f 100644 --- a/gcc/fortran/trans-intrinsic.cc +++ b/gcc/fortran/trans-intrinsic.cc @@ -5358,12 +5358,55 @@ strip_kind_from_actual (gfc_actual_arglist * actual) } S++; } - B: ARRAY has rank 1, and DIM is absent. Use the same code as the scalar - case and wrap the result in an array. - C: ARRAY has rank > 1, NANs are not supported, and DIM and MASK are absent. - Generate code similar to the single loop scalar case, but using one - variable per dimension, for example if ARRAY has rank 2: - 4) NAN's aren't supported, no MASK: + B: Array result, non-CHARACTER type, DIM absent + Generate similar code as in the scalar case, using a collection of + variables (one per dimension) instead of a single variable as result. + Picking only cases 1) and 4) with ARRAY of rank 2, the generated code + becomes: + 1) Array mask is used and NaNs need to be supported: +limit = Infinity; +pos0 = 0; +pos1 = 0; +S1 = from1; +while (S1 <= to1) { + S0 = from0; + while (s0 <= to0 { +if (mask[S1][S0]) { + if (pos0 == 0) { +pos0 = S0 + (1 - from0); +pos1 = S1 + (1 - from1); + } + if (a[S1][S0] <= limit) { +limit = a[S1][S0]; +pos0 = S0 + (1 - from0); +pos1 = S1 + (1 - from1); +goto lab1; + } +} +S0++; + } + S1++; +} +goto lab2; +lab1:; +S1 = from1; +while (S1 <= to1) { + S0 = from0; + while (S0 <= to0) { +
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v11)] fortran: Continue MINLOC/MAXLOC second loop where the first stopped [PR90608]
https://gcc.gnu.org/g:f633006ba0fffe0ab2cc91928978e36d0fc8b14d commit f633006ba0fffe0ab2cc91928978e36d0fc8b14d Author: Mikael Morin Date: Wed Jul 31 10:11:02 2024 +0200 fortran: Continue MINLOC/MAXLOC second loop where the first stopped [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Continue the second set of loops where the first one stopped in the generated inline MINLOC/MAXLOC code in the cases where the generated code contains two sets of loops. This fixes a regression that was introduced when enabling the generation of inline MINLOC/MAXLOC code with ARRAY of rank greater than 1, no DIM argument, and either non-scalar MASK or floating- point ARRAY. In the cases where two sets of loops are generated as inline MINLOC/MAXLOC code, we previously generated code such as (for rank 2 ARRAY, so with two levels of nesting): for (idx11 in lower1..upper1) { for (idx12 in lower2..upper2) { ... if (...) { ... goto second_loop; } } } second_loop: for (idx21 in lower1..upper1) { for (idx22 in lower2..upper2) { ... } } which means we process the first elements twice, once in the first set of loops and once in the second one. This change avoids this duplicate processing by using a conditional as lower bound for the second set of loops, generating code like: second_loop_entry = false; for (idx11 in lower1..upper1) { for (idx12 in lower2..upper2) { ... if (...) { ... second_loop_entry = true; goto second_loop; } } } second_loop: for (idx21 in (second_loop_entry ? idx11 : lower1)..upper1) { for (idx22 in (second_loop_entry ? idx12 : lower2)..upper2) { ... second_loop_entry = false; } } It was expected that the compiler optimizations would be able to remove the state variable second_loop_entry. It is the case if ARRAY has rank 1 (so without loop nesting), the variable is removed and the loop bounds become unconditional, which restores previously generated code, fully fixing the regression. For larger rank, unfortunately, the state variable and conditional loop bounds remain, but those cases were previously using library calls, so it's not a regression. PR fortran/90608 gcc/fortran/ChangeLog: * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Generate a set of index variables. Set them using the loop indexes before leaving the first set of loops. Generate a new loop entry predicate. Initialize it. Set it before leaving the first set of loops. Clear it in the body of the second set of loops. For the second set of loops, update each loop lower bound to use the corresponding index variable if the predicate variable is set. Diff: --- gcc/fortran/trans-intrinsic.cc | 33 +++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc index a92b733cf2f..b03f7b1653e 100644 --- a/gcc/fortran/trans-intrinsic.cc +++ b/gcc/fortran/trans-intrinsic.cc @@ -5368,6 +5368,7 @@ strip_kind_from_actual (gfc_actual_arglist * actual) pos0 = 0; pos1 = 0; S1 = from1; +second_loop_entry = false; while (S1 <= to1) { S0 = from0; while (s0 <= to0 { @@ -5380,6 +5381,7 @@ strip_kind_from_actual (gfc_actual_arglist * actual) limit = a[S1][S0]; pos0 = S0 + (1 - from0); pos1 = S1 + (1 - from1); +second_loop_entry = true; goto lab1; } } @@ -5389,9 +5391,9 @@ strip_kind_from_actual (gfc_actual_arglist * actual) } goto lab2; lab1:; -S1 = from1; +S1 = second_loop_entry ? S1 : from1; while (S1 <= to1) { - S0 = from0; + S0 = second_loop_entry ? S0 : from0; while (S0 <= to0) { if (mask[S1][S0]) if (a[S1][S0] < limit) { @@ -5399,6 +5401,7 @@ strip_kind_from_actual (gfc_actual_arglist * actual) pos0 = S + (1 - from0); pos1 =
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v11)] fortran: Add -finline-intrinsics flag for MINLOC/MAXLOC [PR90608]
https://gcc.gnu.org/g:5a49705a0b9d7db79effda1f3d29894a67b76df2 commit 5a49705a0b9d7db79effda1f3d29894a67b76df2 Author: Mikael Morin Date: Sat Aug 10 14:17:21 2024 +0200 fortran: Add -finline-intrinsics flag for MINLOC/MAXLOC [PR90608] Introduce the -finline-intrinsics flag to control from the command line whether to generate either inline code or calls to the functions from the library, for the MINLOC and MAXLOC intrinsics. The flag allows to specify inlining either independently for each intrinsic (either MINLOC or MAXLOC), or all together. For each intrinsic, a default value is set if none was set. The default value depends on the optimization: inlining is avoided if not optimizing or optimizing for size; otherwise inlining is preferred. There is no direct support for this behaviour provided by the .opt options framework. It is obtained by defining three different variants of the flag (finline-intrinsics, fno-inline-intrinsics, finline-intrinsics=) all using the same underlying option variable. Each enum value (corresponding to an intrinsic function) uses two identical bits, and the variable is initialized with alternated bits, so that we can tell whether the value was left initialized by checking whether the two bits have different values. PR fortran/90608 gcc/ChangeLog: * flag-types.h (enum gfc_inlineable_intrinsics): New type. gcc/fortran/ChangeLog: * invoke.texi(finline-intrinsics): Document new flag. * lang.opt (finline-intrinsics, finline-intrinsics=, fno-inline-intrinsics): New flags. * options.cc (gfc_post_options): If the option variable controling the inlining of MAXLOC (respectively MINLOC) has not been set, set it or clear it depending on the optimization option variables. * trans-intrinsic.cc (gfc_inline_intrinsic_function_p): Return false if inlining for the intrinsic is disabled according to the option variable. gcc/testsuite/ChangeLog: * gfortran.dg/minmaxloc_18.f90: New test. * gfortran.dg/minmaxloc_18a.f90: New test. * gfortran.dg/minmaxloc_18b.f90: New test. * gfortran.dg/minmaxloc_18c.f90: New test. * gfortran.dg/minmaxloc_18d.f90: New test. Diff: --- gcc/flag-types.h| 30 ++ gcc/fortran/invoke.texi | 24 + gcc/fortran/lang.opt| 27 + gcc/fortran/options.cc | 19 +- gcc/fortran/trans-intrinsic.cc | 13 +- gcc/testsuite/gfortran.dg/minmaxloc_18.f90 | 772 gcc/testsuite/gfortran.dg/minmaxloc_18a.f90 | 10 + gcc/testsuite/gfortran.dg/minmaxloc_18b.f90 | 10 + gcc/testsuite/gfortran.dg/minmaxloc_18c.f90 | 10 + gcc/testsuite/gfortran.dg/minmaxloc_18d.f90 | 10 + 10 files changed, 920 insertions(+), 5 deletions(-) diff --git a/gcc/flag-types.h b/gcc/flag-types.h index 1e497f0bb91..df56337f7e8 100644 --- a/gcc/flag-types.h +++ b/gcc/flag-types.h @@ -451,6 +451,36 @@ enum gfc_convert }; +/* gfortran -finline-intrinsics= values; + We use two identical bits for each value, and initialize with alternated + bits, so that we can check whether a value has been set by checking whether + the two bits have identical value. */ + +#define GFC_INL_INTR_VAL(idx) (3 << (2 * idx)) +#define GFC_INL_INTR_UNSET_VAL(val) (0x & (val)) + +enum gfc_inlineable_intrinsics +{ + GFC_FLAG_INLINE_INTRINSIC_NONE = 0, + GFC_FLAG_INLINE_INTRINSIC_MAXLOC = GFC_INL_INTR_VAL (0), + GFC_FLAG_INLINE_INTRINSIC_MINLOC = GFC_INL_INTR_VAL (1), + GFC_FLAG_INLINE_INTRINSIC_ALL = GFC_FLAG_INLINE_INTRINSIC_MAXLOC + | GFC_FLAG_INLINE_INTRINSIC_MINLOC, + + GFC_FLAG_INLINE_INTRINSIC_NONE_UNSET + = GFC_INL_INTR_UNSET_VAL (GFC_FLAG_INLINE_INTRINSIC_NONE), + GFC_FLAG_INLINE_INTRINSIC_MAXLOC_UNSET + = GFC_INL_INTR_UNSET_VAL (GFC_FLAG_INLINE_INTRINSIC_MAXLOC), + GFC_FLAG_INLINE_INTRINSIC_MINLOC_UNSET + = GFC_INL_INTR_UNSET_VAL (GFC_FLAG_INLINE_INTRINSIC_MINLOC), + GFC_FLAG_INLINE_INTRINSIC_ALL_UNSET + = GFC_INL_INTR_UNSET_VAL (GFC_FLAG_INLINE_INTRINSIC_ALL) +}; + +#undef GFC_INL_INTR_UNSET_VAL +#undef GFC_INL_INTR_VAL + + /* Inline String Operations functions. */ enum ilsop_fn { diff --git a/gcc/fortran/invoke.texi b/gcc/fortran/invoke.texi index 6bc42afe2c4..53b6de1c92b 100644 --- a/gcc/fortran/invoke.texi +++ b/gcc/fortran/invoke.texi @@ -194,6 +194,7 @@ and warnings}. -finit-character=@var{n} -finit-integer=@var{n} -finit-local-zero -finit-derived -finit-logical=@var{} -finit-real=@var{} +-finline-intrinsics[=<@var{minloc},@var{maxloc}>] -finline-matmul-limit=@var{n} -finline-arg-packing -fmax-array-constructor=@var{n} -fmax-stack-var-size=@v
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Disable frontend passes for inlinable MINLOC/MAXLOC [PR90608]
https://gcc.gnu.org/g:b0c71550b7cc410a781620c8b5d94de78e832bb7 commit b0c71550b7cc410a781620c8b5d94de78e832bb7 Author: Mikael Morin Date: Wed Jul 31 10:09:39 2024 +0200 fortran: Disable frontend passes for inlinable MINLOC/MAXLOC [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Disable rewriting of MINLOC/MAXLOC expressions for which inline code generation is supported. Update the gfc_inline_intrinsic_function_p predicate (already existing) for that, with the current state of MINLOC/MAXLOC inlining support, that is only the cases of a scalar result and non-CHARACTER argument for now. This change has no effect currently, as the MINLOC/MAXLOC front-end passes only change expressions of rank 1, but the inlining control predicate gfc_inline_intrinsic_function_p returns false for those. However, later changes will extend MINLOC/MAXLOC inline expansion support to array expressions and update the inlining control predicate, and this will become effective. PR fortran/90608 gcc/fortran/ChangeLog: * frontend-passes.cc (optimize_minmaxloc): Skip if we can generate inline code for the unmodified expression. * trans-intrinsic.cc (gfc_inline_intrinsic_function_p): Add MINLOC and MAXLOC cases. Diff: --- gcc/fortran/frontend-passes.cc | 3 ++- gcc/fortran/trans-intrinsic.cc | 23 +++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/gcc/fortran/frontend-passes.cc b/gcc/fortran/frontend-passes.cc index 3c06018fdbb..8e4c6310ba8 100644 --- a/gcc/fortran/frontend-passes.cc +++ b/gcc/fortran/frontend-passes.cc @@ -2277,7 +2277,8 @@ optimize_minmaxloc (gfc_expr **e) || fn->value.function.actual == NULL || fn->value.function.actual->expr == NULL || fn->value.function.actual->expr->ts.type == BT_CHARACTER - || fn->value.function.actual->expr->rank != 1) + || fn->value.function.actual->expr->rank != 1 + || gfc_inline_intrinsic_function_p (fn)) return; *e = gfc_get_array_expr (fn->ts.type, fn->ts.kind, &fn->where); diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc index 84a378ef310..2c8512060cc 100644 --- a/gcc/fortran/trans-intrinsic.cc +++ b/gcc/fortran/trans-intrinsic.cc @@ -11652,6 +11652,29 @@ gfc_inline_intrinsic_function_p (gfc_expr *expr) case GFC_ISYM_TRANSPOSE: return true; +case GFC_ISYM_MINLOC: +case GFC_ISYM_MAXLOC: + { + /* Disable inline expansion if code size matters. */ + if (optimize_size) + return false; + + gfc_actual_arglist *array_arg = expr->value.function.actual; + gfc_actual_arglist *dim_arg = array_arg->next; + + gfc_expr *array = array_arg->expr; + gfc_expr *dim = dim_arg->expr; + + if (!(array->ts.type == BT_INTEGER + || array->ts.type == BT_REAL)) + return false; + + if (array->rank == 1 && dim != nullptr) + return true; + + return false; + } + default: return false; }
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Remove MINLOC/MAXLOC frontend optimization
https://gcc.gnu.org/g:a235d28920618fb6b3bf534ef3f7f847189e9075 commit a235d28920618fb6b3bf534ef3f7f847189e9075 Author: Mikael Morin Date: Fri Aug 9 21:21:01 2024 +0200 fortran: Remove MINLOC/MAXLOC frontend optimization Remove the frontend pass rewriting calls of MINLOC/MAXLOC without DIM to calls with one-valued DIM enclosed in an array constructor. This transformation was circumventing the limitation of inline MINLOC/MAXLOC code generation to scalar cases only, allowing inline code to be generated if ARRAY had rank 1 and DIM was absent. As MINLOC/MAXLOC has gained support of inline code generation in that case, the limitation is no longer effective, and the transformation no longer necessary. gcc/fortran/ChangeLog: * frontend-passes.cc (optimize_minmaxloc): Remove. (optimize_expr): Remove dispatch to optimize_minmaxloc. Diff: --- gcc/fortran/frontend-passes.cc | 57 -- 1 file changed, 57 deletions(-) diff --git a/gcc/fortran/frontend-passes.cc b/gcc/fortran/frontend-passes.cc index 8e4c6310ba8..31d553e9844 100644 --- a/gcc/fortran/frontend-passes.cc +++ b/gcc/fortran/frontend-passes.cc @@ -36,7 +36,6 @@ static bool optimize_op (gfc_expr *); static bool optimize_comparison (gfc_expr *, gfc_intrinsic_op); static bool optimize_trim (gfc_expr *); static bool optimize_lexical_comparison (gfc_expr *); -static void optimize_minmaxloc (gfc_expr **); static bool is_empty_string (gfc_expr *e); static void doloop_warn (gfc_namespace *); static int do_intent (gfc_expr **); @@ -356,17 +355,6 @@ optimize_expr (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, if ((*e)->expr_type == EXPR_OP && optimize_op (*e)) gfc_simplify_expr (*e, 0); - if ((*e)->expr_type == EXPR_FUNCTION && (*e)->value.function.isym) -switch ((*e)->value.function.isym->id) - { - case GFC_ISYM_MINLOC: - case GFC_ISYM_MAXLOC: - optimize_minmaxloc (e); - break; - default: - break; - } - if (function_expr) count_arglist --; @@ -2262,51 +2250,6 @@ optimize_trim (gfc_expr *e) return true; } -/* Optimize minloc(b), where b is rank 1 array, into - (/ minloc(b, dim=1) /), and similarly for maxloc, - as the latter forms are expanded inline. */ - -static void -optimize_minmaxloc (gfc_expr **e) -{ - gfc_expr *fn = *e; - gfc_actual_arglist *a; - char *name, *p; - - if (fn->rank != 1 - || fn->value.function.actual == NULL - || fn->value.function.actual->expr == NULL - || fn->value.function.actual->expr->ts.type == BT_CHARACTER - || fn->value.function.actual->expr->rank != 1 - || gfc_inline_intrinsic_function_p (fn)) -return; - - *e = gfc_get_array_expr (fn->ts.type, fn->ts.kind, &fn->where); - (*e)->shape = fn->shape; - fn->rank = 0; - fn->shape = NULL; - gfc_constructor_append_expr (&(*e)->value.constructor, fn, &fn->where); - - name = XALLOCAVEC (char, strlen (fn->value.function.name) + 1); - strcpy (name, fn->value.function.name); - p = strstr (name, "loc0"); - p[3] = '1'; - fn->value.function.name = gfc_get_string ("%s", name); - if (fn->value.function.actual->next) -{ - a = fn->value.function.actual->next; - gcc_assert (a->expr == NULL); -} - else -{ - a = gfc_get_actual_arglist (); - fn->value.function.actual->next = a; -} - a->expr = gfc_get_constant_expr (BT_INTEGER, gfc_default_integer_kind, - &fn->where); - mpz_set_ui (a->expr->value.integer, 1); -} - /* Data package to hand down for DO loop checks in a contained procedure. */ typedef struct contained_info
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Inline integral MINLOC/MAXLOC with no DIM and no MASK [PR90608]
https://gcc.gnu.org/g:e6b1ed0eb5af54bc36db5b259c99bb9a189f03aa commit e6b1ed0eb5af54bc36db5b259c99bb9a189f03aa Author: Mikael Morin Date: Wed Jul 31 10:10:19 2024 +0200 fortran: Inline integral MINLOC/MAXLOC with no DIM and no MASK [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Enable generation of inline code for the MINLOC and MAXLOC intrinsic, if the ARRAY argument is of integral type and of any rank (only the rank 1 case was previously inlined), and neither DIM nor MASK arguments are present. This needs a few adjustments in gfc_conv_intrinsic_minmaxloc, mainly to replace the single variables POS and OFFSET, with collections of variables, one variable per dimension each. The restriction to integral ARRAY and absent MASK limits the scope of the change to the cases where we generate single loop inline code. The code generation for the second loop is only accessible with ARRAY of rank 1, so it can continue using a single variable. A later change will extend inlining to the double loop cases. There is some bounds checking code that was previously handled by the library, and that needed some changes in the scalarizer to avoid regressing. The bounds check code generation was already supported by the scalarizer, but it was only applying to array reference sections, checking both for array bound violation and for shape conformability between all the involved arrays. With this change, for MINLOC or MAXLOC, enable the conformability check between all the scalarized arrays, and disable the array bound violation check. PR fortran/90608 gcc/fortran/ChangeLog: * trans-array.cc (gfc_conv_ss_startstride): Set the MINLOC/MAXLOC result upper bound using the rank of the ARRAY argument. Ajdust the error message for intrinsic result arrays. Only check array bounds for array references. Move bound check decision code... (bounds_check_needed): ... here as a new predicate. Allow bound check for MINLOC/MAXLOC intrinsic results. * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Change the result array upper bound to the rank of ARRAY. Update the NONEMPTY variable to depend on the non-empty extent of every dimension. Use one variable per dimension instead of a single variable for the position and the offset. Update their declaration, initialization, and update to affect the variable of each dimension. Use the first variable only in areas only accessed with rank 1 ARRAY argument. Set every element of the result using its corresponding variable. (gfc_inline_intrinsic_function_p): Return true for integral ARRAY and absent DIM and MASK. gcc/testsuite/ChangeLog: * gfortran.dg/maxloc_bounds_4.f90: Additionally accept the error message emitted by the scalarizer. Diff: --- gcc/fortran/trans-array.cc| 68 +--- gcc/fortran/trans-intrinsic.cc| 150 +++--- gcc/testsuite/gfortran.dg/maxloc_bounds_4.f90 | 4 +- 3 files changed, 165 insertions(+), 57 deletions(-) diff --git a/gcc/fortran/trans-array.cc b/gcc/fortran/trans-array.cc index e578b676fcc..1190bfa6c02 100644 --- a/gcc/fortran/trans-array.cc +++ b/gcc/fortran/trans-array.cc @@ -4956,6 +4956,35 @@ add_check_section_in_array_bounds (stmtblock_t *inner, gfc_ss_info *ss_info, } +/* Tells whether we need to generate bounds checking code for the array + associated with SS. */ + +bool +bounds_check_needed (gfc_ss *ss) +{ + /* Catch allocatable lhs in f2003. */ + if (flag_realloc_lhs && ss->no_bounds_check) +return false; + + gfc_ss_info *ss_info = ss->info; + if (ss_info->type == GFC_SS_SECTION) +return true; + + if (!(ss_info->type == GFC_SS_INTRINSIC + && ss_info->expr + && ss_info->expr->expr_type == EXPR_FUNCTION)) +return false; + + gfc_intrinsic_sym *isym = ss_info->expr->value.function.isym; + if (!(isym + && (isym->id == GFC_ISYM_MAXLOC + || isym->id == GFC_ISYM_MINLOC))) +return false; + + return gfc_inline_intrinsic_function_p (ss_info->expr); +} + + /* Calculates the range start and stride for a SS chain. Also gets the descriptor and data pointer. The range of vector subscripts is the size of the vector. Array bounds are also checked. */ @@ -5057,10 +5086,17 @@ done: info->data = gfc_conv_array_data (info->descriptor); info->data = gfc_evaluate_now (info->data, &outer_loop->pre); - info->offset = gfc_index_zero_node; + gfc_expr *array = expr->value.function.actual->expr; + tree rank = build_int_cst (gfc_array_index_type, array->rank); + +
[gcc] Created branch 'mikael/heads/inline_minmaxloc_without_dim_v12' in namespace 'refs/users'
The branch 'mikael/heads/inline_minmaxloc_without_dim_v12' was created in namespace 'refs/users' pointing to: f3273f220b8... fortran: Add -finline-intrinsics flag for MINLOC/MAXLOC [PR
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Inline non-character MINLOC/MAXLOC with no DIM [PR90608]
https://gcc.gnu.org/g:51029b02a215d85dcb2aabb07f5b38201e1b944e commit 51029b02a215d85dcb2aabb07f5b38201e1b944e Author: Mikael Morin Date: Wed Jul 31 10:10:49 2024 +0200 fortran: Inline non-character MINLOC/MAXLOC with no DIM [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Enable generation of inline MINLOC/MAXLOC code in the case where DIM is not present, and either ARRAY is of floating point type or MASK is an array. Those cases are the remaining bits to fully support inlining of non-CHARACTER MINLOC/MAXLOC without DIM. They are treated together because they generate similar code, the NANs for REAL types being handled a bit like a second level of masking. These are the cases for which we generate two sets of loops. This change affects the code generating the second loop, that was previously accessible only in the cases ARRAY has rank 1 only. The single variable initialization and update are changed to apply to multiple variables, one per dimension. The code generated is as follows (if ARRAY has rank 2): for (idx11 in lower1..upper1) { for (idx12 in lower2..upper2) { ... if (...) { ... goto second_loop; } } } second_loop: for (idx21 in lower1..upper1) { for (idx22 in lower2..upper2) { ... } } This code leads to processing the first elements redundantly, both in the first set of loops and in the second one. The loop over idx22 could start from idx12 the first time it is run, but as it has to start from lower2 for the rest of the runs, this change uses the same bounds for both set of loops for simplicity. In the rank 1 case, this makes the generated code worse compared to the inline code that was generated before. A later change will introduce conditionals to avoid the duplicate processing and restore the generated code in that case. PR fortran/90608 gcc/fortran/ChangeLog: * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Initialize and update all the variables. Put the label and goto in the outermost scalarizer loop. Don't start the second loop where the first stopped. (gfc_inline_intrinsic_function_p): Also return TRUE for array MASK or for any REAL type. gcc/testsuite/ChangeLog: * gfortran.dg/maxloc_bounds_5.f90: Additionally accept error messages reported by the scalarizer. * gfortran.dg/maxloc_bounds_6.f90: Ditto. Diff: --- gcc/fortran/trans-intrinsic.cc| 127 +- gcc/testsuite/gfortran.dg/maxloc_bounds_5.f90 | 4 +- gcc/testsuite/gfortran.dg/maxloc_bounds_6.f90 | 4 +- 3 files changed, 87 insertions(+), 48 deletions(-) diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc index cd7a43f58fb..a92b733cf2f 100644 --- a/gcc/fortran/trans-intrinsic.cc +++ b/gcc/fortran/trans-intrinsic.cc @@ -5358,12 +5358,55 @@ strip_kind_from_actual (gfc_actual_arglist * actual) } S++; } - B: ARRAY has rank 1, and DIM is absent. Use the same code as the scalar - case and wrap the result in an array. - C: ARRAY has rank > 1, NANs are not supported, and DIM and MASK are absent. - Generate code similar to the single loop scalar case, but using one - variable per dimension, for example if ARRAY has rank 2: - 4) NAN's aren't supported, no MASK: + B: Array result, non-CHARACTER type, DIM absent + Generate similar code as in the scalar case, using a collection of + variables (one per dimension) instead of a single variable as result. + Picking only cases 1) and 4) with ARRAY of rank 2, the generated code + becomes: + 1) Array mask is used and NaNs need to be supported: +limit = Infinity; +pos0 = 0; +pos1 = 0; +S1 = from1; +while (S1 <= to1) { + S0 = from0; + while (s0 <= to0 { +if (mask[S1][S0]) { + if (pos0 == 0) { +pos0 = S0 + (1 - from0); +pos1 = S1 + (1 - from1); + } + if (a[S1][S0] <= limit) { +limit = a[S1][S0]; +pos0 = S0 + (1 - from0); +pos1 = S1 + (1 - from1); +goto lab1; + } +} +S0++; + } + S1++; +} +goto lab2; +lab1:; +S1 = from1; +while (S1 <= to1) { + S0 = from0; + while (S0 <= to0) { +
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Add tests covering inline MINLOC/MAXLOC without DIM [PR90608]
https://gcc.gnu.org/g:38dea32d891064ae298b747ffcc3652757473e5d commit 38dea32d891064ae298b747ffcc3652757473e5d Author: Mikael Morin Date: Wed Jul 31 10:09:25 2024 +0200 fortran: Add tests covering inline MINLOC/MAXLOC without DIM [PR90608] Tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Add the tests covering the various cases for which we are about to implement inline expansion of MINLOC and MAXLOC. Those are cases where the DIM argument is not present. PR fortran/90608 gcc/testsuite/ChangeLog: * gfortran.dg/maxloc_7.f90: New test. * gfortran.dg/maxloc_with_mask_1.f90: New test. * gfortran.dg/minloc_8.f90: New test. * gfortran.dg/minloc_with_mask_1.f90: New test. Diff: --- gcc/testsuite/gfortran.dg/ieee/maxloc_nan_1.f90 | 44 +++ gcc/testsuite/gfortran.dg/ieee/minloc_nan_1.f90 | 44 +++ gcc/testsuite/gfortran.dg/maxloc_7.f90 | 208 + gcc/testsuite/gfortran.dg/maxloc_with_mask_1.f90 | 373 +++ gcc/testsuite/gfortran.dg/minloc_8.f90 | 208 + gcc/testsuite/gfortran.dg/minloc_with_mask_1.f90 | 372 ++ 6 files changed, 1249 insertions(+) diff --git a/gcc/testsuite/gfortran.dg/ieee/maxloc_nan_1.f90 b/gcc/testsuite/gfortran.dg/ieee/maxloc_nan_1.f90 new file mode 100644 index 000..329b54e8e1f --- /dev/null +++ b/gcc/testsuite/gfortran.dg/ieee/maxloc_nan_1.f90 @@ -0,0 +1,44 @@ +! { dg-do run } +! +! PR fortran/90608 +! Check the correct behaviour of the inline MAXLOC implementation, +! when ARRAY is filled with NANs. + +program p + implicit none + call check_without_mask + call check_with_mask +contains + subroutine check_without_mask() +use, intrinsic :: ieee_arithmetic +real, allocatable :: a(:,:,:) +real :: nan +integer, allocatable :: m(:) +if (.not. ieee_support_nan(nan)) return +nan = ieee_value(nan, ieee_quiet_nan) +allocate(a(3,3,3), source = nan) +m = maxloc(a) +if (size(m, dim=1) /= 3) stop 32 +if (any(m /= (/ 1, 1, 1 /))) stop 35 + end subroutine + subroutine check_with_mask() +use, intrinsic :: ieee_arithmetic +real, allocatable :: a(:,:,:) +logical, allocatable :: m(:,:,:) +real :: nan +integer, allocatable :: r(:) +if (.not. ieee_support_nan(nan)) return +nan = ieee_value(nan, ieee_quiet_nan) +allocate(a(3,3,3), source = nan) +allocate(m(3,3,3)) +m(:,:,:) = reshape((/ .false., .false., .true. , .true. , .false., & + .true. , .false., .false., .false., .true. , & + .true. , .false., .true. , .true. , .true. , & + .false., .false., .true. , .true. , .false., & + .false., .true. , .false., .false., .true. , & + .true. , .true. /), shape(m)) +r = maxloc(a, mask = m) +if (size(r, dim = 1) /= 3) stop 62 +if (any(r /= (/ 3, 1, 1 /))) stop 65 + end subroutine +end program p diff --git a/gcc/testsuite/gfortran.dg/ieee/minloc_nan_1.f90 b/gcc/testsuite/gfortran.dg/ieee/minloc_nan_1.f90 new file mode 100644 index 000..8f71b4c4398 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/ieee/minloc_nan_1.f90 @@ -0,0 +1,44 @@ +! { dg-do run } +! +! PR fortran/90608 +! Check the correct behaviour of the inline MINLOC implementation, +! when ARRAY is filled with NANs. + +program p + implicit none + call check_without_mask + call check_with_mask +contains + subroutine check_without_mask() +use, intrinsic :: ieee_arithmetic +real, allocatable :: a(:,:,:) +real :: nan +integer, allocatable :: m(:) +if (.not. ieee_support_nan(nan)) return +nan = ieee_value(nan, ieee_quiet_nan) +allocate(a(3,3,3), source = nan) +m = minloc(a) +if (size(m, dim=1) /= 3) stop 32 +if (any(m /= (/ 1, 1, 1 /))) stop 35 + end subroutine + subroutine check_with_mask() +use, intrinsic :: ieee_arithmetic +real, allocatable :: a(:,:,:) +logical, allocatable :: m(:,:,:) +real :: nan +integer, allocatable :: r(:) +if (.not. ieee_support_nan(nan)) return +nan = ieee_value(nan, ieee_quiet_nan) +allocate(a(3,3,3), source = nan) +allocate(m(3,3,3)) +m(:,:,:) = reshape((/ .false., .false., .true. , .true. , .false., & + .true. , .false., .false., .false., .true. , & + .true. , .false., .true. , .true. , .true. , & + .false., .false., .true. , .true. , .false., & + .false., .true. , .false., .false., .true. , & + .true. , .true. /), shape(m)) +r = minloc(a, mask = m) +if (size(r, dim = 1) /= 3) stop 62 +if (any(r /= (/ 3, 1, 1 /))) stop 65 + end subroutine +end program p diff --git a/gcc/testsuite/gfortran.dg/maxloc_7.f90 b/gcc/testsuite/gfortran.dg/maxloc_7.f90 new file mode 100644 inde
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Continue MINLOC/MAXLOC second loop where the first stopped [PR90608]
https://gcc.gnu.org/g:27a251a48e3ef3bfda84566a372a50fe9c08553e commit 27a251a48e3ef3bfda84566a372a50fe9c08553e Author: Mikael Morin Date: Wed Jul 31 10:11:02 2024 +0200 fortran: Continue MINLOC/MAXLOC second loop where the first stopped [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Continue the second set of loops where the first one stopped in the generated inline MINLOC/MAXLOC code in the cases where the generated code contains two sets of loops. This fixes a regression that was introduced when enabling the generation of inline MINLOC/MAXLOC code with ARRAY of rank greater than 1, no DIM argument, and either non-scalar MASK or floating- point ARRAY. In the cases where two sets of loops are generated as inline MINLOC/MAXLOC code, we previously generated code such as (for rank 2 ARRAY, so with two levels of nesting): for (idx11 in lower1..upper1) { for (idx12 in lower2..upper2) { ... if (...) { ... goto second_loop; } } } second_loop: for (idx21 in lower1..upper1) { for (idx22 in lower2..upper2) { ... } } which means we process the first elements twice, once in the first set of loops and once in the second one. This change avoids this duplicate processing by using a conditional as lower bound for the second set of loops, generating code like: second_loop_entry = false; for (idx11 in lower1..upper1) { for (idx12 in lower2..upper2) { ... if (...) { ... second_loop_entry = true; goto second_loop; } } } second_loop: for (idx21 in (second_loop_entry ? idx11 : lower1)..upper1) { for (idx22 in (second_loop_entry ? idx12 : lower2)..upper2) { ... second_loop_entry = false; } } It was expected that the compiler optimizations would be able to remove the state variable second_loop_entry. It is the case if ARRAY has rank 1 (so without loop nesting), the variable is removed and the loop bounds become unconditional, which restores previously generated code, fully fixing the regression. For larger rank, unfortunately, the state variable and conditional loop bounds remain, but those cases were previously using library calls, so it's not a regression. PR fortran/90608 gcc/fortran/ChangeLog: * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Generate a set of index variables. Set them using the loop indexes before leaving the first set of loops. Generate a new loop entry predicate. Initialize it. Set it before leaving the first set of loops. Clear it in the body of the second set of loops. For the second set of loops, update each loop lower bound to use the corresponding index variable if the predicate variable is set. Diff: --- gcc/fortran/trans-intrinsic.cc | 33 +++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc index a92b733cf2f..b03f7b1653e 100644 --- a/gcc/fortran/trans-intrinsic.cc +++ b/gcc/fortran/trans-intrinsic.cc @@ -5368,6 +5368,7 @@ strip_kind_from_actual (gfc_actual_arglist * actual) pos0 = 0; pos1 = 0; S1 = from1; +second_loop_entry = false; while (S1 <= to1) { S0 = from0; while (s0 <= to0 { @@ -5380,6 +5381,7 @@ strip_kind_from_actual (gfc_actual_arglist * actual) limit = a[S1][S0]; pos0 = S0 + (1 - from0); pos1 = S1 + (1 - from1); +second_loop_entry = true; goto lab1; } } @@ -5389,9 +5391,9 @@ strip_kind_from_actual (gfc_actual_arglist * actual) } goto lab2; lab1:; -S1 = from1; +S1 = second_loop_entry ? S1 : from1; while (S1 <= to1) { - S0 = from0; + S0 = second_loop_entry ? S0 : from0; while (S0 <= to0) { if (mask[S1][S0]) if (a[S1][S0] < limit) { @@ -5399,6 +5401,7 @@ strip_kind_from_actual (gfc_actual_arglist * actual) pos0 = S + (1 - from0); pos1 =
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Inline MINLOC/MAXLOC with no DIM and ARRAY of rank 1 [PR90608]
https://gcc.gnu.org/g:3a4733aae028f9a2323e7b993be7399d1315ea34 commit 3a4733aae028f9a2323e7b993be7399d1315ea34 Author: Mikael Morin Date: Wed Jul 31 10:09:53 2024 +0200 fortran: Inline MINLOC/MAXLOC with no DIM and ARRAY of rank 1 [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Enable inline code generation for the MINLOC and MAXLOC intrinsic, if the DIM argument is not present and ARRAY has rank 1. This case is similar to the case where the result is scalar (DIM present and rank 1 ARRAY), which already supports inline expansion of the intrinsic. Both cases return the same value, with the difference that the result is an array of size 1 if DIM is absent, whereas it's a scalar if DIM is present. So all there is to do for the new case to work is hook the inline expansion with the scalarizer. PR fortran/90608 gcc/fortran/ChangeLog: * trans-array.cc (gfc_conv_ss_startstride): Set the scalarization rank based on the MINLOC/MAXLOC rank if needed. Call the inline code generation and setup the scalarizer array descriptor info in the MINLOC and MAXLOC cases. * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Return the result array element if the scalarizer is setup and we are inside the loops. Restrict library function call dispatch to the case where inline expansion is not supported. Declare an array result if the expression isn't scalar. Initialize the array result single element and return the result variable if the expression isn't scalar. (walk_inline_intrinsic_minmaxloc): New function. (walk_inline_intrinsic_function): Add MINLOC and MAXLOC cases, dispatching to walk_inline_intrinsic_minmaxloc. (gfc_add_intrinsic_ss_code): Add MINLOC and MAXLOC cases. (gfc_inline_intrinsic_function_p): Return true if ARRAY has rank 1, regardless of DIM. Diff: --- gcc/fortran/trans-array.cc | 25 + gcc/fortran/trans-intrinsic.cc | 224 - 2 files changed, 181 insertions(+), 68 deletions(-) diff --git a/gcc/fortran/trans-array.cc b/gcc/fortran/trans-array.cc index 9fb0b2b398d..46e2152d0f0 100644 --- a/gcc/fortran/trans-array.cc +++ b/gcc/fortran/trans-array.cc @@ -4851,6 +4851,8 @@ gfc_conv_ss_startstride (gfc_loopinfo * loop) case GFC_ISYM_UBOUND: case GFC_ISYM_LCOBOUND: case GFC_ISYM_UCOBOUND: + case GFC_ISYM_MAXLOC: + case GFC_ISYM_MINLOC: case GFC_ISYM_SHAPE: case GFC_ISYM_THIS_IMAGE: loop->dimen = ss->dimen; @@ -4900,6 +4902,29 @@ done: case GFC_SS_INTRINSIC: switch (expr->value.function.isym->id) { + case GFC_ISYM_MINLOC: + case GFC_ISYM_MAXLOC: + { + gfc_se se; + gfc_init_se (&se, nullptr); + se.loop = loop; + se.ss = ss; + gfc_conv_intrinsic_function (&se, expr); + gfc_add_block_to_block (&outer_loop->pre, &se.pre); + gfc_add_block_to_block (&outer_loop->post, &se.post); + + info->descriptor = se.expr; + + info->data = gfc_conv_array_data (info->descriptor); + info->data = gfc_evaluate_now (info->data, &outer_loop->pre); + + info->offset = gfc_index_zero_node; + info->start[0] = gfc_index_zero_node; + info->end[0] = gfc_index_zero_node; + info->stride[0] = gfc_index_one_node; + continue; + } + /* Fall through to supply start and stride. */ case GFC_ISYM_LBOUND: case GFC_ISYM_UBOUND: diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc index 2c8512060cc..9fcb57a9cc4 100644 --- a/gcc/fortran/trans-intrinsic.cc +++ b/gcc/fortran/trans-intrinsic.cc @@ -5273,66 +5273,95 @@ strip_kind_from_actual (gfc_actual_arglist * actual) we need to handle. For performance reasons we sometimes create two loops instead of one, where the second one is much simpler. Examples for minloc intrinsic: - 1) Result is an array, a call is generated - 2) Array mask is used and NaNs need to be supported: - limit = Infinity; - pos = 0; - S = from; - while (S <= to) { - if (mask[S]) { - if (pos == 0) pos = S + (1 - from); - if (a[S] <= limit) { limit = a[S]; pos = S + (1 - from); goto lab1; } - } - S++; - } - goto lab2; - lab1:; - while (S <= to) { - if (mask[S]) if (a[S] < limit) { limit = a[S]; pos = S + (1 - from); } - S++; - } - lab2:; - 3) NaNs need to be supported, but it is known at compile time or cheaply - at r
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Add -finline-intrinsics flag for MINLOC/MAXLOC [PR90608]
https://gcc.gnu.org/g:f3273f220b8d113011fbe15ce5fa03314fad18fa commit f3273f220b8d113011fbe15ce5fa03314fad18fa Author: Mikael Morin Date: Sat Aug 10 14:17:21 2024 +0200 fortran: Add -finline-intrinsics flag for MINLOC/MAXLOC [PR90608] Introduce the -finline-intrinsics flag to control from the command line whether to generate either inline code or calls to the functions from the library, for the MINLOC and MAXLOC intrinsics. The flag allows to specify inlining either independently for each intrinsic (either MINLOC or MAXLOC), or all together. For each intrinsic, a default value is set if none was set. The default value depends on the optimization: inlining is avoided if not optimizing or optimizing for size; otherwise inlining is preferred. There is no direct support for this behaviour provided by the .opt options framework. It is obtained by defining three different variants of the flag (finline-intrinsics, fno-inline-intrinsics, finline-intrinsics=) all using the same underlying option variable. Each enum value (corresponding to an intrinsic function) uses two identical bits, and the variable is initialized with alternated bits, so that we can tell whether the value was left initialized by checking whether the two bits have different values. PR fortran/90608 gcc/ChangeLog: * flag-types.h (enum gfc_inlineable_intrinsics): New type. gcc/fortran/ChangeLog: * invoke.texi(finline-intrinsics): Document new flag. * lang.opt (finline-intrinsics, finline-intrinsics=, fno-inline-intrinsics): New flags. * options.cc (gfc_post_options): If the option variable controling the inlining of MAXLOC (respectively MINLOC) has not been set, set it or clear it depending on the optimization option variables. * trans-intrinsic.cc (gfc_inline_intrinsic_function_p): Return false if inlining for the intrinsic is disabled according to the option variable. gcc/testsuite/ChangeLog: * gfortran.dg/minmaxloc_18.f90: New test. * gfortran.dg/minmaxloc_18a.f90: New test. * gfortran.dg/minmaxloc_18b.f90: New test. * gfortran.dg/minmaxloc_18c.f90: New test. * gfortran.dg/minmaxloc_18d.f90: New test. Diff: --- gcc/flag-types.h| 30 ++ gcc/fortran/invoke.texi | 24 + gcc/fortran/lang.opt| 27 + gcc/fortran/options.cc | 19 +- gcc/fortran/trans-intrinsic.cc | 13 +- gcc/testsuite/gfortran.dg/minmaxloc_18.f90 | 772 gcc/testsuite/gfortran.dg/minmaxloc_18a.f90 | 10 + gcc/testsuite/gfortran.dg/minmaxloc_18b.f90 | 10 + gcc/testsuite/gfortran.dg/minmaxloc_18c.f90 | 10 + gcc/testsuite/gfortran.dg/minmaxloc_18d.f90 | 10 + 10 files changed, 920 insertions(+), 5 deletions(-) diff --git a/gcc/flag-types.h b/gcc/flag-types.h index 1e497f0bb91..df56337f7e8 100644 --- a/gcc/flag-types.h +++ b/gcc/flag-types.h @@ -451,6 +451,36 @@ enum gfc_convert }; +/* gfortran -finline-intrinsics= values; + We use two identical bits for each value, and initialize with alternated + bits, so that we can check whether a value has been set by checking whether + the two bits have identical value. */ + +#define GFC_INL_INTR_VAL(idx) (3 << (2 * idx)) +#define GFC_INL_INTR_UNSET_VAL(val) (0x & (val)) + +enum gfc_inlineable_intrinsics +{ + GFC_FLAG_INLINE_INTRINSIC_NONE = 0, + GFC_FLAG_INLINE_INTRINSIC_MAXLOC = GFC_INL_INTR_VAL (0), + GFC_FLAG_INLINE_INTRINSIC_MINLOC = GFC_INL_INTR_VAL (1), + GFC_FLAG_INLINE_INTRINSIC_ALL = GFC_FLAG_INLINE_INTRINSIC_MAXLOC + | GFC_FLAG_INLINE_INTRINSIC_MINLOC, + + GFC_FLAG_INLINE_INTRINSIC_NONE_UNSET + = GFC_INL_INTR_UNSET_VAL (GFC_FLAG_INLINE_INTRINSIC_NONE), + GFC_FLAG_INLINE_INTRINSIC_MAXLOC_UNSET + = GFC_INL_INTR_UNSET_VAL (GFC_FLAG_INLINE_INTRINSIC_MAXLOC), + GFC_FLAG_INLINE_INTRINSIC_MINLOC_UNSET + = GFC_INL_INTR_UNSET_VAL (GFC_FLAG_INLINE_INTRINSIC_MINLOC), + GFC_FLAG_INLINE_INTRINSIC_ALL_UNSET + = GFC_INL_INTR_UNSET_VAL (GFC_FLAG_INLINE_INTRINSIC_ALL) +}; + +#undef GFC_INL_INTR_UNSET_VAL +#undef GFC_INL_INTR_VAL + + /* Inline String Operations functions. */ enum ilsop_fn { diff --git a/gcc/fortran/invoke.texi b/gcc/fortran/invoke.texi index 6bc42afe2c4..53b6de1c92b 100644 --- a/gcc/fortran/invoke.texi +++ b/gcc/fortran/invoke.texi @@ -194,6 +194,7 @@ and warnings}. -finit-character=@var{n} -finit-integer=@var{n} -finit-local-zero -finit-derived -finit-logical=@var{} -finit-real=@var{} +-finline-intrinsics[=<@var{minloc},@var{maxloc}>] -finline-matmul-limit=@var{n} -finline-arg-packing -fmax-array-constructor=@var{n} -fmax-stack-var-size=@v
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Outline array bound check generation code
https://gcc.gnu.org/g:00fe4d9b01e30e9a78da5674cd49503c7ced546e commit 00fe4d9b01e30e9a78da5674cd49503c7ced546e Author: Mikael Morin Date: Wed Jul 31 10:10:06 2024 +0200 fortran: Outline array bound check generation code The next patch will need reindenting of the array bound check generation code. This outlines it to its own function beforehand, reducing the churn in the next patch. Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- gcc/fortran/ChangeLog: * trans-array.cc (gfc_conv_ss_startstride): Move array bound check generation code... (add_check_section_in_array_bounds): ... here as a new function. Diff: --- gcc/fortran/trans-array.cc | 297 ++--- 1 file changed, 143 insertions(+), 154 deletions(-) diff --git a/gcc/fortran/trans-array.cc b/gcc/fortran/trans-array.cc index 46e2152d0f0..e578b676fcc 100644 --- a/gcc/fortran/trans-array.cc +++ b/gcc/fortran/trans-array.cc @@ -4816,6 +4816,146 @@ gfc_conv_section_startstride (stmtblock_t * block, gfc_ss * ss, int dim) } +/* Generate in INNER the bounds checking code along the dimension DIM for + the array associated with SS_INFO. */ + +static void +add_check_section_in_array_bounds (stmtblock_t *inner, gfc_ss_info *ss_info, + int dim) +{ + gfc_expr *expr = ss_info->expr; + locus *expr_loc = &expr->where; + const char *expr_name = expr->symtree->name; + + gfc_array_info *info = &ss_info->data.array; + + bool check_upper; + if (dim == info->ref->u.ar.dimen - 1 + && info->ref->u.ar.as->type == AS_ASSUMED_SIZE) +check_upper = false; + else +check_upper = true; + + /* Zero stride is not allowed. */ + tree tmp = fold_build2_loc (input_location, EQ_EXPR, logical_type_node, + info->stride[dim], gfc_index_zero_node); + char * msg = xasprintf ("Zero stride is not allowed, for dimension %d " + "of array '%s'", dim + 1, expr_name); + gfc_trans_runtime_check (true, false, tmp, inner, expr_loc, msg); + free (msg); + + tree desc = info->descriptor; + + /* This is the run-time equivalent of resolve.cc's + check_dimension. The logical is more readable there + than it is here, with all the trees. */ + tree lbound = gfc_conv_array_lbound (desc, dim); + tree end = info->end[dim]; + tree ubound = check_upper ? gfc_conv_array_ubound (desc, dim) : NULL_TREE; + + /* non_zerosized is true when the selected range is not + empty. */ + tree stride_pos = fold_build2_loc (input_location, GT_EXPR, logical_type_node, +info->stride[dim], gfc_index_zero_node); + tmp = fold_build2_loc (input_location, LE_EXPR, logical_type_node, +info->start[dim], end); + stride_pos = fold_build2_loc (input_location, TRUTH_AND_EXPR, + logical_type_node, stride_pos, tmp); + + tree stride_neg = fold_build2_loc (input_location, LT_EXPR, logical_type_node, +info->stride[dim], gfc_index_zero_node); + tmp = fold_build2_loc (input_location, GE_EXPR, logical_type_node, +info->start[dim], end); + stride_neg = fold_build2_loc (input_location, TRUTH_AND_EXPR, + logical_type_node, stride_neg, tmp); + tree non_zerosized = fold_build2_loc (input_location, TRUTH_OR_EXPR, + logical_type_node, stride_pos, + stride_neg); + + /* Check the start of the range against the lower and upper + bounds of the array, if the range is not empty. + If upper bound is present, include both bounds in the + error message. */ + if (check_upper) +{ + tmp = fold_build2_loc (input_location, LT_EXPR, logical_type_node, +info->start[dim], lbound); + tmp = fold_build2_loc (input_location, TRUTH_AND_EXPR, logical_type_node, +non_zerosized, tmp); + tree tmp2 = fold_build2_loc (input_location, GT_EXPR, logical_type_node, + info->start[dim], ubound); + tmp2 = fold_build2_loc (input_location, TRUTH_AND_EXPR, logical_type_node, + non_zerosized, tmp2); + msg = xasprintf ("Index '%%ld' of dimension %d of array '%s' outside of " + "expected range (%%ld:%%ld)", dim + 1, expr_name); + gfc_trans_runtime_check (true, false, tmp, inner, expr_loc, msg, + fold_convert (long_integer_type_node, info->start[dim]), + fold_convert (long_integer_type_node, lbound), + fold_convert (long_integer_type_node, ubound)); + gfc_trans_runtime_check (true, false, tmp2, inner, expr_loc, msg, + fold_convert (long_integer_type_node, info->start[dim]), + fold_convert (long_integer_type_node, lbound), +
[gcc(refs/users/mikael/heads/inline_minmaxloc_without_dim_v12)] fortran: Inline integral MINLOC/MAXLOC with no DIM and scalar MASK [PR90608]
https://gcc.gnu.org/g:b99cc4f00f2c90e227410f1fb7bbe5d63fd8b729 commit b99cc4f00f2c90e227410f1fb7bbe5d63fd8b729 Author: Mikael Morin Date: Wed Jul 31 10:10:33 2024 +0200 fortran: Inline integral MINLOC/MAXLOC with no DIM and scalar MASK [PR90608] Regression-tested on x86_64-pc-linux-gnu. OK for master? -- >8 -- Enable the generation of inline code for MINLOC/MAXLOC when argument ARRAY is of integral type, DIM is not present, and MASK is present and is scalar (only absent MASK or rank 1 ARRAY were inlined before). Scalar masks are implemented with a wrapping condition around the code one would generate if MASK wasn't present, so they are easy to support once inline code without MASK is working. PR fortran/90608 gcc/fortran/ChangeLog: * trans-intrinsic.cc (gfc_conv_intrinsic_minmaxloc): Generate variable initialization for each dimension in the else branch of the toplevel condition. (gfc_inline_intrinsic_function_p): Return TRUE for scalar MASK. gcc/testsuite/ChangeLog: * gfortran.dg/maxloc_bounds_7.f90: Additionally accept the error message reported by the scalarizer. Diff: --- gcc/fortran/trans-intrinsic.cc| 13 - gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc index b8a7faf5459..cd7a43f58fb 100644 --- a/gcc/fortran/trans-intrinsic.cc +++ b/gcc/fortran/trans-intrinsic.cc @@ -5914,7 +5914,6 @@ gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op) /* For a scalar mask, enclose the loop in an if statement. */ if (maskexpr && maskss == NULL) { - gcc_assert (loop.dimen == 1); tree ifmask; gfc_init_se (&maskse, NULL); @@ -5929,7 +5928,8 @@ gfc_conv_intrinsic_minmaxloc (gfc_se * se, gfc_expr * expr, enum tree_code op) the pos variable the same way as above. */ gfc_init_block (&elseblock); - gfc_add_modify (&elseblock, pos[0], gfc_index_zero_node); + for (int i = 0; i < loop.dimen; i++) + gfc_add_modify (&elseblock, pos[i], gfc_index_zero_node); elsetmp = gfc_finish_block (&elseblock); ifmask = conv_mask_condition (&maskse, maskexpr, optional_mask); tmp = build3_v (COND_EXPR, ifmask, tmp, elsetmp); @@ -11823,9 +11823,12 @@ gfc_inline_intrinsic_function_p (gfc_expr *expr) if (array->rank == 1) return true; - if (array->ts.type == BT_INTEGER - && dim == nullptr - && mask == nullptr) + if (array->ts.type != BT_INTEGER + || dim != nullptr) + return false; + + if (mask == nullptr + || mask->rank == 0) return true; return false; diff --git a/gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 b/gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 index 206a29b149d..3aa9d3dcebe 100644 --- a/gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 +++ b/gcc/testsuite/gfortran.dg/maxloc_bounds_7.f90 @@ -1,6 +1,6 @@ ! { dg-do run } ! { dg-options "-fbounds-check" } -! { dg-shouldfail "Incorrect extent in return value of MAXLOC intrinsic: is 3, should be 2" } +! { dg-shouldfail "Incorrect extent in return value of MAXLOC intrinsic: is 3, should be 2|Array bound mismatch for dimension 1 of array 'res' .3/2." } module tst contains subroutine foo(res) @@ -18,4 +18,4 @@ program main integer :: res(3) call foo(res) end program main -! { dg-output "Fortran runtime error: Incorrect extent in return value of MAXLOC intrinsic: is 3, should be 2" } +! { dg-output "Fortran runtime error: Incorrect extent in return value of MAXLOC intrinsic: is 3, should be 2|Array bound mismatch for dimension 1 of array 'res' .3/2." }
[gcc r15-2932] c++/coroutines: fix passing *this to promise type, again [PR116327]
https://gcc.gnu.org/g:303bed670af962c01b77a4f0c51de97f70e8167e commit r15-2932-g303bed670af962c01b77a4f0c51de97f70e8167e Author: Patrick Palka Date: Thu Aug 15 10:20:18 2024 -0400 c++/coroutines: fix passing *this to promise type, again [PR116327] In r15-2210 we got rid of the unnecessary cast to lvalue reference when passing *this to the promise type ctor, and as a drive-by change we also simplified the code to use cp_build_fold_indirect_ref. But it turns out cp_build_fold_indirect_ref does too much here, namely it has a shortcut for returning current_class_ref if the operand is current_class_ptr. The problem with that shortcut is current_class_ref might have gotten clobbered earlier if it appeared in the function body, since rewrite_param_uses walks and rewrites in-place all local variable uses to their corresponding frame copy. So later cp_build_fold_indirect_ref for *this will instead return the clobbered current_class_ref i.e. *frame_ptr->this, which doesn't make sense here since we're in the ramp function and not the actor function where frame_ptr is in scope. This patch fixes this by using the build_fold_indirect_ref instead of cp_build_fold_indirect_ref. PR c++/116327 PR c++/104981 PR c++/115550 gcc/cp/ChangeLog: * coroutines.cc (morph_fn_to_coro): Use build_fold_indirect_ref instead of cp_build_fold_indirect_ref. gcc/testsuite/ChangeLog: * g++.dg/coroutines/pr104981-preview-this.C: Improve coverage by adding a non-static data member use within the coroutine member function. * g++.dg/coroutines/pr116327-preview-this.C: New test. Reviewed-by: Jason Merrill Diff: --- gcc/cp/coroutines.cc | 8 ++-- .../g++.dg/coroutines/pr104981-preview-this.C | 4 +++- .../g++.dg/coroutines/pr116327-preview-this.C | 22 ++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc index 145ec4b1d16..f7791cbfb9a 100644 --- a/gcc/cp/coroutines.cc +++ b/gcc/cp/coroutines.cc @@ -4850,7 +4850,9 @@ morph_fn_to_coro (tree orig, tree *resumer, tree *destroyer) if (parm_i->this_ptr || parm_i->lambda_cobj) { /* We pass a reference to *this to the allocator lookup. */ - tree this_ref = cp_build_fold_indirect_ref (arg); + /* It's unsafe to use the cp_ version here since current_class_ref +might've gotten clobbered earlier during rewrite_param_uses. */ + tree this_ref = build_fold_indirect_ref (arg); vec_safe_push (args, this_ref); } else @@ -5070,7 +5072,9 @@ morph_fn_to_coro (tree orig, tree *resumer, tree *destroyer) if (parm.this_ptr || parm.lambda_cobj) { /* We pass a reference to *this to the param preview. */ - tree this_ref = cp_build_fold_indirect_ref (arg); + /* It's unsafe to use the cp_ version here since current_class_ref +might've gotten clobbered earlier during rewrite_param_uses. */ + tree this_ref = build_fold_indirect_ref (arg); vec_safe_push (promise_args, this_ref); } else if (parm.rv_ref) diff --git a/gcc/testsuite/g++.dg/coroutines/pr104981-preview-this.C b/gcc/testsuite/g++.dg/coroutines/pr104981-preview-this.C index 81eb963db4a..9f1e3970ce3 100644 --- a/gcc/testsuite/g++.dg/coroutines/pr104981-preview-this.C +++ b/gcc/testsuite/g++.dg/coroutines/pr104981-preview-this.C @@ -23,8 +23,10 @@ struct PromiseType { }; struct Derived : Base { + int m = 41; Result f() { - co_return 42; + ++m; + co_return m; } }; diff --git a/gcc/testsuite/g++.dg/coroutines/pr116327-preview-this.C b/gcc/testsuite/g++.dg/coroutines/pr116327-preview-this.C new file mode 100644 index 000..27b69a41392 --- /dev/null +++ b/gcc/testsuite/g++.dg/coroutines/pr116327-preview-this.C @@ -0,0 +1,22 @@ +// PR c++/116327 - ICE in coroutine with parameter preview on lambda with captures + +#include + +struct coroutine{ + struct promise_type{ +promise_type(const auto &...){} +std::suspend_never initial_suspend(){ return {}; } +std::suspend_always final_suspend()noexcept{ return {}; } +void unhandled_exception(){} +coroutine get_return_object(){ return {}; } +void return_value(int)noexcept{} + }; +}; + +int main(){ + auto f = [a=0](auto) -> coroutine { +co_return 2; + }; + f(0); + return 0; +}
[gcc r15-2933] c++: c->B::m access resolved through current inst [PR116320]
https://gcc.gnu.org/g:484f139ccd3b631a777802e810a632678b42ffab commit r15-2933-g484f139ccd3b631a777802e810a632678b42ffab Author: Patrick Palka Date: Thu Aug 15 10:23:54 2024 -0400 c++: c->B::m access resolved through current inst [PR116320] Here when checking the access of (the injected-class-name) B in c->B::m at parse time, we notice its context B (now the type) is a base of the object type C, so we proceed to use C as the effective qualifying type. But this C is the dependent specialization not the primary template type, so it has empty TYPE_BINFO, which leads to a segfault later from perform_or_defer_access_check. The reason the DERIVED_FROM_P (B, C) test guarding this code path works despite C having empty TYPE_BINFO is because of its currently_open_class logic (added in r9-713-gd9338471b91bbe) which replaces a dependent specialization with the primary template type if we're inside it. So the safest fix seems to be to call currently_open_class in the caller as well. PR c++/116320 gcc/cp/ChangeLog: * semantics.cc (check_accessibility_of_qualified_id): Try currently_open_class when using the object type as the effective qualifying type. gcc/testsuite/ChangeLog: * g++.dg/template/access42.C: New test. Reviewed-by: Jason Merrill Diff: --- gcc/cp/semantics.cc | 11 --- gcc/testsuite/g++.dg/template/access42.C | 17 + 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc index e58612660c9..5ab2076b673 100644 --- a/gcc/cp/semantics.cc +++ b/gcc/cp/semantics.cc @@ -2516,9 +2516,14 @@ check_accessibility_of_qualified_id (tree decl, OBJECT_TYPE. */ && CLASS_TYPE_P (object_type) && DERIVED_FROM_P (scope, object_type)) -/* If we are processing a `->' or `.' expression, use the type of the - left-hand side. */ -qualifying_type = object_type; +{ + /* If we are processing a `->' or `.' expression, use the type of the +left-hand side. */ + if (tree open = currently_open_class (object_type)) + qualifying_type = open; + else + qualifying_type = object_type; +} else if (nested_name_specifier) { /* If the reference is to a non-static member of the diff --git a/gcc/testsuite/g++.dg/template/access42.C b/gcc/testsuite/g++.dg/template/access42.C new file mode 100644 index 000..f1dcbce80c2 --- /dev/null +++ b/gcc/testsuite/g++.dg/template/access42.C @@ -0,0 +1,17 @@ +// PR c++/116320 +// { dg-do compile { target c++11 } } + +template struct C; +template using C_ptr = C*; + +struct B { int m; using B_typedef = B; }; + +template +struct C : B { + void f(C_ptr c) { +c->B::m; +c->B_typedef::m; + } +}; + +template struct C;
[gcc r14-10586] c++/coroutines: fix passing *this to promise type, again [PR116327]
https://gcc.gnu.org/g:63c51e09d160a44fdce1199e8efe9d293f773a2c commit r14-10586-g63c51e09d160a44fdce1199e8efe9d293f773a2c Author: Patrick Palka Date: Thu Aug 15 10:20:18 2024 -0400 c++/coroutines: fix passing *this to promise type, again [PR116327] In r15-2210 we got rid of the unnecessary cast to lvalue reference when passing *this to the promise type ctor, and as a drive-by change we also simplified the code to use cp_build_fold_indirect_ref. But it turns out cp_build_fold_indirect_ref does too much here, namely it has a shortcut for returning current_class_ref if the operand is current_class_ptr. The problem with that shortcut is current_class_ref might have gotten clobbered earlier if it appeared in the function body, since rewrite_param_uses walks and rewrites in-place all local variable uses to their corresponding frame copy. So later cp_build_fold_indirect_ref for *this will instead return the clobbered current_class_ref i.e. *frame_ptr->this, which doesn't make sense here since we're in the ramp function and not the actor function where frame_ptr is in scope. This patch fixes this by using the build_fold_indirect_ref instead of cp_build_fold_indirect_ref. PR c++/116327 PR c++/104981 PR c++/115550 gcc/cp/ChangeLog: * coroutines.cc (morph_fn_to_coro): Use build_fold_indirect_ref instead of cp_build_fold_indirect_ref. gcc/testsuite/ChangeLog: * g++.dg/coroutines/pr104981-preview-this.C: Improve coverage by adding a non-static data member use within the coroutine member function. * g++.dg/coroutines/pr116327-preview-this.C: New test. Reviewed-by: Jason Merrill (cherry picked from commit 303bed670af962c01b77a4f0c51de97f70e8167e) Diff: --- gcc/cp/coroutines.cc | 8 ++-- .../g++.dg/coroutines/pr104981-preview-this.C | 4 +++- .../g++.dg/coroutines/pr116327-preview-this.C | 22 ++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc index 71e64960c5a..f5cfc80ca39 100644 --- a/gcc/cp/coroutines.cc +++ b/gcc/cp/coroutines.cc @@ -4618,7 +4618,9 @@ morph_fn_to_coro (tree orig, tree *resumer, tree *destroyer) if (parm_i->this_ptr || parm_i->lambda_cobj) { /* We pass a reference to *this to the allocator lookup. */ - tree this_ref = cp_build_fold_indirect_ref (arg); + /* It's unsafe to use the cp_ version here since current_class_ref +might've gotten clobbered earlier during rewrite_param_uses. */ + tree this_ref = build_fold_indirect_ref (arg); vec_safe_push (args, this_ref); } else @@ -4838,7 +4840,9 @@ morph_fn_to_coro (tree orig, tree *resumer, tree *destroyer) if (parm.this_ptr || parm.lambda_cobj) { /* We pass a reference to *this to the param preview. */ - tree this_ref = cp_build_fold_indirect_ref (arg); + /* It's unsafe to use the cp_ version here since current_class_ref +might've gotten clobbered earlier during rewrite_param_uses. */ + tree this_ref = build_fold_indirect_ref (arg); vec_safe_push (promise_args, this_ref); } else if (parm.rv_ref) diff --git a/gcc/testsuite/g++.dg/coroutines/pr104981-preview-this.C b/gcc/testsuite/g++.dg/coroutines/pr104981-preview-this.C index 81eb963db4a..9f1e3970ce3 100644 --- a/gcc/testsuite/g++.dg/coroutines/pr104981-preview-this.C +++ b/gcc/testsuite/g++.dg/coroutines/pr104981-preview-this.C @@ -23,8 +23,10 @@ struct PromiseType { }; struct Derived : Base { + int m = 41; Result f() { - co_return 42; + ++m; + co_return m; } }; diff --git a/gcc/testsuite/g++.dg/coroutines/pr116327-preview-this.C b/gcc/testsuite/g++.dg/coroutines/pr116327-preview-this.C new file mode 100644 index 000..27b69a41392 --- /dev/null +++ b/gcc/testsuite/g++.dg/coroutines/pr116327-preview-this.C @@ -0,0 +1,22 @@ +// PR c++/116327 - ICE in coroutine with parameter preview on lambda with captures + +#include + +struct coroutine{ + struct promise_type{ +promise_type(const auto &...){} +std::suspend_never initial_suspend(){ return {}; } +std::suspend_always final_suspend()noexcept{ return {}; } +void unhandled_exception(){} +coroutine get_return_object(){ return {}; } +void return_value(int)noexcept{} + }; +}; + +int main(){ + auto f = [a=0](auto) -> coroutine { +co_return 2; + }; + f(0); + return 0; +}
[gcc r15-2934] Add corank to gfc_expr.
https://gcc.gnu.org/g:a3f1cdd8ed46f9816b31ab162ae4dac547d34ebc commit r15-2934-ga3f1cdd8ed46f9816b31ab162ae4dac547d34ebc Author: Andre Vehreschild Date: Fri Aug 9 12:47:18 2024 +0200 Add corank to gfc_expr. Compute the corank of an expression along side to the regular rank. This safe costly calls to gfc_get_corank (), which consecutively has been removed. In some locations the code needed some adaption to model the difference between expr.corank and gfc_get_corank correctly. The latter always returned the codimension of the expression and not its current corank, i.e. the resolution of all indezes. This commit is preparatory to fixing PR fortran/110033 and may contain parts of that fix already. gcc/fortran/ChangeLog: * arith.cc (reduce_unary): Use expr.corank. (reduce_binary_ac): Same. (reduce_binary_ca): Same. (reduce_binary_aa): Same. * array.cc (gfc_match_array_ref): Same. * check.cc (dim_corank_check): Same. (gfc_check_move_alloc): Same. (gfc_check_image_index): Same. * class.cc (gfc_add_class_array_ref): Same. (finalize_component): Same. * data.cc (gfc_assign_data_value): Same. * decl.cc (match_clist_expr): Same. (add_init_expr_to_sym): Same. * expr.cc (simplify_intrinsic_op): Same. (simplify_parameter_variable): Same. (gfc_check_assign_symbol): Same. (gfc_get_variable_expr): Same. (gfc_add_full_array_ref): Same. (gfc_lval_expr_from_sym): Same. (gfc_get_corank): Removed. * frontend-passes.cc (callback_reduction): Use expr.corank. (create_var): Same. (combine_array_constructor): Same. (optimize_minmaxloc): Same. * gfortran.h (gfc_get_corank): Add corank to gfc_expr. * intrinsic.cc (gfc_get_intrinsic_function_symbol): Use expr.corank. (gfc_convert_type_warn): Same. (gfc_convert_chartype): Same. * iresolve.cc (resolve_bound): Same. (gfc_resolve_cshift): Same. (gfc_resolve_eoshift): Same. (gfc_resolve_logical): Same. (gfc_resolve_matmul): Same. * match.cc (copy_ts_from_selector_to_associate): Same. * matchexp.cc (gfc_get_parentheses): Same. * parse.cc (parse_associate): Same. * primary.cc (gfc_match_rvalue): Same. * resolve.cc (resolve_structure_cons): Same. (resolve_actual_arglist): Same. (resolve_elemental_actual): Same. (resolve_generic_f0): Same. (resolve_unknown_f): Same. (resolve_operator): Same. (gfc_expression_rank): Same and set dimen_type for coarray to default. (gfc_op_rank_conformable): Use expr.corank. (add_caf_get_intrinsic): Same. (resolve_variable): Same. (gfc_fixup_inferred_type_refs): Same. (check_host_association): Same. (resolve_compcall): Same. (resolve_expr_ppc): Same. (resolve_assoc_var): Same. (fixup_array_ref): Same. (resolve_select_type): Same. (add_comp_ref): Same. (get_temp_from_expr): Same. (resolve_fl_var_and_proc): Same. (resolve_symbol): Same. * symbol.cc (gfc_is_associate_pointer): Same. * trans-array.cc (walk_coarray): Same. (gfc_conv_expr_descriptor): Same. (gfc_walk_array_ref): Same. * trans-array.h (gfc_walk_array_ref): Same. * trans-expr.cc (gfc_get_ultimate_alloc_ptr_comps_caf_token): Same. * trans-intrinsic.cc (trans_this_image): Same. (trans_image_index): Same. (conv_intrinsic_cobound): Same. (gfc_walk_intrinsic_function): Same. (conv_intrinsic_move_alloc): Same. * trans-stmt.cc (gfc_trans_lock_unlock): Same. (trans_associate_var): Same and adapt to slightly different behaviour of expr.corank and gfc_get_corank. (gfc_trans_allocate): Same. * trans.cc (gfc_add_finalizer_call): Same. Diff: --- gcc/fortran/arith.cc | 4 + gcc/fortran/array.cc | 16 ++- gcc/fortran/check.cc | 18 +-- gcc/fortran/class.cc | 3 + gcc/fortran/data.cc| 1 + gcc/fortran/decl.cc| 2 + gcc/fortran/expr.cc| 51 +++-- gcc/fortran/frontend-passes.cc | 5 + gcc/fortran/gfortran.h | 2 +- gcc/fortran/intrinsic.cc | 3 + gcc/fortran/iresolve.cc| 20 +++- gcc/fortran/match.cc | 30 +++-- gcc/fortran/matchexp.cc| 1 + gcc/fortran/parse.cc | 39 --- gcc/fortran/primary.cc
[gcc r15-2935] Fix Coarray in associate not a coarray. [PR110033]
https://gcc.gnu.org/g:dbf4c574b92bc692a0380a2b5ee25028321e735f commit r15-2935-gdbf4c574b92bc692a0380a2b5ee25028321e735f Author: Andre Vehreschild Date: Wed Jul 24 09:39:45 2024 +0200 Fix Coarray in associate not a coarray. [PR110033] A coarray used in an associate did not become a coarray in the block of the associate. This patch fixes that and the same also in select type statements. PR fortran/110033 gcc/fortran/ChangeLog: * class.cc (gfc_is_class_scalar_expr): Coarray refs that ref only self, aka this image, are regarded as scalar, too. * resolve.cc (resolve_assoc_var): Ignore this image coarray refs and do not build a new class type. * trans-expr.cc (gfc_get_caf_token_offset): Get the caf token from the descriptor for associated variables. (gfc_conv_variable): Same. (gfc_trans_pointer_assignment): Assign token to temporary associate variable, too. (gfc_trans_scalar_assign): Add flag that assign is for associate and use it to assign the token. (is_assoc_assign): Detect that expressions are for associate assign. (gfc_trans_assignment_1): Treat associate assigns like pointer assignments where possible. * trans-stmt.cc (trans_associate_var): Set same_class only for class-targets. * trans.h (gfc_trans_scalar_assign): Add flag to trans_scalar_assign for marking associate assignments. gcc/testsuite/ChangeLog: * gfortran.dg/coarray/associate_1.f90: New test. Diff: --- gcc/fortran/class.cc | 38 +- gcc/fortran/resolve.cc| 40 --- gcc/fortran/trans-expr.cc | 87 +++ gcc/fortran/trans-stmt.cc | 2 +- gcc/fortran/trans.h | 5 +- gcc/testsuite/gfortran.dg/coarray/associate_1.f90 | 36 ++ 6 files changed, 163 insertions(+), 45 deletions(-) diff --git a/gcc/fortran/class.cc b/gcc/fortran/class.cc index 88fbba2818a..f9e0d416e48 100644 --- a/gcc/fortran/class.cc +++ b/gcc/fortran/class.cc @@ -379,27 +379,33 @@ gfc_is_class_scalar_expr (gfc_expr *e) return false; /* Is this a class object? */ - if (e->symtree - && e->symtree->n.sym->ts.type == BT_CLASS - && CLASS_DATA (e->symtree->n.sym) - && !CLASS_DATA (e->symtree->n.sym)->attr.dimension - && (e->ref == NULL - || (e->ref->type == REF_COMPONENT - && strcmp (e->ref->u.c.component->name, "_data") == 0 - && e->ref->next == NULL))) + if (e->symtree && e->symtree->n.sym->ts.type == BT_CLASS + && CLASS_DATA (e->symtree->n.sym) + && !CLASS_DATA (e->symtree->n.sym)->attr.dimension + && (e->ref == NULL + || (e->ref->type == REF_COMPONENT + && strcmp (e->ref->u.c.component->name, "_data") == 0 + && (e->ref->next == NULL + || (e->ref->next->type == REF_ARRAY + && e->ref->next->u.ar.codimen > 0 + && e->ref->next->u.ar.dimen == 0 + && e->ref->next->next == NULL) return true; /* Or is the final reference BT_CLASS or _data? */ for (ref = e->ref; ref; ref = ref->next) { - if (ref->type == REF_COMPONENT - && ref->u.c.component->ts.type == BT_CLASS - && CLASS_DATA (ref->u.c.component) - && !CLASS_DATA (ref->u.c.component)->attr.dimension - && (ref->next == NULL - || (ref->next->type == REF_COMPONENT - && strcmp (ref->next->u.c.component->name, "_data") == 0 - && ref->next->next == NULL))) + if (ref->type == REF_COMPONENT && ref->u.c.component->ts.type == BT_CLASS + && CLASS_DATA (ref->u.c.component) + && !CLASS_DATA (ref->u.c.component)->attr.dimension + && (ref->next == NULL + || (ref->next->type == REF_COMPONENT + && strcmp (ref->next->u.c.component->name, "_data") == 0 + && (ref->next->next == NULL + || (ref->next->next->type == REF_ARRAY + && ref->next->next->u.ar.codimen > 0 + && ref->next->next->u.ar.dimen == 0 + && ref->next->next->next == NULL) return true; } diff --git a/gcc/fortran/resolve.cc b/gcc/fortran/resolve.cc index ffc3721efbe..71312e0e415 100644 --- a/gcc/fortran/resolve.cc +++ b/gcc/fortran/resolve.cc @@ -9750,6 +9750,9 @@ resolve_assoc_var (gfc_symbol* sym, bool resolve_target) correct this now. */ gfc_typespec *ts = &target->ts; gfc_ref *ref; + /* Internal_ref is true, when this is ref'ing only _data and co-ref. + */ + bool inter
[gcc r15-2936] late-combine: Preserve INSN_CODE when modifying notes [PR116343]
https://gcc.gnu.org/g:70ae0daeb76f28a3135f4a74d6e440fb1d9821fa commit r15-2936-g70ae0daeb76f28a3135f4a74d6e440fb1d9821fa Author: Richard Sandiford Date: Thu Aug 15 16:54:02 2024 +0100 late-combine: Preserve INSN_CODE when modifying notes [PR116343] When it removes a definition, late-combine tries to update all uses in notes. It does this using the same insn_propagation class that it uses for patterns. However, insn_propagation uses validate_change, which in turn resets the INSN_CODE. This is inefficient in the best case, since it forces the pattern to be rerecognised even though changing a note can't affect the INSN_CODE. But in the PR it's a correctness problem: resetting INSN_CODE means we lose the NOOP_INSN_MOVE_CODE, which in turn means that rtl-ssa doesn't queue it for deletion. This patch adds a routine specifically for propagating into notes. A belt-and-braces fix would be to rerecognise noop moves in function_info::change_insns, but I can't think of a good reason why that would be necessary, and it could paper over latent bugs. gcc/ PR testsuite/116343 * recog.h (insn_propagation::apply_to_note): Declare. * recog.cc (insn_propagation::apply_to_note): New function. * late-combine.cc (insn_combination::substitute_note): Use apply_to_note instead of apply_to_rvalue. * rtl-ssa/changes.cc (rtl_ssa::changes_are_worthwhile): Improve dumping of costs for noop moves. gcc/testsuite/ PR testsuite/116343 * gcc.dg/torture/pr116343.c: New test. Diff: --- gcc/late-combine.cc | 2 +- gcc/recog.cc| 13 + gcc/recog.h | 1 + gcc/rtl-ssa/changes.cc | 5 - gcc/testsuite/gcc.dg/torture/pr116343.c | 18 ++ 5 files changed, 37 insertions(+), 2 deletions(-) diff --git a/gcc/late-combine.cc b/gcc/late-combine.cc index 2b62e2956ed..1d81b386c3d 100644 --- a/gcc/late-combine.cc +++ b/gcc/late-combine.cc @@ -338,7 +338,7 @@ insn_combination::substitute_note (insn_info *use_insn, rtx note, || REG_NOTE_KIND (note) == REG_EQUIV) { insn_propagation prop (use_insn->rtl (), m_dest, m_src); - return (prop.apply_to_rvalue (&XEXP (note, 0)) + return (prop.apply_to_note (&XEXP (note, 0)) && (can_propagate || prop.num_replacements == 0)); } return true; diff --git a/gcc/recog.cc b/gcc/recog.cc index 23e4820180f..615aaabc551 100644 --- a/gcc/recog.cc +++ b/gcc/recog.cc @@ -1469,6 +1469,19 @@ insn_propagation::apply_to_rvalue (rtx *loc) return res; } +/* Like apply_to_rvalue, but specifically for the case where *LOC is in + a note. This never changes the INSN_CODE. */ + +bool +insn_propagation::apply_to_note (rtx *loc) +{ + auto old_code = INSN_CODE (insn); + bool res = apply_to_rvalue (loc); + if (INSN_CODE (insn) != old_code) +INSN_CODE (insn) = old_code; + return res; +} + /* Check whether INSN matches a specific alternative of an .md pattern. */ bool diff --git a/gcc/recog.h b/gcc/recog.h index 87a5803dec0..1dccce78ba4 100644 --- a/gcc/recog.h +++ b/gcc/recog.h @@ -121,6 +121,7 @@ public: insn_propagation (rtx_insn *, rtx, rtx, bool = true); bool apply_to_pattern (rtx *); bool apply_to_rvalue (rtx *); + bool apply_to_note (rtx *); /* Return true if we should accept a substitution into the address of memory expression MEM. Undoing changes OLD_NUM_CHANGES and up restores diff --git a/gcc/rtl-ssa/changes.cc b/gcc/rtl-ssa/changes.cc index a30f000191e..0476296607b 100644 --- a/gcc/rtl-ssa/changes.cc +++ b/gcc/rtl-ssa/changes.cc @@ -228,7 +228,10 @@ rtl_ssa::changes_are_worthwhile (array_slice changes, for (const insn_change *change : changes) if (!change->is_deletion ()) { - fprintf (dump_file, " %c %d", sep, change->new_cost); + if (INSN_CODE (change->rtl ()) == NOOP_MOVE_INSN_CODE) + fprintf (dump_file, " %c nop", sep); + else + fprintf (dump_file, " %c %d", sep, change->new_cost); sep = '+'; } if (weighted_new_cost != 0) diff --git a/gcc/testsuite/gcc.dg/torture/pr116343.c b/gcc/testsuite/gcc.dg/torture/pr116343.c new file mode 100644 index 000..ad13f0fc21c --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr116343.c @@ -0,0 +1,18 @@ +// { dg-additional-options "-fschedule-insns -fno-thread-jumps -fno-dce" } + +int a, b, c; +volatile int d; +int e(int f, int g) { return g > 1 ? 1 : f >> g; } +int main() { + int *i = &a; + long j[1]; + if (a) +while (1) { + a ^= 1; + if (*i) +while (1) + ; + b = c && e((d, 1) >= 1, j[0]); +} + return 0; +}
[gcc r15-2937] Tweak base/index disambiguation in decompose_normal_address [PR116236]
https://gcc.gnu.org/g:3673b7054ec268c445620b9c52d25e65bc9a7f96 commit r15-2937-g3673b7054ec268c445620b9c52d25e65bc9a7f96 Author: Richard Sandiford Date: Thu Aug 15 16:54:03 2024 +0100 Tweak base/index disambiguation in decompose_normal_address [PR116236] The PR points out that, for an address like: (plus (zero_extend X) Y) decompose_normal_address doesn't establish a strong preference between treating X as the base or Y as the base. As the comment in the patch says, zero_extend isn't enough on its own to assume an index, at least not on POINTERS_EXTEND_UNSIGNED targets. But in a construct like the one above, X and Y have different modes, and it seems reasonable to assume that the one with the expected address mode is the base. This matters on targets like m68k that support index extension and that require different classes for bases and indices. gcc/ PR middle-end/116236 * rtlanal.cc (decompose_normal_address): Try to distinguish bases and indices based on mode, before resorting to "baseness". Diff: --- gcc/rtlanal.cc | 40 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc index 4158a531bdd..71207ee4f41 100644 --- a/gcc/rtlanal.cc +++ b/gcc/rtlanal.cc @@ -6724,20 +6724,36 @@ decompose_normal_address (struct address_info *info) } else if (out == 2) { + auto address_mode = targetm.addr_space.address_mode (info->as); + rtx inner_op0 = *inner_ops[0]; + rtx inner_op1 = *inner_ops[1]; + int base; + /* If one inner operand has the expected mode for a base and the other +doesn't, assume that the other one is the index. This is useful +for addresses such as: + + (plus (zero_extend X) Y) + +zero_extend is not in itself enough to assume an index, since bases +can be zero-extended on POINTERS_EXTEND_UNSIGNED targets. But if +Y has address mode and X doesn't, there should be little doubt that +Y is the base. */ + if (GET_MODE (inner_op0) == address_mode + && GET_MODE (inner_op1) != address_mode) + base = 0; + else if (GET_MODE (inner_op1) == address_mode + && GET_MODE (inner_op0) != address_mode) + base = 1; /* In the event of a tie, assume the base comes first. */ - if (baseness (*inner_ops[0], info->mode, info->as, PLUS, - GET_CODE (*ops[1])) - >= baseness (*inner_ops[1], info->mode, info->as, PLUS, - GET_CODE (*ops[0]))) - { - set_address_base (info, ops[0], inner_ops[0]); - set_address_index (info, ops[1], inner_ops[1]); - } + else if (baseness (inner_op0, info->mode, info->as, PLUS, +GET_CODE (*ops[1])) + >= baseness (inner_op1, info->mode, info->as, PLUS, + GET_CODE (*ops[0]))) + base = 0; else - { - set_address_base (info, ops[1], inner_ops[1]); - set_address_index (info, ops[0], inner_ops[0]); - } + base = 1; + set_address_base (info, ops[base], inner_ops[base]); + set_address_index (info, ops[1 - base], inner_ops[1 - base]); } else gcc_assert (out == 0);
[gcc r15-2938] c++: fix up cpp23/consteval-if3.C test [PR115583]
https://gcc.gnu.org/g:580fe7979f3c873eae885568d2c17c9e110670b4 commit r15-2938-g580fe7979f3c873eae885568d2c17c9e110670b4 Author: Patrick Palka Date: Thu Aug 15 14:38:47 2024 -0400 c++: fix up cpp23/consteval-if3.C test [PR115583] Compiling with optimizations is needed to trigger the bug fixed by r15-2369. PR c++/115583 gcc/testsuite/ChangeLog: * g++.dg/cpp23/consteval-if13.C: Compile with -O. Diff: --- gcc/testsuite/g++.dg/cpp23/consteval-if13.C | 1 + 1 file changed, 1 insertion(+) diff --git a/gcc/testsuite/g++.dg/cpp23/consteval-if13.C b/gcc/testsuite/g++.dg/cpp23/consteval-if13.C index b98bbc33d13..b10ec18b3c6 100644 --- a/gcc/testsuite/g++.dg/cpp23/consteval-if13.C +++ b/gcc/testsuite/g++.dg/cpp23/consteval-if13.C @@ -1,5 +1,6 @@ // PR c++/115583 // { dg-do compile { target c++23 } } +// { dg-additional-options -O } consteval int f(int i) { return i;
[gcc r15-2939] fortran: Fix bootstrap in resolve.cc [PR116387]
https://gcc.gnu.org/g:0f8b11968472ff12674d67fd856610646b373bd0 commit r15-2939-g0f8b11968472ff12674d67fd856610646b373bd0 Author: Jakub Jelinek Date: Thu Aug 15 22:50:07 2024 +0200 fortran: Fix bootstrap in resolve.cc [PR116387] The r15-2934 change broke bootstrap: ../../gcc/fortran/resolve.cc: In function ‘bool resolve_operator(gfc_expr*)’: ../../gcc/fortran/resolve.cc:4649:22: error: too many arguments for format [-Werror=format-extra-args] 4649 | gfc_error ("Inconsistent coranks for operator at %%L and %%L", | ^~ The following patch fixes that by using %L rather than %%L, the call has 2 location arguments. 2024-08-15 Jakub Jelinek PR bootstrap/116387 * resolve.cc (resolve_operator): Use %L rather than %%L in format string. Diff: --- gcc/fortran/resolve.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/fortran/resolve.cc b/gcc/fortran/resolve.cc index 71312e0e415..12973c6bc85 100644 --- a/gcc/fortran/resolve.cc +++ b/gcc/fortran/resolve.cc @@ -4646,7 +4646,7 @@ resolve_operator (gfc_expr *e) } else { - gfc_error ("Inconsistent coranks for operator at %%L and %%L", + gfc_error ("Inconsistent coranks for operator at %L and %L", &op1->where, &op2->where); return false; }
[gcc r15-2940] i386: Improve split of *extendv2di2_highpart_stv_noavx512vl.
https://gcc.gnu.org/g:b6fb4f7f651d2aa89548c5833fe2679af2638df5 commit r15-2940-gb6fb4f7f651d2aa89548c5833fe2679af2638df5 Author: Roger Sayle Date: Thu Aug 15 22:02:05 2024 +0100 i386: Improve split of *extendv2di2_highpart_stv_noavx512vl. This patch follows up on the previous patch to fix PR target/116275 by improving the code STV (ultimately) generates for highpart sign extensions like (x<<8)>>8. The arithmetic right shift is able to take advantage of the available common subexpressions from the preceding left shift. Hence previously with -O2 -m32 -mavx -mno-avx512vl we'd generate: vpsllq $8, %xmm0, %xmm0 vpsrad $8, %xmm0, %xmm1 vpsrlq $8, %xmm0, %xmm0 vpblendw$51, %xmm0, %xmm1, %xmm0 But with improved splitting, we now generate three instructions: vpslld $8, %xmm1, %xmm0 vpsrad $8, %xmm0, %xmm0 vpblendw$51, %xmm1, %xmm0, %xmm0 This patch also implements Uros' suggestion that the pre-reload splitter could introduced a new pseudo to hold the intermediate to potentially help reload with register allocation, which applies when not performing the above optimization, i.e. on TARGET_XOP. 2024-08-15 Roger Sayle Uros Bizjak gcc/ChangeLog * config/i386/i386.md (*extendv2di2_highpart_stv_noavx512vl): Split to an improved implementation on !TARGET_XOP. On TARGET_XOP, use a new pseudo for the intermediate to simplify register allocation. gcc/testsuite/ChangeLog * g++.target/i386/pr116275-2.C: New test case. Diff: --- gcc/config/i386/i386.md| 32 -- gcc/testsuite/g++.target/i386/pr116275-2.C | 19 ++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index efbab2f25ec..36108e5c2c9 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -17872,10 +17872,38 @@ && ix86_pre_reload_split ()" "#" "&& 1" - [(set (match_dup 0) + [(set (match_dup 4) (ashift:V2DI (match_dup 1) (match_dup 2))) (set (match_dup 0) - (ashiftrt:V2DI (match_dup 0) (match_dup 2)))]) + (ashiftrt:V2DI (match_dup 4) (match_dup 2)))] +{ + if (!TARGET_XOP) +{ + rtx op0 = operands[0]; + rtx op2 = operands[2]; + rtx tmp1 = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_reg_rtx (V4SImode); + rtx tmp3 = gen_reg_rtx (V4SImode); + rtx tmp4 = gen_reg_rtx (V4SImode); + emit_move_insn (tmp1, lowpart_subreg (V4SImode, operands[1], V2DImode)); + emit_insn (gen_ashlv4si3 (tmp2, tmp1, op2)); + emit_insn (gen_ashrv4si3 (tmp3, tmp2, op2)); + vec_perm_builder sel (4, 4, 1); + sel.quick_grow (4); + sel[0] = 0; + sel[1] = 5; + sel[2] = 2; + sel[3] = 7; + vec_perm_indices indices(sel, 2, 4); + bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode, tmp4, + tmp1, tmp3, indices); + gcc_assert (ok); + emit_move_insn (op0, lowpart_subreg (V2DImode, tmp4, V4SImode)); + DONE; +} + else +operands[4] = gen_reg_rtx (V2DImode); +}) ;; Rotate instructions diff --git a/gcc/testsuite/g++.target/i386/pr116275-2.C b/gcc/testsuite/g++.target/i386/pr116275-2.C new file mode 100644 index 000..98d3c19e59c --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr116275-2.C @@ -0,0 +1,19 @@ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2 -mavx -mno-avx512vl -std=c++11" } */ + +struct SymbolDesc push_back(SymbolDesc); +struct SymbolDesc { + long long ELFLocalSymIdx; +}; +struct Expected { + long long &operator*(); +}; +void SymbolizableObjectFileaddSymbol() { + Expected SymbolAddressOrErr; + long long SymbolAddress = *SymbolAddressOrErr << 8 >> 8; + push_back({SymbolAddress}); +} + +/* { dg-final { scan-assembler "vpslld" } } */ +/* { dg-final { scan-assembler-not "vpsllq" } } */ +/* { dg-final { scan-assembler-not "vpsrlq" } } */
[gcc r15-2941] RISC-V: use fclass insns to implement isfinite, isnormal and isinf builtins
https://gcc.gnu.org/g:b0d041f0d4cace06433bf18ae53c40376f2088a7 commit r15-2941-gb0d041f0d4cace06433bf18ae53c40376f2088a7 Author: Vineet Gupta Date: Thu Aug 15 09:24:27 2024 -0700 RISC-V: use fclass insns to implement isfinite,isnormal and isinf builtins Currently these builtins use float compare instructions which require FP flags to be saved/restored which could be costly in uarch. RV Base ISA already has FCLASS.{d,s,h} instruction to compare/identify FP values w/o disturbing FP exception flags. Now that upstream supports the corresponding optabs, wire them up in the backend. gcc/ChangeLog: * config/riscv/riscv.md: define_insn for fclass insn. define_expand for isfinite, isnormal, isinf. gcc/testsuite/ChangeLog: * gcc.target/riscv/fclass.c: New tests. Tested-by: Edwin Lu # pre-commit-CI #2060 Signed-off-by: Vineet Gupta Diff: --- gcc/config/riscv/riscv.md | 63 + gcc/testsuite/gcc.target/riscv/fclass.c | 38 2 files changed, 101 insertions(+) diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 5e3ef789e42..f8d8162c0f9 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -68,6 +68,7 @@ UNSPEC_FMAX UNSPEC_FMINM UNSPEC_FMAXM + UNSPEC_FCLASS ;; Stack tie UNSPEC_TIE @@ -3478,6 +3479,68 @@ (set_attr "mode" "") (set (attr "length") (const_int 16))]) +;; fclass instruction output bitmap +;; 0 negative infinity +;; 1 negative normal number. +;; 2 negative subnormal number. +;; 3 -0 +;; 4 +0 +;; 5 positive subnormal number. +;; 6 positive normal number. +;; 7 positive infinity +;; 8 signaling NaN. +;; 9 quiet NaN + +(define_insn "fclass" + [(set (match_operand:X0 "register_operand" "=r") + (unspec [(match_operand:ANYF 1 "register_operand" " f")] + UNSPEC_FCLASS))] + "TARGET_HARD_FLOAT" + "fclass.\t%0,%1"; + [(set_attr "type" "fcmp") + (set_attr "mode" "")]) + +;; Implements optab for isfinite, isnormal, isinf + +(define_int_iterator FCLASS_MASK [126 66 129]) +(define_int_attr fclass_optab + [(126"isfinite") + (66 "isnormal") + (129"isinf")]) + +(define_expand "2" + [(match_operand 0 "register_operand" "=r") + (match_operand:ANYF 1 "register_operand" " f") + (const_int FCLASS_MASK)] + "TARGET_HARD_FLOAT" +{ + if (GET_MODE (operands[0]) != SImode + && GET_MODE (operands[0]) != word_mode) +FAIL; + + rtx t = gen_reg_rtx (word_mode); + rtx t_op0 = gen_reg_rtx (word_mode); + + if (TARGET_64BIT) +emit_insn (gen_fclassdi (t, operands[1])); + else +emit_insn (gen_fclasssi (t, operands[1])); + + riscv_emit_binary (AND, t, t, GEN_INT ()); + rtx cmp = gen_rtx_NE (word_mode, t, const0_rtx); + emit_insn (gen_cstore4 (t_op0, cmp, t, const0_rtx)); + + if (TARGET_64BIT) +{ + t_op0 = gen_lowpart (SImode, t_op0); + SUBREG_PROMOTED_VAR_P (t_op0) = 1; + SUBREG_PROMOTED_SET (t_op0, SRP_SIGNED); +} + + emit_move_insn (operands[0], t_op0); + DONE; +}) + (define_insn "*seq_zero_" [(set (match_operand:GPR 0 "register_operand" "=r") (eq:GPR (match_operand:X 1 "register_operand" " r") diff --git a/gcc/testsuite/gcc.target/riscv/fclass.c b/gcc/testsuite/gcc.target/riscv/fclass.c new file mode 100644 index 000..ea0f173ecf4 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/fclass.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target hard_float } */ +/* { dg-options "-march=rv64gc -mabi=lp64d -ftrapping-math" { target { rv64 } } } */ +/* { dg-options "-march=rv32gc -mabi=ilp32d -ftrapping-math" { target { rv32 } } } */ + +int d_isfinite(double a) +{ + return __builtin_isfinite(a); +} + +int d_isnormal(double a) +{ + return __builtin_isnormal(a); +} + +int d_isinf(double a) +{ + return __builtin_isinf(a); +} + +int f_isfinite(float a) +{ + return __builtin_isfinite(a); +} + +int f_isnormal(float a) +{ + return __builtin_isnormal(a); +} + +int f_isinf(float a) +{ + return __builtin_isinf(a); +} + +/* { dg-final { scan-assembler-not {\mfrflags} } } */ +/* { dg-final { scan-assembler-not {\mfsflags} } } */ +/* { dg-final { scan-assembler-times {\tfclass} 6 } } */
[gcc r15-2942] PHIOPT: Fix comment before factor_out_conditional_operation
https://gcc.gnu.org/g:9381d52893a77edf2983d72b41f64063ee7cd4bd commit r15-2942-g9381d52893a77edf2983d72b41f64063ee7cd4bd Author: Andrew Pinski Date: Sun Nov 5 19:27:51 2023 -0800 PHIOPT: Fix comment before factor_out_conditional_operation I didn't update the comment before factor_out_conditional_operation correctly. this updates it to be correct and mentions unary operations rather than just conversions. Pushed as obvious. gcc/ChangeLog: * tree-ssa-phiopt.cc (factor_out_conditional_operation): Update comment. Diff: --- gcc/tree-ssa-phiopt.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc index f05ca727503..aa414f6 100644 --- a/gcc/tree-ssa-phiopt.cc +++ b/gcc/tree-ssa-phiopt.cc @@ -212,7 +212,7 @@ replace_phi_edge_with_variable (basic_block cond_block, } /* PR66726: Factor operations out of COND_EXPR. If the arguments of the PHI - stmt are CONVERT_STMT, factor out the conversion and perform the conversion + stmt are Unary operator, factor out the operation and perform the operation to the result of PHI stmt. COND_STMT is the controlling predicate. Return the newly-created PHI, if any. */
[gcc r15-2944] libstdc++-v3: testsuite: Prune uncapitalized "in function" linker warning
https://gcc.gnu.org/g:98e1b19f6690f5e19031ba4f843c22208bbfb14a commit r15-2944-g98e1b19f6690f5e19031ba4f843c22208bbfb14a Author: Hans-Peter Nilsson Date: Wed Aug 14 04:38:05 2024 +0200 libstdc++-v3: testsuite: Prune uncapitalized "in function" linker warning Newer newlib trigger warnings about certain functions not implemented (_getentropy) when testing libstdc++-v3. Since 2018 (circa binutils-2.31) the "in function" prefix isn't capitalized for those "not implemented" warnings when generated from the linker (a GNU ld feature used by newlib). Dejagnu up to and including at least dejagnu-1.6.3 (and git @ 42979bd3b9) assumes a capital "In function", leaving that part unpruned, and boom we have thousands of "excess errors" from the libstdc++-v3 testsuite. While gcc/testsuite/lib/prune.exp:prune_gcc_output already deals with this quirk with a vastly more generic pattern, I choose this simpler tweak. libstdc++-v3: * testsuite/lib/prune.exp (libstdc++-dg-prune): Prune uncapitalized "in function" warning from linker. Diff: --- libstdc++-v3/testsuite/lib/prune.exp | 9 + 1 file changed, 9 insertions(+) diff --git a/libstdc++-v3/testsuite/lib/prune.exp b/libstdc++-v3/testsuite/lib/prune.exp index 071dcf34c1e..4250e2d39e7 100644 --- a/libstdc++-v3/testsuite/lib/prune.exp +++ b/libstdc++-v3/testsuite/lib/prune.exp @@ -80,6 +80,15 @@ proc libstdc++-dg-prune { system text } { # Ignore dsymutil warning (tool bug is actually in the linker) regsub -all "(^|\n)\[^\n\]*could not find object file symbol for symbol\[^\n\]*" $text "" text +# This pattern, except requiring a capitalized "In" and with a +# sub-pattern matching a subsequent line "is not implemented and will +# always fail", is part of the standard dejagnu prune_warnings function. +# There's also a separate single-line pattern pruning the "is not +# implemented and will always fail". Since that pattern is processed +# before this ${tool}-dg-prune function is called, we have to handle +# the single uncapitalized "in function" line. +regsub -all "(^|\n)\[^\n\]*: in function\[^\n\]*" $text "" text + # If exceptions are disabled, mark tests expecting exceptions to be enabled # as unsupported. if { ![check_effective_target_exceptions_enabled] } {
[gcc r15-2945] libstdc++-v3: Handle iconv as optional for newlib builds [PR116362]
https://gcc.gnu.org/g:1b8b53ef75c143cddc114705c97c74d9c8f7a64b commit r15-2945-g1b8b53ef75c143cddc114705c97c74d9c8f7a64b Author: Hans-Peter Nilsson Date: Tue Aug 13 19:23:43 2024 +0200 libstdc++-v3: Handle iconv as optional for newlib builds [PR116362] Support for iconv in newlib seems to have been always assumed present by libstdc++-v3, but is default off. Though, it hasn't been used before recent libstdc++ changes that actually call iconv functions. This now leads to failures exposed by running the test-suite, unless the newlib being used has been explicitly configured with --enable-newlib-iconv. When failing, there are undefined references to iconv, iconv_open or iconv_close for multiple tests. Thankfully there's a macro in newlib.h that we can check to detect presence of iconv support for the newlib build that's used. libstdc++-v3: PR libstdc++/116362 * configure.ac: Check newlib configuration whether iconv is enabled. * configure: Regenerate. Diff: --- libstdc++-v3/configure| 26 +- libstdc++-v3/configure.ac | 10 +- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure index fe525308ae2..305675eaa1e 100755 --- a/libstdc++-v3/configure +++ b/libstdc++-v3/configure @@ -28571,7 +28571,31 @@ _ACEOF -$as_echo "#define HAVE_ICONV 1" >>confdefs.h +# Support for iconv in newlib is configurable. +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ + + #ifndef _ICONV_ENABLED + #error + #endif + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_newlib_iconv_enabled=yes +else + ac_newlib_iconv_enabled=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +if test "$ac_newlib_iconv_enabled" = yes; then + $as_echo "#define HAVE_ICONV 1" >>confdefs.h + +fi $as_echo "#define HAVE_MEMALIGN 1" >>confdefs.h diff --git a/libstdc++-v3/configure.ac b/libstdc++-v3/configure.ac index ccb24a82be7..4049f54bd5a 100644 --- a/libstdc++-v3/configure.ac +++ b/libstdc++-v3/configure.ac @@ -376,7 +376,15 @@ dnl # rather than hardcoding that information. frexpl hypotl ldexpl log10l logl modfl powl sinhl sinl sqrtl tanhl tanl]) -AC_DEFINE(HAVE_ICONV) +# Support for iconv in newlib is configurable. +AC_TRY_COMPILE([#include ], [ + #ifndef _ICONV_ENABLED + #error + #endif], [ac_newlib_iconv_enabled=yes], [ac_newlib_iconv_enabled=no]) +if test "$ac_newlib_iconv_enabled" = yes; then + AC_DEFINE(HAVE_ICONV) +fi + AC_DEFINE(HAVE_MEMALIGN) case "${target}" in
[gcc/aoliva/heads/testme] (2 commits) Dump aliases in -fcallgraph-info
The branch 'aoliva/heads/testme' was updated to point to: d2b89c77861... Dump aliases in -fcallgraph-info It previously pointed to: 17d9d479afd... Dump aliases in -fcallgraph-info Diff: !!! WARNING: THE FOLLOWING COMMITS ARE NO LONGER ACCESSIBLE (LOST): --- 17d9d47... Dump aliases in -fcallgraph-info ebf9b1b... Optimize initialization of small padded objects Summary of changes (added commits): --- d2b89c7... Dump aliases in -fcallgraph-info 7b50738... Optimize initialization of small padded objects
[gcc(refs/users/aoliva/heads/testme)] Optimize initialization of small padded objects
https://gcc.gnu.org/g:7b50738c0cce248ecb98c8e2bf5f8115c4a90e74 commit 7b50738c0cce248ecb98c8e2bf5f8115c4a90e74 Author: Alexandre Oliva Date: Wed Aug 14 21:59:28 2024 -0300 Optimize initialization of small padded objects When small objects containing padding bits (or bytes) are fully initialized, we will often store them in registers, and setting bitfields and other small fields will attempt to preserve the uninitialized padding bits, which tends to be expensive. Zero-initializing registers, OTOH, tends to be cheap. So, if we're optimizing, zero-initialize such small padded objects even if that's not needed for correctness. We can't zero-initialize all such padding objects, though: if there's no padding whatsoever, and all fields are initialized with nonzero, the zero initialization would be flagged as dead. That's why we introduce machinery to detect whether objects have padding bits. I considered distinguishing between bitfields, units and larger padding elements, but I didn't pursue that distinction. Since the object's zero-initialization subsumes fields' zero-initialization, the empty string test in builtin-snprintf-6.c's test_assign_aggregate would regress without the addition of native_encode_constructor. for gcc/ChangeLog * expr.cc (categorize_ctor_elements_1): Change p_complete to int, to distinguish complete initialization in presence or absence of uninitialized padding bits. (categorize_ctor_elements): Likewise. Adjust all callers... * expr.h (categorize_ctor_elements): ... and declaration. (type_has_padding_at_level_p): New. * gimple-fold.cc (type_has_padding_at_level_p): New. * fold-const.cc (native_encode_constructor): New. (native_encode_expr): Call it. * gimplify.cc (gimplify_init_constructor): Clear small non-addressable non-volatile objects with padding or other uninitialized fields as an optimization. for gcc/testsuite/ChangeLog * gcc.dg/init-pad-1.c: New. Diff: --- gcc/expr.cc | 20 ++-- gcc/expr.h| 3 ++- gcc/fold-const.cc | 33 ++ gcc/gimple-fold.cc| 50 +++ gcc/gimplify.cc | 14 ++- gcc/testsuite/gcc.dg/init-pad-1.c | 18 ++ 6 files changed, 129 insertions(+), 9 deletions(-) diff --git a/gcc/expr.cc b/gcc/expr.cc index 2089c2b86a9..a701c67b348 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -7096,7 +7096,7 @@ count_type_elements (const_tree type, bool for_ctor_p) static bool categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts, HOST_WIDE_INT *p_unique_nz_elts, - HOST_WIDE_INT *p_init_elts, bool *p_complete) + HOST_WIDE_INT *p_init_elts, int *p_complete) { unsigned HOST_WIDE_INT idx; HOST_WIDE_INT nz_elts, unique_nz_elts, init_elts, num_fields; @@ -7218,7 +7218,10 @@ categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts, if (*p_complete && !complete_ctor_at_level_p (TREE_TYPE (ctor), num_fields, elt_type)) -*p_complete = false; +*p_complete = 0; + else if (*p_complete > 0 + && type_has_padding_at_level_p (TREE_TYPE (ctor))) +*p_complete = -1; *p_nz_elts += nz_elts; *p_unique_nz_elts += unique_nz_elts; @@ -7239,7 +7242,10 @@ categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts, and place it in *P_ELT_COUNT. * whether the constructor is complete -- in the sense that every meaningful byte is explicitly given a value -- - and place it in *P_COMPLETE. + and place it in *P_COMPLETE: + - 0 if any field is missing + - 1 if all fields are initialized, and there's no padding + - -1 if all fields are initialized, but there's padding Return whether or not CTOR is a valid static constant initializer, the same as "initializer_constant_valid_p (CTOR, TREE_TYPE (CTOR)) != 0". */ @@ -7247,12 +7253,12 @@ categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts, bool categorize_ctor_elements (const_tree ctor, HOST_WIDE_INT *p_nz_elts, HOST_WIDE_INT *p_unique_nz_elts, - HOST_WIDE_INT *p_init_elts, bool *p_complete) + HOST_WIDE_INT *p_init_elts, int *p_complete) { *p_nz_elts = 0; *p_unique_nz_elts = 0; *p_init_elts = 0; - *p_complete = true; + *p_complete = 1; return categorize_ctor_elements_1 (ctor, p_nz_elts, p_unique_nz_elts, p_init_elts, p_complete); @@ -7313,7 +7319,7 @@ mostly_zeros_p (const_tree exp) if (TREE_CODE (exp)
[gcc(refs/users/aoliva/heads/testme)] Dump aliases in -fcallgraph-info
https://gcc.gnu.org/g:d2b89c77861c4a773efada3954e910b6623f8eb5 commit d2b89c77861c4a773efada3954e910b6623f8eb5 Author: Alexandre Oliva Date: Thu Aug 15 02:00:18 2024 -0300 Dump aliases in -fcallgraph-info Dump ICF-unified decls, thunks, aliases and whatnot along with their ultimate targets, with edges from the alias to the target. Add support for dropping the source file's suffix when forming from dump-base, so that auxiliary files can be scanned, such as the .ci files generated by -fcallgraph-info, as in the testcase. for gcc/ChangeLog * toplev.cc (dump_final_alias_vcg): New. (dump_final_node_vcg): Dump aliases along with node. for gcc/testsuite/ChangeLog * lib/scandump.exp (dump-base): Support {} in dump base suffix to drop it. * gcc.dg/callgraph-info-1.c: New. Diff: --- gcc/testsuite/gcc.dg/callgraph-info-1.c | 7 +++ gcc/testsuite/lib/scandump.exp | 4 gcc/toplev.cc | 37 + 3 files changed, 48 insertions(+) diff --git a/gcc/testsuite/gcc.dg/callgraph-info-1.c b/gcc/testsuite/gcc.dg/callgraph-info-1.c new file mode 100644 index 000..853ff9554ee --- /dev/null +++ b/gcc/testsuite/gcc.dg/callgraph-info-1.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-fcallgraph-info" } */ + +void f() {} +void g() __attribute__ ((__alias__ ("f"))); + +/* { dg-final { scan-dump-times "ci" "triangle" 1 "ci" {{}} } } */ diff --git a/gcc/testsuite/lib/scandump.exp b/gcc/testsuite/lib/scandump.exp index 14536ae7379..adf9886b61c 100644 --- a/gcc/testsuite/lib/scandump.exp +++ b/gcc/testsuite/lib/scandump.exp @@ -37,6 +37,10 @@ proc dump-base { args } { # gcc-defs to base compilation dumps only on the source basename. set dumpbase $src if { [string length $dumpbase_suf] != 0 } { + # Accept {} as dump base suffix to drop the source suffix entirely. + if { "$dumpbase_suf" == "{}" } { + set dumpbase_suf "" + } regsub {[.][^.]*$} $src $dumpbase_suf dumpbase } return $dumpbase diff --git a/gcc/toplev.cc b/gcc/toplev.cc index eee4805b504..f308fb15108 100644 --- a/gcc/toplev.cc +++ b/gcc/toplev.cc @@ -914,6 +914,37 @@ dump_final_callee_vcg (FILE *f, location_t location, tree callee) fputs ("\" }\n", f); } +/* Callback for cgraph_node::call_for_symbol_thunks_and_aliases to dump to F_ a + node and an edge from ALIAS->DECL to CURRENT_FUNCTION_DECL. */ + +static bool +dump_final_alias_vcg (cgraph_node *alias, void *f_) +{ + FILE *f = (FILE *)f_; + + if (alias->decl == current_function_decl) +return false; + + dump_final_node_vcg_start (f, alias->decl); + fputs ("\" shape : triangle }\n", f); + + fputs ("edge: { sourcename: \"", f); + print_decl_identifier (f, alias->decl, PRINT_DECL_UNIQUE_NAME); + fputs ("\" targetname: \"", f); + print_decl_identifier (f, current_function_decl, PRINT_DECL_UNIQUE_NAME); + location_t location = DECL_SOURCE_LOCATION (alias->decl); + if (LOCATION_LOCUS (location) != UNKNOWN_LOCATION) +{ + expanded_location loc; + fputs ("\" label: \"", f); + loc = expand_location (location); + fprintf (f, "%s:%d:%d", loc.file, loc.line, loc.column); +} + fputs ("\" }\n", f); + + return false; +} + /* Dump final cgraph node in VCG format. */ static void @@ -950,6 +981,12 @@ dump_final_node_vcg (FILE *f) dump_final_callee_vcg (f, c->location, c->decl); vec_free (cfun->su->callees); cfun->su->callees = NULL; + + cgraph_node *node = cgraph_node::get (current_function_decl); + if (!node) +return; + node->call_for_symbol_thunks_and_aliases (dump_final_alias_vcg, f, + true, false); } /* Output stack usage and callgraph info, as requested. */
[gcc r14-10588] Move ix86_align_loops into a separate pass and insert the pass after pass_endbr_and_patchable_area.
https://gcc.gnu.org/g:4e7735a8d87559bbddfe3a985786996e22241f8d commit r14-10588-g4e7735a8d87559bbddfe3a985786996e22241f8d Author: liuhongt Date: Mon Aug 12 14:35:31 2024 +0800 Move ix86_align_loops into a separate pass and insert the pass after pass_endbr_and_patchable_area. gcc/ChangeLog: PR target/116174 * config/i386/i386.cc (ix86_align_loops): Move this to .. * config/i386/i386-features.cc (ix86_align_loops): .. here. (class pass_align_tight_loops): New class. (make_pass_align_tight_loops): New function. * config/i386/i386-passes.def: Insert pass_align_tight_loops after pass_insert_endbr_and_patchable_area. * config/i386/i386-protos.h (make_pass_align_tight_loops): New declare. gcc/testsuite/ChangeLog: * gcc.target/i386/pr116174.c: New test. (cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8) Diff: --- gcc/config/i386/i386-features.cc | 191 +++ gcc/config/i386/i386-passes.def | 3 + gcc/config/i386/i386-protos.h| 1 + gcc/config/i386/i386.cc | 146 --- gcc/testsuite/gcc.target/i386/pr116174.c | 12 ++ 5 files changed, 207 insertions(+), 146 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index e3e004d5526..7de19d42363 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) return new pass_remove_partial_avx_dependency (ctxt); } +/* When a hot loop can be fit into one cacheline, + force align the loop without considering the max skip. */ +static void +ix86_align_loops () +{ + basic_block bb; + + /* Don't do this when we don't know cache line size. */ + if (ix86_cost->prefetch_block == 0) +return; + + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); + profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; + FOR_EACH_BB_FN (bb, cfun) +{ + rtx_insn *label = BB_HEAD (bb); + bool has_fallthru = 0; + edge e; + edge_iterator ei; + + if (!LABEL_P (label)) + continue; + + profile_count fallthru_count = profile_count::zero (); + profile_count branch_count = profile_count::zero (); + + FOR_EACH_EDGE (e, ei, bb->preds) + { + if (e->flags & EDGE_FALLTHRU) + has_fallthru = 1, fallthru_count += e->count (); + else + branch_count += e->count (); + } + + if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) + continue; + + if (bb->loop_father + && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) + && (has_fallthru + ? (!(single_succ_p (bb) + && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) +&& optimize_bb_for_speed_p (bb) +&& branch_count + fallthru_count > count_threshold +&& (branch_count > fallthru_count * param_align_loop_iterations)) + /* In case there'no fallthru for the loop. +Nops inserted won't be executed. */ + : (branch_count > count_threshold +|| (bb->count > bb->prev_bb->count * 10 +&& (bb->prev_bb->count +<= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2) + { + rtx_insn* insn, *end_insn; + HOST_WIDE_INT size = 0; + bool padding_p = true; + basic_block tbb = bb; + unsigned cond_branch_num = 0; + bool detect_tight_loop_p = false; + + for (unsigned int i = 0; i != bb->loop_father->num_nodes; + i++, tbb = tbb->next_bb) + { + /* Only handle continuous cfg layout. */ + if (bb->loop_father != tbb->loop_father) + { + padding_p = false; + break; + } + + FOR_BB_INSNS (tbb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + size += ix86_min_insn_size (insn); + + /* We don't know size of inline asm. +Don't align loop for call. */ + if (asm_noperands (PATTERN (insn)) >= 0 + || CALL_P (insn)) + { + size = -1; + break; + } + } + + if (size == -1 || size > ix86_cost->prefetch_block) + { + padding_p = false; + break; + } + + FOR_EACH_EDGE (e, ei, tbb->succs) + { + /* It could be part of the loop. */ + if (e->dest == bb) + { + detect_tight_loop_p
[gcc r14-10589] aarch64: Fix invalid nested subregs [PR115464]
https://gcc.gnu.org/g:32b21292adb6ad6b5e1d60d923a773e4d0daca7b commit r14-10589-g32b21292adb6ad6b5e1d60d923a773e4d0daca7b Author: Richard Sandiford Date: Fri Aug 16 07:53:01 2024 +0100 aarch64: Fix invalid nested subregs [PR115464] The testcase extracts one arm_neon.h vector from a pair (one subreg) and then reinterprets the result as an SVE vector (another subreg). Each subreg makes sense individually, but we can't fold them together into a single subreg: it's 32 bytes -> 16 bytes -> 16*N bytes, but the interpretation of 32 bytes -> 16*N bytes depends on whether N==1 or N>1. Since the second subreg makes sense individually, simplify_subreg should bail out rather than ICE on it. simplify_gen_subreg will then do the same (because it already checks validate_subreg). This leaves simplify_gen_subreg returning null, requiring the caller to take appropriate action. I think this is relatively likely to occur elsewhere, so the patch adds a helper for forcing a subreg, allowing a temporary pseudo to be created where necessary. I'll follow up by using force_subreg in more places. This patch is intended to be a minimal backportable fix for the PR. gcc/ PR target/115464 * simplify-rtx.cc (simplify_context::simplify_subreg): Don't try to fold two subregs together if their relationship isn't known at compile time. * explow.h (force_subreg): Declare. * explow.cc (force_subreg): New function. * config/aarch64/aarch64-sve-builtins-base.cc (svset_neonq_impl::expand): Use it instead of simplify_gen_subreg. gcc/testsuite/ PR target/115464 * gcc.target/aarch64/sve/acle/general/pr115464.c: New test. (cherry picked from commit 0970ff46ba6330fc80e8736fc05b2eaeeae0b6a0) Diff: --- gcc/config/aarch64/aarch64-sve-builtins-base.cc | 2 +- gcc/explow.cc | 15 +++ gcc/explow.h | 2 ++ gcc/simplify-rtx.cc | 5 + .../gcc.target/aarch64/sve/acle/general/pr115464.c| 13 + 5 files changed, 36 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 0d2edf3f19e..c9182594bc1 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -1174,7 +1174,7 @@ public: Advanced SIMD argument as an SVE vector. */ if (!BYTES_BIG_ENDIAN && is_undef (CALL_EXPR_ARG (e.call_expr, 0))) - return simplify_gen_subreg (mode, e.args[1], GET_MODE (e.args[1]), 0); + return force_subreg (mode, e.args[1], GET_MODE (e.args[1]), 0); rtx_vector_builder builder (VNx16BImode, 16, 2); for (unsigned int i = 0; i < 16; i++) diff --git a/gcc/explow.cc b/gcc/explow.cc index 8e5f6b8e680..f6843398c4b 100644 --- a/gcc/explow.cc +++ b/gcc/explow.cc @@ -745,6 +745,21 @@ force_reg (machine_mode mode, rtx x) return temp; } +/* Like simplify_gen_subreg, but force OP into a new register if the + subreg cannot be formed directly. */ + +rtx +force_subreg (machine_mode outermode, rtx op, + machine_mode innermode, poly_uint64 byte) +{ + rtx x = simplify_gen_subreg (outermode, op, innermode, byte); + if (x) +return x; + + op = copy_to_mode_reg (innermode, op); + return simplify_gen_subreg (outermode, op, innermode, byte); +} + /* If X is a memory ref, copy its contents to a new temp reg and return that reg. Otherwise, return X. */ diff --git a/gcc/explow.h b/gcc/explow.h index 16aa02cfb68..cbd1fcb7eb3 100644 --- a/gcc/explow.h +++ b/gcc/explow.h @@ -42,6 +42,8 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode); Args are mode (in case value is a constant) and the value. */ extern rtx force_reg (machine_mode, rtx); +extern rtx force_subreg (machine_mode, rtx, machine_mode, poly_uint64); + /* Return given rtx, copied into a new temp reg if it was in memory. */ extern rtx force_not_mem (rtx); diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index dceaa1c..729d408aa55 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -7612,6 +7612,11 @@ simplify_context::simplify_subreg (machine_mode outermode, rtx op, poly_uint64 innermostsize = GET_MODE_SIZE (innermostmode); rtx newx; + /* Make sure that the relationship between the two subregs is +known at compile time. */ + if (!ordered_p (outersize, innermostsize)) + return NULL_RTX; + if (outermode == innermostmode && known_eq (byte, 0U) && known_eq (SUBREG_BYTE (op), 0)) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464.c
[gcc r14-10590] aarch64: Add another use of force_subreg [PR115464]
https://gcc.gnu.org/g:86dacfb06b90371458d58872f461d358a0834305 commit r14-10590-g86dacfb06b90371458d58872f461d358a0834305 Author: Richard Sandiford Date: Fri Aug 16 07:53:02 2024 +0100 aarch64: Add another use of force_subreg [PR115464] This patch includes the testcase from r15-1399 plus a miminal fix for it, without the other proactive uses of force_subreg. We can backport other force_subreg calls later if they're shown to be needed. gcc/ PR target/115464 * config/aarch64/aarch64-sve-builtins-base.cc (svset_neonq_impl::expand): Use force_subreg instead of lowpart_subreg. gcc/testsuite/ PR target/115464 * gcc.target/aarch64/sve/acle/general/pr115464_2.c: New test. Diff: --- gcc/config/aarch64/aarch64-sve-builtins-base.cc | 4 +++- .../gcc.target/aarch64/sve/acle/general/pr115464_2.c | 11 +++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index c9182594bc1..241a249503f 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -1185,7 +1185,9 @@ public: if (BYTES_BIG_ENDIAN) return e.use_exact_insn (code_for_aarch64_sve_set_neonq (mode)); insn_code icode = code_for_vcond_mask (mode, mode); -e.args[1] = lowpart_subreg (mode, e.args[1], GET_MODE (e.args[1])); +e.args[1] = force_subreg (mode, e.args[1], GET_MODE (e.args[1]), + subreg_lowpart_offset (mode, +GET_MODE (e.args[1]))); e.add_output_operand (icode); e.add_input_operand (icode, e.args[1]); e.add_input_operand (icode, e.args[0]); diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464_2.c new file mode 100644 index 000..f561c34f732 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr115464_2.c @@ -0,0 +1,11 @@ +/* { dg-options "-O2" } */ + +#include +#include +#include + +svuint16_t +convolve4_4_x (uint16x8x2_t permute_tbl, svuint16_t a) +{ +return svset_neonq_u16 (a, permute_tbl.val[1]); +}
[gcc r15-2946] aarch64: Improve popcount for bytes [PR113042]
https://gcc.gnu.org/g:fcc3af9949880476c4ed01a98bd7f5d7f29b7b16 commit r15-2946-gfcc3af9949880476c4ed01a98bd7f5d7f29b7b16 Author: Andrew Pinski Date: Mon Jun 10 00:39:54 2024 + aarch64: Improve popcount for bytes [PR113042] For popcount for bytes, we don't need the reduction addition after the vector cnt instruction as we are only counting one byte's popcount. This changes the popcount extend to cover all ALLI rather than GPI. Changes since v1: * v2 - Use ALLI iterator and combine all into one pattern. Add new testcases popcnt[6-8].c. * v3 - Simplify TARGET_CSSC path. Use convert_to_mode instead of gen_zero_extend* directly. Some other small cleanups. Bootstrapped and tested on aarch64-linux-gnu with no regressions. PR target/113042 gcc/ChangeLog: * config/aarch64/aarch64.md (popcount2): Update pattern to support ALLI modes. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt5.c: New test. * gcc.target/aarch64/popcnt6.c: New test. * gcc.target/aarch64/popcnt7.c: New test. * gcc.target/aarch64/popcnt8.c: New test. Signed-off-by: Andrew Pinski Diff: --- gcc/config/aarch64/aarch64.md | 37 +++--- gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 +++ gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 +++ gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 +++ gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 +++ 5 files changed, 98 insertions(+), 13 deletions(-) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 665a333903c..12dcc16529a 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5341,9 +5341,9 @@ ;; MOV w0, v2.b[0] (define_expand "popcount2" - [(set (match_operand:GPI 0 "register_operand") - (popcount:GPI (match_operand:GPI 1 "register_operand")))] - "TARGET_CSSC || TARGET_SIMD" + [(set (match_operand:ALLI 0 "register_operand") + (popcount:ALLI (match_operand:ALLI 1 "register_operand")))] + "TARGET_CSSC ? GET_MODE_BITSIZE (mode) >= 32 : TARGET_SIMD" { if (!TARGET_CSSC) { @@ -5351,18 +5351,29 @@ rtx v1 = gen_reg_rtx (V8QImode); rtx in = operands[1]; rtx out = operands[0]; - if(mode == SImode) - { - rtx tmp; - tmp = gen_reg_rtx (DImode); - /* If we have SImode, zero extend to DImode, pop count does -not change if we have extra zeros. */ - emit_insn (gen_zero_extendsidi2 (tmp, in)); - in = tmp; - } + /* SImode and HImode should be zero extended to DImode. +popcount does not change if we have extra zeros. */ + if (mode == SImode || mode == HImode) + in = convert_to_mode (DImode, in, true); + emit_move_insn (v, gen_lowpart (V8QImode, in)); emit_insn (gen_popcountv8qi2 (v1, v)); - emit_insn (gen_aarch64_zero_extend_reduc_plus_v8qi (out, v1)); + /* QImode, just extract from the v8qi vector. */ + if (mode == QImode) + emit_move_insn (out, gen_lowpart (QImode, v1)); + /* HI and SI, reduction is zero extended to SImode. */ + else if (mode == SImode || mode == HImode) + { + rtx out1 = gen_reg_rtx (SImode); + emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi (out1, v1)); + emit_move_insn (out, gen_lowpart (mode, out1)); + } + /* DImode, reduction is zero extended to DImode. */ + else + { + gcc_assert (mode == DImode); + emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi (out, v1)); + } DONE; } }) diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c new file mode 100644 index 000..406369d9b29 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR target/113042 */ + +#pragma GCC target "+nocssc" + +/* +** h8: +** ldr b[0-9]+, \[x0\] +** cnt v[0-9]+.8b, v[0-9]+.8b +** smovw0, v[0-9]+.b\[0\] +** ret +*/ +/* We should not need the addv here since we only need a byte popcount. */ + +unsigned h8 (const unsigned char *a) { + return __builtin_popcountg (a[0]); +} diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c b/gcc/testsuite/gcc.target/aarch64/popcnt6.c new file mode 100644 index 000..e882cb24126 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR target/113042 */ + +#pragma GCC target "+nocssc" + +/* +** h8: +** ldr h[0-9]+, \[x0\] +** cnt v[0-9]+.8b, v[0-9]+.8b +** addvb[0-9]+, v[
[gcc r13-8976] aarch64: Fix expansion of svsudot [PR114607]
https://gcc.gnu.org/g:22c6a11686d3f20f8682c2fbe9e33867a7e8af0e commit r13-8976-g22c6a11686d3f20f8682c2fbe9e33867a7e8af0e Author: Richard Sandiford Date: Fri Aug 16 07:58:24 2024 +0100 aarch64: Fix expansion of svsudot [PR114607] Not sure how this happend, but: svsudot is supposed to be expanded as USDOT with the operands swapped. However, a thinko in the expansion of svsudot meant that the arguments weren't in fact swapped; the attempted swap was just a no-op. And the testcases blithely accepted that. gcc/ PR target/114607 * config/aarch64/aarch64-sve-builtins-base.cc (svusdot_impl::expand): Fix botched attempt to swap the operands for svsudot. gcc/testsuite/ PR target/114607 * gcc.target/aarch64/sve/acle/asm/sudot_s32.c: New test. (cherry picked from commit 2c1c2485a4b1aca746ac693041e51ea6da5c64ca) Diff: --- gcc/config/aarch64/aarch64-sve-builtins-base.cc | 2 +- gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c | 8 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index cd9cace3c9b..34f2d8c6e4e 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -2403,7 +2403,7 @@ public: version) is through the USDOT instruction but with the second and third inputs swapped. */ if (m_su) - e.rotate_inputs_left (1, 2); + e.rotate_inputs_left (1, 3); /* The ACLE function has the same order requirements as for svdot. While there's no requirement for the RTL pattern to have the same sort of order as that for dot_prod, it's easier to read. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c index 4b452619eee..e06b69affab 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c @@ -6,7 +6,7 @@ /* ** sudot_s32_tied1: -** usdot z0\.s, z2\.b, z4\.b +** usdot z0\.s, z4\.b, z2\.b ** ret */ TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, svuint8_t, @@ -17,7 +17,7 @@ TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, svuint8_t, ** sudot_s32_tied2: ** mov (z[0-9]+)\.d, z0\.d ** movprfx z0, z4 -** usdot z0\.s, z2\.b, \1\.b +** usdot z0\.s, \1\.b, z2\.b ** ret */ TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, svuint8_t, @@ -27,7 +27,7 @@ TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, svuint8_t, /* ** sudot_w0_s32_tied: ** mov (z[0-9]+\.b), w0 -** usdot z0\.s, z2\.b, \1 +** usdot z0\.s, \1, z2\.b ** ret */ TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, uint8_t, @@ -37,7 +37,7 @@ TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, uint8_t, /* ** sudot_9_s32_tied: ** mov (z[0-9]+\.b), #9 -** usdot z0\.s, z2\.b, \1 +** usdot z0\.s, \1, z2\.b ** ret */ TEST_TRIPLE_Z (sudot_9_s32_tied, svint32_t, svint8_t, uint8_t,
[gcc r13-8977] aarch64: Fix bogus cnot optimisation [PR114603]
https://gcc.gnu.org/g:959d6529df206c1983be14383da081f374416e47 commit r13-8977-g959d6529df206c1983be14383da081f374416e47 Author: Richard Sandiford Date: Fri Aug 16 07:58:25 2024 +0100 aarch64: Fix bogus cnot optimisation [PR114603] aarch64-sve.md had a pattern that combined: cmpeq pb.T, pa/z, zc.T, #0 mov zd.T, pb/z, #1 into: cnotzd.T, pa/m, zc.T But this is only valid if pa.T is a ptrue. In other cases, the original would set inactive elements of zd.T to 0, whereas the combined form would copy elements from zc.T. gcc/ PR target/114603 * config/aarch64/aarch64-sve.md (@aarch64_pred_cnot): Replace with... (@aarch64_ptrue_cnot): ...this, requiring operand 1 to be a ptrue. (*cnot): Require operand 1 to be a ptrue. * config/aarch64/aarch64-sve-builtins-base.cc (svcnot_impl::expand): Use aarch64_ptrue_cnot for _x operations that are predicated with a ptrue. Represent other _x operations as fully-defined _m operations. gcc/testsuite/ PR target/114603 * gcc.target/aarch64/sve/acle/general/cnot_1.c: New test. (cherry picked from commit 67cbb1c638d6ab3a9cb77e674541e2b291fb67df) Diff: --- gcc/config/aarch64/aarch64-sve-builtins-base.cc| 25 ++ gcc/config/aarch64/aarch64-sve.md | 20 - .../gcc.target/aarch64/sve/acle/general/cnot_1.c | 23 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 34f2d8c6e4e..852f569461a 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -496,15 +496,22 @@ public: expand (function_expander &e) const override { machine_mode mode = e.vector_mode (0); -if (e.pred == PRED_x) - { - /* The pattern for CNOT includes an UNSPEC_PRED_Z, so needs - a ptrue hint. */ - e.add_ptrue_hint (0, e.gp_mode (0)); - return e.use_pred_x_insn (code_for_aarch64_pred_cnot (mode)); - } - -return e.use_cond_insn (code_for_cond_cnot (mode), 0); +machine_mode pred_mode = e.gp_mode (0); +/* The underlying _x pattern is effectively: + +dst = src == 0 ? 1 : 0 + + rather than an UNSPEC_PRED_X. Using this form allows autovec + constructs to be matched by combine, but it means that the + predicate on the src == 0 comparison must be all-true. + + For simplicity, represent other _x operations as fully-defined _m + operations rather than using a separate bespoke pattern. */ +if (e.pred == PRED_x + && gen_lowpart (pred_mode, e.args[0]) == CONSTM1_RTX (pred_mode)) + return e.use_pred_x_insn (code_for_aarch64_ptrue_cnot (mode)); +return e.use_cond_insn (code_for_cond_cnot (mode), + e.pred == PRED_x ? 1 : 0); } }; diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 7533b956686..0a05aecd1a3 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3225,24 +3225,24 @@ ;; - CNOT ;; - -;; Predicated logical inverse. -(define_expand "@aarch64_pred_cnot" +;; Logical inverse, predicated with a ptrue. +(define_expand "@aarch64_ptrue_cnot" [(set (match_operand:SVE_FULL_I 0 "register_operand") (unspec:SVE_FULL_I [(unspec: [(match_operand: 1 "register_operand") - (match_operand:SI 2 "aarch64_sve_ptrue_flag") + (const_int SVE_KNOWN_PTRUE) (eq: - (match_operand:SVE_FULL_I 3 "register_operand") - (match_dup 4))] + (match_operand:SVE_FULL_I 2 "register_operand") + (match_dup 3))] UNSPEC_PRED_Z) - (match_dup 5) - (match_dup 4)] + (match_dup 4) + (match_dup 3)] UNSPEC_SEL))] "TARGET_SVE" { -operands[4] = CONST0_RTX (mode); -operands[5] = CONST1_RTX (mode); +operands[3] = CONST0_RTX (mode); +operands[4] = CONST1_RTX (mode); } ) @@ -3251,7 +3251,7 @@ (unspec:SVE_I [(unspec: [(match_operand: 1 "register_operand" "Upl, Upl") - (match_operand:SI 5 "aarch64_sve_ptrue_flag") + (const_int SVE_KNOWN_PTRUE) (eq: (match_operand:SVE_I 2 "register_operand" "0, w") (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))] diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnot_1.c new file mode 100644 index 000..b1a489f0cf0 --- /dev/null +++ b/gcc/tests