Re: [FYI] pass --enable-obsolete to build configure
On Nov 02 2019, Alexandre Oliva wrote: > @@ -903,6 +904,7 @@ infodir > docdir > oldincludedir > includedir > +runstatedir > localstatedir > sharedstatedir > sysconfdir Don't add unreleated changes. Andreas. -- Andreas Schwab, sch...@linux-m68k.org GPG Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5 "And now for something completely different."
Re: [PATCH v2] [PR91979] Updated the fix:
On 11/1/19 11:47 PM, Kamlesh Kumar wrote: Changlogs gcc -- 2019-11-02 Kamlesh Kumar PR c++/91979 - mangling nullptr expression * mangle.c (write_template_arg_literal): Handle nullptr mangling. * testsuite/g++.dg/cpp0x/nullptr27.C: Modify Test. * testsuite/g++.dg/cpp0x/pr91979.C: New Test. libiberty --- 2019-11-02 Kamlesh Kumar * cp-demangle.c (d_expr_primary): Handle nullptr demangling. * testsuite/demangle-expected: Added test. gcc/c-family -- 2019-11-02 Kamlesh Kumar * c-opts.c (c_common_post_options): Updated latest_abi_version. --- gcc/c-family/c-opts.c | 2 +- gcc/cp/mangle.c| 4 +++- gcc/testsuite/g++.dg/cpp0x/nullptr27.C | 2 +- gcc/testsuite/g++.dg/cpp0x/pr91979.C | 15 +++ libiberty/cp-demangle.c| 11 +++ libiberty/testsuite/demangle-expected | 4 6 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/g++.dg/cpp0x/pr91979.C diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c index 0fffe60b140..d4c77be5cd5 100644 --- a/gcc/c-family/c-opts.c +++ b/gcc/c-family/c-opts.c @@ -937,7 +937,7 @@ c_common_post_options (const char **pfilename) /* Change flag_abi_version to be the actual current ABI level, for the benefit of c_cpp_builtins, and to make comparison simpler. */ - const int latest_abi_version = 13; + const int latest_abi_version = 14; Please also add a comment about ABI v14 above fabi-version= in gcc/common.opt. /* Generate compatibility aliases for ABI v11 (7.1) by default. */ const int abi_compat_default = 11; diff --git a/gcc/cp/mangle.c b/gcc/cp/mangle.c index a9333b84349..234a975781e 100644 --- a/gcc/cp/mangle.c +++ b/gcc/cp/mangle.c @@ -3400,7 +3400,9 @@ write_template_arg_literal (const tree value) case INTEGER_CST: gcc_assert (!same_type_p (TREE_TYPE (value), boolean_type_node) || integer_zerop (value) || integer_onep (value)); - write_integer_cst (value); + if (abi_version_at_least(14) + && !NULLPTR_TYPE_P (TREE_TYPE (value))) This logic is wrong: it will never emit the value for ABI version lower than 14. And you need a space before (. + write_integer_cst (value); break; case REAL_CST: diff --git a/gcc/testsuite/g++.dg/cpp0x/nullptr27.C b/gcc/testsuite/g++.dg/cpp0x/nullptr27.C index 2510dc80634..edd11606266 100644 --- a/gcc/testsuite/g++.dg/cpp0x/nullptr27.C +++ b/gcc/testsuite/g++.dg/cpp0x/nullptr27.C @@ -1,7 +1,7 @@ // PR c++/52706 // { dg-do compile { target c++11 } } // { dg-options "-fabi-version=0" } -// { dg-final { scan-assembler "_Z1fIDnLDn0EEiT_" } } +// { dg-final { scan-assembler "_Z1fIDnLDnEEiT_" } } template int f(T); Please also add a test for the old mangled name with -fabi-version=13. diff --git a/gcc/testsuite/g++.dg/cpp0x/pr91979.C b/gcc/testsuite/g++.dg/cpp0x/pr91979.C new file mode 100644 index 000..7fcd56b27f0 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp0x/pr91979.C @@ -0,0 +1,15 @@ +// PR c++/91989 +// { dg-do compile { target c++11 } } +// { dg-final { scan-assembler "_Z3fooILPv0EEvPN9enable_ifIXeqT_LDnEEvE4typeE" } } + +template +struct enable_if {}; + +template +struct enable_if { typedef T type; }; + +template +void foo(typename enable_if::type* = 0) {} + +template void foo<(void *)0>(void *); + diff --git a/libiberty/cp-demangle.c b/libiberty/cp-demangle.c index 5b674d7d93c..3fb1c56409e 100644 --- a/libiberty/cp-demangle.c +++ b/libiberty/cp-demangle.c @@ -3577,6 +3577,17 @@ d_expr_primary (struct d_info *di) && type->u.s_builtin.type->print != D_PRINT_DEFAULT) di->expansion -= type->u.s_builtin.type->len; + if (type->type == DEMANGLE_COMPONENT_BUILTIN_TYPE + && strncmp (type->u.s_builtin.type->name, + cplus_demangle_builtin_types[33].name, 17) == 0) Let's use strcmp rather than hardcode 17 here. + { + if (d_peek_char (di) == 'E') + { + d_advance (di, 1); + return type; + } + } + /* Rather than try to interpret the literal value, we just collect it as a string. Note that it's possible to have a floating point literal here. The ABI specifies that the diff --git a/libiberty/testsuite/demangle-expected b/libiberty/testsuite/demangle-expected index 61681484d0e..f68a8a71aaf 100644 --- a/libiberty/testsuite/demangle-expected +++ b/libiberty/testsuite/demangle-expected @@ -1446,3 +1446,7 @@ Foo()::X::fn _ZZZ3FooIiEfvENKUlT_E_clIcEEDaS0_EN1X2fnEv Foo()::{lambda(auto:1)#1}::operator()(char) const::X::fn() Foo()::{lambda(auto:1)#1}::operator()(char) const::X::fn +#PR91979 demangling nullptr expression + +_Z3fooILPv0EEvPN9enable_ifIXeqT_LDnEEvE4typeE +void foo<(void*)0>(enable_if<((void*)0)==((declty
[Darwin, testsuite, committed] Fix PR 79274
The solution for initialising global TLS variables does not apply to platforms using emulated TLS. On later branches this test requires native TLS. Rather than make that change (especially late on gcc-7) I’ve XFAILed the test for this on Darwin. tested on x86_64-darwin16, powerpc-darwin9, x86_64-linux-gnu applied to gcc-8 and gcc-7 branches thanks Iain gcc/testsuite/ 2019-11-03 Iain Sandoe PR c++/79274 * g++.dg/tls/pr77285-2.C: XFAIL test for Darwin. diff --git a/gcc/testsuite/g++.dg/tls/pr77285-2.C b/gcc/testsuite/g++.dg/tls/pr77285-2.C index bac273a4d6..459ecc6555 100644 --- a/gcc/testsuite/g++.dg/tls/pr77285-2.C +++ b/gcc/testsuite/g++.dg/tls/pr77285-2.C @@ -3,7 +3,7 @@ // { dg-require-effective-target tls } // { dg-final { scan-assembler "_Z4var1B3tag" } } // { dg-final { scan-assembler "_Z4var2B3tag" } } -// { dg-final { scan-assembler "_ZTH4var1B3tag" } } +// { dg-final { scan-assembler "_ZTH4var1B3tag" { xfail *-*-darwin* } } } // { dg-final { scan-assembler "_ZTW4var1B3tag" } } struct __attribute__((abi_tag("tag"))) X { ~X () {} int i = 0; };
[PATCH v3] Updated the Fix:
ChangeLog Entries: gcc/cp -- 2019-11-02 Kamlesh Kumar PR c++/91979 - mangling nullptr expression * cp/mangle.c (write_template_arg_literal): Handle nullptr mangling. gcc -- 2019-11-02 Kamlesh Kumar * common.opt (-fabi-version): Added Description. * testsuite/g++.dg/cpp0x/nullptr27.C: Modify Test. * testsuite/g++.dg/cpp0x/nullptr43.C: New Test. * testsuite/g++.dg/cpp0x/nullptr44.C: New Test. libiberty --- 2019-11-02 Kamlesh Kumar * cp-demangle.c (d_expr_primary): Handle nullptr demangling. * testsuite/demangle-expected: Added test. gcc/c-family -- 2019-11-02 Kamlesh Kumar * c-opts.c (c_common_post_options): Updated latest_abi_version. --- gcc/c-family/c-opts.c | 2 +- gcc/common.opt | 2 ++ gcc/cp/mangle.c| 4 +++- gcc/testsuite/g++.dg/cpp0x/nullptr27.C | 2 +- gcc/testsuite/g++.dg/cpp0x/nullptr43.C | 9 + gcc/testsuite/g++.dg/cpp0x/nullptr44.C | 15 +++ libiberty/cp-demangle.c| 11 +++ libiberty/testsuite/demangle-expected | 4 8 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/g++.dg/cpp0x/nullptr43.C create mode 100644 gcc/testsuite/g++.dg/cpp0x/nullptr44.C diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c index 0fffe60b140..d4c77be5cd5 100644 --- a/gcc/c-family/c-opts.c +++ b/gcc/c-family/c-opts.c @@ -937,7 +937,7 @@ c_common_post_options (const char **pfilename) /* Change flag_abi_version to be the actual current ABI level, for the benefit of c_cpp_builtins, and to make comparison simpler. */ - const int latest_abi_version = 13; + const int latest_abi_version = 14; /* Generate compatibility aliases for ABI v11 (7.1) by default. */ const int abi_compat_default = 11; diff --git a/gcc/common.opt b/gcc/common.opt index cc279f411d7..fdd923e3c35 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -951,6 +951,8 @@ Driver Undocumented ; 13: Fixes the accidental change in 12 to the calling convention for classes ; with deleted copy constructor and trivial move constructor. ; Default in G++ 8.2. +; 14: Corrects the mangling of nullptr expression. +; Default in G++ 10. ; ; Additional positive integers will be assigned as new versions of ; the ABI become the default version of the ABI. diff --git a/gcc/cp/mangle.c b/gcc/cp/mangle.c index a9333b84349..a32ff2a2210 100644 --- a/gcc/cp/mangle.c +++ b/gcc/cp/mangle.c @@ -3400,7 +3400,9 @@ write_template_arg_literal (const tree value) case INTEGER_CST: gcc_assert (!same_type_p (TREE_TYPE (value), boolean_type_node) || integer_zerop (value) || integer_onep (value)); - write_integer_cst (value); + if (!(abi_version_at_least (14) + && NULLPTR_TYPE_P (TREE_TYPE (value + write_integer_cst (value); break; case REAL_CST: diff --git a/gcc/testsuite/g++.dg/cpp0x/nullptr27.C b/gcc/testsuite/g++.dg/cpp0x/nullptr27.C index 2510dc80634..edd11606266 100644 --- a/gcc/testsuite/g++.dg/cpp0x/nullptr27.C +++ b/gcc/testsuite/g++.dg/cpp0x/nullptr27.C @@ -1,7 +1,7 @@ // PR c++/52706 // { dg-do compile { target c++11 } } // { dg-options "-fabi-version=0" } -// { dg-final { scan-assembler "_Z1fIDnLDn0EEiT_" } } +// { dg-final { scan-assembler "_Z1fIDnLDnEEiT_" } } template int f(T); diff --git a/gcc/testsuite/g++.dg/cpp0x/nullptr43.C b/gcc/testsuite/g++.dg/cpp0x/nullptr43.C new file mode 100644 index 000..fbdb6cd8e9d --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp0x/nullptr43.C @@ -0,0 +1,9 @@ +// PR c++/91979 +// { dg-do compile { target c++11 } } +// { dg-options "-fabi-version=13" } +// { dg-final { scan-assembler "_Z1fIDnLDn0EEiT_" } } + +template +int f(T); + +int i2 = f(nullptr); diff --git a/gcc/testsuite/g++.dg/cpp0x/nullptr44.C b/gcc/testsuite/g++.dg/cpp0x/nullptr44.C new file mode 100644 index 000..9ceba14fc98 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp0x/nullptr44.C @@ -0,0 +1,15 @@ +// PR c++/91979 +// { dg-do compile { target c++11 } } +// { dg-final { scan-assembler "_Z3fooILPv0EEvPN9enable_ifIXeqT_LDnEEvE4typeE" } } + +template +struct enable_if {}; + +template +struct enable_if { typedef T type; }; + +template +void foo(typename enable_if::type* = 0) {} + +template void foo<(void *)0>(void *); + diff --git a/libiberty/cp-demangle.c b/libiberty/cp-demangle.c index 5b674d7d93c..3150efff80d 100644 --- a/libiberty/cp-demangle.c +++ b/libiberty/cp-demangle.c @@ -3577,6 +3577,17 @@ d_expr_primary (struct d_info *di) && type->u.s_builtin.type->print != D_PRINT_DEFAULT) di->expansion -= type->u.s_builtin.type->len; + if (type->type == DEMANGLE_COMPONENT_BUILTIN_TYPE + && strcmp (type->u.s_builtin.type->name, + cplus_demangle_builtin_types[33].name) == 0) + { + if (d_
Re: [RFH][libgcc] fp-bit bit ordering (PR 78804)
On Fri, 2019-10-11 at 23:27 +0900, Oleg Endo wrote: > On Thu, 2019-10-03 at 19:34 -0600, Jeff Law wrote: > > > > So probably the most interesting target for this test is v850-elf > > as > > it's got a reasonably well functioning simulator, hard and soft FP > > targets, little endian, and I'm familiar with its current set of > > failures. > > > > I can confirm that your patch makes no difference in the test > > results > > (which includes execution results). > > > > In fact, there haven't been any problems on any target in my tester > > that > > I can tie back to this change. > > > > At this point I'd say let's go for it. > > > > Thanks, Jeff. I'll commit it to trunk if there are no further > objections some time next week. > I've just committed it as r277752. Personally I'd like to install it on GCC 8 and 9 branches as well. Any thoughts on that? Cheers, Oleg
Re: [PATCH][RFC] C++-style iterators for FOR_EACH_IMM_USE_STMT
On Wed, 2019-10-30 at 10:27 +0100, Richard Biener wrote: > > Hmm, not sure - I'd like to write > > for (gimple *use_stmt : imm_stmt_uses (SSAVAR)) >for (use_operand_p use_p : from > above>) > ... > > I don't see how that's possible. It would need to be "awkward" like > > for (auto it : imm_stmt_uses (SSAVAR)) >{ > gimple *use_stmt = *it; > for (use_operand_p use_p : it) >... >} > > so the first loops iteration object are the actual iterator and you'd > have to do extra indirection to get at the actual stmt you iterated > to. > > So I'd extend C++ (hah) to allow > > for (gimple *use_stmt : imm_stmt_uses (SSAVAR)) > for (use_operand_p use_p : auto) > ... > > where 'auto' magically selects the next iterator object in scope > [that matches]. > > ;) Have you applied for a patent yet? :D How about this one? for (gimple* use_stmt : imm_stmt_uses (SSAVAR)) for (use_operand_p use_p : imm_uses_on_stmt (*use_stmt)) ... where helper function "imm_uses_on_stmt" returns a range object that offers a begin and end function and its own iterator type. Another concept that could be interesting are filter iterators. We used a simplistic re-implementation (c++03) to avoid dragging in boost when working on AMS https://github.com/erikvarga/gcc/blob/master/gcc/filter_iterator.h Example uses are https://github.com/erikvarga/gcc/blob/master/gcc/ams.h#L845 https://github.com/erikvarga/gcc/blob/master/gcc/ams.cc#L3715 I think there are also some places in RTL where filter iterators could be used, e.g. "iterate over all MEMs in an RTL" could be made to look something like that: for (auto&& i : filter_rtl (my_rtl_obj, MEM_P)) ... Anyway, maybe it can plant some ideas. Cheers, Oleg
Introduce ipa_inline_call_context class
Hi, this patch refactors estimate_node_size_and_time so we can store the context call size/time estimates was calculated for. The goal is to prevent recalculation of these properties for calls from different call sites but with same context and instead cache the results. Honza Bootstrapped/regtested x86_64-linux, will commit it shortly. * ipa-fnsummary.c (ipa_call_context): New constructor. (estimate_node_size_and_time): Turn to ... (ipa_call_context::estimate_size_and_time): ... this one. (ipa_call_context::release): New. * ipa-fnsummary.h (ipa_call_context): New class. (estimate_node_size_and_time): Remove. (do_estimate_edge_time, do_estimate_edge_size, do_estimate_edge_hints): Update. === --- ipa-fnsummary.c (revision 277753) +++ ipa-fnsummary.c (working copy) @@ -2940,31 +2940,54 @@ estimate_calls_size_and_time (struct cgr } } +/* Default constructor for ipa call context. + Memory alloction of known_vals, known_contexts + and known_aggs vectors is owned by the caller, but can + be release by ipa_call_context::release. + + inline_param_summary is owned by the caller. */ +ipa_call_context::ipa_call_context (cgraph_node *node, + clause_t possible_truths, + clause_t nonspec_possible_truths, + vec known_vals, + vec +known_contexts, + vec known_aggs, + vec +inline_param_summary) +: m_node (node), m_possible_truths (possible_truths), + m_nonspec_possible_truths (nonspec_possible_truths), + m_inline_param_summary (inline_param_summary), + m_known_vals (known_vals), + m_known_contexts (known_contexts), + m_known_aggs (known_aggs) +{ +} + +/* Release memory used by known_vals/contexts/aggs vectors. */ + +void +ipa_call_context::release () +{ + m_known_vals.release (); + m_known_contexts.release (); + m_known_aggs.release (); +} -/* Estimate size and time needed to execute NODE assuming - POSSIBLE_TRUTHS clause, and KNOWN_VALS, KNOWN_AGGS and KNOWN_CONTEXTS - information about NODE's arguments. If non-NULL use also probability - information present in INLINE_PARAM_SUMMARY vector. +/* Estimate size and time needed to execute call in the given context. Additionally detemine hints determined by the context. Finally compute minimal size needed for the call that is independent on the call context and can be used for fast estimates. Return the values in RET_SIZE, RET_MIN_SIZE, RET_TIME and RET_HINTS. */ void -estimate_node_size_and_time (struct cgraph_node *node, -clause_t possible_truths, -clause_t nonspec_possible_truths, -vec known_vals, -vec known_contexts, -vec known_aggs, -int *ret_size, int *ret_min_size, -sreal *ret_time, -sreal *ret_nonspecialized_time, -ipa_hints *ret_hints, -vec -inline_param_summary) +ipa_call_context::estimate_size_and_time (int *ret_size, + int *ret_min_size, + sreal *ret_time, + sreal *ret_nonspecialized_time, + ipa_hints *ret_hints) { - class ipa_fn_summary *info = ipa_fn_summaries->get_create (node); + class ipa_fn_summary *info = ipa_fn_summaries->get_create (m_node); size_time_entry *e; int size = 0; sreal time = 0; @@ -2976,13 +2999,13 @@ estimate_node_size_and_time (struct cgra { bool found = false; fprintf (dump_file, " Estimating body: %s/%i\n" - " Known to be false: ", node->name (), - node->order); + " Known to be false: ", m_node->name (), + m_node->order); for (i = predicate::not_inlined_condition; i < (predicate::first_dynamic_condition + (int) vec_safe_length (info->conds)); i++) - if (!(possible_truths & (1 << i))) + if (!(m_possible_truths & (1 << i))) { if (found) fprintf (dump_file, ", "); @@ -2991,19 +3014,19 @@ estimate_node_size_and_time (struct cgra } } - estimate_calls_size_and_time (node, &size, &min_size, &time, &hints, possible_truths, - known_vals, known_contexts, known_aggs); + estimate_calls_size_and_time (m_node, &size, &min_size, &time, &hints, m_possible_truths, + m_known
Re: [PATCH 1/2] [ARM,testsuite] Skip tests incompatible with -mpure-code
Ping? https://gcc.gnu.org/ml/gcc-patches/2019-10/msg01357.html On Fri, 18 Oct 2019 at 15:18, Christophe Lyon wrote: > > Hi, > > All these tests fail when using -mpure-code: > * some force A or R profile > * some use Neon > * some use -fpic/-fPIC > all of which are not supported by this option. > > OK? > > Thanks, > > Christophe
Re: [PATCH 2/2] [ARM] Add support for -mpure-code in thumb-1 (v6m)
Ping? https://gcc.gnu.org/ml/gcc-patches/2019-10/msg01356.html On Fri, 18 Oct 2019 at 15:18, Christophe Lyon wrote: > > Hi, > > This patch extends support for -mpure-code to all thumb-1 processors, > by removing the need for MOVT. > > Symbol addresses are built using upper8_15, upper0_7, lower8_15 and > lower0_7 relocations, and constants are built using sequences of > movs/adds and lsls instructions. > > The extension of the *thumb1_movhf pattern uses always the same size > (6) although it can emit a shorter sequence when possible. This is > similar to what *arm32_movhf already does. > > CASE_VECTOR_PC_RELATIVE is now false with -mpure-code, to avoid > generating invalid assembly code with differences from symbols from > two different sections (the difference cannot be computed by the > assembler). > > Tests pr45701-[12].c needed a small adjustment to avoid matching > upper8_15 when looking for the r8 register. > > Test no-literal-pool.c is augmented with __fp16, so it now uses > -mfp16-format=ieee. > > Test thumb1-Os-mult.c generates an inline code sequence with > -mpure-code and computes the multiplication by using a sequence of > add/shift rather than using the multiply instruction, so we skip it in > presence of -mpure-code. > > With -mcpu=cortex-m0, the pure-code/no-literal-pool.c fails because > code like: > static char *p = "Hello World"; > char * > testchar () > { > return p + 4; > } > generates 2 indirections (I removed non-essential directives/code) > .section.rodata > .LC0: > .ascii "Hello World\000" > .data > p: > .word .LC0 > .section.rodata > .LC2: > .word p > .section .text,"0x2006",%progbits > testchar: > push{r7, lr} > add r7, sp, #0 > movsr3, #:upper8_15:#.LC2 > lslsr3, #8 > addsr3, #:upper0_7:#.LC2 > lslsr3, #8 > addsr3, #:lower8_15:#.LC2 > lslsr3, #8 > addsr3, #:lower0_7:#.LC2 > ldr r3, [r3] > ldr r3, [r3] > addsr3, r3, #4 > movsr0, r3 > mov sp, r7 > @ sp needed > pop {r7, pc} > > By contrast, when using -mcpu=cortex-m4, the code looks like: > .section.rodata > .LC0: > .ascii "Hello World\000" > .data > p: > .word .LC0 > testchar: > push{r7} > add r7, sp, #0 > movwr3, #:lower16:p > movtr3, #:upper16:p > ldr r3, [r3] > addsr3, r3, #4 > mov r0, r3 > mov sp, r7 > pop {r7} > bx lr > > I haven't found yet how to make code for cortex-m0 apply upper/lower > relocations to "p" instead of .LC2. The current code looks functional, > but could be improved. > > OK as-is? > > Thanks, > > Christophe
Optimize streaming in inline summaries
Hi, this patch makes inline_read_sections to allocate vector in proper size saving bit of register scaling overhead. Bootstrapped/regtested x86_64-linux, comitted. * ipa-fnsummary.c (inline_read_section): Set vector size ahead of time. Index: ipa-fnsummary.c === --- ipa-fnsummary.c (revision 277754) +++ ipa-fnsummary.c (working copy) @@ -3614,6 +3614,8 @@ inline_read_section (struct lto_file_dec count2 = streamer_read_uhwi (&ib); gcc_assert (!info || !info->conds); + if (info) +vec_safe_reserve_exact (info->conds, count2); for (j = 0; j < count2; j++) { struct condition c; @@ -3627,8 +3629,10 @@ inline_read_section (struct lto_file_dec c.by_ref = bp_unpack_value (&bp, 1); if (c.agg_contents) c.offset = streamer_read_uhwi (&ib); - c.param_ops = NULL; count3 = streamer_read_uhwi (&ib); + c.param_ops = NULL; + if (info) + vec_safe_reserve_exact (c.param_ops, count3); for (k = 0; k < count3; k++) { struct expr_eval_op op; @@ -3658,13 +3662,16 @@ inline_read_section (struct lto_file_dec fatal_error (UNKNOWN_LOCATION, "invalid fnsummary in LTO stream"); } - vec_safe_push (c.param_ops, op); + if (info) + c.param_ops->quick_push (op); } if (info) - vec_safe_push (info->conds, c); + info->conds->quick_push (c); } count2 = streamer_read_uhwi (&ib); gcc_assert (!info || !info->size_time_table); + if (info && count2) +vec_safe_reserve_exact (info->size_time_table, count2); for (j = 0; j < count2; j++) { class size_time_entry e; @@ -3675,7 +3682,7 @@ inline_read_section (struct lto_file_dec e.nonconst_predicate.stream_in (&ib); if (info) - vec_safe_push (info->size_time_table, e); + info->size_time_table->quick_push (e); } p.stream_in (&ib);
Implement ipa_inline_call_context cache
Hi, this patch implements simple single entry LRU cache for computing size/time of callgraph edges. My original patch contained a hashtable for all contexts but since I became more careful about memory allocation I decided to first go with this simple patch. For Firefox I get: node context cache: 1266834 hits, 871097 misses, 345654 initializations So we save about 51% of calculations which are the usual bottlenecks of the greedy inliner. Calculating the estimate involves walking all edges out of the given function and since we compute it for each caller, this may get quadratic. In the followup patches I will increase the effectivity of cache by dropping unnecesary information and then I will evaluate what size of cahce works best. Current perf profile of WPA on firefox looks as follows: 2.98% lto1-wpa-stream libc-2.24.so [.] _int_malloc 2.75% lto1-wpa-stream lto1 [.] streamer_tree_cache_lookup 2.16% lto1-wpa libc-2.24.so [.] _int_malloc 2.13% lto1-wpa lto1 [.] inflate_fast 1.96% lto1-wpa lto1 [.] record_target_from_binfo 1.90% lto1-wpa lto1 [.] types_same_for_odr 1.37% lto1-wpa lto1 [.] estimate_calls_size_and_time 1.37% lto1-wpa lto1 [.] symbol_table::remove_unreachable_nodes 1.23% lto1-wpa-stream lto1 [.] streamer_write_uhwi_stream 1.07% lto1-wpa lto1 [.] type_possibly_instantiated_p 1.07% lto1-wpa libc-2.24.so [.] malloc_consolidate 1.02% lto1-wpa-stream lto1 [.] DFS::DFS 1.02% lto1-wpa lto1 [.] unify_scc 1.02% lto1-wpa lto1 [.] sreal::operator/ 1.01% lto1-wpa-stream [kernel.kallsyms] [k] copy_page 0.97% lto1-wpa lto1 [.] fibonacci_heap::consolidate 0.89% lto1-wpa lto1 [.] evaluate_properties_for_edge 0.81% lto1-wpa [kernel.kallsyms] [k] copy_page_range 0.81% lto1-wpa lto1 [.] splay_tree_splay.part.0 0.80% lto1-wpa-stream lto1 [.] lto_output_tree 0.79% lto1-wpa lto1 [.] streamer_read_uhwi 0.75% lto1-wpa lto1 [.] gimple_get_virt_method_for_vtable 0.74% lto1-wpa lto1 [.] edge_badness 0.74% lto1-wpa libc-2.24.so [.] int_mallinfo 0.74% lto1-wpa lto1 [.] rename_statics 0.74% lto1-wpa lto1 [.] possible_polymorphic_call_targets_1 0.69% lto1-wpa lto1 [.] compare_tree_sccs_1 0.67% lto1-wpa libc-2.24.so [.] _int_free 0.65% lto1-wpa lto1 [.] fibonacci_heap::extract_minimum_node 0.61% lto1-wpa lto1 [.] htab_hash_string 0.58% lto1-wpa lto1 [.] ht_lookup_with_hash 0.58% lto1-wpa-stream lto1 [.] linemap_ordinary_map_lookup 0.57% lto1-wpa lto1 [.] streamer_read_tree_bitfields 0.56% lto1-wpa lto1 [.] cgraph_node::get_availability 0.56% lto1-wpa lto1 [.] hash_table, edge_growth_cache_entry*, simple_hashmap_traits >, edge_gr Cutting down record_target_from_binfo and types_same_for_odr is relatively easy (we do not need to walk may edges in unreachable function removal each time). estimate_calls_size_and_time was reduced by this patch (it used to be 1st of lto1-wpa). Bootstrapped/regtested x86_64-linux, will commit it shortly. Honza * ipa-fnsummary.c (ipa_call_context::duplicate_from): New member function. (ipa_call_context::release): Add ALL parameter. (ipa_call_context::equal_to): New member function. * ipa-fnsummary.h (ipa_call_context): Add empty constructor; duplicate_form, release, equal_to and exists_p member functoins. * ipa-inline-analysis.c (node_context_cache_entry): New class. (node_context_summary): Likewise. (node_context_cache, node_context_cache_hit, node_context_cache_miss, node_context_clear): New static vars. (initialize_growth_caches): New function. (free_growth_caches): Also delete node_context_cache; output stats. (do_estimate_edge_time): Cache contexts. (reset_node_cache): New function. * ipa-inline.c (reset_edge_caches): Reset also node cache. (inline_small_functions): Initialize growth caches. * ipa-inline.h (reset_node_cache, initialize_growth_caches): Declare. * ipa-predicate.h (inline_param_summary::equal_to): New. * ipa-prop.c (ipa_agg_jf_item::equal_to): New. * ipa-prop.h (ipa_agg_jf_item): Declare equal_to member function. (ipa_agg_jump_function): Implement equal_to member function. Index: ipa-fnsummary.c ==
[patch, fortran] PR92123 - [F2018/array-descriptor] Scalar allocatable/pointer with array descriptor (via bind(C)): ICE with select rank or error scalar variable with POINTER or ALLOCATABLE in procedu
The attached patch is verging on the obvious. Thanks to Tobias for spotting Vipul's messages on the J3 list. Regtests on FC30/x86_64 - OK for trunk and 9-branch? Paul 2019-11-03 Paul Thomas PR fortran/92123 *decl.c (gfc_verify_c_interop_param): Remove error asserting that pointer or allocatable variables in a bind C procedure are not supported. Delete some trailing spaces. * trans-stmt.c (trans_associate_var): Correct the attempt to treat scalar pointer or allocatable temporaries as if they are array descriptors. 2019-11-03 Paul Thomas PR fortran/92123 * gfortran.dg/bind_c_procs_3.f90 : New test. * gfortran.dg/ISO_Fortran_binding_15.c : New test. * gfortran.dg/ISO_Fortran_binding_15.f90 : Additional source. Index: gcc/fortran/decl.c === *** gcc/fortran/decl.c (revision 277531) --- gcc/fortran/decl.c (working copy) *** gfc_verify_c_interop_param (gfc_symbol * *** 1560,1574 sym->ns->proc_name->name)) retval = false; - if ((sym->attr.allocatable || sym->attr.pointer) && !sym->as) - { - gfc_error ("Scalar variable %qs at %L with POINTER or " - "ALLOCATABLE in procedure %qs with BIND(C) is not yet" - " supported", sym->name, &(sym->declared_at), - sym->ns->proc_name->name); - retval = false; - } - if (sym->attr.optional == 1 && sym->attr.value) { gfc_error ("Variable %qs at %L cannot have both the OPTIONAL " --- 1560,1565 *** gfc_match_entry (void) *** 7547,7553 entry->attr.is_bind_c = 0; loc = entry->old_symbol != NULL ! ? entry->old_symbol->declared_at : gfc_current_locus; gfc_error_now ("BIND(C) attribute at %L can only be used for " "variables or common blocks", &loc); } --- 7538,7544 entry->attr.is_bind_c = 0; loc = entry->old_symbol != NULL ! ? entry->old_symbol->declared_at : gfc_current_locus; gfc_error_now ("BIND(C) attribute at %L can only be used for " "variables or common blocks", &loc); } *** gfc_match_derived_decl (void) *** 10288,10294 } /* In free source form, need to check for TYPE XXX as oppose to TYPEXXX. ! But, we need to simply return for TYPE(. */ if (m == MATCH_NO && gfc_current_form == FORM_FREE) { char c = gfc_peek_ascii_char (); --- 10279,10285 } /* In free source form, need to check for TYPE XXX as oppose to TYPEXXX. ! But, we need to simply return for TYPE(. */ if (m == MATCH_NO && gfc_current_form == FORM_FREE) { char c = gfc_peek_ascii_char (); Index: gcc/fortran/trans-stmt.c === *** gcc/fortran/trans-stmt.c (revision 277531) --- gcc/fortran/trans-stmt.c (working copy) *** trans_associate_var (gfc_symbol *sym, gf *** 1841,1850 if (rank > 0) copy_descriptor (&se.post, se.expr, desc, rank); else ! { ! tmp = gfc_conv_descriptor_data_get (desc); ! gfc_conv_descriptor_data_set (&se.post, se.expr, tmp); ! } /* The dynamic type could have changed too. */ if (sym->ts.type == BT_CLASS) --- 1841,1847 if (rank > 0) copy_descriptor (&se.post, se.expr, desc, rank); else ! gfc_conv_descriptor_data_set (&se.post, se.expr, desc); /* The dynamic type could have changed too. */ if (sym->ts.type == BT_CLASS) Index: gcc/testsuite/gfortran.dg/bind_c_procs_3.f90 === *** gcc/testsuite/gfortran.dg/bind_c_procs_3.f90 (nonexistent) --- gcc/testsuite/gfortran.dg/bind_c_procs_3.f90 (working copy) *** *** 0 --- 1,25 + ! { dg-do run } + ! + ! Test the fix for PR92123, in which 'dat' caused an error with the message + ! "Scalar variable 'dat' at ?? with POINTER or ALLOCATABLE in procedure Fsub + ! with BIND(C) is not yet supported." + ! + ! Contributed by Vipul Parekh + ! + module m +use, intrinsic :: iso_c_binding, only : c_int + contains +subroutine Fsub( dat ) bind(C, name="Fsub") + !.. Argument list + integer(c_int), allocatable, intent(out) :: dat + dat = 42 + return +end subroutine + end module m + +use, intrinsic :: iso_c_binding, only : c_int +use m, only : Fsub +integer(c_int), allocatable :: x +call Fsub( x ) +if (x .ne. 42) stop 1 + end Index: gcc/testsuite/gfortran.dg/ISO_Fortran_binding_15.c === *** gcc/testsuite/gfortran.dg/ISO_Fortran_binding_15.c (nonexistent) --- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_15.c (working copy) *** *** 0 --- 1,41 + /* Test the fix for PR92123. */ + + /* Contributed by Vipul Parekh*/ + + #include + #include + #include "../../../l
Re: [patch, fortran] PR92123 - [F2018/array-descriptor] Scalar allocatable/pointer with array descriptor (via bind(C)): ICE with select rank or error scalar variable with POINTER or ALLOCATABLE in pro
On Sun, Nov 03, 2019 at 06:36:59PM +, Paul Richard Thomas wrote: > The attached patch is verging on the obvious. Thanks to Tobias for > spotting Vipul's messages on the J3 list. > > Regtests on FC30/x86_64 - OK for trunk and 9-branch? > OK for both. -- Steve
Re: [patch, fortran] Fix PR 92113
On Sat, Nov 02, 2019 at 10:38:32AM +0100, Thomas Koenig wrote: > > the attached patch fixes an 8/9/10 regression where, to fix PR 84487 > by not putting the initializers and vtabs into the read-only section > (for reasons of size, which could grow enormously) led to a regression > on POWER9 and other non-x86 architectures, where the initializer was > sometimes optimized away, depending on optimization levels. > > This was a strange beast to hunt down. This only showed up on > the testresults for gcc 8, so I tried to find out what commit > had fixed this on trunk, in order to backport. > > However, bisecting this I found that the test case actually > segfaults all the way up to current trunk when run by hand. > By running the testsuite, I didn't see it. This is strange, > and raises some issues about the testsuite and the possibility > of a latent issue, but I lack the knowledge to hunt this down. > > In the meantime, here is this patch, which puts the vtabs and > the initializers where the user actually specified something > into the read-only section again. > > Test case: Well, theoretically it is already there, so it makes > little sense to add a new one. > > Regression-tested on powerpc64le-unknown-linux-gnu, also > verified by hand that pr51434.f90 now passes with -O2 there. > > OK for trunk/9/8? OK for all three. It is, as you have indicated, troublesome that a segfaulting testcase isn't caught by the testsuite. -- Steve
Re: [Patch, Fortran] PR90374 Support d0.d, e0.d, es0.d, en0.d, g0.d
On Fri, Nov 01, 2019 at 03:48:04PM -0700, Jerry DeLisle wrote: > > The attached patch provides frontend and runtime modifications to allow the > subject format specifiers. These are allowed as default behavior and under > -std=f2018. > > It does not implement the ew.de0 specifier. I decided to do that part > separarately since it involves different places in the code. > > I will to a Changlog for the testsuite changes. In summary: > > modified: fmt_error_10.f to allow it to pass. > modified: fmt_error_7.f likewise. > modified: fmt_error_9.f likewise. > new file: fmt_zero_width.f90 to test the new features. > > Regression tested on x86_64-pc-linux-gnu. > > OK for trunk? > OK. --
Re: [patch, fortran] Fix PR 92113
Hi Steve, OK for trunk/9/8? OK for all three. Thanks, committed to trunk as r277760. I'll be AFK for a few days, so I will have to wait before committing this to gcc-9. Given the convoluted history of this bug, this might not be a bad thing. > It is, as you have indicated, troublesome that a segfaulting > testcase isn't caught by the testsuite. It certainly is, but I have no solution for this at the moment. Thanks for the review! Regards Thomas
Improve effectivity of ipa_context_cache
Hi, this, somewhat verbose, patch make ipa-context-cache to only store the information that is actually used by the size/time estimation. In particular we now track what parameters are used in ipa predicates, what parameters are used as targets of indirect calls and what parameters are used for polymorphic calls. So we have 3 new flags or every parameter of function: unsigned used_by_ipa_predicates : 1; unsigned used_by_indirect_call : 1; unsigned used_by_polymorphic_call : 1; This greatly improves effectivity of the cache (from 51% to 75%) and saves a lot of allocations because only info needed is actually stored and compared for equality. As followup I also plan to track what parameters are used by jump function and drop useless jump function prior inlining. That should save some memory and avoid extra legwork updating the info while we know it is not being useful. Martin, I would welcome double-checking that I did not missed any update. Basically the flags are: 1) set by functions recording the indirect call and recording the condition for predicates 2) updated by functions manipulating these once function is inlined 3) set by stream-in code (rather than being streamed as well, since it is trivial to re-compute). Bootstrapped/regtested x86_64-linux, comitted. Honza * ipa-fnsummary.c (set_cond_stmt_execution_predicate, set_switch_stmt_execution_predicate, compute_bb_predicates, will_be_nonconstant_expr_predicate, phi_result_unknown_predicate, analyze_function_body): Pass arround params summary. (ipa_call_context::duplicate_from): New comment; only duplicate useful values. (ipa_call_context::equal_to): Only compare useful values. (remap_edge_summaries): Pass params_summary. (remap_hint_predicate): Likewise. (ipa_merge_fn_summary_after_inlining): Likewise. (inline_read_section): Initialize params summary used flags. * ipa-predicate.c (predicate::remap_after_inlining): Pass around param_summary. (add_condition): Initialized used params summary flags. * ipa-predicate.h (inline_param_summary::equals_to): Make const. (inline_param_summary::useless_p): New predicate. (remap_after_inlining, add_condition): Update prototype * ipa-prop.c (ipa_populate_param_decls): Watch overflow in move_cost. (ipa_note_param_call): Add parameter POLYMORPHIC; update params summaries. (ipa_analyze_indirect_call_uses): Update use of ipa_note_param_call. (ipa_analyze_virtual_call_uses): Likewise. (update_indirect_edges_after_inlining): Update param summaries. (ipa_print_node_params): Print used flags. (ipa_read_indirect_edge_info): Update param summareis. * ipa-prop.h (ipa_param_descriptor): Add used_by_ipa_predicates, used_by_indirect_call and used_by_polymorphic_call. (ipa_set_param_used_by_ipa_predicates, ipa_set_param_used_by_indirect_call, ipa_set_param_used_by_polymorphic_call, ipa_is_param_used_by_ipa_predicates, ipa_is_param_used_by_indirect_call, ipa_is_param_used_by_polymorphic_call): New inline functions. Index: ipa-fnsummary.c === --- ipa-fnsummary.c (revision 277757) +++ ipa-fnsummary.c (working copy) @@ -1312,6 +1312,7 @@ fail: static void set_cond_stmt_execution_predicate (struct ipa_func_body_info *fbi, class ipa_fn_summary *summary, + class ipa_node_params *params_summary, basic_block bb) { gimple *last; @@ -1354,7 +1355,8 @@ set_cond_stmt_execution_predicate (struc && !dominated_by_p (CDI_POST_DOMINATORS, bb, e->dest)) { predicate p - = add_condition (summary, index, param_type, &aggpos, + = add_condition (summary, params_summary, index, +param_type, &aggpos, this_code, gimple_cond_rhs (last), param_ops); e->aux = edge_predicate_pool.allocate (); *(predicate *) e->aux = p; @@ -1387,7 +1389,8 @@ set_cond_stmt_execution_predicate (struc return; FOR_EACH_EDGE (e, ei, bb->succs) if (e->flags & EDGE_FALSE_VALUE) { - predicate p = add_condition (summary, index, param_type, &aggpos, + predicate p = add_condition (summary, params_summary, index, + param_type, &aggpos, predicate::is_not_constant, NULL_TREE); e->aux = edge_predicate_pool.allocate (); *(predicate *) e->aux = p; @@ -1401,6 +1404,7 @@ set_cond_stmt_execution_predicate (struc static void set_switch_stmt_execution_predicate (struct ipa_func_body_info *fbi, class ipa_fn_summary *
Re: [PATCH] combine: Don't generate IF_THEN_ELSE
On Thu, May 9, 2019 at 5:05 PM Segher Boessenkool wrote: > > On all targets I managed to test (21) this results in better code. Only > alpha ends up with slightly bigger code. > > Committing to trunk. This introduced: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92342 Thanks, Andrew Pinski > > > Segher > > > 2019-05-10 Segher Boessenkool > > * combine.c (combine_simplify_rtx): Don't make IF_THEN_ELSE RTL. > > --- > gcc/combine.c | 8 > 1 file changed, 8 deletions(-) > > diff --git a/gcc/combine.c b/gcc/combine.c > index 7b236225..8c4375f 100644 > --- a/gcc/combine.c > +++ b/gcc/combine.c > @@ -5937,14 +5937,6 @@ combine_simplify_rtx (rtx x, machine_mode op0_mode, > int in_dest, > mode, > VOIDmode, > cond, cop1), > mode); > - else > - return gen_rtx_IF_THEN_ELSE (mode, > -simplify_gen_relational > (cond_code, > - mode, > - > VOIDmode, > - cond, > - cop1), > -true_rtx, false_rtx); > > code = GET_CODE (x); > op0_mode = VOIDmode; > -- > 1.8.3.1 >
Re: [PR47785] COLLECT_AS_OPTIONS
Thanks for the reviews. On Sat, 2 Nov 2019 at 02:49, H.J. Lu wrote: > > On Thu, Oct 31, 2019 at 6:33 PM Kugan Vivekanandarajah > wrote: > > > > On Wed, 30 Oct 2019 at 03:11, H.J. Lu wrote: > > > > > > On Sun, Oct 27, 2019 at 6:33 PM Kugan Vivekanandarajah > > > wrote: > > > > > > > > Hi Richard, > > > > > > > > Thanks for the review. > > > > > > > > On Wed, 23 Oct 2019 at 23:07, Richard Biener > > > > wrote: > > > > > > > > > > On Mon, Oct 21, 2019 at 10:04 AM Kugan Vivekanandarajah > > > > > wrote: > > > > > > > > > > > > Hi Richard, > > > > > > > > > > > > Thanks for the pointers. > > > > > > > > > > > > > > > > > > > > > > > > On Fri, 11 Oct 2019 at 22:33, Richard Biener > > > > > > wrote: > > > > > > > > > > > > > > On Fri, Oct 11, 2019 at 6:15 AM Kugan Vivekanandarajah > > > > > > > wrote: > > > > > > > > > > > > > > > > Hi Richard, > > > > > > > > Thanks for the review. > > > > > > > > > > > > > > > > On Wed, 2 Oct 2019 at 20:41, Richard Biener > > > > > > > > wrote: > > > > > > > > > > > > > > > > > > On Wed, Oct 2, 2019 at 10:39 AM Kugan Vivekanandarajah > > > > > > > > > wrote: > > > > > > > > > > > > > > > > > > > > Hi, > > > > > > > > > > > > > > > > > > > > As mentioned in the PR, attached patch adds > > > > > > > > > > COLLECT_AS_OPTIONS for > > > > > > > > > > passing assembler options specified with -Wa, to the > > > > > > > > > > link-time driver. > > > > > > > > > > > > > > > > > > > > The proposed solution only works for uniform -Wa options > > > > > > > > > > across all > > > > > > > > > > TUs. As mentioned by Richard Biener, supporting non-uniform > > > > > > > > > > -Wa flags > > > > > > > > > > would require either adjusting partitioning according to > > > > > > > > > > flags or > > > > > > > > > > emitting multiple object files from a single LTRANS CU. We > > > > > > > > > > could > > > > > > > > > > consider this as a follow up. > > > > > > > > > > > > > > > > > > > > Bootstrapped and regression tests on arm-linux-gcc. Is > > > > > > > > > > this OK for trunk? > > > > > > > > > > > > > > > > > > While it works for your simple cases it is unlikely to work > > > > > > > > > in practice since > > > > > > > > > your implementation needs the assembler options be present at > > > > > > > > > the link > > > > > > > > > command line. I agree that this might be the way for people > > > > > > > > > to go when > > > > > > > > > they face the issue but then it needs to be documented > > > > > > > > > somewhere > > > > > > > > > in the manual. > > > > > > > > > > > > > > > > > > That is, with COLLECT_AS_OPTION (why singular? I'd expected > > > > > > > > > COLLECT_AS_OPTIONS) available to cc1 we could stream this > > > > > > > > > string > > > > > > > > > to lto_options and re-materialize it at link time (and > > > > > > > > > diagnose mismatches > > > > > > > > > even if we like). > > > > > > > > OK. I will try to implement this. So the idea is if we provide > > > > > > > > -Wa,options as part of the lto compile, this should be available > > > > > > > > during link time. Like in: > > > > > > > > > > > > > > > > arm-linux-gnueabihf-gcc -march=armv7-a -mthumb -O2 -flto > > > > > > > > -Wa,-mimplicit-it=always,-mthumb -c test.c > > > > > > > > arm-linux-gnueabihf-gcc -flto test.o > > > > > > > > > > > > > > > > I am not sure where should we stream this. Currently, > > > > > > > > cl_optimization > > > > > > > > has all the optimization flag provided for compiler and it is > > > > > > > > autogenerated and all the flags are integer values. Do you have > > > > > > > > any > > > > > > > > preference or example where this should be done. > > > > > > > > > > > > > > In lto_write_options, I'd simply append the contents of > > > > > > > COLLECT_AS_OPTIONS > > > > > > > (with -Wa, prepended to each of them), then recover them in > > > > > > > lto-wrapper > > > > > > > for each TU and pass them down to the LTRANS compiles (if they > > > > > > > agree > > > > > > > for all TUs, otherwise I'd warn and drop them). > > > > > > > > > > > > Attached patch streams it and also make sure that the options are > > > > > > the > > > > > > same for all the TUs. Maybe it is a bit restrictive. > > > > > > > > > > > > What is the best place to document COLLECT_AS_OPTIONS. We don't seem > > > > > > to document COLLECT_GCC_OPTIONS anywhere ? > > > > > > > > > > Nowhere, it's an implementation detail then. > > > > > > > > > > > Attached patch passes regression and also fixes the original ARM > > > > > > kernel build issue with tumb2. > > > > > > > > > > Did you try this with multiple assembler options? I see you stream > > > > > them as -Wa,-mfpu=xyz,-mthumb but then compare the whole > > > > > option strings so a mismatch with -Wa,-mthumb,-mfpu=xyz would be > > > > > diagnosed. If there's a spec induced -Wa option do we get to see > > > > > that as well? I can imagine -march=xyz enabling a -Wa option > > > > > for example. > > > > > > > > > > + *collect_as = XNEWVEC (cha
[PATCH v2] PR92090: Fix testcase failures by r276469
-finline-functions is enabled by default for O2 since r276469, update the test cases with -fno-inline-functions. v2: disable inlining for the failed cases. Add two more failed cases not listed in BZ. Tested on P8LE, P8BE and P9LE. gcc/testsuite/ChangeLog: 2019-10-30 Xiong Hu Luo PR92090 * g++.dg/tree-ssa/pr61034.C: Add -fno-inline-functions --param max-inline-insns-single-O2=200. * gcc.dg/atomic/c11-atomic-exec-5.c: Likewise. * gcc.target/powerpc/pr72804.c: Likewie. * gcc.target/powerpc/pr79439-1.c: Add -fno-inline-functions. * gcc.target/powerpc/vsx-builtin-7.c: Likewise. --- gcc/testsuite/g++.dg/tree-ssa/pr61034.C | 2 +- gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c | 2 +- gcc/testsuite/gcc.target/powerpc/pr72804.c | 2 +- gcc/testsuite/gcc.target/powerpc/pr79439-1.c | 2 +- gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C index 2e3dfecacb4..338de1ebb13 100644 --- a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C +++ b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C @@ -1,5 +1,5 @@ // { dg-do compile } -// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized -fdelete-null-pointer-checks --param early-inlining-insns-O2=14" } +// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized -fdelete-null-pointer-checks --param early-inlining-insns-O2=14 -fno-inline-functions --param max-inline-insns-single-O2=200" } #define assume(x) if(!(x))__builtin_unreachable() diff --git a/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c b/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c index 692c64ad207..0d70a769fdd 100644 --- a/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c +++ b/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c @@ -3,7 +3,7 @@ iterations of the compare-and-exchange loop are needed, exceptions get properly cleared). */ /* { dg-do run } */ -/* { dg-options "-std=c11 -pedantic-errors -pthread -U_POSIX_C_SOURCE -D_POSIX_C_SOURCE=200809L" } */ +/* { dg-options "-std=c11 -pedantic-errors -pthread -U_POSIX_C_SOURCE -D_POSIX_C_SOURCE=200809L -fno-inline-functions --param max-inline-insns-single-O2=200" } */ /* { dg-add-options ieee } */ /* { dg-additional-options "-mfp-trap-mode=sui" { target alpha*-*-* } } */ /* { dg-additional-options "-D_XOPEN_SOURCE=600" { target *-*-solaris2* } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.c b/gcc/testsuite/gcc.target/powerpc/pr72804.c index b83b6350d75..0fc3df1d89b 100644 --- a/gcc/testsuite/gcc.target/powerpc/pr72804.c +++ b/gcc/testsuite/gcc.target/powerpc/pr72804.c @@ -1,6 +1,6 @@ /* { dg-do compile { target { lp64 } } } */ /* { dg-skip-if "" { powerpc*-*-darwin* } } */ -/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-require-effective-target powerpc_vsx_ok -fno-inline-functions --param max-inline-insns-single-O2=200 } */ /* { dg-options "-O2 -mvsx" } */ __int128_t diff --git a/gcc/testsuite/gcc.target/powerpc/pr79439-1.c b/gcc/testsuite/gcc.target/powerpc/pr79439-1.c index 5732a236c8e..539c96f571e 100644 --- a/gcc/testsuite/gcc.target/powerpc/pr79439-1.c +++ b/gcc/testsuite/gcc.target/powerpc/pr79439-1.c @@ -1,5 +1,5 @@ /* { dg-do compile { target { powerpc*-*-linux* && lp64 } } } */ -/* { dg-options "-O2 -fpic -fno-reorder-blocks" } */ +/* { dg-options "-O2 -fpic -fno-reorder-blocks -fno-inline-functions" } */ /* On the Linux 64-bit ABIs, we eliminate NOP in the 'rec' call even if -fpic is used. The recursive call should call the local alias. The diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c b/gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c index 5d31309f272..0780b01ffab 100644 --- a/gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c +++ b/gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c @@ -1,7 +1,7 @@ /* { dg-do compile { target { powerpc*-*-* } } } */ /* { dg-skip-if "" { powerpc*-*-darwin* } } */ /* { dg-require-effective-target powerpc_vsx_ok } */ -/* { dg-options "-O2 -mdejagnu-cpu=power7" } */ +/* { dg-options "-O2 -mdejagnu-cpu=power7 -fno-inline-functions" } */ /* Test simple extract/insert/slat operations. Make sure all types are supported with various options. */ -- 2.21.0.777.g83232e3864
Re: [PATCH] Add explicit description for -finline
On 2019/11/2 00:23, Joseph Myers wrote: > On Thu, 31 Oct 2019, Xiong Hu Luo wrote: > >> +@code{-finline} enables inlining of function declared \"inline\". >> +@code{-finline} is enabled at levels -O1, -O2, -O3 and -Os, but not -Og. > > Use @option{} to mark up option names (both -finline and all the -O > options in this paragraph). Use @code{} to mark up keyword names, not > \"\". > Thanks. So shall I commit the tiny patch with below updates? diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 1407d019d14..ea0d407fe11 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -8576,6 +8576,10 @@ optimizing. Single functions can be exempted from inlining by marking them with the @code{noinline} attribute. +@option{-finline} enables inlining of function declared @code{inline}. +@option{-finline} is enabled at levels @option{-O1}, @option{-O2}, @option{-O3} +and @option{-Os}, but not @option{-Og}. + @item -finline-small-functions @opindex finline-small-functions Integrate functions into their callers when their body is smaller than expected
[PATCH V3] rs6000: Refine small loop unroll in loop_unroll_adjust hook
Hi, In this patch, loop unroll adjust hook is introduced for powerpc. We can do target related hueristic adjustment in this hook. In this patch, small loops is unrolled 2 times for O2 and O3 by default. With this patch, we can see some improvement for spec2017. This patch enhanced a little for [Patch V2] to enable small loops unroll for O3 by default like O2. Bootstrapped and regtested on powerpc64le. Is this ok for trunk? Jiufu BR. gcc/ 2019-11-04 Jiufu Guo PR tree-optimization/88760 * config/rs6000/rs6000.c (rs6000_option_override_internal): Remove code which changes PARAM_MAX_UNROLL_TIMES and PARAM_MAX_UNROLLED_INSNS. (TARGET_LOOP_UNROLL_ADJUST): Add loop unroll adjust hook. (rs6000_loop_unroll_adjust): New hook for loop unroll adjust. Unrolling small loop 2 times for -O2 and -O3. (rs6000_function_specific_save): Save unroll_small_loops flag. (rs6000_function_specific_restore): Restore unroll_small_loops flag. * gcc/config/rs6000/rs6000.opt (unroll_small_loops): New internal flag. gcc.testsuite/ 2019-11-04 Jiufu Guo PR tree-optimization/88760 * gcc.dg/pr59643.c: Update back to r277550. --- gcc/config/rs6000/rs6000.c | 38 -- gcc/config/rs6000/rs6000.opt | 7 +++ gcc/testsuite/gcc.dg/pr59643.c | 3 --- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 9ed5151..5e1a75d 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1428,6 +1428,9 @@ static const struct attribute_spec rs6000_attribute_table[] = #undef TARGET_VECTORIZE_DESTROY_COST_DATA #define TARGET_VECTORIZE_DESTROY_COST_DATA rs6000_destroy_cost_data +#undef TARGET_LOOP_UNROLL_ADJUST +#define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust + #undef TARGET_INIT_BUILTINS #define TARGET_INIT_BUILTINS rs6000_init_builtins #undef TARGET_BUILTIN_DECL @@ -4540,25 +4543,20 @@ rs6000_option_override_internal (bool global_init_p) global_options.x_param_values, global_options_set.x_param_values); - /* unroll very small loops 2 time if no -funroll-loops. */ + /* If funroll-loops is not enabled explicitly, then enable small loops +unrolling for -O2, and do not turn fweb or frename-registers on. */ if (!global_options_set.x_flag_unroll_loops && !global_options_set.x_flag_unroll_all_loops) { - maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2, -global_options.x_param_values, -global_options_set.x_param_values); - - maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20, -global_options.x_param_values, -global_options_set.x_param_values); + unroll_small_loops = optimize >= 2 ? 1 : 0; - /* If fweb or frename-registers are not specificed in command-line, -do not turn them on implicitly. */ if (!global_options_set.x_flag_web) global_options.x_flag_web = 0; if (!global_options_set.x_flag_rename_registers) global_options.x_flag_rename_registers = 0; } + else + unroll_small_loops = 0; /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0) can be optimized to @@ -5101,6 +5099,24 @@ rs6000_destroy_cost_data (void *data) free (data); } +/* Implement targetm.loop_unroll_adjust. */ + +static unsigned +rs6000_loop_unroll_adjust (unsigned nunroll, struct loop * loop) +{ + if (unroll_small_loops) +{ + /* TODO: This is hardcoded to 10 right now. It can be refined, for +example we may want to unroll very small loops more times (4 perhaps). +We also should use a PARAM for this. */ + if (loop->ninsns <= 10) + return MIN (2, nunroll); + else + return 0; +} + return nunroll; +} + /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a library with vectorized intrinsics. */ @@ -23472,6 +23488,7 @@ rs6000_function_specific_save (struct cl_target_option *ptr, { ptr->x_rs6000_isa_flags = opts->x_rs6000_isa_flags; ptr->x_rs6000_isa_flags_explicit = opts->x_rs6000_isa_flags_explicit; + ptr->x_unroll_small_loops = opts->x_unroll_small_loops; } /* Restore the current options */ @@ -23483,6 +23500,7 @@ rs6000_function_specific_restore (struct gcc_options *opts, { opts->x_rs6000_isa_flags = ptr->x_rs6000_isa_flags; opts->x_rs6000_isa_flags_explicit = ptr->x_rs6000_isa_flags_explicit; + opts->x_unroll_small_loops = ptr->x_unroll_small_loops; (void) rs6000_option_override_internal (false); } diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 1f37a92..9cd5b4e 100644 --- a/gcc/config/rs6000/rs6000.opt +++
[PATCH, rs6000] Make load cost more in vectorization cost for P8/P9
Hi, To align with rs6000_insn_cost costing more for load type insns, this patch is to make load insns cost more in vectorization cost function. Considering that the result of load usually is used somehow later (true-dep) but store won't, we keep the store as before. The SPEC2017 performance evaluation on Power8 shows 525.x264_r +9.56%, 511.povray_r +2.08%, 527.cam4_r 1.16% gains, no significant degradation, SPECINT geomean +0.88%, SPECFP geomean +0.26%. The SPEC2017 performance evaluation on Power9 shows no significant improvement or degradation, SPECINT geomean +0.04%, SPECFP geomean +0.04%. The SPEC2006 performance evaluation on Power8 shows 454.calculix +4.41% gain but 416.gamess -1.19% and 453.povray -3.83% degradation. I looked into the two degradation bmks, the degradation were NOT due to hotspot changes by vectorization, were all side effects. SPECINT geomean +0.10%, SPECFP geomean no changed considering the degradation. Bootstrapped and regress tested on powerpc64le-linux-gnu. Is OK for trunk? BR, Kewen --- gcc/ChangeLog 2019-11-04 Kewen Lin * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Make scalar_load, vector_load, unaligned_load and vector_gather_load cost a bit more on Power8 and up. diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 5876714..876c7ef 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4763,15 +4763,22 @@ rs6000_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, switch (type_of_cost) { case scalar_stmt: - case scalar_load: case scalar_store: case vector_stmt: - case vector_load: case vector_store: case vec_to_scalar: case scalar_to_vec: case cond_branch_not_taken: return 1; + case scalar_load: + case vector_load: + /* Like rs6000_insn_cost, make load insns cost a bit more. FIXME: the + benefits were observed on Power8 and up, we can unify it if similar + profits are measured on Power6 and Power7. */ + if (TARGET_P8_VECTOR) + return 2; + else + return 1; case vec_perm: /* Power7 has only one permute unit, make it a bit expensive. */ @@ -4792,8 +4799,9 @@ rs6000_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case unaligned_load: case vector_gather_load: + /* Like rs6000_insn_cost, make load insns cost a bit more. */ if (TARGET_EFFICIENT_UNALIGNED_VSX) - return 1; + return 2; if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN) { @@ -4827,7 +4835,13 @@ rs6000_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, /* Misaligned loads are not supported. */ gcc_unreachable (); -return 2; + /* Like rs6000_insn_cost, make load insns cost a bit more. FIXME: the + benefits were observed on Power8 and up, we can unify it if similar + profits are measured on Power6 and Power7. */ + if (TARGET_P8_VECTOR) + return 4; + else + return 2; case unaligned_store: case vector_scatter_store: