[AARCH64] AArch64 backport to 4.7
[AARCH64] AArch64 backport to 4.7. This series of patches implements a back port of the AArch64 backend to gcc-4.7. This patch series is not intended to be applied directly to the gcc-4_7-branch branch. We have pushed a new branch into SVN to host this back port, located at: svn://gcc.gnu.org/gcc/branches/ARM/aarch64-4.7-branch This branch will track 4.7. Patches to this branch should be tagged "[AARCH64-4.7]" and must be approved by ARM personnel. The branch was constructed using the following steps: . Copied gcc-4_7-branch. . Copied the AArch64 port from ARM/aarch64-branch. . Applied the attached AArch64 backport patch. . Applied the attached backport patch to implement integer iterators. . Updated config.sub and config.guess from upstream gnuconfig. Thank you aarch64-backport.patch Description: Binary data aarch64-int-iterators-backport.patch Description: Binary data
[SH] Correct address cost estimations
Hello, This corrects the address cost estimations for SH. Tested on rev 191161 with make -k check RUNTESTFLAGS="--target_board=sh-sim \{-m2/-ml,-m2/-mb,-m2a/-mb,-m4/-ml,-m4/-mb,-m4a/-ml,-m4a/-mb}" and no new failures. With this applied CSiBE shows a total size decrease of 4668 bytes for '-O2 -m4-single -ml -mpretend-cmove'. OK? Cheers, Oleg ChangeLog: * config/sh/sh.c (sh_rtx_costs): Add handling of MEM, SIGN_EXTEND, ZERO_EXTEND and PARALLEL cases. (sh_address_cost): Correct rtx parsing and tweak cost estimations. Index: gcc/config/sh/sh.c === --- gcc/config/sh/sh.c (revision 191161) +++ gcc/config/sh/sh.c (working copy) @@ -3196,6 +3196,78 @@ } return false; +/* The cost of a mem access is mainly the cost of the address mode. */ +case MEM: + *total = sh_address_cost (XEXP (x, 0), GET_MODE (x), MEM_ADDR_SPACE (x), +true); + return true; + +/* The cost of a sign or zero extend depends on whether the source is a + reg or a mem. In case of a mem take the address into acount. */ +case SIGN_EXTEND: + if (REG_P (XEXP (x, 0))) + { + *total = COSTS_N_INSNS (1); + return true; + } + if (MEM_P (XEXP (x, 0))) + { + *total = sh_address_cost (XEXP (XEXP (x, 0), 0), +GET_MODE (XEXP (x, 0)), +MEM_ADDR_SPACE (XEXP (x, 0)), true); + return true; + } + return false; + +case ZERO_EXTEND: + if (REG_P (XEXP (x, 0))) + { + *total = COSTS_N_INSNS (1); + return true; + } + else if (TARGET_SH2A && MEM_P (XEXP (x, 0)) + && (GET_MODE (XEXP (x, 0)) == QImode + || GET_MODE (XEXP (x, 0)) == HImode)) + { + /* Handle SH2A's movu.b and movu.w insn. */ + *total = sh_address_cost (XEXP (XEXP (x, 0), 0), +GET_MODE (XEXP (x, 0)), +MEM_ADDR_SPACE (XEXP (x, 0)), true); + return true; + } + return false; + +/* mems for SFmode and DFmode can be inside a parallel due to + the way the fpscr is handled. */ +case PARALLEL: + for (int i = 0; i < XVECLEN (x, 0); i++) + { + rtx xx = XVECEXP (x, 0, i); + if (GET_CODE (xx) == SET && MEM_P (XEXP (xx, 0))) + { + *total = sh_address_cost (XEXP (XEXP (xx, 0), 0), + GET_MODE (XEXP (xx, 0)), + MEM_ADDR_SPACE (XEXP (xx, 0)), true); + return true; + } + if (GET_CODE (xx) == SET && MEM_P (XEXP (xx, 1))) + { + *total = sh_address_cost (XEXP (XEXP (xx, 1), 0), + GET_MODE (XEXP (xx, 1)), + MEM_ADDR_SPACE (XEXP (xx, 1)), true); + return true; + } + } + + if (sh_1el_vec (x, VOIDmode)) + *total = outer_code != SET; + else if (sh_rep_vec (x, VOIDmode)) + *total = ((GET_MODE_UNIT_SIZE (GET_MODE (x)) + 3) / 4 + + (outer_code != SET)); + else + *total = COSTS_N_INSNS (3) + (outer_code != SET); + return true; + case CONST_INT: if (TARGET_SHMEDIA) { @@ -3271,7 +3343,10 @@ else *total = 10; return true; + case CONST_VECTOR: +/* FIXME: This looks broken. Only the last statement has any effect. + Probably this could be folded with the PARALLEL case? */ if (x == CONST0_RTX (GET_MODE (x))) *total = 0; else if (sh_1el_vec (x, VOIDmode)) @@ -3339,15 +3414,6 @@ *total = COSTS_N_INSNS (20); return true; -case PARALLEL: - if (sh_1el_vec (x, VOIDmode)) - *total = outer_code != SET; - if (sh_rep_vec (x, VOIDmode)) - *total = ((GET_MODE_UNIT_SIZE (GET_MODE (x)) + 3) / 4 - + (outer_code != SET)); - *total = COSTS_N_INSNS (3) + (outer_code != SET); - return true; - case FLOAT: case FIX: *total = 100; @@ -3430,36 +3496,47 @@ /* Compute the cost of an address. */ static int -sh_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED, +sh_address_cost (rtx x, enum machine_mode mode, addr_space_t as ATTRIBUTE_UNUSED, bool speed ATTRIBUTE_UNUSED) { + /* Simple reg, post-inc, pre-dec addressing. */ + if (REG_P (x) || GET_CODE (x) == POST_INC || GET_CODE (x) == PRE_DEC) +return 1; + /* 'reg + disp' addressing. */ - if (satisfies_constraint_Sdd (x)) + if (GET_CODE (x) == PLUS + && REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1))) { - const HOST_WIDE_INT offset = disp_addr_displacement (x); - const enum machine_mode mode = GET_MODE (x); + const HOST_WIDE_INT offset = INTVAL (XEXP (x, 1)); - /* The displacement would fit into a 2 byte move insn. */ + if (offset == 0) + return 1; + + /* The displacement would fit into a 2 byte move insn. + HImode and QImode loads/stores with displacement put pressure on + R0 which will most likely require another reg copy. Thus account + a higher cost for that. */ if (offset > 0 && offset <= max_mov_insn_displacement (mode, false)) - return 0; + return (mode == HImode || mode == QImode) ? 2 : 1; /* The displacement would fit into a 4
Re: Obsolete picochip-* in 4.7.2+
On Tue, Sep 11, 2012 at 5:21 PM, Jakub Jelinek wrote: > Hi! > > As discussed on IRC, the picochip-* port doesn't have an active maintainer > anymore, this patch adds it to deprecated ports for 4.7.2+ so that it can be > removed in > GCC 4.8 unless somebody steps up to maintain it. > > Ok for trunk/4.7? Ok. Thanks, Richard. > 2012-09-11 Jakub Jelinek > > * config.gcc: Obsolete picochip-*. > > --- gcc/config.gcc 2012-09-05 14:52:14.428548941 +0200 > +++ gcc/config.gcc 2012-09-11 17:05:15.147522191 +0200 > @@ -245,7 +245,8 @@ md_file= > > # Obsolete configurations. > case ${target} in > - score-* \ > + picochip-* \ > + | score-* \ > ) > if test "x$enable_obsolete" != xyes; then >echo "*** Configuration ${target} is obsolete." >&2 > > --- gcc-4.7/changes.html10 Aug 2012 16:25:46 - 1.124 > +++ gcc-4.7/changes.html11 Sep 2012 15:15:38 - > @@ -29,7 +29,14 @@ > next release of GCC will have their sources permanently > removed. > > -The following ports for individual systems on > +All GCC ports for the following processor > +architectures have been declared obsolete: > + > + > + picoChip (picochip-*) > + > + > +The following ports for individual systems on > particular architectures have been obsoleted: > > > > > Jakub
Re: [PATCH,mmix] convert to constraints.md
- Original Message - > Nathan, again thanks. There are a few minor tweaks compared to your > version: Thanks for fixing this up! > - Keeping old layout of "mmix_reg_or_8bit_operand". That looked like > a spurious change and I prefer the ior construct to the > if_then_else. ISTR without this change, there were lots of assembly changes like: set rx, 6 cmp rz, ry, rx instead of the previous and better: cmp rz, ry, 6 (apologies if the assembly syntax isn't correct; hopefully the intent is clear) but if you double-checked that the assembly didn't change after your changes, maybe something else that you tweaked fixed this. > - Replacing undefined-constraint-"H" with "I" instead of removing it. > It was either renamed early or a genuine typo. Good catch. Hard not to see it; the gen* machinery complains about undefined constraints. :) -Nathan
Re: [C++ Patch] Remove uses of ATTRIBUTE_UNUSED in the function parameters
On Tue, Sep 11, 2012 at 5:37 PM, Jakub Jelinek wrote: > On Tue, Sep 11, 2012 at 05:29:12PM +0200, Paolo Carlini wrote: >> PS: slightly interesting, in a couple of cases - >> write_unnamed_type_name, wrap_cleanups_r - the parameters were >> actually used. > > Just a general comment, often an argument is only conditionally used, > e.g. depending on some preprocessor macro (e.g. target hook). In that > case unnamed parameter is not an option, but dropping ATTRIBUTE_UNUSED is > not desirable either. And note that we have ARG_UNUSED for parameters - to cope with older compilers not handling attributes here too well (I run into this when using gcc 3.3 as host compiler). Richard. > Jakub
Re: Backtrace library [3/3]
On 09/12/2012 12:55 AM, Ian Lance Taylor wrote: I have finished the initial implementation of the backtrace library I proposed at http://gcc.gnu.org/ml/gcc/2012-08/msg00317.html . I've separated the work into three patches. These patches only implement the backtrace library itself; actual use of the library will follow in separate patches. I'm trying to add a few comments below. I hope Thunderbird does not garble them too much. +backtrace_open (const char *filename, backtrace_error_callback error_callback, + void *data) +{ + int descriptor; + + descriptor = open (filename, O_RDONLY | O_CLOEXEC); + if (descriptor < 0) +{ + error_callback (data, filename, errno); + return -1; +} + if (O_CLOEXEC == 0) +{ + /* It doesn't matter if this fails for some reason. */ + fcntl (descriptor, F_SETFD, FD_CLOEXEC); +} You should call fcntl unconditionally. O_CLOEXEC might be non-zero during build, but could still be ignored by the kernel. +static void +fileline_initialize (backtrace_error_callback error_callback, void *data) +{ ... + if (executable_name != NULL) +descriptor = backtrace_open (executable_name, error_callback, data); + else +descriptor = backtrace_open ("/proc/self/exe", error_callback, data); You should try getauxval(AT_EXECFN) as well (needs recent glibc), so that this works with a mounted /proc. This library should only be used when getauxval(AT_SECURE) is zero, so that the program doesn't try to read files with elevated privileges to which the original user wouldn't have access. I don't think this has to be addressed within the library itself. Adding /usr/lib/debug support shouldn't be too hard, I will try to figure out the required path transformations (which are somewhat system-specific). -- Florian Weimer / Red Hat Product Security Team
Re: [PATCH] Combine location with block using block_locations
On Tue, Sep 11, 2012 at 5:32 PM, Michael Matz wrote: > Hi, > > On Tue, 11 Sep 2012, Dehao Chen wrote: > >> Looks like we have two choices: >> >> 1. Stream out block info, and use LTO_SET_PREVAIL for TREE_CHAIN(t) > > This will actually not work correctly in some cases. The problem is, if > the prevailing decl is already part of another chain (say in another > block_var list) you would break the current chain. Hence block vars need > special handling in the lto streamer (another reason why tree_chain is not > the most clever think to use for this chain). This problem area needs to > be solved somehow if block info is to be preserved correctly. Well. The issue is that at present we stream BLOCKs in the function section via its DECL_INTIAL. Which means we _never_ should get a non-prevailing DECL in BLOCK_VARS. You need to debug why that doesn't work anymore. Possibly the BLOCK leaks into decls it should not leak to via the location mechanism? >> 2. Don't stream out block info for LTO, and still call LTO_NO_PREVAIL >> (TREE_CHAIN (t)). > > That's also a large hammer as it basically will mean no debug info after > LTO :-/ Sigh, at this point I have no good solution that doesn't involve > quite some work, perhaps your hack is good enough for the time being, > though I hate it :) It hides a bug. If we replace anything in BLOCK_VARS then the risk is that you generate an infinite chain in some BLOCK_VARS list and thus get infinite loops somewhere in the compiler. So, no, that's not an option. Richard. > > Ciao, > Michael.
Re: [PATCH] Combine location with block using block_locations
On Wed, Sep 12, 2012 at 7:06 AM, Dehao Chen wrote: > Now I think we are facing a more complex problem. The data structure > we use to store the location_adhoc_data are file-static in linemap.c > in libcpp. These data structures are not guarded by GTY(()). > Meanwhile, as we have removed the block data structure from > gimple.gsbase as well as tree.exp (encoding them into an location_t). > This could cause block being GCed and the LOCATION_BLOCK becoming > dangling pointers. Uh. Note that it is quite important that we are able to garbage-collect unused BLOCKs, this is the whole point of removing unused BLOCK scopes in remove_unused_locals. So this indeed becomes much more complicated ... What would be desired is that the garbage collector can NULL an entry in the mapping table when it is not referenced in any other way (that other reference would be the BLOCK tree as stored in a FUNCTION_DECLs DECL_INITIAL). > I tried to manipulate GTY to make it recognize the LOCATION_BLOCK from > gimple.gsbase.location. However, neigher nested_ptr nor mark_hook can > help me. > > Another approach would be guard the location_adhoc_data and related > data structures in GTY(()). However, this is non-trivial because tree > is not visible in libcpp. At the same time, my implementation heavily > relies on hashtable to make the code efficient, thus it's quite tricky > to make "param_is" and "use_params" work. > > The final approach, which I'll try tomorrow, would be move all my > implementation from libcpp to gcc, and guard them with GTY(()). I > still haven't thought of any potential problem of this approach. Any > comments? I think moving the mapping to GC in a lazy manner as I described above would be the way to go. For hashtables GC already supports if_marked, not sure if similar support is available for arrays/vecs. Richard. > Thanks, > Dehao > > On Tue, Sep 11, 2012 at 9:00 AM, Dehao Chen wrote: >> I saw comments in tree-streamer-out.c: >> >> /* Do not stream BLOCK_SOURCE_LOCATION. We cannot handle debug information >> for early inlining so drop it on the floor instead of ICEing in >> dwarf2out.c. */ >> streamer_write_chain (ob, BLOCK_VARS (expr), ref_p); >> >> However, what the code is doing seemed contradictory with the comment. >> Or am I missing something? >> >> >> >> On Tue, Sep 11, 2012 at 8:32 AM, Michael Matz wrote: >>> Hi, >>> >>> On Tue, 11 Sep 2012, Dehao Chen wrote: >>> Looks like we have two choices: 1. Stream out block info, and use LTO_SET_PREVAIL for TREE_CHAIN(t) >>> >>> This will actually not work correctly in some cases. The problem is, if >>> the prevailing decl is already part of another chain (say in another >>> block_var list) you would break the current chain. Hence block vars need >>> special handling in the lto streamer (another reason why tree_chain is not >>> the most clever think to use for this chain). This problem area needs to >>> be solved somehow if block info is to be preserved correctly. >>> 2. Don't stream out block info for LTO, and still call LTO_NO_PREVAIL (TREE_CHAIN (t)). >>> >>> That's also a large hammer as it basically will mean no debug info after >>> LTO :-/ Sigh, at this point I have no good solution that doesn't involve >>> quite some work, perhaps your hack is good enough for the time being, >>> though I hate it :) >> >> I got it. Then I'll keep the patch as it is (remove the >> LTO_NO_PREVAIL), and work with Honza to resolve the issue he had, and >> then we should be good to check in? >> >> Thanks, >> Dehao >> >>> >>> >>> Ciao, >>> Michael.
[v3, build] Clear hardware capabilities on libstdc++.so with Sun as
Since the use of rdrand was introduced in src/c++11/random.cc, all execution tests involving libstdc++.so.6 fail on Solaris 10 and 11/x86 with a sufficiently recent native assembler that supports rdrand: either Solaris 10/x86 patch 119961-11 or Solaris 11.1 builds (haven't checked which one). The problem is that as tags src/c++11/random.o as needing RDRAND support: % file random.o random.o: ELF 32-bit LSB relocatable 80386 Version 1 [CMOV RDRAND] which is propagated to libstdc++.so.6 and causes the runtime linker to fail the execution if the hardware doesn't support that hardware capability, although rdrand is executed only if the cpuid indicates the support is present. The usual solution so far has been to clear the hardware capability using a linker map (as in libitm, cf. libitm/clearcap.map). Unfortunately, this doesn't work here: as can be seen with elfdump, RDRAND is set in a second mask (CA_SUNW_HW_2) since all bits in CA_SUNW_HW_1 are already used: % elfdump -H random.o Capabilities Section: .SUNW_cap Object Capabilities: index tag value [0] CA_SUNW_HW_1 0x20 [ CMOV ] [1] CA_SUNW_HW_2 0x1 [ RDRAND ] The old (v1) linker map syntax has no support for clearing that bit/mask, and while the v2 map syntax does, we cannot universally assume it's present on Solaris 10: while recent linker patches include it, older ones don't and ld and as can be updated independently. So I've settled for a different solution instead: Sun assemblers with hardware capability support also have a -nH switch to suppress their generation, thus I'm testing for that and use it if possible. This is exactly what the following patch does. Bootstrapped without regressions on i386-pc-solaris2.1[01] with affected versions of Sun as and gas 2.22. Results with gas are unchanged, with Sun as all failures are gone. x86_64-unknown-linux-gnu bootstrap is running to make sure nothing breaks there. Ok for mainline if that passes? Rainer 2012-09-11 Rainer Orth * acinclude.m4 (GLIBCXX_CHECK_ASSEMBLER_HWCAP): Define. * configure.ac: Call GLIBCXX_CHECK_ASSEMBLER_HWCAP. * fragment.am (CONFIG_CXXFLAGS): Add $(HWCAP_FLAGS). * configure: Regenerate. * Makefile.in: Regenerate. * doc/Makefile.in: Regenerate. * include/Makefile.in: Regenerate. * libsupc++/Makefile.in: Regenerate. * po/Makefile.in: Regenerate. * python/Makefile.in: Regenerate. * src/Makefile.in: Regenerate. * src/c++11/Makefile.in: Regenerate. * src/c++98/Makefile.in: Regenerate. * testsuite/Makefile.in: Regenerate. # HG changeset patch # Parent cc6aab46be72c37bfdfccd786ed5c332a7ce4cd9 Clear hardware capabilities on libstdc++.so with Sun as diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4 --- a/libstdc++-v3/acinclude.m4 +++ b/libstdc++-v3/acinclude.m4 @@ -169,6 +169,32 @@ AC_DEFUN([GLIBCXX_CHECK_COMPILER_FEATURE dnl +dnl Check if the assembler used supports disabling generation of hardware +dnl capabilities. This is only supported by Sun as at the moment. +dnl +dnl Defines: +dnl HWCAP_FLAGS='-Wa,-nH' if possible. +dnl +AC_DEFUN([GLIBCXX_CHECK_ASSEMBLER_HWCAP], [ + test -z "$HWCAP_FLAGS" && HWCAP_FLAGS='' + + ac_save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS -Wa,-nH" + + AC_MSG_CHECKING([for as that supports -Wa,-nH]) + AC_TRY_COMPILE([], [return 0;], [ac_hwcap_flags=yes],[ac_hwcap_flags=no]) + if test "$ac_hwcap_flags" = "yes"; then +HWCAP_FLAGS="-Wa,-nH $HWCAP_FLAGS" + fi + AC_MSG_RESULT($ac_hwcap_flags) + + CFLAGS="$ac_save_CFLAGS" + + AC_SUBST(HWCAP_FLAGS) +]) + + +dnl dnl If GNU ld is in use, check to see if tricky linker opts can be used. If dnl the native linker is in use, all variables will be defined to something dnl safe (like an empty string). diff --git a/libstdc++-v3/configure.ac b/libstdc++-v3/configure.ac --- a/libstdc++-v3/configure.ac +++ b/libstdc++-v3/configure.ac @@ -333,6 +333,9 @@ case "$target" in esac GLIBCXX_CONDITIONAL(GLIBCXX_LDBL_COMPAT, test $ac_ldbl_compat = yes) +# Check if assembler supports disabling hardware capability support. +GLIBCXX_CHECK_ASSEMBLER_HWCAP + # Check if assembler supports rdrand opcode. GLIBCXX_CHECK_X86_RDRAND diff --git a/libstdc++-v3/fragment.am b/libstdc++-v3/fragment.am --- a/libstdc++-v3/fragment.am +++ b/libstdc++-v3/fragment.am @@ -22,7 +22,7 @@ endif # These bits are all figured out from configure. Look in acinclude.m4 # or configure.ac to see how they are set. See GLIBCXX_EXPORT_FLAGS. CONFIG_CXXFLAGS = \ - $(SECTION_FLAGS) $(EXTRA_CXX_FLAGS) -frandom-seed=$@ + $(SECTION_FLAGS) $(HWCAP_FLAGS) $(EXTRA_CXX_FLAGS) -frandom-seed=$@ WARN_CXXFLAGS = \ $(WARN_FLAGS) $(WERROR_FLAG) -fdiagnostics-show-location=once -- - Rainer Orth, Center for Biotechnology, Bielefeld University
Re: [Fortran, Patch] PR 54225 - Fix ice-on-invalid-code with "*" in array refs
On 11/09/2012 07:54, Tobias Burnus wrote: > This patch fixes a GCC 4.7/4.8 regression for invalid code. > > Build and regtested on x86-64-linux. > OK for the trunk and 4.7? > Yes. PR 53306 is also fixed by your patch according to Dominique, so don't forget to include its testcase and to add the PR reference in the ChangeLogs. Thanks Mikael
Re: [C++ Patch] Remove uses of ATTRIBUTE_UNUSED in the function parameters
On 09/12/2012 10:57 AM, Richard Guenther wrote: And note that we have ARG_UNUSED for parameters - to cope with older compilers not handling attributes here too well (I run into this when using gcc 3.3 as host compiler). Ah, thanks, I didn't know that (both the problem and the solution): it seems (another) good reason to just get rid of as many ATTRIBUTE_UNUSED as possible! Paolo.
Re: [PATCH,mmix] convert to constraints.md
On Wed, 12 Sep 2012, Nathan Froyd wrote: > - Original Message - > > Nathan, again thanks. There are a few minor tweaks compared to your > > version: > > Thanks for fixing this up! > > > - Keeping old layout of "mmix_reg_or_8bit_operand". That looked like > > a spurious change and I prefer the ior construct to the > > if_then_else. > > ISTR without this change, there were lots of assembly changes like: > > set rx, 6 > cmp rz, ry, rx > > instead of the previous and better: > > cmp rz, ry, 6 > > (apologies if the assembly syntax isn't correct; hopefully the intent is > clear) Yes. My only guess is a typo in your previous edit, as the two constructs certainly should be equivalent in *what's* being recognized. If they aren't, we have a bug in the gen* machinery. If "my" construct is wrong, then that's a separate bug-fix, ehm. Seems worth it to double-check, not just for the sake of MMIX. > but if you double-checked that the assembly didn't change > after your changes, maybe something else that you tweaked fixed this. I have no clue; nothing in the patch below stands out - the missing "I"-constraint that may explain this (modulo that "H" wasn't recognizable either) was on an "or" operation, not a compare. But maybe one was close enough that it mattered. What revision was your baseline? My baseline was r190260 but configure-patches as posted (required to build, affecting crtstuff at most) and with: I'll try with your original patch and see it I can spot something. Index: gcc/config/mmix/predicates.md === --- gcc/config/mmix/predicates.md (revision 190682) +++ gcc/config/mmix/predicates.md (working copy) @@ -118,7 +118,7 @@ (define_predicate "mmix_symbolic_or_addr return 1; /* Fall through. */ default: - return address_operand (op, mode); + return mmix_extra_constraint (op, 'U', reload_in_progress || reload_completed); } }) Index: gcc/config/mmix/mmix.md === --- gcc/config/mmix/mmix.md (revision 190682) +++ gcc/config/mmix/mmix.md (working copy) @@ -274,7 +275,7 @@ (define_insn "anddi3" (define_insn "iordi3" [(set (match_operand:DI 0 "register_operand" "=r,r") (ior:DI (match_operand:DI 1 "register_operand" "%r,0") - (match_operand:DI 2 "mmix_reg_or_constant_operand" "rH,LS")))] + (match_operand:DI 2 "mmix_reg_or_constant_operand" "rI,LS")))] "" "@ OR %0,%1,%2 > > - Replacing undefined-constraint-"H" with "I" instead of removing it. > > It was either renamed early or a genuine typo. Good catch. > > Hard not to see it; the gen* machinery complains about undefined constraints. > :) Exactly! :) You don't congratulate the machine, but the machinist - and sometimes the inventor of the machine. (Thank you, Zack!) brgds, H-P
[PATCH] Fix PR54553
This makes sure -finline is saved across optimize attribute changes (-O0 changes it). Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk. Richard. 2012-09-12 Richard Guenther PR middle-end/54553 * common.opt (finline): Mark with Optimization. Index: gcc/common.opt === --- gcc/common.opt (revision 191209) +++ gcc/common.opt (working copy) @@ -1289,7 +1289,7 @@ Perform indirect inlining ; General flag to enable inlining. Specifying -fno-inline will disable ; all inlining apart from always-inline functions. finline -Common Report Var(flag_no_inline,0) Init(0) +Common Report Var(flag_no_inline,0) Init(0) Optimization Enable inlining of function declared \"inline\", disabling disables all inlining finline-small-functions
Re: [PATCH] Add option for dumping to stderr (issue6190057)
On Wed, Sep 12, 2012 at 12:30 PM, Richard Guenther wrote: > On Wed, Sep 12, 2012 at 10:12 AM, Sharad Singhai wrote: >> Thanks for your comments. Please see my responses inline. >> >> On Tue, Sep 11, 2012 at 1:16 PM, Xinliang David Li >> wrote: >>> Can you resend your patch in text form (also need to resolve the >>> latest conflicts) so that it can be commented inline? >> >> I tried to include inline patch earlier but my message was bounced >> back from patches mailing list. I am trying it again. >> >>> Please also provide as summary a more up-to-date description of >>> 1) Command line option syntax and semantics >> >> I added some documentation in the patch. Here are the relevant bits >> from invoke.texi. >> >> `-fdump-tree-SWITCH-OPTIONS=FILENAME' >> Control the dumping at various stages of processing the >> intermediate language tree to a file. The file name is generated >> by appending a switch-specific suffix to the source file name, and >> the file is created in the same directory as the output file. In >> case of `=FILENAME' option, the dump is output on the given file >> instead of the auto named dump files. >> ... >> >> `=FILENAME' >> Instead of an auto named dump file, output into the given file >> name. The file names `stdout' and `stderr' are treated >> specially and are considered already open standard streams. >> For example, >> >>gcc -O2 -ftree-vectorize -fdump-tree-vect-details=foo.dump >> -fdump-tree-pre=stderr file.c >> >> outputs vectorizer dump into `foo.dump', while the PRE dump >> is output on to `stderr'. If two conflicting dump filenames >> are given for the same pass, then the latter option >> overrides the earlier one. >> >> `-fopt-info-PASS' >> `-fopt-info-PASS-OPTIONS' >> `-fopt-info-PASS-OPTIONS=FILENAME' >> Controls optimization dumps from various passes. If the `-OPTIONS' >> form is used, OPTIONS is a list of `-' separated options which >> controls the details of the dump. If OPTIONS is not specified, it >> defaults to `optimized'. If the FILENAME is not specified, it >> defaults to `stderr'. Note that the output FILENAME will be >> overwritten in case of multiple translation units. If a combined >> output from multiple the translation units is desired, `stderr' >> should be used instead. >> >> The PASS could be one of the tree or rtl passes. The following >> options are available > > I don't like that we have -PASS here. That makes it awfully similar > to -fdump-PASS-OPTIONS=FILENAME. Are we merely having > -fopt-info because OPTIONS are "different"? > >> `optimized' >> Print information when a particular optimization is >> successfully applied. It is up to the pass to decide which >> information is relevant. For example, the vectorizer pass >> prints the location of loop which got vectorized. >> >> `missed' >> Print information about missed optimizations. Individual >> passes control which information to include in the output. >> For example, >> >>gcc -O2 -ftree-vectorize -fopt-info-tree-vect-missed > > At least for -PASS better names should be available. And given the > lack of pass support for the new scheme (apart from the vectorizer) > we should enumerate the set of -PASS values we accept, currently > -vec only. IMHO it does not make sense to provide -fopt-info for > the myriad of passes we have - usually only high-level ones are interesting > to the user? > >> will print information about missed vectorization >> opportunities on to stderr. >> >> `note' >> Print verbose information about optimizations, such as certain >> transformations, more detailed information about decisions >> etc. >> >> `details' >> Print detailed information from a particular pass. This >> includes OPTIMIZED, MISSED, and NOTE. For example, >> >>gcc -O2 -ftree-vectorize >> -fopt-info-tree-vect-details=vect.details >> >> outputs detailed optimization report from the vectorization >> pass into `vect.details'. > > Can options be chained? like -fopt-info-vec-missed-note? > >> `-fopt-info-tree-all' >> `-fopt-info-tree-all-OPTIONS' >> `-fopt-info-tree-all-OPTIONS=FILENAME' >> This is similar to `-fopt-info' but instead of a single pass, it >> applies the dump options to all the tree passes. If the FILENAME >> is provided, the dump from all the passes is concatenated, >> otherwise the dump is output onto `stderr'. If OPTIONS is omitted, >> it defaults to `optimized'. >> >> gcc -O3 -fopt-info-tree-all-optimized-missed=tree.optdump >> >> This will output information about missed optimizations as well as >> optimized locations from all the tree pas
Unreviewed bootstrap patch
This patch Fix Solaris 9/x86 bootstrap http://gcc.gnu.org/ml/gcc-patches/2012-08/msg01494.html has remained unreviewed for 3 weeks. It is necessary to fix Solaris 9/x86 bootstrap after the cxx-conversion merge. Rainer -- - Rainer Orth, Center for Biotechnology, Bielefeld University
Re: [v3, build] Clear hardware capabilities on libstdc++.so with Sun as
Il 12/09/2012 11:26, Rainer Orth ha scritto: > Since the use of rdrand was introduced in src/c++11/random.cc, all > execution tests involving libstdc++.so.6 fail on Solaris 10 and 11/x86 > with a sufficiently recent native assembler that supports rdrand: either > Solaris 10/x86 patch 119961-11 or Solaris 11.1 builds (haven't checked > which one). The problem is that as tags src/c++11/random.o as needing > RDRAND support: > > % file random.o > random.o: ELF 32-bit LSB relocatable 80386 Version 1 [CMOV RDRAND] > > which is propagated to libstdc++.so.6 and causes the runtime linker to > fail the execution if the hardware doesn't support that hardware > capability, although rdrand is executed only if the cpuid indicates the > support is present. > > The usual solution so far has been to clear the hardware capability > using a linker map (as in libitm, cf. libitm/clearcap.map). > Unfortunately, this doesn't work here: as can be seen with elfdump, > RDRAND is set in a second mask (CA_SUNW_HW_2) since all bits in > CA_SUNW_HW_1 are already used: > > % elfdump -H random.o > > Capabilities Section: .SUNW_cap > > Object Capabilities: > index tag value >[0] CA_SUNW_HW_1 0x20 [ CMOV ] >[1] CA_SUNW_HW_2 0x1 [ RDRAND ] > > The old (v1) linker map syntax has no support for clearing that > bit/mask, and while the v2 map syntax does, we cannot universally assume > it's present on Solaris 10: while recent linker patches include it, > older ones don't and ld and as can be updated independently. > > So I've settled for a different solution instead: Sun assemblers with > hardware capability support also have a -nH switch to suppress their > generation, thus I'm testing for that and use it if possible. > > This is exactly what the following patch does. > > Bootstrapped without regressions on i386-pc-solaris2.1[01] with affected > versions of Sun as and gas 2.22. Results with gas are unchanged, with > Sun as all failures are gone. x86_64-unknown-linux-gnu bootstrap is > running to make sure nothing breaks there. > > Ok for mainline if that passes? > > Rainer > > > 2012-09-11 Rainer Orth > > * acinclude.m4 (GLIBCXX_CHECK_ASSEMBLER_HWCAP): Define. > * configure.ac: Call GLIBCXX_CHECK_ASSEMBLER_HWCAP. > * fragment.am (CONFIG_CXXFLAGS): Add $(HWCAP_FLAGS). > * configure: Regenerate. > * Makefile.in: Regenerate. > * doc/Makefile.in: Regenerate. > * include/Makefile.in: Regenerate. > * libsupc++/Makefile.in: Regenerate. > * po/Makefile.in: Regenerate. > * python/Makefile.in: Regenerate. > * src/Makefile.in: Regenerate. > * src/c++11/Makefile.in: Regenerate. > * src/c++98/Makefile.in: Regenerate. > * testsuite/Makefile.in: Regenerate. > > > > Ok. Paolo
Re: shrink-wrapping duplicates BBs across partitions.
The problem stems from tree-ssa-tail-merge that breaks bb->count, The CFG looks like 2 / \ /6 5 (0) | | 3 <- |/ \ | | 7 (1) 8 - | / 4 (1) (in parenthesis the bb->count from gcov) 2 / \ /6 / | | 3 <-- | / | | 5 (0) 8 -- | | 4 (1) so 5 and 4 are now in different partitions, producing an assertion because there is no EDGE_CROSSING between them. I can see 3 solutions to this 1) merge the BB counts in tree-ssa-tail-merge.c, so 5 is in the same partition that 4 2) don't tail-merge blocks that belong to different partitions. 3) add a EDGE_CROSSING flag on the edge between 4 and 5. 1) fixes the problem, so 5 and 4 are now in the same partition. The fix is quite trivial, as with attached. the other solution 2) is more conservative, and also fixes the problem. I don't think 3) is necessary. more ideas ? thanks, Christian On 09/11/2012 06:21 PM, Jakub Jelinek wrote: > On Tue, Sep 11, 2012 at 05:40:30PM +0200, Steven Bosscher wrote: >> On Tue, Sep 11, 2012 at 5:31 PM, Christian Bruel >> wrote: >>> Actually, the edge is fairly simple. I have >>> >>> BB5 (BB_COLD_PARTITION) -> BB10 (BB_HOT_PARTITION) -> EXIT >>> >>> and BB10 has no other incoming edges. and we are duplicating it. >> >> That is wrong, should never happen. Is there a test case to play with? >> It'd be good to have a PR for this. > > Isn't that the standard case when !HAVE_return ? Then you can have only a > single return through epilogue, and when the epilogue is in the hot > partition, even if cold code is returning, it needs to jump to the epilogue. > > Jakub > Index: tree-ssa-tail-merge.c === --- tree-ssa-tail-merge.c (revision 191129) +++ tree-ssa-tail-merge.c (working copy) @@ -1478,6 +1478,8 @@ bb2->frequency = BB_FREQ_MAX; bb1->frequency = 0; + bb2->count += bb1->count; + /* Do updates that use bb1, before deleting bb1. */ release_last_vdef (bb1); same_succ_flush_bb (bb1);
Re: Unreviewed bootstrap patch
On Wed, 12 Sep 2012, Rainer Orth wrote: > This patch > > Fix Solaris 9/x86 bootstrap > http://gcc.gnu.org/ml/gcc-patches/2012-08/msg01494.html > > has remained unreviewed for 3 weeks. It is necessary to fix Solaris > 9/x86 bootstrap after the cxx-conversion merge. Ok. Thanks, Richard.
Re: [PATCH] Combine location with block using block_locations
Hi, On Wed, 12 Sep 2012, Richard Guenther wrote: > > This will actually not work correctly in some cases. The problem is, > > if the prevailing decl is already part of another chain (say in > > another block_var list) you would break the current chain. Hence > > block vars need special handling in the lto streamer (another reason > > why tree_chain is not the most clever think to use for this chain). > > This problem area needs to be solved somehow if block info is to be > > preserved correctly. > > Well. The issue is that at present we stream BLOCKs in the function > section via its DECL_INTIAL. Which means we _never_ should get a > non-prevailing DECL in BLOCK_VARS. You need to debug why that doesn't > work anymore. The assert that triggers tests that there's no var_decl in TREE_CHAIN. It doesn't test that it's a prevailing decl. So we could assert that instead of the current check. > >> 2. Don't stream out block info for LTO, and still call LTO_NO_PREVAIL > >> (TREE_CHAIN (t)). > > > > That's also a large hammer as it basically will mean no debug info after > > LTO :-/ Sigh, at this point I have no good solution that doesn't involve > > quite some work, perhaps your hack is good enough for the time being, > > though I hate it :) > > It hides a bug. If we replace anything in BLOCK_VARS then the risk is > that you generate an infinite chain in some BLOCK_VARS list and thus > get infinite loops somewhere in the compiler. That's what I said for using SET_PREVAIL. But his hack would specifically not replace anything in TREE_CHAIN (and hence BLOCK_VARS), and indeed not check anything either. Ciao, Michael.
Re: Change double_int calls to new interface.
> Date: Tue, 11 Sep 2012 17:03:39 -0700 > From: Ian Lance Taylor > > On Tue, Sep 11, 2012 at 3:12 PM, Lawrence Crowl wrote: > > On 9/11/12, Andreas Schwab wrote: > >> Mark Kettenis writes: > >>> In file included from ../../../src/gcc/gcc/mcf.c:47:0: > >>> ../../../src/gcc/gcc/mcf.c: In function 'void dump_fixup_edge(FILE*, > >>> fixup_graph_type*, fixup_edge_p)': > >>> ../../../src/gcc/gcc/system.h:288:78: error: integer overflow in > >>> expression [-Werror=overflow] > >> > >> This is PR54528. > > > > The expression itself looks correct. I have not been able to > > duplicate the problem on x86. I am now waiting on access to the > > compile farm for access to a hppa system. Does anyone have more > > specific information on the condition that generates the error? > > I haven't tried, but I bet you can get it to happen if you build a > 32-bit compiler--that is, build a compiler using -m32 so that the > compiler itself runs 32 bit code. I don't think that helps since I don't see the problem on OpenBSD/i386 (i386-unknown-openbsd5.2) whith a strictly 32-bit compiler. As I wrote earlier, I suspect the key factor is HOST_WIDE_INT being 32-bit, which means 32-bit architectures that don't sey need_64bit_hwint in config.gcc. The fact that m68k is affected as well strengthens my suspicion. So I expect arm to show the problem as well. But people probably haven't noticed since they cross-compile. Anyway the "diff" below seems to fix the issue. Guess the replacement doesn't work! Should be a big enough clue for Lawrence to come up with a proper fix. Index: fold-const.c === --- fold-const.c(revision 191150) +++ fold-const.c(working copy) @@ -982,13 +982,15 @@ break; case MINUS_EXPR: -/* FIXME(crowl) Remove this code if the replacment works. + /* FIXME(crowl) Remove this code if the replacment works. */ +#if 1 neg_double (op2.low, op2.high, &res.low, &res.high); add_double (op1.low, op1.high, res.low, res.high, &res.low, &res.high); overflow = OVERFLOW_SUM_SIGN (res.high, op2.high, op1.high); -*/ +#else res = op1.add_with_sign (-op2, false, &overflow); +#endif break; case MULT_EXPR:
Re: [PATCH] Combine location with block using block_locations
On Wed, Sep 12, 2012 at 2:14 PM, Michael Matz wrote: > Hi, > > On Wed, 12 Sep 2012, Richard Guenther wrote: > >> > This will actually not work correctly in some cases. The problem is, >> > if the prevailing decl is already part of another chain (say in >> > another block_var list) you would break the current chain. Hence >> > block vars need special handling in the lto streamer (another reason >> > why tree_chain is not the most clever think to use for this chain). >> > This problem area needs to be solved somehow if block info is to be >> > preserved correctly. >> >> Well. The issue is that at present we stream BLOCKs in the function >> section via its DECL_INTIAL. Which means we _never_ should get a >> non-prevailing DECL in BLOCK_VARS. You need to debug why that doesn't >> work anymore. > > The assert that triggers tests that there's no var_decl in TREE_CHAIN. > It doesn't test that it's a prevailing decl. So we could assert that > instead of the current check. > >> >> 2. Don't stream out block info for LTO, and still call LTO_NO_PREVAIL >> >> (TREE_CHAIN (t)). >> > >> > That's also a large hammer as it basically will mean no debug info after >> > LTO :-/ Sigh, at this point I have no good solution that doesn't involve >> > quite some work, perhaps your hack is good enough for the time being, >> > though I hate it :) >> >> It hides a bug. If we replace anything in BLOCK_VARS then the risk is >> that you generate an infinite chain in some BLOCK_VARS list and thus >> get infinite loops somewhere in the compiler. > > That's what I said for using SET_PREVAIL. But his hack would specifically > not replace anything in TREE_CHAIN (and hence BLOCK_VARS), and indeed not > check anything either. Hm, but we shouldn't end up streaming any BLOCKs at this point (nor local TYPE_DECLs). Those are supposed to be in the local function sections only where no fixup for prevailing decls happens. Richard. > > Ciao, > Michael.
Re: [PATCH] Combine location with block using block_locations
Hi, On Wed, 12 Sep 2012, Richard Guenther wrote: > >> It hides a bug. If we replace anything in BLOCK_VARS then the risk > >> is that you generate an infinite chain in some BLOCK_VARS list and > >> thus get infinite loops somewhere in the compiler. > > > > That's what I said for using SET_PREVAIL. But his hack would > > specifically not replace anything in TREE_CHAIN (and hence > > BLOCK_VARS), and indeed not check anything either. > > Hm, but we shouldn't end up streaming any BLOCKs at this point (nor > local TYPE_DECLs). Those are supposed to be in the local function > sections only where no fixup for prevailing decls happens. That's true, something is fishy with the patch, will try to investigate. Ciao, Michael.
Remove obsolete compatibility notes in vec.h
Committed to trunk. * vec.h: Remove compatibility notes for previous distinction between vectors of objects and vectors of pointers. diff --git a/gcc/vec.h b/gcc/vec.h index 88891d7..8858f6a 100644 --- a/gcc/vec.h +++ b/gcc/vec.h @@ -31,23 +31,6 @@ along with GCC; see the file COPYING3. If not see sometimes backed by out-of-line generic functions. The vectors are designed to interoperate with the GTY machinery. - FIXME - Remove the following compatibility notes after a handler - class for vec_t is implemented. - - To preserve compatibility with the existing API, some functions - that manipulate vector elements implement two overloads: one taking - a pointer to the element and others that take a pointer to a - pointer to the element. - - This used to be implemented with three different families of macros - and structures: structure objects, scalar objects and of pointers. - Both the structure object and pointer variants passed pointers to - objects around -- in the former case the pointers were stored into - the vector and in the latter case the pointers were dereferenced and - the objects copied into the vector. The scalar object variant was - suitable for int-like objects, and the vector elements were returned - by value. - There are both 'index' and 'iterate' accessors. The index accessor is implemented by operator[]. The iterator returns a boolean iteration condition and updates the iteration variable passed by @@ -124,7 +107,6 @@ along with GCC; see the file COPYING3. If not see VEC_safe_push(tree,gc,s->v,decl); // append some decl onto the end for (ix = 0; VEC_iterate(tree,s->v,ix,elt); ix++) { do something with elt } - */ #if ENABLE_CHECKING
[PATCH] Fix PR54489 - FRE needing AVAIL_OUT
This removes the need for FRE to compute AVAIL_OUT which can consume an unreasonable amount of memory for testcases like int foo (int a) { int b = 0; #define X if (a) b = b + 1; #define XX X X X X X X X X X X #define XXX XX XX XX XX XX XX XX XX XX XX #define XXX XXX XXX XXX XXX XXX XXX XXX XXX XXX #define X #define XX X X X X X X X X X X #define XXX XX XX XX XX XX XX XX XX XX XX XX return b; } instead of computing AVAIL_OUT up-front for all basic-block just to find the SSA name whose definition dominates the current elimination point this re-organizes elimination to perform a domwalk, updating a SCCVN value-id (thus, SSA name) -> leader (dominating SSA name) mapping. It also simplifies PRE by removing that in_fre global flag and not sharing a common execute function. FRE and PRE now only share elimination (and value-numbering). It comes to my mind that if we manage to get rid of the AVAIL_OUT use from clean () and dependent_clean () we can delay PRE insertion (well, producing and inserting actual GIMPLE stmts) to elimination time as well, removing its need for AVAIL_OUT. But that's surely for a followup (and I bet sth else than PRE blows up at -O2 as well). Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Richard. 2012-09-12 Richard Guenther PR tree-optimization/54489 * tree-ssa-pre.c: Include domwalk.h. (in_fre): Remove. (sccvn_valnum_from_value_id): New function. (debug_bitmap_sets_for): Simplify. (get_representative_for): Properly initialize the SCCVN valnum. (create_expression_by_pieces): Likewise. (insert_into_preds_of_block): Likewise. (can_PRE_operation): Remove. (make_values_for_phi): Simplify. (compute_avail): Likewise. (do_SCCVN_insertion): Remove. (eliminate_avail, eliminate_push_avail, eliminate_insert): New functions. (eliminate): Split and perform a domwalk. (eliminate_bb): Former eliminate part that is now dom-enter. (eliminate_leave_block): New function. (fini_eliminate): Likewise. (init_pre): Simplify. (fini_pre): Likewise. (execute_pre): Fold into do_pre and do_fre. (do_pre): Consume execute_pre. (do_fre): Likewise. * Makefile.in (tree-ssa-pre.o): Add domwalk.h dependency. Index: gcc/tree-ssa-pre.c === --- gcc/tree-ssa-pre.c (revision 191215) +++ gcc/tree-ssa-pre.c (working copy) @@ -43,6 +43,7 @@ along with GCC; see the file COPYING3. #include "tree-scalar-evolution.h" #include "params.h" #include "dbgcnt.h" +#include "domwalk.h" /* TODO: @@ -351,8 +352,6 @@ get_or_alloc_expr_for_name (tree name) return result; } -static bool in_fre = false; - /* An unordered bitmap set. One bitmap tracks values, the other, expressions. */ typedef struct bitmap_set @@ -637,6 +636,25 @@ get_expr_value_id (pre_expr expr) } } +/* Return a SCCVN valnum (SSA name or constant) for the PRE value-id VAL. */ + +static tree +sccvn_valnum_from_value_id (unsigned int val) +{ + bitmap_iterator bi; + unsigned int i; + bitmap exprset = VEC_index (bitmap, value_expressions, val); + EXECUTE_IF_SET_IN_BITMAP (exprset, 0, i, bi) +{ + pre_expr vexpr = expression_for_id (i); + if (vexpr->kind == NAME) + return VN_INFO (PRE_EXPR_NAME (vexpr))->valnum; + else if (vexpr->kind == CONSTANT) + return PRE_EXPR_CONSTANT (vexpr); +} + return NULL_TREE; +} + /* Remove an expression EXPR from a bitmapped set. */ static void @@ -1022,16 +1040,13 @@ DEBUG_FUNCTION void debug_bitmap_sets_for (basic_block bb) { print_bitmap_set (stderr, AVAIL_OUT (bb), "avail_out", bb->index); - if (!in_fre) -{ - print_bitmap_set (stderr, EXP_GEN (bb), "exp_gen", bb->index); - print_bitmap_set (stderr, PHI_GEN (bb), "phi_gen", bb->index); - print_bitmap_set (stderr, TMP_GEN (bb), "tmp_gen", bb->index); - print_bitmap_set (stderr, ANTIC_IN (bb), "antic_in", bb->index); - if (do_partial_partial) - print_bitmap_set (stderr, PA_IN (bb), "pa_in", bb->index); - print_bitmap_set (stderr, NEW_SETS (bb), "new_sets", bb->index); -} + print_bitmap_set (stderr, EXP_GEN (bb), "exp_gen", bb->index); + print_bitmap_set (stderr, PHI_GEN (bb), "phi_gen", bb->index); + print_bitmap_set (stderr, TMP_GEN (bb), "tmp_gen", bb->index); + print_bitmap_set (stderr, ANTIC_IN (bb), "antic_in", bb->index); + if (do_partial_partial) +print_bitmap_set (stderr, PA_IN (bb), "pa_in", bb->index); + print_bitmap_set (stderr, NEW_SETS (bb), "new_sets", bb->index); } /* Print out the expressions that have VAL to OUTFILE. */ @@ -1402,11 +1417,9 @@ get_representative_for (const pre_expr e that we will return. */
Re: [PATCH] Combine location with block using block_locations
Hi, On Wed, 12 Sep 2012, Michael Matz wrote: > > Hm, but we shouldn't end up streaming any BLOCKs at this point (nor > > local TYPE_DECLs). Those are supposed to be in the local function > > sections only where no fixup for prevailing decls happens. > > That's true, something is fishy with the patch, will try to investigate. ipa-prop creates the problem. Its tree mapping can contain expressions, expressions can have locations, locations now have blocks. The tree maps are stored as part of jump functions, and hence as part of node summaries. Node summaries are global, hence blocks, and therefore block vars can be placed in the global blob. That's not supposed to happen. The patch below fixes this instance of the problem and makes the testcase work with Dehaos patch with the LTO_NO_PREVAIL call added back in. Ciao, Michael. Index: lto-cgraph.c === --- lto-cgraph.c(revision 190803) +++ lto-cgraph.c(working copy) @@ -1373,6 +1373,7 @@ output_node_opt_summary (struct output_b mechanism to store function local declarations into summaries. */ gcc_assert (parm); streamer_write_uhwi (ob, parm_num); + gcc_assert (IS_UNKNOWN_LOCATION (EXPR_LOCATION (map->new_tree))); stream_write_tree (ob, map->new_tree, true); bp = bitpack_create (ob->main_stream); bp_pack_value (&bp, map->replace_p, 1); Index: ipa-prop.c === --- ipa-prop.c (revision 190803) +++ ipa-prop.c (working copy) @@ -1378,7 +1378,11 @@ ipa_compute_jump_functions_for_edge (str tree arg = gimple_call_arg (call, n); if (is_gimple_ip_invariant (arg)) - ipa_set_jf_constant (jfunc, arg); + { + arg = unshare_expr (arg); + SET_EXPR_LOCATION (arg, UNKNOWN_LOCATION); + ipa_set_jf_constant (jfunc, arg); + } else if (!is_gimple_reg_type (TREE_TYPE (arg)) && TREE_CODE (arg) == PARM_DECL) { @@ -3154,6 +3158,7 @@ ipa_write_jump_function (struct output_b stream_write_tree (ob, jump_func->value.known_type.component_type, true); break; case IPA_JF_CONST: + gcc_assert (IS_UNKNOWN_LOCATION (EXPR_LOCATION (jump_func->value.constant))); stream_write_tree (ob, jump_func->value.constant, true); break; case IPA_JF_PASS_THROUGH:
Re: [PATCH] Combine location with block using block_locations
Hi, On Wed, Sep 12, 2012 at 04:17:45PM +0200, Michael Matz wrote: > Hi, > > On Wed, 12 Sep 2012, Michael Matz wrote: > > > > Hm, but we shouldn't end up streaming any BLOCKs at this point (nor > > > local TYPE_DECLs). Those are supposed to be in the local function > > > sections only where no fixup for prevailing decls happens. > > > > That's true, something is fishy with the patch, will try to investigate. > > ipa-prop creates the problem. Its tree mapping can contain expressions, > expressions can have locations, locations now have blocks. The tree maps > are stored as part of jump functions, and hence as part of node summaries. > Node summaries are global, hence blocks, and therefore block vars can be > placed in the global blob. > > That's not supposed to happen. The patch below fixes this instance of the > problem and makes the testcase work with Dehaos patch with the > LTO_NO_PREVAIL call added back in. > > > Ciao, > Michael. > > Index: lto-cgraph.c > === > --- lto-cgraph.c (revision 190803) > +++ lto-cgraph.c (working copy) > @@ -1373,6 +1373,7 @@ output_node_opt_summary (struct output_b > mechanism to store function local declarations into summaries. */ >gcc_assert (parm); >streamer_write_uhwi (ob, parm_num); > + gcc_assert (IS_UNKNOWN_LOCATION (EXPR_LOCATION (map->new_tree))); >stream_write_tree (ob, map->new_tree, true); >bp = bitpack_create (ob->main_stream); >bp_pack_value (&bp, map->replace_p, 1); > Index: ipa-prop.c > === > --- ipa-prop.c(revision 190803) > +++ ipa-prop.c(working copy) > @@ -1378,7 +1378,11 @@ ipa_compute_jump_functions_for_edge (str >tree arg = gimple_call_arg (call, n); > >if (is_gimple_ip_invariant (arg)) > - ipa_set_jf_constant (jfunc, arg); > + { > + arg = unshare_expr (arg); > + SET_EXPR_LOCATION (arg, UNKNOWN_LOCATION); > + ipa_set_jf_constant (jfunc, arg); > + } >else if (!is_gimple_reg_type (TREE_TYPE (arg)) > && TREE_CODE (arg) == PARM_DECL) Perhaps it would be better if ipa_set_jf_constant did that, just in case we ever add another caller? Note that arithmetic functions also have their second operand tree stored in them and so perhaps ipa_set_jf_arith_pass_through should do the same. And I it is also necessary to do the same thing at the end of determine_known_aggregate_parts, i.e. before assignment to item->value. I can post a separate patch if necessary. I wasn't following this thread but I hope that streaming types does not cause this problem. If they do, there are quite a few in various jump functions and indirect call graph edges. Thanks, Martin > { > @@ -3154,6 +3158,7 @@ ipa_write_jump_function (struct output_b >stream_write_tree (ob, jump_func->value.known_type.component_type, > true); >break; > case IPA_JF_CONST: > + gcc_assert (IS_UNKNOWN_LOCATION (EXPR_LOCATION > (jump_func->value.constant))); >stream_write_tree (ob, jump_func->value.constant, true); >break; > case IPA_JF_PASS_THROUGH:
Re: Backtrace library [1/3]
On Tue, Sep 11, 2012 at 4:36 PM, Lawrence Crowl wrote: > On 9/11/12, Ian Lance Taylor wrote: >> This patch is the interface to and configury of libbacktrace. >> I've separated these out as the parts of libbacktrace that >> require the most review. The interface to libbacktrace is in >> the file backtrace.h. This is what callers will use. The file >> backtrace-supported.h is also available so that programs can see >> whether calling the backtrace library will work at all. > > The interface relies on global data in the library. Wouldn't it > be better to expose the state as an additional parameter to enable > concurrent access by different threads? That parameter could then > be modeled as 'this' parameter, addressing Gaby's suggesting. I went ahead and added a state parameter to the interface. I've attached the updated patch. Ian foo1.patch Description: Binary data
Re: [PATCH] Combine location with block using block_locations
On Wed, Sep 12, 2012 at 4:37 PM, Martin Jambor wrote: > Hi, > > On Wed, Sep 12, 2012 at 04:17:45PM +0200, Michael Matz wrote: >> Hi, >> >> On Wed, 12 Sep 2012, Michael Matz wrote: >> >> > > Hm, but we shouldn't end up streaming any BLOCKs at this point (nor >> > > local TYPE_DECLs). Those are supposed to be in the local function >> > > sections only where no fixup for prevailing decls happens. >> > >> > That's true, something is fishy with the patch, will try to investigate. >> >> ipa-prop creates the problem. Its tree mapping can contain expressions, >> expressions can have locations, locations now have blocks. The tree maps >> are stored as part of jump functions, and hence as part of node summaries. >> Node summaries are global, hence blocks, and therefore block vars can be >> placed in the global blob. >> >> That's not supposed to happen. The patch below fixes this instance of the >> problem and makes the testcase work with Dehaos patch with the >> LTO_NO_PREVAIL call added back in. >> >> >> Ciao, >> Michael. >> >> Index: lto-cgraph.c >> === >> --- lto-cgraph.c (revision 190803) >> +++ lto-cgraph.c (working copy) >> @@ -1373,6 +1373,7 @@ output_node_opt_summary (struct output_b >> mechanism to store function local declarations into summaries. */ >>gcc_assert (parm); >>streamer_write_uhwi (ob, parm_num); >> + gcc_assert (IS_UNKNOWN_LOCATION (EXPR_LOCATION (map->new_tree))); >>stream_write_tree (ob, map->new_tree, true); >>bp = bitpack_create (ob->main_stream); >>bp_pack_value (&bp, map->replace_p, 1); >> Index: ipa-prop.c >> === >> --- ipa-prop.c(revision 190803) >> +++ ipa-prop.c(working copy) >> @@ -1378,7 +1378,11 @@ ipa_compute_jump_functions_for_edge (str >>tree arg = gimple_call_arg (call, n); >> >>if (is_gimple_ip_invariant (arg)) >> - ipa_set_jf_constant (jfunc, arg); >> + { >> + arg = unshare_expr (arg); >> + SET_EXPR_LOCATION (arg, UNKNOWN_LOCATION); >> + ipa_set_jf_constant (jfunc, arg); >> + } >>else if (!is_gimple_reg_type (TREE_TYPE (arg)) >> && TREE_CODE (arg) == PARM_DECL) > > Perhaps it would be better if ipa_set_jf_constant did that, just in > case we ever add another caller? Note that arithmetic functions also > have their second operand tree stored in them and so perhaps > ipa_set_jf_arith_pass_through should do the same. > > And I it is also necessary to do the same thing at the end of > determine_known_aggregate_parts, i.e. before assignment to > item->value. I can post a separate patch if necessary. > > I wasn't following this thread but I hope that streaming types does > not cause this problem. If they do, there are quite a few in various > jump functions and indirect call graph edges. Well, or if jump functions would not use trees ... Richard. > Thanks, > > Martin > > >> { >> @@ -3154,6 +3158,7 @@ ipa_write_jump_function (struct output_b >>stream_write_tree (ob, jump_func->value.known_type.component_type, >> true); >>break; >> case IPA_JF_CONST: >> + gcc_assert (IS_UNKNOWN_LOCATION (EXPR_LOCATION >> (jump_func->value.constant))); >>stream_write_tree (ob, jump_func->value.constant, true); >>break; >> case IPA_JF_PASS_THROUGH:
Re: Backtrace library [3/3]
On Tue, Sep 11, 2012 at 10:16 PM, Jakub Jelinek wrote: > On Tue, Sep 11, 2012 at 03:55:04PM -0700, Ian Lance Taylor wrote: >> +/* The DWARF abbreviations for a compilation unit. This structure >> + only exists while reading the compilation unit. Most DWARF readers >> + seem to a hash table to map abbrev ID's to abbrev entries. >> + However, we primarily care about GCC, and GCC simply issues ID's in >> + numerical order starting at 1. So we simply keep a sorted vector, >> + and try to just look up the code. */ > > As soon as you run dwz on it, the abbrevs won't be in numerical order, > and even GCC really should be changed to sort the abbrevs by decreasing > use count if there are more than 127 abbrevs (special casing the base types > that are used for typed DWARF stack). OK. Still, the current code should work correctly. > The patch also doesn't handle DW_TAG_imported_unit/DW_TAG_partial_unit, > which is a show stopper if dwz is used (and > DW_FORM_GNU_strp_alt/DW_FORM_GNU_ref_alt isn't handled properly). > > BTW, the library doesn't seem to implement what gdb does, in particular > improved support for tail calls. In gdb that was (at least); > 2011-10-09 Jan Kratochvil > > Protect entry values against self tail calls. > ... > > 2011-10-09 Jan Kratochvil > > Recognize virtual tail call frames. Yes. I think these are issues that can be addressed after the initial version of the library is approved and committed. Ian
Re: [PATCH] Fix PR54489 - FRE needing AVAIL_OUT
On Wed, Sep 12, 2012 at 4:02 PM, Richard Guenther wrote: > for a followup (and I bet sth else than PRE blows up at -O2 as well). Actually, the only thing that really blows up is that enemy of scalability, VRP. With -O2 -fno-tree-{pre,fre,vrp}, the slowest part of the compiler on this test case are dominance computation, dominator-based optimization, and tree CFG cleanup, but they all scale linearly with the number of X'es in the test case :-) Ciao! Steven
Re: Backtrace library [3/3]
On Wed, Sep 12, 2012 at 2:01 AM, Florian Weimer wrote: > > +backtrace_open (const char *filename, backtrace_error_callback > error_callback, > + void *data) > +{ > + int descriptor; > + > + descriptor = open (filename, O_RDONLY | O_CLOEXEC); > + if (descriptor < 0) > +{ > + error_callback (data, filename, errno); > + return -1; > +} > + if (O_CLOEXEC == 0) > +{ > + /* It doesn't matter if this fails for some reason. */ > + fcntl (descriptor, F_SETFD, FD_CLOEXEC); > +} > > You should call fcntl unconditionally. O_CLOEXEC might be non-zero during > build, but could still be ignored by the kernel. OK, done. > +static void > +fileline_initialize (backtrace_error_callback error_callback, void *data) > +{ > ... > + if (executable_name != NULL) > +descriptor = backtrace_open (executable_name, error_callback, data); > + else > +descriptor = backtrace_open ("/proc/self/exe", error_callback, data); > > You should try getauxval(AT_EXECFN) as well (needs recent glibc), so that > this works with a mounted /proc. I'm going to postpone this--my glibc doesn't have this function. > This library should only be used when getauxval(AT_SECURE) is zero, so that > the program doesn't try to read files with elevated privileges to which the > original user wouldn't have access. I don't think this has to be addressed > within the library itself. The library doesn't try to elevate privileges, so, yes, I think this is entirely the responsibility of the program calling the library. > Adding /usr/lib/debug support shouldn't be too hard, I will try to figure > out the required path transformations (which are somewhat system-specific). Thanks! Ian
Re: [PATCH] Combine location with block using block_locations
Hi, On Wed, Sep 12, 2012 at 04:47:11PM +0200, Richard Guenther wrote: > On Wed, Sep 12, 2012 at 4:37 PM, Martin Jambor wrote: > > On Wed, Sep 12, 2012 at 04:17:45PM +0200, Michael Matz wrote: > >> On Wed, 12 Sep 2012, Michael Matz wrote: > >> > >> > > Hm, but we shouldn't end up streaming any BLOCKs at this point (nor > >> > > local TYPE_DECLs). Those are supposed to be in the local function > >> > > sections only where no fixup for prevailing decls happens. > >> > > >> > That's true, something is fishy with the patch, will try to investigate. > >> > >> ipa-prop creates the problem. Its tree mapping can contain expressions, > >> expressions can have locations, locations now have blocks. The tree maps > >> are stored as part of jump functions, and hence as part of node summaries. > >> Node summaries are global, hence blocks, and therefore block vars can be > >> placed in the global blob. > >> > >> That's not supposed to happen. The patch below fixes this instance of the > >> problem and makes the testcase work with Dehaos patch with the > >> LTO_NO_PREVAIL call added back in. > >> > >> > >> Ciao, > >> Michael. > >> > >> Index: lto-cgraph.c > >> === > >> --- lto-cgraph.c (revision 190803) > >> +++ lto-cgraph.c (working copy) > >> @@ -1373,6 +1373,7 @@ output_node_opt_summary (struct output_b > >> mechanism to store function local declarations into summaries. > >> */ > >>gcc_assert (parm); > >>streamer_write_uhwi (ob, parm_num); > >> + gcc_assert (IS_UNKNOWN_LOCATION (EXPR_LOCATION (map->new_tree))); > >>stream_write_tree (ob, map->new_tree, true); > >>bp = bitpack_create (ob->main_stream); > >>bp_pack_value (&bp, map->replace_p, 1); > >> Index: ipa-prop.c > >> === > >> --- ipa-prop.c(revision 190803) > >> +++ ipa-prop.c(working copy) > >> @@ -1378,7 +1378,11 @@ ipa_compute_jump_functions_for_edge (str > >>tree arg = gimple_call_arg (call, n); > >> > >>if (is_gimple_ip_invariant (arg)) > >> - ipa_set_jf_constant (jfunc, arg); > >> + { > >> + arg = unshare_expr (arg); > >> + SET_EXPR_LOCATION (arg, UNKNOWN_LOCATION); > >> + ipa_set_jf_constant (jfunc, arg); > >> + } > >>else if (!is_gimple_reg_type (TREE_TYPE (arg)) > >> && TREE_CODE (arg) == PARM_DECL) > > > > Perhaps it would be better if ipa_set_jf_constant did that, just in > > case we ever add another caller? Note that arithmetic functions also > > have their second operand tree stored in them and so perhaps > > ipa_set_jf_arith_pass_through should do the same. > > > > And I it is also necessary to do the same thing at the end of > > determine_known_aggregate_parts, i.e. before assignment to > > item->value. I can post a separate patch if necessary. > > > > I wasn't following this thread but I hope that streaming types does > > not cause this problem. If they do, there are quite a few in various > > jump functions and indirect call graph edges. > > Well, or if jump functions would not use trees ... > Eventually, yes, but it is not a short-term goal. At least I'd like to replace the type-based devirtualization with pointer tracking first, which will avoid the need to store (hopefully) all types and I will have a better idea how to represent the jump functions in a different way. I will still need access to VMT initializers to keep indirect inlining, indirect call inlining hints and IPA-CP devirtualization working, though. But as I said, this is all a long-term plan. Martin
Finish up PR rtl-optimization/44194
This is the PR about the useless spilling to memory of structures that are returned in registers. It was essentially addressed last year by Easwaran with an enhancement of the RTL DSE pass, but Easwaran also noted that we still spill to memory in the simplest cases, e.g. gcc.dg/pr44194-1.c, because expand_call creates a temporary on the stack to store the value returned in registers... The attached patch solves this problem by copying the value into pseudos instead by means of emit_group_move_into_temps. This is sufficient to get rid of the remaining memory accesses for gcc.dg/pr44194-1.c on x86-64 for example, but not on strict-alignment platforms like SPARC64. The problem is that, on strict-alignment platforms, emit_group_store will use bitfield techniques (store_bit_field) to store the returned value, and the bitfield routines (store_bit_field and extract_bit_field) have these lines: /* We may be accessing data outside the field, which means we can alias adjacent data. */ if (MEM_P (op0)) { op0 = shallow_copy_rtx (op0); set_mem_alias_set (op0, 0); set_mem_expr (op0, 0); } Now the enhancement implemented in the RTL DSE pass by Easwaran is precisely based on the MEM_EXPR of MEM objects. The patch solves this problem by implementing a variant of adjust_address along the lines of the comment at the end of adjust_address_1: /* At some point, we should validate that this offset is within the object, if all the appropriate values are known. */ return new_rtx; i.e. adjust_bitfield_address will drop the underlying object of the MEM if it cannot prove that the adjusted memory access is still within its bounds. The bitfield manipulation routines in expmed.c are then changed to invoke adjust_bitfield_address instead of adjust_address and the above special lines in store_bit_field and extract_bit_field are eliminated. While I was at it, I also fixed a probable oversight in extract_bit_field_1 that has bothered me for a while: in the multi-word case, extract_bit_field_1 recurses on extract_bit_field instead of itself (unlike store_bit_field_1), which short-circuits the FALLBACK_P parameter. Tested on x86-64/Linux and SPARC64/Solaris. Comments? 2012-09-12 Eric Botcazou PR rtl-optimization/44194 * calls.c (expand_call): In the PARALLEL case, copy the return value into pseudos instead of spilling it onto the stack. * emit-rtl.c (adjust_address_1): Rename ADJUST into ADJUST_ADDRESS and add new ADJUST_OBJECT parameter. If ADJUST_OBJECT is set, drop the underlying object if it cannot be proved that the adjusted memory access is still within its bounds. (adjust_automodify_address_1): Adjust call to adjust_address_1. (widen_memory_access): Likewise. * expmed.c (store_bit_field_1): Call adjust_bitfield_address instead of adjust_address. Do not drop the underlying object of a MEM. (store_fixed_bit_field): Likewise. (extract_bit_field_1): Likewise. Fix oversight in recursion. (extract_fixed_bit_field): Likewise. * expr.h (adjust_address_1): Adjust prototype. (adjust_address): Adjust call to adjust_address_1. (adjust_address_nv): Likewise. (adjust_bitfield_address): New macro. (adjust_bitfield_address_nv): Likewise. * expr.c (expand_assignment): Handle a PARALLEL in more cases. (store_expr): Likewise. (store_field): Likewise. * dse.c: Fix typos in the head comment. -- Eric Botcazou Index: expr.c === --- expr.c (revision 191198) +++ expr.c (working copy) @@ -4870,8 +4870,16 @@ expand_assignment (tree to, tree from, b /* Handle calls that return values in multiple non-contiguous locations. The Irix 6 ABI has examples of this. */ if (GET_CODE (to_rtx) == PARALLEL) - emit_group_load (to_rtx, value, TREE_TYPE (from), - int_size_in_bytes (TREE_TYPE (from))); + { + if (GET_CODE (value) == PARALLEL) + emit_group_move (to_rtx, value); + else + emit_group_load (to_rtx, value, TREE_TYPE (from), + int_size_in_bytes (TREE_TYPE (from))); + } + else if (GET_CODE (value) == PARALLEL) + emit_group_store (to_rtx, value, TREE_TYPE (from), + int_size_in_bytes (TREE_TYPE (from))); else if (GET_MODE (to_rtx) == BLKmode) emit_block_move (to_rtx, value, expr_size (from), BLOCK_OP_NORMAL); else @@ -4903,9 +4911,16 @@ expand_assignment (tree to, tree from, b else temp = expand_expr (from, NULL_RTX, GET_MODE (to_rtx), EXPAND_NORMAL); + /* Handle calls that return values in multiple non-contiguous locations. + The Irix 6 ABI has examples of this. */ if (GET_CODE (to_rtx) == PARALLEL) - emit_group_load (to_rtx, temp, TREE_TYPE (from), - int_size_in_bytes (TREE_TYPE (from))); + { + if (GET_CODE (temp) == PARALLEL) + emit_group_move (t
Re: [PATCH,i386] Enable prefetchw in processor alias table for AMD targets
On Tue, Sep 11, 2012 at 11:03 AM, wrote: > Hi Maintainers, > > This patch enables "prefetchw" ISA in the processor alias table for targets > amdfam10,barcelona and bdver1,2 and btver1,2. > > GCC regression test passes with the patch. > > Ok for trunk? > > Change log: > > 2012-09-11 Venkataramanan Kumar > > * config/i386/i386.c (processor_alias_table): Enable PTA_PRFCHW > for targets amdfam10, barcelona, bdver1, bdver2, btver1 and btver2. Please note that amdfam10 and barcelona are already generating prefetchw due to PTA_3DNOW flag, so these targets can be removed from the patch. The patch is OK for mainline with that change. Please commit the patch. Thanks, Uros.
[AArch64] fix missing Dwarf call frame information in the epilogue
This patch fixes a bug in the AArch64 epilogue expander which failed to generate consistent information of REG_FRAME_RELATED_EXPR reg notes in some cases; for instance, given the following C source code: int bar (unsigned int); int foo (void) { return bar (0xcafe); } --- CUT --- The -O0 -g code gen for the function 'foo' is: .cfi_startproc stp x29, x30, [sp, -16]! .LCFI0: .cfi_def_cfa_offset 16 .cfi_offset 29, -16 .cfi_offset 30, -8 add x29, sp, 0 .LCFI1: .cfi_def_cfa_register 29 .LM2: mov w0, 57005 bl bar .LM3: ldp x29, x30, [sp], 16 .cfi_restore 30 .cfi_restore 29 ret .cfi_endproc --- CUT --- As can be seen, the CFA reg is X29 from the ADD instruction until the end of the func; this is wrong as the LDP instruction restores X29; the CFA reg should be set to SP after LDP. The patch passes the gcc and g++ testsuites. Is it OK for the aarch64 branch? Thanks, Yufeng gcc/ChangeLog 2012-09-12 Yufeng Zhang * config/aarch64/aarch64.c (aarch64_expand_prologue): Add new local variable 'cfa_reg' and replace stack_pointer_rtx with it in the REG_FRAME_RELATED_EXPR reg note for the load-pair with writeback instruction. gcc/testsuite/ChangeLog 2012-09-12 Yufeng Zhang * gcc.target/aarch64/dwarf-cfa-reg.c: New file.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 2d7eba7..f452d3d 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1786,7 +1786,7 @@ aarch64_expand_prologue (void) - original_frame_size - cfun->machine->frame.saved_regs_size); - /* Store pairs and load pairs have a range only of +/- 512. */ + /* Store pairs and load pairs have a range only -512 to 504. */ if (offset >= 512) { /* When the frame has a large size, an initial decrease is done on @@ -1934,6 +1934,7 @@ aarch64_expand_epilogue (bool for_sibcall) HOST_WIDE_INT original_frame_size, frame_size, offset; HOST_WIDE_INT fp_offset; rtx insn; + rtx cfa_reg; aarch64_layout_frame (); original_frame_size = get_frame_size () + cfun->machine->saved_varargs_size; @@ -1946,7 +1947,9 @@ aarch64_expand_epilogue (bool for_sibcall) - original_frame_size - cfun->machine->frame.saved_regs_size); - /* Store pairs and load pairs have a range only of +/- 512. */ + cfa_reg = frame_pointer_needed ? hard_frame_pointer_rtx : stack_pointer_rtx; + + /* Store pairs and load pairs have a range only -512 to 504. */ if (offset >= 512) { offset = original_frame_size + cfun->machine->frame.saved_regs_size; @@ -1977,6 +1980,10 @@ aarch64_expand_epilogue (bool for_sibcall) hard_frame_pointer_rtx, GEN_INT (- fp_offset))); RTX_FRAME_RELATED_P (insn) = 1; + /* As SP is set to (FP - fp_offset), according to the rules in + dwarf2cfi.c:dwarf2out_frame_debug_expr, CFA should be calculated + from the value of SP from now on. */ + cfa_reg = stack_pointer_rtx; } aarch64_save_or_restore_callee_save_registers @@ -2016,10 +2023,15 @@ aarch64_expand_epilogue (bool for_sibcall) GEN_INT (offset), GEN_INT (GET_MODE_SIZE (DImode) + offset))); RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1; + /* The REG_FRAME_RELATED_EXPR to be attached to the above insn + is either SP = SP + offset or SP = FP + offset, depending on + which register had been used to calculate the CFA before + the insn; note that FP had the same value as SP before the + above insn. */ aarch64_set_frame_expr (gen_rtx_SET (Pmode, stack_pointer_rtx, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, cfa_reg, GEN_INT (offset; } @@ -2040,7 +2052,6 @@ aarch64_expand_epilogue (bool for_sibcall) RTX_FRAME_RELATED_P (insn) = 1; } } - else { insn = emit_insn (gen_add2_insn (stack_pointer_rtx, diff --git a/gcc/testsuite/gcc.target/aarch64/dwarf-cfa-reg.c b/gcc/testsuite/gcc.target/aarch64/dwarf-cfa-reg.c new file mode 100644 index 000..cce8815 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/dwarf-cfa-reg.c @@ -0,0 +1,14 @@ +/* Verify that CFA register is restored to SP after FP is restored. */ +/* { dg-do compile } */ +/* { dg-options "-O0 -gdwarf-2" } */ +/* { dg-final { scan-assembler ".cfi_restore 30" } } */ +/* { dg-final { scan-assembler ".cfi_restore 29" } } */ +/* { dg-final { scan-assembler ".cfi_def_cfa 31, 0" } } */ +/* { dg-final { scan-assembler "ret" } } */ + +int bar (unsigned int); + +int foo (void) +{ + return bar (0xcafe); +}
Re: Backtrace library [1/3]
On Tue, 11 Sep 2012, Ian Lance Taylor wrote: > The configury is fairly standard. Note that libbacktrace is built as > both a host library (to link into the compilers) and as a target library > (to link into libgo and possibly other libraries). Under what circumstances will the library be built for the target - only if a relevant language such as Go is being built, or unconditionally? If built unconditionally, will the library build OK for the target in situations where no target headers are yet available? (For such builds of compilers used to bootstrap libc it would be usual to use various --disable- options to disable libraries not needed to build libc, and to use --enable-languages=c, but if this library is used on the host side by the compiler then the generic --disable-libbacktrace might not suffice since that would disable the host copy, required by the compiler itself, as well as the target copy.) -- Joseph S. Myers jos...@codesourcery.com
Re: Backtrace library [1/3]
On 09/12/2012 01:08 AM, Ian Lance Taylor wrote: The interface is somewhat constrained in that, on systems that support anonymous mmap, it does not call malloc. That makes it possible to do a symbolic backtrace from a signal handler. It would also make it possible to have a traceback of a segmentation fault caused by corruption of the malloc arena ... -- Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290 Saturnushof 14, 3738 XG Maartensdijk, The Netherlands At home: http://moene.org/~toon/; weather: http://moene.org/~hirlam/ Progress of GNU Fortran: http://gcc.gnu.org/wiki/GFortran#news
LTO partitioning reorg 4/n
Hi, this patch unifies the code as promised in the previous patch. It also cleans up the APIs. We now have enum symbol_class specifying how the symbol should be handled and add_symbol_to_partition handling all the magic of what needs to be dragged into partition and what not. I also commonized some code that was duplicated across different partitioners and added "max" partitioner that puts every symbol into new partition if possible. The patch also fixed undefined reference symbol seen with max partitioning on a testcase. Here we fold through a reference to constant variable into a reference to constant pool. Since initializer of the constant variable is not seen by the partitioner, we do not duplicate the constant pool entry into the unit and we end up with undefined reference. Fixed thus. I will try to prepare fix for 4.7, too. I think it is the problem Andi is seeing with kernel build. Bootstrapped/regtested x86_64-linux. Will commit it after bit of further testing. * common.opt (flto-partition): Add "max". * invoke.texi (flto-partition): Document "max" * lto.c (do_whole_program_analysis): Care timevars, statistics and AUX pointer cleaning. Add max partitioning. * lto-partition.c (enum symbol_class): New. (get_symbol_class): New function. (symbol_partitioned_p): New function. (add_references_to_partition): Remove. (add_aliases_to_partition): Remove. (add_cgraph_node_to_partition_1): Remove. (add_cgraph_node_to_partition): Remove. (add_symbol_to_partition): New function. (add_symbol_to_partition_1): New function. (contained_in_symbol): New function. (partition_cgraph_node_p): Remove. (partition_varpool_node_p): Remove. (partition_symbol_p): Remove. (lto_1_to_1_map): Cleanup. (lto_max_map): New. (lto_balanced_map): Update. (lto_promote_cross_file_statics): Update. * lto-partition.h (lto_max_map): Declare. * timevar.def (TV_WHOPR_PARTITIONING): New timevar. Index: common.opt === --- common.opt (revision 191215) +++ common.opt (working copy) @@ -1439,12 +1439,16 @@ Link-time optimization with number of pa flto-partition=1to1 Common Var(flag_lto_partition_1to1) -Partition functions and vars at linktime based on object files they originate from +Partition symbols and vars at linktime based on object files they originate from flto-partition=balanced Common Var(flag_lto_partition_balanced) Partition functions and vars at linktime into approximately same sized buckets +flto-partition=max +Common Var(flag_lto_partition_max) +Put every symbol into separate partition + flto-partition=none Common Var(flag_lto_partition_none) Disable partioning and streaming Index: lto/lto.c === --- lto/lto.c (revision 191215) +++ lto/lto.c (working copy) @@ -2604,11 +2604,28 @@ lto_wpa_write_files (void) fprintf (cgraph_dump_file, "Writing partition %s to file %s, %i insns\n", part->name, temp_filename, part->insns); + fprintf (cgraph_dump_file, " Symbols in partition: "); for (lsei = lsei_start_in_partition (part->encoder); !lsei_end_p (lsei); lsei_next_in_partition (&lsei)) { symtab_node node = lsei_node (lsei); - fprintf (cgraph_dump_file, "%s ", symtab_node_name (node)); + fprintf (cgraph_dump_file, "%s ", symtab_node_asm_name (node)); + } + fprintf (cgraph_dump_file, "\n Symbols in boundary: "); + for (lsei = lsei_start (part->encoder); !lsei_end_p (lsei); + lsei_next (&lsei)) + { + symtab_node node = lsei_node (lsei); + if (!lto_symtab_encoder_in_partition_p (part->encoder, node)) + { + fprintf (cgraph_dump_file, "%s ", symtab_node_asm_name (node)); + if (symtab_function_p (node) + && lto_symtab_encoder_encode_body_p (part->encoder, cgraph (node))) + fprintf (cgraph_dump_file, "(body included)"); + else if (symtab_variable_p (node) + && lto_symtab_encoder_encode_initializer_p (part->encoder, varpool (node))) + fprintf (cgraph_dump_file, "(initializer included)"); + } } fprintf (cgraph_dump_file, "\n"); } @@ -3093,6 +3107,8 @@ print_lto_report_1 (void) static void do_whole_program_analysis (void) { + symtab_node node; + timevar_start (TV_PHASE_OPT_GEN); /* Note that since we are in WPA mode, materialize_cgraph will not @@ -3127,17 +3143,31 @@ do_whole_program_analysis (void) dump_cgraph (cgraph_dump_file); dump_varpool (cgraph_dump_file); } +#ifdef ENABLE_CHECKING verif
Re: [PATCH] Combine location with block using block_locations
On Wed, Sep 12, 2012 at 2:13 AM, Richard Guenther wrote: > On Wed, Sep 12, 2012 at 7:06 AM, Dehao Chen wrote: >> Now I think we are facing a more complex problem. The data structure >> we use to store the location_adhoc_data are file-static in linemap.c >> in libcpp. These data structures are not guarded by GTY(()). >> Meanwhile, as we have removed the block data structure from >> gimple.gsbase as well as tree.exp (encoding them into an location_t). >> This could cause block being GCed and the LOCATION_BLOCK becoming >> dangling pointers. > > Uh. Note that it is quite important that we are able to garbage-collect > unused > BLOCKs, this is the whole point of removing unused BLOCK scopes in > remove_unused_locals. So this indeed becomes much more complicated ... > What would be desired is that the garbage collector can NULL an entry in > the mapping table when it is not referenced in any other way (that other > reference would be the BLOCK tree as stored in a FUNCTION_DECLs DECL_INITIAL). It would be nice to GC those unused BLOCKS. I wonder how many BLOCKS are created for a large C++ program. This patch saves memory by shrinking tree size, is it a net win or loss without GC those BLOCKS? thanks, David > >> I tried to manipulate GTY to make it recognize the LOCATION_BLOCK from >> gimple.gsbase.location. However, neigher nested_ptr nor mark_hook can >> help me. >> >> Another approach would be guard the location_adhoc_data and related >> data structures in GTY(()). However, this is non-trivial because tree >> is not visible in libcpp. At the same time, my implementation heavily >> relies on hashtable to make the code efficient, thus it's quite tricky >> to make "param_is" and "use_params" work. >> >> The final approach, which I'll try tomorrow, would be move all my >> implementation from libcpp to gcc, and guard them with GTY(()). I >> still haven't thought of any potential problem of this approach. Any >> comments? > > I think moving the mapping to GC in a lazy manner as I described above > would be the way to go. For hashtables GC already supports if_marked, > not sure if similar support is available for arrays/vecs. > > Richard. > >> Thanks, >> Dehao >> >> On Tue, Sep 11, 2012 at 9:00 AM, Dehao Chen wrote: >>> I saw comments in tree-streamer-out.c: >>> >>> /* Do not stream BLOCK_SOURCE_LOCATION. We cannot handle debug >>> information >>> for early inlining so drop it on the floor instead of ICEing in >>> dwarf2out.c. */ >>> streamer_write_chain (ob, BLOCK_VARS (expr), ref_p); >>> >>> However, what the code is doing seemed contradictory with the comment. >>> Or am I missing something? >>> >>> >>> >>> On Tue, Sep 11, 2012 at 8:32 AM, Michael Matz wrote: Hi, On Tue, 11 Sep 2012, Dehao Chen wrote: > Looks like we have two choices: > > 1. Stream out block info, and use LTO_SET_PREVAIL for TREE_CHAIN(t) This will actually not work correctly in some cases. The problem is, if the prevailing decl is already part of another chain (say in another block_var list) you would break the current chain. Hence block vars need special handling in the lto streamer (another reason why tree_chain is not the most clever think to use for this chain). This problem area needs to be solved somehow if block info is to be preserved correctly. > 2. Don't stream out block info for LTO, and still call LTO_NO_PREVAIL > (TREE_CHAIN (t)). That's also a large hammer as it basically will mean no debug info after LTO :-/ Sigh, at this point I have no good solution that doesn't involve quite some work, perhaps your hack is good enough for the time being, though I hate it :) >>> >>> I got it. Then I'll keep the patch as it is (remove the >>> LTO_NO_PREVAIL), and work with Honza to resolve the issue he had, and >>> then we should be good to check in? >>> >>> Thanks, >>> Dehao >>> Ciao, Michael.
[PATCH] proposed fix for PRs target/54516 & rtl-optimization/54540
Although the mersenne twister code is new, the build/runtime failures of PRs target/54516 and PR rtl-optimization/54540 can be tracked back to the addition of the variant add %0("=rk"), %1("0"), %2("rk") // size=2 to the thumb2 backend. This is causing reload to try to do clever things when we have a large stack adjustment. Prior to reload we have a sequence of the form: r:SI = sp:SI + (-some_const1) sp:SI = r:SI + (-some_const2) Since there is no thumb2 instruction to directly perform the second operation some reloading is needed. Reload determines that the new variant described above is a viable candidate and sets about trying to create reloads for it. The first thing it does is rearranges the operands to create sp:SI = (-some_const2) + r:SI which now needs precisely one reload. The tie operation forces a reload into sp of the value -some_const2, not realizing that putting random values into SP is potentially very dangerous to the health of your program (or system if you're running at a privileged level). It seems to me that find_dummy_reload is doing an unsafe thing by permitting the use of a register in fixed_regs[] as a reload register -- writing to fixed regs could have unspecified side effects on the machine, so they should only be used in the ways that earlier passes have deemed to be safe, and using them as a temporary is not one of those. So this patch fixes the problem by forcing find_dummy_reload to reject OUT as a valid reload register for IN when OUT overlaps a register mentioned in fixed_regs[]. Bootstrapped on arm-linux-gnueabi with no regressions. No new tests are needed as this fixes the execution failures in the mersenne twister tests. * reload.c (find_dummy_reload): Don't use OUT as a reload reg for IN if it overlaps a fixed register. OK?--- reload.c(revision 191208) +++ reload.c(local) @@ -2036,7 +2036,12 @@ find_dummy_reload (rtx real_in, rtx real However, we only ignore IN in its role as this reload. If the insn uses IN elsewhere and it contains OUT, that counts. We can't be sure it's the "same" operand -so it might not go through this reload. */ +so it might not go through this reload. + + We also need to avoid using OUT if it, or part of it, is a + fixed register. Modifying such registers, even transiently, + may have undefined effects on the machine, such as modifying + the stack pointer. */ saved_rtx = *inloc; *inloc = const0_rtx; @@ -2049,7 +2054,8 @@ find_dummy_reload (rtx real_in, rtx real for (i = 0; i < nwords; i++) if (! TEST_HARD_REG_BIT (reg_class_contents[(int) rclass], -regno + i)) +regno + i) + || fixed_regs[regno + i]) break; if (i == nwords)
Re: [Patch ARM] implement bswap16
On 11/09/12 16:04, Christophe Lyon wrote: > On 11 September 2012 12:52, Richard Earnshaw wrote: >> Try something like: >> >> short foo(int); >> >> short swaps (short x, int y) >> { >> int z = x; >> if (y) >> z = __builtin_bswap16(x); >> return foo (z); >> } >> >> If that's not enough, try adding 1 to z before calling foo. >> > > Thanks, it works. > It's surprising however that 'return z' isn't enough. > > Here is a new version of the patch, which also transforms the 32 bits > arm_rev/thumb1_rev into arm_rev/arm_rev_cond. > > I have enhanced the testcase too. > > Christophe.= > > > bswap16.patch > > Thanks, This is OK. Ramana has just pointed out to me that predication these days can be set on a per-alternative basis, so all those extra patterns for cond-exec can be eliminated after all (sigh!). I'm not going to insist that you fix that (unless you want to); We've been round this patch too many times now and the code you've written isn't wrong... R.
Re: [PATCH] proposed fix for PRs target/54516 & rtl-optimization/54540
Richard Earnshaw wrote: > It seems to me that find_dummy_reload is doing an unsafe thing by > permitting the use of a register in fixed_regs[] as a reload register -- > writing to fixed regs could have unspecified side effects on the > machine, so they should only be used in the ways that earlier passes > have deemed to be safe, and using them as a temporary is not one of those. This sounds reasonable, agreed. > * reload.c (find_dummy_reload): Don't use OUT as a reload reg > for IN if it overlaps a fixed register. OK. Thanks, Ulrich -- Dr. Ulrich Weigand GNU Toolchain for Linux on System z and Cell BE ulrich.weig...@de.ibm.com
Re: [PATCH] Combine location with block using block_locations
There are two parts that needs memory management: 1. The BLOCK structure. This is managed by GC. I originally thought that removing blocks from tree.gsbase would paralyze GC. This turned out not to be a concern because DECL_INITIAL will still mark those used tree nodes. This patch may decrease the memory consumption by removing blocks from tree/gimple. However, as it makes more blocks become used, they also increase the memory consumption. 2. The data structure in libcpp that maintains the hashtable for the location->block mapping. This is relatively minor because for the largest source I've seen, it only maintains less than 100K entries in the array (less than 1M total memory consumption). However, as it is a global data structure, it may make LTO unhappy. Honza is helping testing the memory consumption on LTO (but we first need to make this patch work for LTO). If the LTO result turns out ok, we probably don't want to put these under GC because: 1. it'll make things much more complicated. 2. using self managed memory is more efficient (as this is frequently used in many passes). 3. not using GC actually saves memory because even though the block is in the map, it can still be GCed as soon as it's not reachable from DECL_INITIAL. I've tested this on some very large C++ files (each one takes more than 10s to build), the memory consumption does not see noticeable increase/decrease. Thanks, Dehao On Wed, Sep 12, 2012 at 9:39 AM, Xinliang David Li wrote: > On Wed, Sep 12, 2012 at 2:13 AM, Richard Guenther > wrote: >> On Wed, Sep 12, 2012 at 7:06 AM, Dehao Chen wrote: >>> Now I think we are facing a more complex problem. The data structure >>> we use to store the location_adhoc_data are file-static in linemap.c >>> in libcpp. These data structures are not guarded by GTY(()). >>> Meanwhile, as we have removed the block data structure from >>> gimple.gsbase as well as tree.exp (encoding them into an location_t). >>> This could cause block being GCed and the LOCATION_BLOCK becoming >>> dangling pointers. >> >> Uh. Note that it is quite important that we are able to garbage-collect >> unused >> BLOCKs, this is the whole point of removing unused BLOCK scopes in >> remove_unused_locals. So this indeed becomes much more complicated ... >> What would be desired is that the garbage collector can NULL an entry in >> the mapping table when it is not referenced in any other way (that other >> reference would be the BLOCK tree as stored in a FUNCTION_DECLs >> DECL_INITIAL). > > It would be nice to GC those unused BLOCKS. I wonder how many BLOCKS > are created for a large C++ program. This patch saves memory by > shrinking tree size, is it a net win or loss without GC those BLOCKS? > > thanks, > > David > > >> >>> I tried to manipulate GTY to make it recognize the LOCATION_BLOCK from >>> gimple.gsbase.location. However, neigher nested_ptr nor mark_hook can >>> help me. >>> >>> Another approach would be guard the location_adhoc_data and related >>> data structures in GTY(()). However, this is non-trivial because tree >>> is not visible in libcpp. At the same time, my implementation heavily >>> relies on hashtable to make the code efficient, thus it's quite tricky >>> to make "param_is" and "use_params" work. >>> >>> The final approach, which I'll try tomorrow, would be move all my >>> implementation from libcpp to gcc, and guard them with GTY(()). I >>> still haven't thought of any potential problem of this approach. Any >>> comments? >> >> I think moving the mapping to GC in a lazy manner as I described above >> would be the way to go. For hashtables GC already supports if_marked, >> not sure if similar support is available for arrays/vecs. >> >> Richard. >> >>> Thanks, >>> Dehao >>> >>> On Tue, Sep 11, 2012 at 9:00 AM, Dehao Chen wrote: I saw comments in tree-streamer-out.c: /* Do not stream BLOCK_SOURCE_LOCATION. We cannot handle debug information for early inlining so drop it on the floor instead of ICEing in dwarf2out.c. */ streamer_write_chain (ob, BLOCK_VARS (expr), ref_p); However, what the code is doing seemed contradictory with the comment. Or am I missing something? On Tue, Sep 11, 2012 at 8:32 AM, Michael Matz wrote: > Hi, > > On Tue, 11 Sep 2012, Dehao Chen wrote: > >> Looks like we have two choices: >> >> 1. Stream out block info, and use LTO_SET_PREVAIL for TREE_CHAIN(t) > > This will actually not work correctly in some cases. The problem is, if > the prevailing decl is already part of another chain (say in another > block_var list) you would break the current chain. Hence block vars need > special handling in the lto streamer (another reason why tree_chain is not > the most clever think to use for this chain). This problem area needs to > be solved somehow if block info is to be preserved correctly. > >> 2. Don't stream out block info for LTO,
Re: PATCH: PR target/54445: TLS array lookup with negative constant is not combined into a single instruction
On Sun, Sep 2, 2012 at 7:20 AM, H.J. Lu wrote: > Hi, > > When x86-64 TLS support was added by: > > http://gcc.gnu.org/ml/gcc-patches/2002-10/msg01262.html > > it didn't allow negative offset. Jakub, do you remember the reason for > it? I tested this patch on Linux/x86-64 and used the new GCC to build > glibc for x86-64 and x32. There are no regressions. OK to install? > > Thanks. > > > H.J. > > gcc/ > > 2012-09-02 H.J. Lu > > PR target/54445 > * config/i386/predicates.md (x86_64_immediate_operand): Allow > negative offset for UNSPEC_DTPOFF/UNSPEC_NTPOFF. > > gcc/testsuite/ > 2012-09-02 H.J. Lu > > PR target/54445 > * gcc.target/i386/pr54445-1.c: New file. > * gcc.target/i386/pr54445-2.c: Likewise. > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index 55e4b56..159594e 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -228,8 +228,7 @@ > { > case UNSPEC_DTPOFF: > case UNSPEC_NTPOFF: > - if (offset > 0 > - && trunc_int_for_mode (offset, SImode) == offset) > + if (trunc_int_for_mode (offset, SImode) == offset) > return true; > } > break; > diff --git a/gcc/testsuite/gcc.target/i386/pr54445-1.c > b/gcc/testsuite/gcc.target/i386/pr54445-1.c > new file mode 100644 > index 000..72ef84e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr54445-1.c > @@ -0,0 +1,24 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > + > +__thread unsigned char tls_array[64]; > + > +unsigned char > +__attribute__ ((noinline)) > +tls_array_lookup_with_negative_constant(long long int position) { > + return tls_array[position - 1]; > +} > + > +int > +main () > +{ > + int i; > + > + for (i = 0; i < sizeof (tls_array) / sizeof (tls_array[0]); i++) > +tls_array[i] = i; > + > + for (i = 0; i < sizeof (tls_array) / sizeof (tls_array[0]); i++) > +if (i != tls_array_lookup_with_negative_constant (i + 1)) > + __builtin_abort (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr54445-2.c > b/gcc/testsuite/gcc.target/i386/pr54445-2.c > new file mode 100644 > index 000..5151c13 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr54445-2.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile { target { *-*-linux* && { ! { ia32 } } } } } */ > +/* { dg-options "-O2 -fno-pic" } */ > + > +__thread unsigned char tls_array[64]; > + > +unsigned char > +tls_array_lookup_with_negative_constant(long long int position) { > + return tls_array[position - 1]; > +} > + > +/* { dg-final { scan-assembler "mov(b|zbl)\[ > \t\](%fs:)?tls_array@tpoff-1\\(%" } } */ Hi Uros, Is this OK? Thanks. -- H.J.
Re: [PATCH] Combine location with block using block_locations
On Wed, Sep 12, 2012 at 10:05 AM, Dehao Chen wrote: > There are two parts that needs memory management: > > 1. The BLOCK structure. This is managed by GC. I originally thought > that removing blocks from tree.gsbase would paralyze GC. This turned > out not to be a concern because DECL_INITIAL will still mark those > used tree nodes. This patch may decrease the memory consumption by > removing blocks from tree/gimple. However, as it makes more blocks > become used, they also increase the memory consumption. You mean when you also make the location table GC root. Can you share the mem-stats information for the large program with and without your patch? thanks, David > 2. The data structure in libcpp that maintains the hashtable for the > location->block mapping. This is relatively minor because for the > largest source I've seen, it only maintains less than 100K entries in > the array (less than 1M total memory consumption). However, as it is a > global data structure, it may make LTO unhappy. Honza is helping > testing the memory consumption on LTO (but we first need to make this > patch work for LTO). If the LTO result turns out ok, we probably don't > want to put these under GC because: 1. it'll make things much more > complicated. 2. using self managed memory is more efficient (as this > is frequently used in many passes). 3. not using GC actually saves > memory because even though the block is in the map, it can still be > GCed as soon as it's not reachable from DECL_INITIAL. > > I've tested this on some very large C++ files (each one takes more > than 10s to build), the memory consumption does not see noticeable > increase/decrease. > > Thanks, > Dehao > > On Wed, Sep 12, 2012 at 9:39 AM, Xinliang David Li wrote: >> On Wed, Sep 12, 2012 at 2:13 AM, Richard Guenther >> wrote: >>> On Wed, Sep 12, 2012 at 7:06 AM, Dehao Chen wrote: Now I think we are facing a more complex problem. The data structure we use to store the location_adhoc_data are file-static in linemap.c in libcpp. These data structures are not guarded by GTY(()). Meanwhile, as we have removed the block data structure from gimple.gsbase as well as tree.exp (encoding them into an location_t). This could cause block being GCed and the LOCATION_BLOCK becoming dangling pointers. >>> >>> Uh. Note that it is quite important that we are able to garbage-collect >>> unused >>> BLOCKs, this is the whole point of removing unused BLOCK scopes in >>> remove_unused_locals. So this indeed becomes much more complicated ... >>> What would be desired is that the garbage collector can NULL an entry in >>> the mapping table when it is not referenced in any other way (that other >>> reference would be the BLOCK tree as stored in a FUNCTION_DECLs >>> DECL_INITIAL). >> >> It would be nice to GC those unused BLOCKS. I wonder how many BLOCKS >> are created for a large C++ program. This patch saves memory by >> shrinking tree size, is it a net win or loss without GC those BLOCKS? >> >> thanks, >> >> David >> >> >>> I tried to manipulate GTY to make it recognize the LOCATION_BLOCK from gimple.gsbase.location. However, neigher nested_ptr nor mark_hook can help me. Another approach would be guard the location_adhoc_data and related data structures in GTY(()). However, this is non-trivial because tree is not visible in libcpp. At the same time, my implementation heavily relies on hashtable to make the code efficient, thus it's quite tricky to make "param_is" and "use_params" work. The final approach, which I'll try tomorrow, would be move all my implementation from libcpp to gcc, and guard them with GTY(()). I still haven't thought of any potential problem of this approach. Any comments? >>> >>> I think moving the mapping to GC in a lazy manner as I described above >>> would be the way to go. For hashtables GC already supports if_marked, >>> not sure if similar support is available for arrays/vecs. >>> >>> Richard. >>> Thanks, Dehao On Tue, Sep 11, 2012 at 9:00 AM, Dehao Chen wrote: > I saw comments in tree-streamer-out.c: > > /* Do not stream BLOCK_SOURCE_LOCATION. We cannot handle debug > information > for early inlining so drop it on the floor instead of ICEing in > dwarf2out.c. */ > streamer_write_chain (ob, BLOCK_VARS (expr), ref_p); > > However, what the code is doing seemed contradictory with the comment. > Or am I missing something? > > > > On Tue, Sep 11, 2012 at 8:32 AM, Michael Matz wrote: >> Hi, >> >> On Tue, 11 Sep 2012, Dehao Chen wrote: >> >>> Looks like we have two choices: >>> >>> 1. Stream out block info, and use LTO_SET_PREVAIL for TREE_CHAIN(t) >> >> This will actually not work correctly in some cases. The problem is, if >> the prevailing decl is already part of another chain (say in another >> blo
Re: Finish up PR rtl-optimization/44194
On Wed, Sep 12, 2012 at 8:37 AM, Eric Botcazou wrote: > This is the PR about the useless spilling to memory of structures that are > returned in registers. It was essentially addressed last year by Easwaran > with > an enhancement of the RTL DSE pass, but Easwaran also noted that we still > spill > to memory in the simplest cases, e.g. gcc.dg/pr44194-1.c, because expand_call > creates a temporary on the stack to store the value returned in registers... > > The attached patch solves this problem by copying the value into pseudos > instead by means of emit_group_move_into_temps. This is sufficient to get rid > of the remaining memory accesses for gcc.dg/pr44194-1.c on x86-64 for example, > but not on strict-alignment platforms like SPARC64. > > The problem is that, on strict-alignment platforms, emit_group_store will use > bitfield techniques (store_bit_field) to store the returned value, and the > bitfield routines (store_bit_field and extract_bit_field) have these lines: > > /* We may be accessing data outside the field, which means > we can alias adjacent data. */ > if (MEM_P (op0)) > { > op0 = shallow_copy_rtx (op0); > set_mem_alias_set (op0, 0); > set_mem_expr (op0, 0); > } > > Now the enhancement implemented in the RTL DSE pass by Easwaran is precisely > based on the MEM_EXPR of MEM objects. > > The patch solves this problem by implementing a variant of adjust_address > along > the lines of the comment at the end of adjust_address_1: > > /* At some point, we should validate that this offset is within the object, > if all the appropriate values are known. */ > return new_rtx; > > i.e. adjust_bitfield_address will drop the underlying object of the MEM if it > cannot prove that the adjusted memory access is still within its bounds. > The bitfield manipulation routines in expmed.c are then changed to invoke > adjust_bitfield_address instead of adjust_address and the above special lines > in store_bit_field and extract_bit_field are eliminated. > > While I was at it, I also fixed a probable oversight in extract_bit_field_1 > that has bothered me for a while: in the multi-word case, extract_bit_field_1 > recurses on extract_bit_field instead of itself (unlike store_bit_field_1), > which short-circuits the FALLBACK_P parameter. > > Tested on x86-64/Linux and SPARC64/Solaris. Comments? > > > 2012-09-12 Eric Botcazou > > PR rtl-optimization/44194 > * calls.c (expand_call): In the PARALLEL case, copy the return value > into pseudos instead of spilling it onto the stack. > * emit-rtl.c (adjust_address_1): Rename ADJUST into ADJUST_ADDRESS and > add new ADJUST_OBJECT parameter. > If ADJUST_OBJECT is set, drop the underlying object if it cannot be > proved that the adjusted memory access is still within its bounds. > (adjust_automodify_address_1): Adjust call to adjust_address_1. > (widen_memory_access): Likewise. > * expmed.c (store_bit_field_1): Call adjust_bitfield_address instead > of adjust_address. Do not drop the underlying object of a MEM. > (store_fixed_bit_field): Likewise. > (extract_bit_field_1): Likewise. Fix oversight in recursion. > (extract_fixed_bit_field): Likewise. > * expr.h (adjust_address_1): Adjust prototype. > (adjust_address): Adjust call to adjust_address_1. > (adjust_address_nv): Likewise. > (adjust_bitfield_address): New macro. > (adjust_bitfield_address_nv): Likewise. > * expr.c (expand_assignment): Handle a PARALLEL in more cases. > (store_expr): Likewise. > (store_field): Likewise. > > * dse.c: Fix typos in the head comment. Will it help http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54315 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=28831 Thanks. -- H.J.
Re: [PATCH] Combine location with block using block_locations
There is another bug in the patch (not covered by unittests, discovered through spec benchmarks). When we remove unused locals, we do not mark the block as used for debug stmt, but gimple-streamer-out will still stream out blocks for debug stmt. There can be 2 fixes: 1. --- a/gcc/gimple-streamer-out.c +++ b/gcc/gimple-streamer-out.c @@ -77,7 +77,8 @@ output_gimple_stmt (struct output_block *ob, gimple stmt) lto_output_location (ob, LOCATION_LOCUS (gimple_location (stmt))); /* Emit the lexical block holding STMT. */ - stream_write_tree (ob, gimple_block (stmt), true); + if (!is_gimple_debug (stmt)) +stream_write_tree (ob, gimple_block (stmt), true); /* Emit the operands. */ switch (gimple_code (stmt)) 2. --- a/gcc/tree-ssa-live.c +++ b/gcc/tree-ssa-live.c @@ -726,9 +726,6 @@ remove_unused_locals (void) gimple stmt = gsi_stmt (gsi); tree b = gimple_block (stmt); - if (is_gimple_debug (stmt)) - continue; - if (gimple_clobber_p (stmt)) { have_local_clobbers = true; Either fix could work. Any suggests which one should we go? Thanks, Dehao On Wed, Sep 12, 2012 at 10:05 AM, Dehao Chen wrote: > There are two parts that needs memory management: > > 1. The BLOCK structure. This is managed by GC. I originally thought > that removing blocks from tree.gsbase would paralyze GC. This turned > out not to be a concern because DECL_INITIAL will still mark those > used tree nodes. This patch may decrease the memory consumption by > removing blocks from tree/gimple. However, as it makes more blocks > become used, they also increase the memory consumption. > 2. The data structure in libcpp that maintains the hashtable for the > location->block mapping. This is relatively minor because for the > largest source I've seen, it only maintains less than 100K entries in > the array (less than 1M total memory consumption). However, as it is a > global data structure, it may make LTO unhappy. Honza is helping > testing the memory consumption on LTO (but we first need to make this > patch work for LTO). If the LTO result turns out ok, we probably don't > want to put these under GC because: 1. it'll make things much more > complicated. 2. using self managed memory is more efficient (as this > is frequently used in many passes). 3. not using GC actually saves > memory because even though the block is in the map, it can still be > GCed as soon as it's not reachable from DECL_INITIAL. > > I've tested this on some very large C++ files (each one takes more > than 10s to build), the memory consumption does not see noticeable > increase/decrease. > > Thanks, > Dehao > > On Wed, Sep 12, 2012 at 9:39 AM, Xinliang David Li wrote: >> On Wed, Sep 12, 2012 at 2:13 AM, Richard Guenther >> wrote: >>> On Wed, Sep 12, 2012 at 7:06 AM, Dehao Chen wrote: Now I think we are facing a more complex problem. The data structure we use to store the location_adhoc_data are file-static in linemap.c in libcpp. These data structures are not guarded by GTY(()). Meanwhile, as we have removed the block data structure from gimple.gsbase as well as tree.exp (encoding them into an location_t). This could cause block being GCed and the LOCATION_BLOCK becoming dangling pointers. >>> >>> Uh. Note that it is quite important that we are able to garbage-collect >>> unused >>> BLOCKs, this is the whole point of removing unused BLOCK scopes in >>> remove_unused_locals. So this indeed becomes much more complicated ... >>> What would be desired is that the garbage collector can NULL an entry in >>> the mapping table when it is not referenced in any other way (that other >>> reference would be the BLOCK tree as stored in a FUNCTION_DECLs >>> DECL_INITIAL). >> >> It would be nice to GC those unused BLOCKS. I wonder how many BLOCKS >> are created for a large C++ program. This patch saves memory by >> shrinking tree size, is it a net win or loss without GC those BLOCKS? >> >> thanks, >> >> David >> >> >>> I tried to manipulate GTY to make it recognize the LOCATION_BLOCK from gimple.gsbase.location. However, neigher nested_ptr nor mark_hook can help me. Another approach would be guard the location_adhoc_data and related data structures in GTY(()). However, this is non-trivial because tree is not visible in libcpp. At the same time, my implementation heavily relies on hashtable to make the code efficient, thus it's quite tricky to make "param_is" and "use_params" work. The final approach, which I'll try tomorrow, would be move all my implementation from libcpp to gcc, and guard them with GTY(()). I still haven't thought of any potential problem of this approach. Any comments? >>> >>> I think moving the mapping to GC in a lazy manner as I described above >>> would be the way to go. For hashtables GC already supports if_marked, >>> not sure if similar support is available for a
Re: Backtrace library [1/3]
On 9/12/12, Ian Lance Taylor wrote: > On Sep 11, 2012 Lawrence Crowl wrote: > > On 9/11/12, Ian Lance Taylor wrote: > > > This patch is the interface to and configury of libbacktrace. > > > I've separated these out as the parts of libbacktrace that > > > require the most review. The interface to libbacktrace is in > > > the file backtrace.h. This is what callers will use. The file > > > backtrace-supported.h is also available so that programs can > > > see whether calling the backtrace library will work at all. > > > > The interface relies on global data in the library. Wouldn't it > > be better to expose the state as an additional parameter to > > enable concurrent access by different threads? That parameter > > could then be modeled as 'this' parameter, addressing Gaby's > > suggesting. > > I went ahead and added a state parameter to the interface. > I've attached the updated patch. How about typing it as a pointer to an incomplete struct? extern void *backtrace_create_state (... becomes, e.g., struct backtrace_state; extern backtrace_state *backtrace_create_state (... -- Lawrence Crowl
Re: Change double_int calls to new interface.
On 9/12/12, Mark Kettenis wrote: >> Date: Tue, 11 Sep 2012 17:03:39 -0700 >> From: Ian Lance Taylor >> >> On Tue, Sep 11, 2012 at 3:12 PM, Lawrence Crowl >> wrote: >> > On 9/11/12, Andreas Schwab wrote: >> >> Mark Kettenis writes: >> >>> In file included from ../../../src/gcc/gcc/mcf.c:47:0: >> >>> ../../../src/gcc/gcc/mcf.c: In function 'void dump_fixup_edge(FILE*, >> >>> fixup_graph_type*, fixup_edge_p)': >> >>> ../../../src/gcc/gcc/system.h:288:78: error: integer overflow in >> >>> expression [-Werror=overflow] >> >> >> >> This is PR54528. >> > >> > The expression itself looks correct. I have not been able to >> > duplicate the problem on x86. I am now waiting on access to the >> > compile farm for access to a hppa system. Does anyone have more >> > specific information on the condition that generates the error? >> >> I haven't tried, but I bet you can get it to happen if you build a >> 32-bit compiler--that is, build a compiler using -m32 so that the >> compiler itself runs 32 bit code. > > I don't think that helps since I don't see the problem on OpenBSD/i386 > (i386-unknown-openbsd5.2) whith a strictly 32-bit compiler. As I > wrote earlier, I suspect the key factor is HOST_WIDE_INT being 32-bit, > which means 32-bit architectures that don't sey need_64bit_hwint in > config.gcc. The fact that m68k is affected as well strengthens my > suspicion. So I expect arm to show the problem as well. But people > probably haven't noticed since they cross-compile. > > Anyway the "diff" below seems to fix the issue. Guess the replacement > doesn't work! Should be a big enough clue for Lawrence to come up > with a proper fix. > > Index: fold-const.c > === > --- fold-const.c (revision 191150) > +++ fold-const.c (working copy) > @@ -982,13 +982,15 @@ >break; > > case MINUS_EXPR: > -/* FIXME(crowl) Remove this code if the replacment works. > + /* FIXME(crowl) Remove this code if the replacment works. */ > +#if 1 >neg_double (op2.low, op2.high, &res.low, &res.high); >add_double (op1.low, op1.high, res.low, res.high, > &res.low, &res.high); >overflow = OVERFLOW_SUM_SIGN (res.high, op2.high, op1.high); > -*/ > +#else >res = op1.add_with_sign (-op2, false, &overflow); > +#endif >break; > > case MULT_EXPR: My suspicions were on the shift operation. I will update the last patch I sent out with a fix. I will probably need help testing. -- Lawrence Crowl
Loop stride optimization hint
Hi, for Fortran one of common reason to inline is because array descriptor is known and defines loop stride. This patch makes ipa-inline-analysis to notice these cases. Bootstrapped/regtested x86_64-linux, will commit it tomorrow if there are no complains. Honza * ipa-inline-analysis.c (dump_inline_hints): Dump loop stride. (set_hint_predicate): New function. (reset_inline_summary): Reset loop stride. (remap_predicate_after_duplication): New function. (remap_hint_predicate_after_duplication): New function. (inline_node_duplication_hook): Update. (dump_inline_summary): Dump stride summaries. (estimate_function_body_sizes): Compute strides. (remap_hint_predicate): New function. (inline_merge_summary): Use it. (inline_read_section): Read stride. (inline_write_summary): Write stride. * ipa-inline.c (want_inline_small_function_p): Handle strides. (edge_badness): Likewise. * ipa-inline.h (inline_hints_vals): Add stride hint. (inline_summary): Update stride. * gcc.dg/ipa/inlinehint-2.c: New testcase. Index: ipa-inline-analysis.c === *** ipa-inline-analysis.c (revision 191228) --- ipa-inline-analysis.c (working copy) *** dump_inline_hints (FILE *f, inline_hints *** 634,639 --- 634,644 hints &= ~INLINE_HINT_loop_iterations; fprintf (f, " loop_iterations"); } + if (hints & INLINE_HINT_loop_stride) + { + hints &= ~INLINE_HINT_loop_stride; + fprintf (f, " loop_stride"); + } gcc_assert (!hints); } *** edge_set_predicate (struct cgraph_edge * *** 719,724 --- 724,749 } } + /* Set predicate for hint *P. */ + + static void + set_hint_predicate (struct predicate **p, struct predicate new_predicate) + { + if (false_predicate_p (&new_predicate) + || true_predicate_p (&new_predicate)) + { + if (*p) + pool_free (edge_predicate_pool, *p); + *p = NULL; + } + else + { + if (!*p) + *p = (struct predicate *)pool_alloc (edge_predicate_pool); + **p = new_predicate; + } + } + /* KNOWN_VALS is partial mapping of parameters of NODE to constant values. KNOWN_AGGS is a vector of aggreggate jump functions for each parameter. *** reset_inline_summary (struct cgraph_node *** 953,958 --- 978,988 pool_free (edge_predicate_pool, info->loop_iterations); info->loop_iterations = NULL; } + if (info->loop_stride) + { + pool_free (edge_predicate_pool, info->loop_stride); + info->loop_stride = NULL; + } VEC_free (condition, gc, info->conds); VEC_free (size_time_entry,gc, info->entry); for (e = node->callees; e; e = e->next_callee) *** inline_node_removal_hook (struct cgraph_ *** 975,980 --- 1005,1056 memset (info, 0, sizeof (inline_summary_t)); } + /* Remap predicate P of former function to be predicate of duplicated functoin. +POSSIBLE_TRUTHS is clause of possible truths in the duplicated node, +INFO is inline summary of the duplicated node. */ + + static struct predicate + remap_predicate_after_duplication (struct predicate *p, + clause_t possible_truths, + struct inline_summary *info) + { + struct predicate new_predicate = true_predicate (); + int j; + for (j = 0; p->clause[j]; j++) + if (!(possible_truths & p->clause[j])) + { + new_predicate = false_predicate (); + break; + } + else + add_clause (info->conds, &new_predicate, + possible_truths & p->clause[j]); + return new_predicate; + } + + /* Same as remap_predicate_after_duplication but handle hint predicate *P. +Additionally care about allocating new memory slot for updated predicate +and set it to NULL when it becomes true or false (and thus uninteresting). + */ + + static void + remap_hint_predicate_after_duplication (struct predicate **p, + clause_t possible_truths, + struct inline_summary *info) + { + struct predicate new_predicate; + + if (!*p) + return; + + new_predicate = remap_predicate_after_duplication (*p, +possible_truths, +info); + /* We do not want to free previous predicate; it is used by node origin. */ + *p = NULL; + set_hint_predicate (p, new_predicate); + } + /* Hook that is called by cgraph.c when a node is duplicated. */ *** inline_node_duplication_hook (struct cgr *** 1042,1057 to be true. */ for (i = 0; VEC_iterate (size_time_entry, entry, i, e); i++) { ! struct predica
Re: PATCH: PR target/54445: TLS array lookup with negative constant is not combined into a single instruction
On Wed, Sep 12, 2012 at 7:17 PM, H.J. Lu wrote: >> When x86-64 TLS support was added by: >> >> http://gcc.gnu.org/ml/gcc-patches/2002-10/msg01262.html >> >> it didn't allow negative offset. Jakub, do you remember the reason for >> it? I tested this patch on Linux/x86-64 and used the new GCC to build >> glibc for x86-64 and x32. There are no regressions. OK to install? > >> 2012-09-02 H.J. Lu >> >> PR target/54445 >> * config/i386/predicates.md (x86_64_immediate_operand): Allow >> negative offset for UNSPEC_DTPOFF/UNSPEC_NTPOFF. >> >> gcc/testsuite/ >> 2012-09-02 H.J. Lu >> >> PR target/54445 >> * gcc.target/i386/pr54445-1.c: New file. >> * gcc.target/i386/pr54445-2.c: Likewise. The spec does not require positive offsets, and truncations will be detected by the linker anyway. So, looks OK to me. Thanks, Uros.
[4.7 PATCH, i386]: Enable prefetchw for selected AMD targets
Hello! Newer AMD processors does not enable 3dNOW anymore. However, prefetchw depends on TARGET_3DNOW, so it is not generated anymore. Following patch is a 4.7 version of mainline patch [1]. 2012-09-12 Uros Bizjak * config/i386/i386.h (x86_prefetchw): New global variable. (TARGET_PREFETCHW): New macro. * config/i386/i386.c (PTA_PREFETCHW): Ditto. (processor_alias_table): Add PTA_PREFETCHW to bdver1, bdver2 and btver1. (ix86_option_override_internal): Set x86_prefetchw for PTA_PREFETCHW targets. * config/i386/i386.md (prefetch): Expand to prefetchw for TARGET_PREFETCHW. (*prefetch_3dnow_): Also enable for TARGET_PREFETCHW. Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}. Will be committed to 4.7 branch after [1] is committed to mainline. [1] http://gcc.gnu.org/ml/gcc-patches/2012-09/msg00670.html Uros. Index: config/i386/i386.c === --- config/i386/i386.c (revision 191227) +++ config/i386/i386.c (working copy) @@ -2428,9 +2428,12 @@ enum processor_type ix86_tune; /* Which instruction set architecture to use. */ enum processor_type ix86_arch; -/* true if sse prefetch instruction is not NOOP. */ +/* True if processor has SSE prefetch instruction. */ int x86_prefetch_sse; +/* True if processor has prefetchw instruction. */ +int x86_prefetchw; + /* -mstackrealign option */ static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer"; @@ -2931,6 +2934,8 @@ ix86_option_override_internal (bool main_args_p) #define PTA_XOP(HOST_WIDE_INT_1 << 29) #define PTA_AVX2 (HOST_WIDE_INT_1 << 30) #define PTA_BMI2 (HOST_WIDE_INT_1 << 31) +#define PTA_PREFETCHW (HOST_WIDE_INT_1 << 32) + /* if this reaches 64, need to widen struct pta flags below */ static struct pta @@ -3038,19 +3043,19 @@ ix86_option_override_internal (bool main_args_p) PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM}, {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 - | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4 - | PTA_XOP | PTA_LWP}, + PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2 + | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 + | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX + | PTA_FMA4 | PTA_XOP | PTA_LWP}, {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 - | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4 - | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C + PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2 + | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 + | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX + | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C | PTA_FMA}, {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64, -PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 -| PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16}, + PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2 + | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16}, {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO, 0 /* flags are only used for -march switch. */ }, {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64, @@ -3358,6 +3363,8 @@ ix86_option_override_internal (bool main_args_p) ix86_isa_flags |= OPTION_MASK_ISA_F16C; if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)) x86_prefetch_sse = true; + if (processor_alias_table[i].flags & PTA_PREFETCHW) + x86_prefetchw = true; break; } Index: config/i386/i386.h === --- config/i386/i386.h (revision 191226) +++ config/i386/i386.h (working copy) @@ -450,9 +450,11 @@ extern unsigned char ix86_arch_features[X86_ARCH_L #define TARGET_FISTTP (TARGET_SSE3 && TARGET_80387) extern int x86_prefetch_sse; - #define TARGET_PREFETCH_SSEx86_prefetch_sse +extern int x86_prefetchw; +#define TARGET_PREFETCHW x86_prefetchw + #define ASSEMBLER_DIALECT (ix86_asm_dialect) #define TARGET_SSE_MATH((ix86_fpmath & FPMATH_SSE) != 0) Index: config/i386/i386.md === --- config/i386/i386.md (revision 191227) +++ config/i386/i386.md (working copy) @@ -17671,12 +17671,14 @@ gcc_assert (locality >= 0 && locality <= 3); gcc_assert (GET_MODE (operands[0]) == Pmode || GET_MODE
Re: [4.7 PATCH, i386]: Enable prefetchw for selected AMD targets
On Wed, Sep 12, 2012 at 11:20 AM, Uros Bizjak wrote: > Hello! > > Newer AMD processors does not enable 3dNOW anymore. However, prefetchw > depends on TARGET_3DNOW, so it is not generated anymore. Following > patch is a 4.7 version of mainline patch [1]. > > 2012-09-12 Uros Bizjak > > * config/i386/i386.h (x86_prefetchw): New global variable. > (TARGET_PREFETCHW): New macro. > * config/i386/i386.c (PTA_PREFETCHW): Ditto. > (processor_alias_table): Add PTA_PREFETCHW to > bdver1, bdver2 and btver1. > (ix86_option_override_internal): Set x86_prefetchw for > PTA_PREFETCHW targets. > * config/i386/i386.md (prefetch): Expand to prefetchw > for TARGET_PREFETCHW. > (*prefetch_3dnow_): Also enable for TARGET_PREFETCHW. > > Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu > {,-m32}. Will be committed to 4.7 branch after [1] is committed to > mainline. > > [1] http://gcc.gnu.org/ml/gcc-patches/2012-09/msg00670.html Do we need to update driver-i386.c? -- H.J.
Re: [AArch64] fix missing Dwarf call frame information in the epilogue
On 09/12/2012 09:10 AM, Yufeng Zhang wrote: > aarch64_set_frame_expr (gen_rtx_SET > (Pmode, > stack_pointer_rtx, > -gen_rtx_PLUS (Pmode, stack_pointer_rtx, > +gen_rtx_PLUS (Pmode, cfa_reg, >GEN_INT (offset; We'd prefer to use plus_constant (Pmode, cfa_reg, offset) instead of the explicit call to gen_rtx_PLUS and GEN_INT. It would appear that the entire aarch64.c file ought to be audited for that. Also, use of the REG_CFA_* notes is strongly encouraged over use of REG_FRAME_RELATED_EXPR. There's all sorts of work involved in turning R_F_R_E into R_CFA_* notes, depending on a rather large state machine. This state machine was developed when only prologues were annotated for unwinding, and therefore one cannot expect it to work reliably for epilogues. A long-term goal is to convert all targets to use R_CFA_* exclusively, as that preserves much more information present in the structure of the code of the prologue generator. It means less work within the compiler, and eventually being able to remove a rather large hunk of state-machine code. r~
Re: [4.7 PATCH, i386]: Enable prefetchw for selected AMD targets
On Wed, Sep 12, 2012 at 8:30 PM, H.J. Lu wrote: >> Newer AMD processors does not enable 3dNOW anymore. However, prefetchw >> depends on TARGET_3DNOW, so it is not generated anymore. Following >> patch is a 4.7 version of mainline patch [1]. >> >> 2012-09-12 Uros Bizjak >> >> * config/i386/i386.h (x86_prefetchw): New global variable. >> (TARGET_PREFETCHW): New macro. >> * config/i386/i386.c (PTA_PREFETCHW): Ditto. >> (processor_alias_table): Add PTA_PREFETCHW to >> bdver1, bdver2 and btver1. >> (ix86_option_override_internal): Set x86_prefetchw for >> PTA_PREFETCHW targets. >> * config/i386/i386.md (prefetch): Expand to prefetchw >> for TARGET_PREFETCHW. >> (*prefetch_3dnow_): Also enable for TARGET_PREFETCHW. >> >> Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu >> {,-m32}. Will be committed to 4.7 branch after [1] is committed to >> mainline. >> >> [1] http://gcc.gnu.org/ml/gcc-patches/2012-09/msg00670.html > > Do we need to update driver-i386.c? No, the patch is not a backport of -mprfchw, but statically enables prefetchw for targets that have prefetchw, but don't define TARGET_3DNOW anymore. Uros.
Re: Backtrace library [1/3]
On Wed, Sep 12, 2012 at 9:23 AM, Joseph S. Myers wrote: > On Tue, 11 Sep 2012, Ian Lance Taylor wrote: > >> The configury is fairly standard. Note that libbacktrace is built as >> both a host library (to link into the compilers) and as a target library >> (to link into libgo and possibly other libraries). > > Under what circumstances will the library be built for the target - only > if a relevant language such as Go is being built, or unconditionally? My intent is to only build it when something needs it, e.g., libgo. I don't know if I've expressed that intent correctly. > If built unconditionally, will the library build OK for the target in > situations where no target headers are yet available? (For such builds of > compilers used to bootstrap libc it would be usual to use various > --disable- options to disable libraries not needed to build libc, and to > use --enable-languages=c, but if this library is used on the host side by > the compiler then the generic --disable-libbacktrace might not suffice > since that would disable the host copy, required by the compiler itself, > as well as the target copy.) The library does currently assume that a few header files are available, so building it would presumably break in this scenario. We could introduce --disable-target-libbacktrace easily enough. Ian
Re: Backtrace library [1/3]
On Wed, Sep 12, 2012 at 10:31 AM, Lawrence Crowl wrote: > > How about typing it as a pointer to an incomplete struct? > > extern void *backtrace_create_state (... > > becomes, e.g., > > struct backtrace_state; > extern backtrace_state *backtrace_create_state (... Yeah, that is probably the way to do it. I made that change. I won't bother to send around the patches again. Ian
[PATCH, i386]: Change x86_prefetch_sse to unsigned char
Hello! This will match "fake" bool type that is used throughout gcc sources. We can't use bool there, since the header is used in libgcc which doesn't include system.h 2012-09-12 Uros Bizjak * config/i386/i386.c (x86_prefetch_sse): Change to unsigned char. * config/i386/i386.h (x86_prefetch_sse): Ditto. Tested on x86_64-pc-linux-gnu {,-m32} and committed to mainline SVN. Uros. Index: config/i386/i386.c === --- config/i386/i386.c (revision 191226) +++ config/i386/i386.c (working copy) @@ -2517,8 +2517,8 @@ /* Which instruction set architecture to use. */ enum processor_type ix86_arch; -/* true if sse prefetch instruction is not NOOP. */ -int x86_prefetch_sse; +/* True if processor has SSE prefetch instruction. */ +unsigned char x86_prefetch_sse; /* -mstackrealign option */ static const char ix86_force_align_arg_pointer_string[] Index: config/i386/i386.h === --- config/i386/i386.h (revision 191224) +++ config/i386/i386.h (working copy) @@ -458,8 +458,7 @@ #define TARGET_FISTTP (TARGET_SSE3 && TARGET_80387) -extern int x86_prefetch_sse; - +extern unsigned char x86_prefetch_sse; #define TARGET_PREFETCH_SSEx86_prefetch_sse #define ASSEMBLER_DIALECT (ix86_asm_dialect)
Re: Loop stride optimization hint
Jan Hubicka wrote: for Fortran one of common reason to inline is because array descriptor is known and defines loop stride. This patch makes ipa-inline-analysis to notice these cases. Thanks for your Fortran inlining work. + t(int s, void **p) + { + int i; + for (i;i<1;i+=s) + p[i]=0; + } + m(void **p) + { + t (10); Shouldn't that be t(10, p)? Tobias
Re: [PATCH] Combine location with block using block_locations
Attached is the memory consumption report for a very large source file. Looks like this patch actually reduced the memory consumption by 2%. Dehao On Thu, Sep 13, 2012 at 1:18 AM, Xinliang David Li wrote: > On Wed, Sep 12, 2012 at 10:05 AM, Dehao Chen wrote: >> There are two parts that needs memory management: >> >> 1. The BLOCK structure. This is managed by GC. I originally thought >> that removing blocks from tree.gsbase would paralyze GC. This turned >> out not to be a concern because DECL_INITIAL will still mark those >> used tree nodes. This patch may decrease the memory consumption by >> removing blocks from tree/gimple. However, as it makes more blocks >> become used, they also increase the memory consumption. > > You mean when you also make the location table GC root. > > Can you share the mem-stats information for the large program with and > without your patch? > > thanks, > > David > >> 2. The data structure in libcpp that maintains the hashtable for the >> location->block mapping. This is relatively minor because for the >> largest source I've seen, it only maintains less than 100K entries in >> the array (less than 1M total memory consumption). However, as it is a >> global data structure, it may make LTO unhappy. Honza is helping >> testing the memory consumption on LTO (but we first need to make this >> patch work for LTO). If the LTO result turns out ok, we probably don't >> want to put these under GC because: 1. it'll make things much more >> complicated. 2. using self managed memory is more efficient (as this >> is frequently used in many passes). 3. not using GC actually saves >> memory because even though the block is in the map, it can still be >> GCed as soon as it's not reachable from DECL_INITIAL. >> >> I've tested this on some very large C++ files (each one takes more >> than 10s to build), the memory consumption does not see noticeable >> increase/decrease. >> >> Thanks, >> Dehao >> >> On Wed, Sep 12, 2012 at 9:39 AM, Xinliang David Li >> wrote: >>> On Wed, Sep 12, 2012 at 2:13 AM, Richard Guenther >>> wrote: On Wed, Sep 12, 2012 at 7:06 AM, Dehao Chen wrote: > Now I think we are facing a more complex problem. The data structure > we use to store the location_adhoc_data are file-static in linemap.c > in libcpp. These data structures are not guarded by GTY(()). > Meanwhile, as we have removed the block data structure from > gimple.gsbase as well as tree.exp (encoding them into an location_t). > This could cause block being GCed and the LOCATION_BLOCK becoming > dangling pointers. Uh. Note that it is quite important that we are able to garbage-collect unused BLOCKs, this is the whole point of removing unused BLOCK scopes in remove_unused_locals. So this indeed becomes much more complicated ... What would be desired is that the garbage collector can NULL an entry in the mapping table when it is not referenced in any other way (that other reference would be the BLOCK tree as stored in a FUNCTION_DECLs DECL_INITIAL). >>> >>> It would be nice to GC those unused BLOCKS. I wonder how many BLOCKS >>> are created for a large C++ program. This patch saves memory by >>> shrinking tree size, is it a net win or loss without GC those BLOCKS? >>> >>> thanks, >>> >>> David >>> >>> > I tried to manipulate GTY to make it recognize the LOCATION_BLOCK from > gimple.gsbase.location. However, neigher nested_ptr nor mark_hook can > help me. > > Another approach would be guard the location_adhoc_data and related > data structures in GTY(()). However, this is non-trivial because tree > is not visible in libcpp. At the same time, my implementation heavily > relies on hashtable to make the code efficient, thus it's quite tricky > to make "param_is" and "use_params" work. > > The final approach, which I'll try tomorrow, would be move all my > implementation from libcpp to gcc, and guard them with GTY(()). I > still haven't thought of any potential problem of this approach. Any > comments? I think moving the mapping to GC in a lazy manner as I described above would be the way to go. For hashtables GC already supports if_marked, not sure if similar support is available for arrays/vecs. Richard. > Thanks, > Dehao > > On Tue, Sep 11, 2012 at 9:00 AM, Dehao Chen wrote: >> I saw comments in tree-streamer-out.c: >> >> /* Do not stream BLOCK_SOURCE_LOCATION. We cannot handle debug >> information >> for early inlining so drop it on the floor instead of ICEing in >> dwarf2out.c. */ >> streamer_write_chain (ob, BLOCK_VARS (expr), ref_p); >> >> However, what the code is doing seemed contradictory with the comment. >> Or am I missing something? >> >> >> >> On Tue, Sep 11, 2012 at 8:32 AM, Michael Matz wrote: >>> Hi, >>> >>> On Tue, 11 Se
[PATCH] Fix PR gcov-profile/54487 (profiledbootstrap intermittent failures) (issue6496113)
This fixes PR gcov-profile/54487 where the gcda files were not locked by the profile-use read, enabling writes by other instrumented compiles to change the profile in the middle of the profile use read. The GCOV_LOCKED macro was not set because it was guarded by HOST_HAS_F_SETLKW, which was never set. The fix is to add a compile test in the configure to set it. Tested with bootstrap and profiledbootstrap on x86_64-unknown-linux-gnu. Ok for trunk? Thanks, Teresa 2012-09-12 Teresa Johnson * configure.ac(HOST_HAS_F_SETLKW): Set based on compile test using F_SETLKW with fcntl. * configure, config.in: Regenerate. Index: configure === --- configure (revision 191225) +++ configure (working copy) @@ -10731,6 +10731,41 @@ $as_echo "#define HAVE_CLOCK_T 1" >>confdefs.h fi +# Check if F_SETLKW is supported by fcntl. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for F_SETLKW" >&5 +$as_echo_n "checking for F_SETLKW... " >&6; } +if test "${ac_cv_f_setlkw+set}" = set; then : + $as_echo_n "(cached) " >&6 +else + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#include "fcntl.h" + +int +main () +{ +struct flock fl; fl.l_whence = 0; fl.l_start = 0; fl.l_len = 0; fl.l_pid = 0; return fcntl (1, F_SETLKW, &fl); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_f_setlkw=yes +else + ac_cv_f_setlkw=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f_setlkw" >&5 +$as_echo "$ac_cv_f_setlkw" >&6; } +if test $ac_cv_f_setlkw = yes; then + +$as_echo "#define HOST_HAS_F_SETLKW 1" >>confdefs.h + +fi + # Restore CFLAGS, CXXFLAGS from before the gcc_AC_NEED_DECLARATIONS tests. CFLAGS="$saved_CFLAGS" CXXFLAGS="$saved_CXXFLAGS" @@ -17742,7 +1,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 17745 "configure" +#line 17780 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -17848,7 +17883,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 17851 "configure" +#line 17886 "configure" #include "confdefs.h" #if HAVE_DLFCN_H Index: config.in === --- config.in (revision 191225) +++ config.in (working copy) @@ -1600,6 +1600,12 @@ #endif +/* Define if F_SETLKW supported by fcntl. */ +#ifndef USED_FOR_TARGET +#undef HOST_HAS_F_SETLKW +#endif + + /* Define as const if the declaration of iconv() needs const. */ #ifndef USED_FOR_TARGET #undef ICONV_CONST Index: configure.ac === --- configure.ac(revision 191225) +++ configure.ac(working copy) @@ -1159,6 +1159,16 @@ if test $gcc_cv_type_clock_t = yes; then [Define if defines clock_t.]) fi +# Check if F_SETLKW is supported by fcntl. +AC_CACHE_CHECK(for F_SETLKW, ac_cv_f_setlkw, [ +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +#include "fcntl.h" +]], [[struct flock fl; fl.l_whence = 0; fl.l_start = 0; fl.l_len = 0; fl.l_pid = 0; return fcntl (1, F_SETLKW, &fl);]])],[ac_cv_f_setlkw=yes],[ac_cv_f_setlkw=no])]) +if test $ac_cv_f_setlkw = yes; then + AC_DEFINE(HOST_HAS_F_SETLKW, 1, + [Define if F_SETLKW supported by fcntl.]) +fi + # Restore CFLAGS, CXXFLAGS from before the gcc_AC_NEED_DECLARATIONS tests. CFLAGS="$saved_CFLAGS" CXXFLAGS="$saved_CXXFLAGS" -- This patch is available for review at http://codereview.appspot.com/6496113
Re: [PATCH] Fix PR gcov-profile/54487 (profiledbootstrap intermittent failures) (issue6496113)
On Wed, Sep 12, 2012 at 01:45:12PM -0700, Teresa Johnson wrote: > This fixes PR gcov-profile/54487 where the gcda files were not locked > by the profile-use read, enabling writes by other instrumented compiles > to change the profile in the middle of the profile use read. The GCOV_LOCKED > macro was not set because it was guarded by HOST_HAS_F_SETLKW, which was > never set. The fix is to add a compile test in the configure to set it. > > Tested with bootstrap and profiledbootstrap on x86_64-unknown-linux-gnu. > Ok for trunk? > > Thanks, > Teresa > > 2012-09-12 Teresa Johnson > Please include PR gcov-profile/54487 here in the ChangeLog entry. > * configure.ac(HOST_HAS_F_SETLKW): Set based on compile Space before (. > test using F_SETLKW with fcntl. > * configure, config.in: Regenerate. > > --- configure.ac (revision 191225) > +++ configure.ac (working copy) > @@ -1159,6 +1159,16 @@ if test $gcc_cv_type_clock_t = yes; then >[Define if defines clock_t.]) > fi > > +# Check if F_SETLKW is supported by fcntl. > +AC_CACHE_CHECK(for F_SETLKW, ac_cv_f_setlkw, [ > +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ > +#include "fcntl.h" Please use #include instead. > +]], [[struct flock fl; fl.l_whence = 0; fl.l_start = 0; fl.l_len = 0; > fl.l_pid = 0; return fcntl (1, F_SETLKW, > &fl);]])],[ac_cv_f_setlkw=yes],[ac_cv_f_setlkw=no])]) And split this overlong line, there is no reason why you can't use a newline e.g. after every ; in the test proglet. > +if test $ac_cv_f_setlkw = yes; then > + AC_DEFINE(HOST_HAS_F_SETLKW, 1, > + [Define if F_SETLKW supported by fcntl.]) > +fi > + > # Restore CFLAGS, CXXFLAGS from before the gcc_AC_NEED_DECLARATIONS tests. > CFLAGS="$saved_CFLAGS" > CXXFLAGS="$saved_CXXFLAGS" Ok for trunk with those changes, IMHO it would be worthwhile to put this into 4.7 too, I've seen several unexplained profiledbootstrap errors on that branch in the past already when using make -jN with high N. Jakub
Re: [PATCH] Combine location with block using block_locations
For the largest bucket (size==80), the size reduction is 20%. Not bad. David On Wed, Sep 12, 2012 at 1:44 PM, Dehao Chen wrote: > Attached is the memory consumption report for a very large source > file. Looks like this patch actually reduced the memory consumption by > 2%. > > Dehao > > On Thu, Sep 13, 2012 at 1:18 AM, Xinliang David Li wrote: >> On Wed, Sep 12, 2012 at 10:05 AM, Dehao Chen wrote: >>> There are two parts that needs memory management: >>> >>> 1. The BLOCK structure. This is managed by GC. I originally thought >>> that removing blocks from tree.gsbase would paralyze GC. This turned >>> out not to be a concern because DECL_INITIAL will still mark those >>> used tree nodes. This patch may decrease the memory consumption by >>> removing blocks from tree/gimple. However, as it makes more blocks >>> become used, they also increase the memory consumption. >> >> You mean when you also make the location table GC root. >> >> Can you share the mem-stats information for the large program with and >> without your patch? >> >> thanks, >> >> David >> >>> 2. The data structure in libcpp that maintains the hashtable for the >>> location->block mapping. This is relatively minor because for the >>> largest source I've seen, it only maintains less than 100K entries in >>> the array (less than 1M total memory consumption). However, as it is a >>> global data structure, it may make LTO unhappy. Honza is helping >>> testing the memory consumption on LTO (but we first need to make this >>> patch work for LTO). If the LTO result turns out ok, we probably don't >>> want to put these under GC because: 1. it'll make things much more >>> complicated. 2. using self managed memory is more efficient (as this >>> is frequently used in many passes). 3. not using GC actually saves >>> memory because even though the block is in the map, it can still be >>> GCed as soon as it's not reachable from DECL_INITIAL. >>> >>> I've tested this on some very large C++ files (each one takes more >>> than 10s to build), the memory consumption does not see noticeable >>> increase/decrease. >>> >>> Thanks, >>> Dehao >>> >>> On Wed, Sep 12, 2012 at 9:39 AM, Xinliang David Li >>> wrote: On Wed, Sep 12, 2012 at 2:13 AM, Richard Guenther wrote: > On Wed, Sep 12, 2012 at 7:06 AM, Dehao Chen wrote: >> Now I think we are facing a more complex problem. The data structure >> we use to store the location_adhoc_data are file-static in linemap.c >> in libcpp. These data structures are not guarded by GTY(()). >> Meanwhile, as we have removed the block data structure from >> gimple.gsbase as well as tree.exp (encoding them into an location_t). >> This could cause block being GCed and the LOCATION_BLOCK becoming >> dangling pointers. > > Uh. Note that it is quite important that we are able to garbage-collect > unused > BLOCKs, this is the whole point of removing unused BLOCK scopes in > remove_unused_locals. So this indeed becomes much more complicated ... > What would be desired is that the garbage collector can NULL an entry in > the mapping table when it is not referenced in any other way (that other > reference would be the BLOCK tree as stored in a FUNCTION_DECLs > DECL_INITIAL). It would be nice to GC those unused BLOCKS. I wonder how many BLOCKS are created for a large C++ program. This patch saves memory by shrinking tree size, is it a net win or loss without GC those BLOCKS? thanks, David > >> I tried to manipulate GTY to make it recognize the LOCATION_BLOCK from >> gimple.gsbase.location. However, neigher nested_ptr nor mark_hook can >> help me. >> >> Another approach would be guard the location_adhoc_data and related >> data structures in GTY(()). However, this is non-trivial because tree >> is not visible in libcpp. At the same time, my implementation heavily >> relies on hashtable to make the code efficient, thus it's quite tricky >> to make "param_is" and "use_params" work. >> >> The final approach, which I'll try tomorrow, would be move all my >> implementation from libcpp to gcc, and guard them with GTY(()). I >> still haven't thought of any potential problem of this approach. Any >> comments? > > I think moving the mapping to GC in a lazy manner as I described above > would be the way to go. For hashtables GC already supports if_marked, > not sure if similar support is available for arrays/vecs. > > Richard. > >> Thanks, >> Dehao >> >> On Tue, Sep 11, 2012 at 9:00 AM, Dehao Chen wrote: >>> I saw comments in tree-streamer-out.c: >>> >>> /* Do not stream BLOCK_SOURCE_LOCATION. We cannot handle debug >>> information >>> for early inlining so drop it on the floor instead of ICEing in >>> dwarf2out.c. */ >>> streamer_write_chain (ob, BLOCK_VARS (expr), ref_
Re: [PATCH] Prevent cselib substitution of FP, SP, SFP
Hi Jakub The same problem also affects gcc4.6, http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54398. Could this be ported to 4.6 branch? thanks Carrot On Mon, Feb 13, 2012 at 11:54 AM, Jakub Jelinek wrote: > > On Wed, Jan 04, 2012 at 05:21:38PM +, Marcus Shawcroft wrote: > > Alias analysis by DSE based on CSELIB expansion assumes that > > references to the stack frame from different base registers (ie FP, SP) > > never alias. > > > > The comment block in cselib explains that cselib does not allow > > substitution of FP, SP or SFP specifically in order not to break DSE. > > Looks reasonable, appart from coding style (no spaces around -> and > no {} around return p->loc;), I just wonder if having a separate > loop in expand_loc just for this isn't too expensive. On sane targets > IMHO hard frame pointer in the prologue should be initialized from sp, not > the other way around, thus hard frame pointer based VALUEs should have > hard frame pointer earlier in the locs list (when there is > hfp = sp (+ optionally some const) > insn, we first cselib_lookup_from_insn the rhs and add to locs > of the new VALUE (plus (VALUE of sp) (const_int)), then process the > lhs and add it to locs, moving the plus to locs->next). > So I think the following patch could be enough (bootstrapped/regtested > on x86_64-linux and i686-linux). > There is AVR though, which has really weirdo prologue - PR50063, > but I think it should just use UNSPEC for that or something similar, > setting sp from hfp seems unnecessary and especially for values with long > locs chains could make cselib more expensive. > > Richard, what do you think about this? > > 2012-02-13 Jakub Jelinek > > * cselib.c (expand_loc): Return sp, fp, hfp or cfa base reg right > away if seen. > > --- gcc/cselib.c.jj 2012-02-13 11:07:15.0 +0100 > +++ gcc/cselib.c2012-02-13 18:15:17.531776145 +0100 > @@ -1372,8 +1372,18 @@ expand_loc (struct elt_loc_list *p, stru >unsigned int regno = UINT_MAX; >struct elt_loc_list *p_in = p; > > - for (; p; p = p -> next) > + for (; p; p = p->next) > { > + /* Return these right away to avoid returning stack pointer based > +expressions for frame pointer and vice versa, which is something > +that would confuse DSE. See the comment in > cselib_expand_value_rtx_1 > +for more details. */ > + if (REG_P (p->loc) > + && (REGNO (p->loc) == STACK_POINTER_REGNUM > + || REGNO (p->loc) == FRAME_POINTER_REGNUM > + || REGNO (p->loc) == HARD_FRAME_POINTER_REGNUM > + || REGNO (p->loc) == cfa_base_preserved_regno)) > + return p->loc; >/* Avoid infinite recursion trying to expand a reg into a > the same reg. */ >if ((REG_P (p->loc)) > > > Jakub
Re: Backtrace library [1/3]
On Wed, 12 Sep 2012, Ian Lance Taylor wrote: > On Wed, Sep 12, 2012 at 9:23 AM, Joseph S. Myers > wrote: > > On Tue, 11 Sep 2012, Ian Lance Taylor wrote: > > > >> The configury is fairly standard. Note that libbacktrace is built as > >> both a host library (to link into the compilers) and as a target library > >> (to link into libgo and possibly other libraries). > > > > Under what circumstances will the library be built for the target - only > > if a relevant language such as Go is being built, or unconditionally? > > My intent is to only build it when something needs it, e.g., libgo. I > don't know if I've expressed that intent correctly. I think that if a library is listed in target_libs in config-lang.in for at least one language, and if all languages with it so listed are disabled, then that library will be disabled - but if no languages list it in config-lang.in, it will be enabled by default. -- Joseph S. Myers jos...@codesourcery.com
Re: [PATCH] Fix PR gcov-profile/54487 (profiledbootstrap intermittent failures) (issue6496113)
On Wed, Sep 12, 2012 at 1:54 PM, Jakub Jelinek wrote: > On Wed, Sep 12, 2012 at 01:45:12PM -0700, Teresa Johnson wrote: >> This fixes PR gcov-profile/54487 where the gcda files were not locked >> by the profile-use read, enabling writes by other instrumented compiles >> to change the profile in the middle of the profile use read. The GCOV_LOCKED >> macro was not set because it was guarded by HOST_HAS_F_SETLKW, which was >> never set. The fix is to add a compile test in the configure to set it. >> >> Tested with bootstrap and profiledbootstrap on x86_64-unknown-linux-gnu. >> Ok for trunk? >> >> Thanks, >> Teresa >> >> 2012-09-12 Teresa Johnson >> > > Please include > PR gcov-profile/54487 > here in the ChangeLog entry. > >> * configure.ac(HOST_HAS_F_SETLKW): Set based on compile > > Space before (. > >> test using F_SETLKW with fcntl. >> * configure, config.in: Regenerate. >> >> --- configure.ac (revision 191225) >> +++ configure.ac (working copy) >> @@ -1159,6 +1159,16 @@ if test $gcc_cv_type_clock_t = yes; then >>[Define if defines clock_t.]) >> fi >> >> +# Check if F_SETLKW is supported by fcntl. >> +AC_CACHE_CHECK(for F_SETLKW, ac_cv_f_setlkw, [ >> +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ >> +#include "fcntl.h" > > Please use > #include > instead. > >> +]], [[struct flock fl; fl.l_whence = 0; fl.l_start = 0; fl.l_len = 0; >> fl.l_pid = 0; return fcntl (1, F_SETLKW, >> &fl);]])],[ac_cv_f_setlkw=yes],[ac_cv_f_setlkw=no])]) > > And split this overlong line, there is no reason why you can't use a newline > e.g. after every ; in the test proglet. > >> +if test $ac_cv_f_setlkw = yes; then >> + AC_DEFINE(HOST_HAS_F_SETLKW, 1, >> + [Define if F_SETLKW supported by fcntl.]) >> +fi >> + >> # Restore CFLAGS, CXXFLAGS from before the gcc_AC_NEED_DECLARATIONS tests. >> CFLAGS="$saved_CFLAGS" >> CXXFLAGS="$saved_CXXFLAGS" > > Ok for trunk with those changes, IMHO it would be worthwhile to put this > into 4.7 too, I've seen several unexplained profiledbootstrap errors on > that branch in the past already when using make -jN with high N. Ok, thanks. I will fix the issues you pointed out above and retest before committing. I'll prepare a backport patch for 4.7 as well. Teresa > > Jakub -- Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413
Re: shrink-wrapping duplicates BBs across partitions.
On Wed, Sep 12, 2012 at 4:47 AM, Christian Bruel wrote: > The problem stems from tree-ssa-tail-merge that breaks bb->count, The > CFG looks like > > 2 >/ \ > /6 > 5 (0) | > | 3 <- > |/ \ | > | 7 (1) 8 - > | / > 4 (1) > > (in parenthesis the bb->count from gcov) > > 2 >/ \ > /6 > / | > | 3 <-- > | / | | > 5 (0) 8 -- > | > | > 4 (1) > > so 5 and 4 are now in different partitions, producing an assertion > because there is no EDGE_CROSSING between them. > > I can see 3 solutions to this > > 1) merge the BB counts in tree-ssa-tail-merge.c, so 5 is in the same > partition that 4 This looks correct as we already do that for the frequency. > > 2) don't tail-merge blocks that belong to different partitions. I don't think you detect this on the tree level as the partitioning has not happened yet. Thanks, Andrew Pinski > > 3) add a EDGE_CROSSING flag on the edge between 4 and 5. > > 1) fixes the problem, so 5 and 4 are now in the same partition. The fix > is quite trivial, as with attached. > > the other solution 2) is more conservative, and also fixes the problem. > > I don't think 3) is necessary. > > more ideas ? > > thanks, > > Christian > > > On 09/11/2012 06:21 PM, Jakub Jelinek wrote: >> On Tue, Sep 11, 2012 at 05:40:30PM +0200, Steven Bosscher wrote: >>> On Tue, Sep 11, 2012 at 5:31 PM, Christian Bruel >>> wrote: Actually, the edge is fairly simple. I have BB5 (BB_COLD_PARTITION) -> BB10 (BB_HOT_PARTITION) -> EXIT and BB10 has no other incoming edges. and we are duplicating it. >>> >>> That is wrong, should never happen. Is there a test case to play with? >>> It'd be good to have a PR for this. >> >> Isn't that the standard case when !HAVE_return ? Then you can have only a >> single return through epilogue, and when the epilogue is in the hot >> partition, even if cold code is returning, it needs to jump to the epilogue. >> >> Jakub >>
Loop stride inline hint
Hi, this patch makes inliner to realize that it is good idea to inline when loop stride becomes constant. This is mostly to help fortran testcases where it is important to inline to get array descriptors. Bootstrapped/regtested x86_64-linux, comitted. Honza * ipa-inline-analysis.c (dump_inline_hints): Dump loop stride. (set_hint_predicate): New function. (reset_inline_summary): Reset loop stride. (remap_predicate_after_duplication): New function. (remap_hint_predicate_after_duplication): New function. (inline_node_duplication_hook): Update. (dump_inline_summary): Dump stride summaries. (estimate_function_body_sizes): Compute strides. (remap_hint_predicate): New function. (inline_merge_summary): Use it. (inline_read_section): Read stride. (inline_write_summary): Write stride. * ipa-inline.c (want_inline_small_function_p): Handle strides. (edge_badness): Likewise. * ipa-inline.h (inline_hints_vals): Add stride hint. (inline_summary): Update stride. * gcc.dg/ipa/inlinehint-2.c: New testcase. Index: ipa-inline.c === *** ipa-inline.c(revision 191228) --- ipa-inline.c(working copy) *** want_inline_small_function_p (struct cgr *** 481,487 else if (DECL_DECLARED_INLINE_P (callee->symbol.decl) && growth >= MAX_INLINE_INSNS_SINGLE && !(hints & (INLINE_HINT_indirect_call !| INLINE_HINT_loop_iterations))) { e->inline_failed = CIF_MAX_INLINE_INSNS_SINGLE_LIMIT; want_inline = false; --- 481,488 else if (DECL_DECLARED_INLINE_P (callee->symbol.decl) && growth >= MAX_INLINE_INSNS_SINGLE && !(hints & (INLINE_HINT_indirect_call !| INLINE_HINT_loop_iterations !| INLINE_HINT_loop_stride))) { e->inline_failed = CIF_MAX_INLINE_INSNS_SINGLE_LIMIT; want_inline = false; *** want_inline_small_function_p (struct cgr *** 533,539 inlining given function is very profitable. */ else if (!DECL_DECLARED_INLINE_P (callee->symbol.decl) && growth >= ((hints & (INLINE_HINT_indirect_call ! | INLINE_HINT_loop_iterations)) ? MAX (MAX_INLINE_INSNS_AUTO, MAX_INLINE_INSNS_SINGLE) : MAX_INLINE_INSNS_AUTO)) --- 534,541 inlining given function is very profitable. */ else if (!DECL_DECLARED_INLINE_P (callee->symbol.decl) && growth >= ((hints & (INLINE_HINT_indirect_call ! | INLINE_HINT_loop_iterations ! | INLINE_HINT_loop_stride)) ? MAX (MAX_INLINE_INSNS_AUTO, MAX_INLINE_INSNS_SINGLE) : MAX_INLINE_INSNS_AUTO)) *** edge_badness (struct cgraph_edge *edge, *** 866,872 fprintf (dump_file, "Badness overflow\n"); } if (hints & (INLINE_HINT_indirect_call ! | INLINE_HINT_loop_iterations)) badness /= 8; if (dump) { --- 868,875 fprintf (dump_file, "Badness overflow\n"); } if (hints & (INLINE_HINT_indirect_call ! | INLINE_HINT_loop_iterations ! | INLINE_HINT_loop_stride)) badness /= 8; if (dump) { Index: ipa-inline.h === *** ipa-inline.h(revision 191228) --- ipa-inline.h(working copy) *** typedef struct GTY(()) condition *** 46,52 They are represtented as bitmap of the following values. */ enum inline_hints_vals { INLINE_HINT_indirect_call = 1, ! INLINE_HINT_loop_iterations = 2 }; typedef int inline_hints; --- 46,53 They are represtented as bitmap of the following values. */ enum inline_hints_vals { INLINE_HINT_indirect_call = 1, ! INLINE_HINT_loop_iterations = 2, ! INLINE_HINT_loop_stride = 4 }; typedef int inline_hints; *** struct GTY(()) inline_summary *** 120,128 conditions conds; VEC(size_time_entry,gc) *entry; ! /* Predicate on when some loop in the function sbecomes to have known bounds. */ struct predicate * GTY((skip)) loop_iterations; }; --- 121,132 conditions conds; VEC(size_time_entry,gc) *entry; ! /* Predicate on when some loop in the function becomes to have known bounds. */ struct predicate * GTY((skip)) loop_iterations; + /* Predicate on when some loop in the function becomes to have known + stride. */ + st
Re: Loop stride optimization hint
Hi, On Wed, Sep 12, 2012 at 07:57:16PM +0200, Jan Hubicka wrote: > Hi, > for Fortran one of common reason to inline is because array descriptor is > known and defines > loop stride. This patch makes ipa-inline-analysis to notice these cases. > > Bootstrapped/regtested x86_64-linux, will commit it tomorrow if there are no > complains. > > Honza > > * ipa-inline-analysis.c (dump_inline_hints): Dump loop stride. > (set_hint_predicate): New function. > (reset_inline_summary): Reset loop stride. > (remap_predicate_after_duplication): New function. > (remap_hint_predicate_after_duplication): New function. > (inline_node_duplication_hook): Update. > (dump_inline_summary): Dump stride summaries. > (estimate_function_body_sizes): Compute strides. > (remap_hint_predicate): New function. > (inline_merge_summary): Use it. > (inline_read_section): Read stride. > (inline_write_summary): Write stride. > * ipa-inline.c (want_inline_small_function_p): Handle strides. > (edge_badness): Likewise. > * ipa-inline.h (inline_hints_vals): Add stride hint. > (inline_summary): Update stride. > > * gcc.dg/ipa/inlinehint-2.c: New testcase. ... > *** estimate_function_body_sizes (struct cgr > *** 2390,2395 > --- 2442,2448 > struct loop *loop; > loop_iterator li; > predicate loop_iterations = true_predicate (); > + predicate loop_stride = true_predicate (); > > if (dump_file && (dump_flags & TDF_DETAILS)) > flow_loops_dump (dump_file, NULL, 0); > *** estimate_function_body_sizes (struct cgr > *** 2398,2405 > { > VEC (edge, heap) *exits; > edge ex; > ! unsigned int j; > struct tree_niter_desc niter_desc; > > exits = get_loop_exit_edges (loop); > FOR_EACH_VEC_ELT (edge, exits, j, ex) > --- 2451,2459 > { > VEC (edge, heap) *exits; > edge ex; > ! unsigned int j, i; > struct tree_niter_desc niter_desc; > + basic_block *body = get_loop_body (loop); > > exits = get_loop_exit_edges (loop); > FOR_EACH_VEC_ELT (edge, exits, j, ex) > *** estimate_function_body_sizes (struct cgr > *** 2416,2427 > loop_iterations = and_predicates (info->conds, > &loop_iterations, &will_be_nonconstant); > } > VEC_free (edge, heap, exits); > } > ! if (!true_predicate_p (&loop_iterations)) > ! { > ! inline_summary (node)->loop_iterations = (struct predicate > *)pool_alloc (edge_predicate_pool); > ! *inline_summary (node)->loop_iterations = loop_iterations; > ! } > scev_finalize (); > } > inline_summary (node)->self_time = time; > --- 2470,2508 > loop_iterations = and_predicates (info->conds, > &loop_iterations, &will_be_nonconstant); > } > VEC_free (edge, heap, exits); > + > + for (i = 0; i < loop->num_nodes; i++) > + { > + gimple_stmt_iterator gsi; > + for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next > (&gsi)) > + { > + gimple stmt = gsi_stmt (gsi); > + affine_iv iv; > + ssa_op_iter iter; > + tree use; > + > + FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE) > + { > + predicate will_be_nonconstant; > + > + if (!simple_iv (loop, loop_containing_stmt (stmt), use, > &iv, true) > + || is_gimple_min_invariant (iv.step)) > + continue; > + will_be_nonconstant > += will_be_nonconstant_expr_predicate (parms_info, info, > + iv.step, > nonconstant_names); > + if (!true_predicate_p (&will_be_nonconstant) > + && !false_predicate_p (&will_be_nonconstant)) > + /* This is slightly inprecise. We may want to > represent each loop with > +independent predicate. */ > + loop_stride = and_predicates (info->conds, > &loop_stride, &will_be_nonconstant); > + } > + } > + } > + free (body); > } > ! set_hint_predicate (&inline_summary (node)->loop_iterations, > loop_iterations); > ! set_hint_predicate (&inline_summary (node)->loop_stride, loop_stride); > scev_finalize (); > } > inline_summary (node)->self_time = time; Well, I know i's not that important but some lines are clearly wider than 80 characters and makes it more difficult to read for people like me. In fact, the already committed loop_iterations computation also often exceeds the limit. I have also been wondering whether we
Re: [PATCH] limited C++ parsing support for gengtype
On Wed, Sep 12, 2012 at 4:54 PM, Aaron Gray wrote: > On 11 September 2012 23:45, Gabriel Dos Reis > wrote: >> On Tue, Sep 11, 2012 at 3:41 PM, Diego Novillo wrote: >> @@ -778,6 +791,7 @@ type (options_p *optsp, bool nested) return resolve_typedef (s, &lexer_line); case STRUCT: +case CLASS: >>> >>> >>> I think that as far as gengtype is concerned, 'struct' and 'class' should be >>> exactly the same thing. So, all the handling for 'CLASS' you added should >>> not be needed. >> >> >> 100% agreed. > > The reason I included a CLASS type distinct from STRUCT was for > reporting debugging information when I get to that stage. In general, patches are easier to assess when they are self-contained (e.g. each change is justified by the purpose of the patch.) My recommendation would be for your patches to focus on one topic at a time. -- Gaby
Re: Loop stride optimization hint
> > ! set_hint_predicate (&inline_summary (node)->loop_iterations, > > loop_iterations); > > ! set_hint_predicate (&inline_summary (node)->loop_stride, > > loop_stride); > > scev_finalize (); > > } > > inline_summary (node)->self_time = time; > > Well, I know i's not that important but some lines are clearly wider > than 80 characters and makes it more difficult to read for people like > me. In fact, the already committed loop_iterations computation also > often exceeds the limit. Yeah, I will reindent that. > > I have also been wondering whether we really want to and all the > different will_be_nonconstant predicates to produce the final hint > predicates. IIUC we won't get the hint unless all loop > iterations/strides are known. The loop iterations predicate in the > fairly simple pr48636.f90 is: > > loop iterations:(op0[ref offset: 192] changed || op0[ref offset: 256] > changed || op0[ref offset: 224] changed) && (op0[ref offset: 96] changed || > op0[ref offset: 160] changed || op0[ref offset: 128] changed) > > With predicates like these, it's going to be very difficult for IPA-CP > to put together a combination of known values and aggregate contents > for just some contexts so that we hit the sweet spot. The current > approach of trying one known context-specific value at a time > certainly will not work in real cases, at the same time, it is really > built around independent assessments of the values... Well, I intentionally made the code to make it easy to drop multiple predicates, i.e. simply replace current predicate pointer by vector of independent predicates, so each loop can be handled independently. In the case of pr48636.f90 I would however expect that we should be able to figure out tha tthe whole loop descriptor is constant and match the above as true? Honza
[PING^2] C++ conversion - pull in cstdlib
Hello, On Sun, 2012-09-02 at 01:19 +0200, Oleg Endo wrote: > On Sat, 2012-09-01 at 18:25 +0200, Oleg Endo wrote: > > On Sat, 2012-09-01 at 16:17 +, Joseph S. Myers wrote: > > > On Sat, 1 Sep 2012, Oleg Endo wrote: > > > > > > > Ping! > > > > > > > > This allows one to include e.g. in GCC source files. > > > > Since the switch to C++ has been made, this should be OK to do now, I > > > > guess. > > > > > > This is not a review, but have you tested building the Ada front end with > > > this patch applied? Given recent issues relating to how Ada uses > > > system.h, I think any such changes need testing for Ada. > > > > > > > No I haven't. C and C++ only. Good to know, thanks. Will try. > > > > OK, now I have. ada, c, c++, fortran, go, java, objc, obj-c++ do build > here. > Would it be OK to install the patch originally posted here: http://gcc.gnu.org/ml/gcc-patches/2012-08/msg01761.html ? If not OK, it's also fine. Just let me know. It seems the issue can be worked around in individual source files by including before "system.h" (as it is already done in config/sh/sh.c). Cheers, Oleg
Re: [patch] Finish double_int conversion.
> Index: gcc/config/sparc/sparc.c > === > --- gcc/config/sparc/sparc.c (revision 191083) > +++ gcc/config/sparc/sparc.c (working copy) > @@ -10113,33 +10113,27 @@ sparc_fold_builtin (tree fndecl, int n_a > && TREE_CODE (arg1) == VECTOR_CST > && TREE_CODE (arg2) == INTEGER_CST) > { > - int overflow = 0; > - unsigned HOST_WIDE_INT low = TREE_INT_CST_LOW (arg2); > - HOST_WIDE_INT high = TREE_INT_CST_HIGH (arg2); > + bool overflow = false; > + double_int di_arg2 = TREE_INT_CST (arg2); > unsigned i; > > for (i = 0; i < VECTOR_CST_NELTS (arg0); ++i) > { > - unsigned HOST_WIDE_INT > - low0 = TREE_INT_CST_LOW (VECTOR_CST_ELT (arg0, i)), > - low1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (arg1, i)); > - HOST_WIDE_INT > - high0 = TREE_INT_CST_HIGH (VECTOR_CST_ELT (arg0, i)); > - HOST_WIDE_INT > - high1 = TREE_INT_CST_HIGH (VECTOR_CST_ELT (arg1, i)); > + double_int e0 = TREE_INT_CST (VECTOR_CST_ELT (arg0, i)); > + double_int e1 = TREE_INT_CST (VECTOR_CST_ELT (arg1, i)); > > - unsigned HOST_WIDE_INT l; > - HOST_WIDE_INT h; > + bool neg1_ovf, neg2_ovf, add1_ovf, add2_ovf; > > - overflow |= neg_double (low1, high1, &l, &h); > - overflow |= add_double (low0, high0, l, h, &l, &h); > - if (h < 0) > - overflow |= neg_double (l, h, &l, &h); > + double_int tmp = e1.neg_with_overflow (&neg1_ovf); > + tmp = e0.add_with_sign (tmp, false, &add1_ovf); > + if (tmp.is_negative ()) > + tmp = tmp.neg_with_overflow (&neg2_ovf); > > - overflow |= add_double (low, high, l, h, &low, &high); > + tmp = di_arg2.add_with_sign (tmp, false, &add2_ovf); > + overflow |= neg1_ovf | neg2_ovf | add1_ovf | add2_ovf; > } > > - gcc_assert (overflow == 0); > + gcc_assert (!overflow); > > return build_int_cst_wide (rtype, low, high); > } This cannot build because of the references to low and high in the last line. As Richard said, building a cross cc1 is very easy. -- Eric Botcazou
Re: Loop stride optimization hint
> > > > loop iterations:(op0[ref offset: 192] changed || op0[ref offset: 256] > > changed || op0[ref offset: 224] changed) && (op0[ref offset: 96] changed || > > op0[ref offset: 160] changed || op0[ref offset: 128] changed) Looking at the testcase, loop stride is (op0[ref offset: 384] changed) && (op0[ref offset: 192] changed) that is more trackable ;) The problem with iterations is simply that they verify that ubound,hbound and stride is known. Also the individual loops are combined by and_predicates. We hint when one of loops become to have known stride or iterations. We lose track then and do not hint for further loops on subsequent inlining. I am not sure how important it is - for fortran I guess the first inlining matter the most. Honza
Re: [SH] Correct address cost estimations
Oleg Endo wrote: > This corrects the address cost estimations for SH. > Tested on rev 191161 with > make -k check RUNTESTFLAGS="--target_board=sh-sim > \{-m2/-ml,-m2/-mb,-m2a/-mb,-m4/-ml,-m4/-mb,-m4a/-ml,-m4a/-mb}" > > and no new failures. > With this applied CSiBE shows a total size decrease of 4668 bytes for > '-O2 -m4-single -ml -mpretend-cmove'. > > OK? OK. Regards, kaz
Re: [patch] Fix PR rtl-optimization/54290
> Bootstrapped/regtested on x86_64-suse-linux. Does that look plausible? Do > we want to fix this on release branches as well? > > > 2012-09-02 Eric Botcazou > > PR rtl-optimization/54290 > * reload1.c (choose_reload_regs): Also take into account secondary MEMs > to remove address replacements for inherited reloads. Ulrich, would you mind taking a look when you have some time? TIA. http://gcc.gnu.org/ml/gcc-patches/2012-09/msg00037.html -- Eric Botcazou
Re: Scheduler: Allow breaking dependencies by modifying patterns
On 12/09/2012, at 4:34 AM, Vladimir Makarov wrote: > On 08/03/2012 08:05 AM, Bernd Schmidt wrote: >> ... > Ok, thanks. The changes are pretty straightforward. Only just a few > comments. > ... > > Thanks for the patch, Bernd. Sorry for the delay with the review. I thought > that Maxim writes his comments first. I reviewed the patch in http://gcc.gnu.org/ml/gcc-patches/2012-08/msg01028.html (comments were similar to yours), but didn't CC you. Anyway, thanks for the review! -- Maxim Kuvyrkov CodeSourcery / Mentor Graphics
[google/gcc-4_7] Fix problem with asm spec for -gsplit-dwarf
2012-09-12 Cary Coutant gcc/ * gcc.c (replace_extension_spec_func): Restrict search for extension to last component of path. Index: gcc/gcc.c === --- gcc/gcc.c (revision 191233) +++ gcc/gcc.c (working copy) @@ -8413,12 +8413,18 @@ replace_extension_spec_func (int argc, c char *name; char *p; char *result; + int i; if (argc != 2) fatal_error ("too few arguments to %%:replace-extension"); name = xstrdup (argv[0]); - p = strrchr (name, '.'); + + for (i = strlen(name) - 1; i >= 0; i--) +if (IS_DIR_SEPARATOR (name[i])) + break; + + p = strrchr (name + i + 1, '.'); if (p != NULL) *p = '\0';
Re: [google/gcc-4_7] Fix problem with asm spec for -gsplit-dwarf
On Wed, Sep 12, 2012 at 4:18 PM, Cary Coutant wrote: > 2012-09-12 Cary Coutant > > gcc/ > * gcc.c (replace_extension_spec_func): Restrict search for > extension to last component of path. > > Index: gcc/gcc.c > === > --- gcc/gcc.c (revision 191233) > +++ gcc/gcc.c (working copy) > @@ -8413,12 +8413,18 @@ replace_extension_spec_func (int argc, c >char *name; >char *p; >char *result; > + int i; > >if (argc != 2) > fatal_error ("too few arguments to %%:replace-extension"); > >name = xstrdup (argv[0]); > - p = strrchr (name, '.'); > + > + for (i = strlen(name) - 1; i >= 0; i--) > +if (IS_DIR_SEPARATOR (name[i])) > + break; > + > + p = strrchr (name + i + 1, '.'); >if (p != NULL) >*p = '\0'; > This is OK for Google 4.7. Sterling
Re: [google/gcc-4_7] Fix problem with asm spec for -gsplit-dwarf
>> 2012-09-12 Cary Coutant >> >> gcc/ >> * gcc.c (replace_extension_spec_func): Restrict search for >> extension to last component of path. > > This is OK for Google 4.7. Thanks, committed as r191234. -cary
Re: [C PATCH] Avoid ICEs due to save_expr instead of c_save_expr (PR c/54428)
On Fri, Aug 31, 2012 at 5:36 AM, Jakub Jelinek wrote: > Hi! > > This is another case of the issue that save_expr shouldn't be called > when parsing C (except for in_late_binary_op), but c_save_expr must be > called instead. > > I wonder if we shouldn't add a langhook, which would do > if (in_late_binary_op) save_expr_1 else c_save_expr > and ensure that when not parsing in_late_binary_op is set or something > similar, and if non-NULL, call the langhook from save_expr. > > Anyway, this patch fixes this issue even without such changes, > bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk/4.7? > > 2012-08-31 Jakub Jelinek > > PR c/54428 > * c-convert.c (convert): Don't call fold_convert_loc if > TYPE_MAIN_VARIANT of a COMPLEX_TYPE is the same, unless e > is a COMPLEX_EXPR. Remove TYPE_MAIN_VARIANT check from > COMPLEX_TYPE -> COMPLEX_TYPE conversion. > > * gcc.c-torture/compile/pr54428.c: New test. > This caused: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54559 -- H.J.
Re: Backtrace library [1/3]
On Wed, Sep 12, 2012 at 2:04 PM, Joseph S. Myers wrote: > On Wed, 12 Sep 2012, Ian Lance Taylor wrote: > >> On Wed, Sep 12, 2012 at 9:23 AM, Joseph S. Myers >> wrote: >> > On Tue, 11 Sep 2012, Ian Lance Taylor wrote: >> > >> >> The configury is fairly standard. Note that libbacktrace is built as >> >> both a host library (to link into the compilers) and as a target library >> >> (to link into libgo and possibly other libraries). >> > >> > Under what circumstances will the library be built for the target - only >> > if a relevant language such as Go is being built, or unconditionally? >> >> My intent is to only build it when something needs it, e.g., libgo. I >> don't know if I've expressed that intent correctly. > > I think that if a library is listed in target_libs in config-lang.in for > at least one language, and if all languages with it so listed are > disabled, then that library will be disabled - but if no languages list it > in config-lang.in, it will be enabled by default. Looks right. If libbacktrace is approved and committed, I will included this patch to gcc/go/config-lang.in. I tested that it indeed causes libbacktrace to be built only for the host, not for the target, when not building Go. Ian Index: gcc/go/config-lang.in === --- gcc/go/config-lang.in (revision 191171) +++ gcc/go/config-lang.in (working copy) @@ -28,7 +28,7 @@ language="go" compilers="go1\$(exeext)" -target_libs="target-libgo target-libffi" +target_libs="target-libgo target-libffi target-libbacktrace" # The Go frontend is written in C++, so we need to build the C++ # compiler during stage 1.
Re: [PATCH] Fix PR gcov-profile/54487 (profiledbootstrap intermittent failures) (issue6496113)
On Wed, Sep 12, 2012 at 2:12 PM, Teresa Johnson wrote: > On Wed, Sep 12, 2012 at 1:54 PM, Jakub Jelinek wrote: >> On Wed, Sep 12, 2012 at 01:45:12PM -0700, Teresa Johnson wrote: >>> This fixes PR gcov-profile/54487 where the gcda files were not locked >>> by the profile-use read, enabling writes by other instrumented compiles >>> to change the profile in the middle of the profile use read. The GCOV_LOCKED >>> macro was not set because it was guarded by HOST_HAS_F_SETLKW, which was >>> never set. The fix is to add a compile test in the configure to set it. >>> >>> Tested with bootstrap and profiledbootstrap on x86_64-unknown-linux-gnu. >>> Ok for trunk? >>> >>> Thanks, >>> Teresa >>> >>> 2012-09-12 Teresa Johnson >>> >> >> Please include >> PR gcov-profile/54487 >> here in the ChangeLog entry. >> >>> * configure.ac(HOST_HAS_F_SETLKW): Set based on compile >> >> Space before (. >> >>> test using F_SETLKW with fcntl. >>> * configure, config.in: Regenerate. >>> >>> --- configure.ac (revision 191225) >>> +++ configure.ac (working copy) >>> @@ -1159,6 +1159,16 @@ if test $gcc_cv_type_clock_t = yes; then >>>[Define if defines clock_t.]) >>> fi >>> >>> +# Check if F_SETLKW is supported by fcntl. >>> +AC_CACHE_CHECK(for F_SETLKW, ac_cv_f_setlkw, [ >>> +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ >>> +#include "fcntl.h" >> >> Please use >> #include >> instead. >> >>> +]], [[struct flock fl; fl.l_whence = 0; fl.l_start = 0; fl.l_len = 0; >>> fl.l_pid = 0; return fcntl (1, F_SETLKW, >>> &fl);]])],[ac_cv_f_setlkw=yes],[ac_cv_f_setlkw=no])]) >> >> And split this overlong line, there is no reason why you can't use a newline >> e.g. after every ; in the test proglet. >> >>> +if test $ac_cv_f_setlkw = yes; then >>> + AC_DEFINE(HOST_HAS_F_SETLKW, 1, >>> + [Define if F_SETLKW supported by fcntl.]) >>> +fi >>> + >>> # Restore CFLAGS, CXXFLAGS from before the gcc_AC_NEED_DECLARATIONS tests. >>> CFLAGS="$saved_CFLAGS" >>> CXXFLAGS="$saved_CXXFLAGS" >> >> Ok for trunk with those changes, IMHO it would be worthwhile to put this >> into 4.7 too, I've seen several unexplained profiledbootstrap errors on >> that branch in the past already when using make -jN with high N. > > Ok, thanks. I will fix the issues you pointed out above and retest > before committing. I'll prepare a backport patch for 4.7 as well. Patch committed. I have the 4_7 backport ready. It is basically the same patch. Also tested with bootstap and profiledbootstrap. Ok for gcc/4_7? Thanks, Teresa 2012-09-12 Teresa Johnson Backport from mainline. 2012-09-12 Teresa Johnson PR gcov-profile/54487 * configure.ac (HOST_HAS_F_SETLKW): Set based on compile test using F_SETLKW with fcntl. * configure, config.in: Regenerate. Index: configure === --- configure (revision 191237) +++ configure (working copy) @@ -10968,6 +10968,46 @@ $as_echo "#define HAVE_CLOCK_T 1" >>confdefs.h fi +# Check if F_SETLKW is supported by fcntl. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for F_SETLKW" >&5 +$as_echo_n "checking for F_SETLKW... " >&6; } +if test "${ac_cv_f_setlkw+set}" = set; then : + $as_echo_n "(cached) " >&6 +else + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#include +int +main () +{ + +struct flock fl; +fl.l_whence = 0; +fl.l_start = 0; +fl.l_len = 0; +fl.l_pid = 0; +return fcntl (1, F_SETLKW, &fl); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_f_setlkw=yes +else + ac_cv_f_setlkw=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f_setlkw" >&5 +$as_echo "$ac_cv_f_setlkw" >&6; } +if test $ac_cv_f_setlkw = yes; then + +$as_echo "#define HOST_HAS_F_SETLKW 1" >>confdefs.h + +fi + # Restore CFLAGS, CXXFLAGS from before the gcc_AC_NEED_DECLARATIONS tests. CFLAGS="$saved_CFLAGS" CXXFLAGS="$saved_CXXFLAGS" @@ -17970,7 +18010,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 17973 "configure" +#line 18013 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -18076,7 +18116,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 18079 "configure" +#line 18119 "configure" #include "confdefs.h" #if HAVE_DLFCN_H Index: config.in === --- config.in (revision 191237) +++ config.in (working copy) @@ -1588,6 +1588,12 @@ #endif +/* Define if F_SETLKW supported by fcntl. */ +#ifndef USED_FOR_TARGET +#undef HOST_HAS_F_SETLKW +#endif + + /* Define as const if the declaration of iconv() needs const. */ #ifndef USED_FOR_TARGET #undef ICONV_CONST Index: configure.ac ===
Re: [PATCH] Fix PR gcov-profile/54487 (profiledbootstrap intermittent failures) (issue6496113)
On Wed, Sep 12, 2012 at 10:06:02PM -0700, Teresa Johnson wrote: > Patch committed. I have the 4_7 backport ready. It is basically the > same patch. Also tested with bootstap and profiledbootstrap. Ok for > gcc/4_7? Yes, thanks. > 2012-09-12 Teresa Johnson > > Backport from mainline. > 2012-09-12 Teresa Johnson > > PR gcov-profile/54487 > * configure.ac (HOST_HAS_F_SETLKW): Set based on compile > test using F_SETLKW with fcntl. > * configure, config.in: Regenerate. Jakub