Code generated for a simple memory copy loop
Hello, Here's a simple memory copy macro: #define MYMEMCOPY(dp, sp, len) \ do { \ long __len = len; \ while (--__len >= 0) \ (dp)[__len] = (sp)[__len]; \ } while (0) void foo(unsigned char *dp, const unsigned char *sp, unsigned long size) { MYMEMCOPY(dp, sp, size); } void bar(unsigned char *dp, const unsigned char *sp) { MYMEMCOPY(dp, sp, 128); } The code fragments generated for the foo and bar functions with -O and -O2 optimizations respectively is as follows: /* = With -O switch = */ /* function foo */ .L4: movzbl -1(%rcx), %eax movb%al, -1(%rdx) subq$1, %rcx subq$1, %rdx subq$1, %r8 jns .L4 /* function bar */ movl$126, %edx .L8: .LBB3: .loc 1 13 0 movzbl 1(%rdx,%rsi), %eax movb%al, 1(%rdx,%rdi) subq$1, %rdx cmpq$-2, %rdx jne .L8 /* = With -O2 switch =*/ /* function foo */ .L4: movzbl -1(%rsi), %eax addq$1, %rdi subq$1, %rsi movb%al, -1(%rcx) subq$1, %rcx cmpq%rdx, %rdi jne .L4 /* function bar */ movl$126, %edx .L9: .LBB3: .loc 1 13 0 movzbl 1(%rdx,%rsi), %eax movb%al, 1(%rdx,%rdi) subq$1, %rdx cmpq$-2, %rdx jne .L9 Now my questions are: (i) Why does the compiler generate an addq, cmpq and jne for the foo function with -O2? Isn't subq/jns more efficient, as seen from the output from -O? (ii) For function bar, why is the "cmpq $-2, %rdx" instruction generated? Won't it be better to count down from 128 to 0 instead of 126 to -2? Here's my OS and compiler version (I'm running a 64-bit FreeBSD): $ uname -a FreeBSD xxx 7.0-RELEASE FreeBSD 7.0-RELEASE #0: Wed Nov 12 18:54:21 PST 2008 r...@wc7:/usr/obj/usr/src/sys/SMKERNEL amd64 $ cc --version cc (GCC) 4.2.1 20070719 [FreeBSD] Copyright (C) 2007 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. And these are the commands I used to compile the program: cc -S -O -g test.c cc -S -O2 -g test.c Any pointers would be appreciated. Thanks! Regards, N Datta Add more friends to your messenger and enjoy! Go to http://messenger.yahoo.com/invite/
Re: Code generated for a simple memory copy loop
On Mon, Feb 16, 2009 at 11:19 AM, Narasimha Datta wrote: > Hello, > > Here's a simple memory copy macro: > > #define MYMEMCOPY(dp, sp, len) \ > do { \ >long __len = len; \ >while (--__len >= 0) \ >(dp)[__len] = (sp)[__len]; \ > } while (0) > > void foo(unsigned char *dp, const unsigned char *sp, unsigned long size) { >MYMEMCOPY(dp, sp, size); > } > > void bar(unsigned char *dp, const unsigned char *sp) { >MYMEMCOPY(dp, sp, 128); > } > > The code fragments generated for the foo and bar functions with -O and -O2 > optimizations respectively is as follows: > > /* = With -O switch = */ > /* function foo */ > .L4: >movzbl -1(%rcx), %eax >movb%al, -1(%rdx) >subq$1, %rcx >subq$1, %rdx >subq$1, %r8 >jns .L4 > > /* function bar */ >movl$126, %edx > .L8: > .LBB3: >.loc 1 13 0 >movzbl 1(%rdx,%rsi), %eax >movb%al, 1(%rdx,%rdi) >subq$1, %rdx >cmpq$-2, %rdx >jne .L8 > > /* = With -O2 switch =*/ > /* function foo */ > .L4: >movzbl -1(%rsi), %eax >addq$1, %rdi >subq$1, %rsi >movb%al, -1(%rcx) >subq$1, %rcx >cmpq%rdx, %rdi >jne .L4 > > /* function bar */ >movl$126, %edx > .L9: > .LBB3: >.loc 1 13 0 >movzbl 1(%rdx,%rsi), %eax >movb%al, 1(%rdx,%rdi) >subq$1, %rdx >cmpq$-2, %rdx >jne .L9 > > Now my questions are: > (i) Why does the compiler generate an addq, cmpq and jne for the foo function > with -O2? Isn't subq/jns more efficient, as seen from the output from -O? > (ii) For function bar, why is the "cmpq $-2, %rdx" instruction generated? > Won't it be better to count down from 128 to 0 instead of 126 to -2? > > Here's my OS and compiler version (I'm running a 64-bit FreeBSD): > $ uname -a > FreeBSD xxx 7.0-RELEASE FreeBSD 7.0-RELEASE #0: Wed Nov 12 18:54:21 PST 2008 >r...@wc7:/usr/obj/usr/src/sys/SMKERNEL amd64 > $ cc --version > cc (GCC) 4.2.1 20070719 [FreeBSD] > Copyright (C) 2007 Free Software Foundation, Inc. > This is free software; see the source for copying conditions. There is NO > warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. > > And these are the commands I used to compile the program: > cc -S -O -g test.c > cc -S -O2 -g test.c > > Any pointers would be appreciated. Thanks! 1) Try a more recent GCC 2) Use memcpy. It is properly inlined/optimized. Richard.
ARM : code less efficient with gcc-trunk ?
Hi, I am comparing the assembly generated by compilers targeting arm-wince platform and it seems that cross-compiler from gcc-trunk is less optimized than an old one based on gcc 4.1.x Here is the comparison obtained from objdump: cegcc-4.1.x : 00011000 : 11000: e92d40f0push{r4, r5, r6, r7, lr} 11004: e1a04000mov r4, r0 11008: e1a05001mov r5, r1 1100c: e1a06002mov r6, r2 11010: e1a07003mov r7, r3 11014: ebdebl 11394 <_fpreset> 11018: eb2abl 110c8 <_pei386_runtime_relocator> 1101c: eb99bl 11288 <__atexit_init> 11020: ebd3bl 11374 <__gccmain> 11024: e1a01005mov r1, r5 11028: e1a4mov r0, r4 1102c: e1a02006mov r2, r6 11030: e1a03007mov r3, r7 11034: eb05bl 11050 11038: e1a04000mov r4, r0 1103c: eb87bl 11260 <_cexit> 11040: e1a01004mov r1, r4 11044: e3a00042mov r0, #66 ; 0x42 11048: ebd4bl 113a0 1104c: eafeb 1104c cegcc-4.4.x 00011000 : 11000: e92d4010push{r4, lr} 11004: e1a04000mov r4, r0 11008: e24dd00csub sp, sp, #12 ; 0xc 1100c: e58d1008str r1, [sp, #8] 11010: e58d2004str r2, [sp, #4] 11014: e58d3000str r3, [sp] 11018: eb000120bl 114a0 <_fpreset> 1101c: eb43bl 11130 <_pei386_runtime_relocator> 11020: ebcebl 11360 <__atexit_init> 11024: eb000111bl 11470 <__gccmain> 11028: e59d1008ldr r1, [sp, #8] 1102c: e1a4mov r0, r4 11030: e59d2004ldr r2, [sp, #4] 11034: e59d3000ldr r3, [sp] 11038: eb28bl 110e0 1103c: e1a04000mov r4, r0 11040: ebbabl 11330 <_cexit> 11044: e1a01004mov r1, r4 11048: e3a00042mov r0, #66 ; 0x42 1104c: eb000116bl 114ac 11050: eafeb 11050 11054: e1a0nop (mov r0,r0) 11058: e1a0nop (mov r0,r0) 1105c: e1a0nop (mov r0,r0) If you have a look at address 11008-1100c you can see that old gcc is using registers but upcoming gcc-4.4 is using memory. I tried to put some optim flags -O2 but it doesn't modify the situation. Is there anything to do to improve this situation ? Is it a normal behavior ? Maybe my remark is not relevant because I didn't try to do some benchmark and I agree this is not because gcc-trunk do not optimize this specific part that it will be slower. I have also noticed that now I get some nop instructions and when I ask gcc to generate assembly I can see that alignement directive is different. I used to have .align 0 with gcc-4.1 and now I get a .align 4, how can I change that ? And finally maybe those nop insn prevents compiler from optimizing ... Thanks
targed.md: copy_to_mode_reg or force_reg?
Hi, in machine description expanders the functions copy_to_mode_reg and and force_reg from explow.c can be used to ensure that an operand lives in a register. But what function should be used? What are the differences? The only difference I can depict from the comment is that an operand returned by force_reg must not be altered, i.e. overwritten afterwards. Are there any pitfalls using these functions with respect to reload_completed, reload_in_progress or no_new_pseudos? As far as I understand, these function can only be used if no_new_pseudos is false. Thanks, Georg-Johann
Re: ARM : code less efficient with gcc-trunk ?
On Mon, Feb 16, 2009 at 12:19:52PM +0100, Vincent R. wrote: > 00011000 : >11000: e92d40f0push{r4, r5, r6, r7, lr} >11004: e1a04000mov r4, r0 >11008: e1a05001mov r5, r1 >1100c: e1a06002mov r6, r2 >11010: e1a07003mov r7, r3 > 00011000 : >11000: e92d4010push{r4, lr} >11004: e1a04000mov r4, r0 >11008: e24dd00csub sp, sp, #12 ; 0xc >1100c: e58d1008str r1, [sp, #8] >11010: e58d2004str r2, [sp, #4] >11014: e58d3000str r3, [sp] Notice how many more registers used to be pushed? I expect the new code is faster. > Maybe my remark is not relevant because I didn't try to do some benchmark If you find any code that *runs* slower, please report it as a bug in Bugzilla. -- Daniel Jacobowitz CodeSourcery
Re: ARM : code less efficient with gcc-trunk ?
On Monday 16 February 2009 11:19:52, Vincent R. wrote: > I used to have .align 0 with gcc-4.1 and now I get a .align 4, how can I > change that ? It was a bug in the patches I had sent you months ago. I've posted the latest patch I had here at cegcc-devel@ --- it should fix this. -- Pedro Alves
Re: IRA conflict graph & alternative selection
Steven Bosscher wrote: On Fri, Feb 13, 2009 at 8:53 PM, Jeff Law wrote: That is in brief how I see it and there are a lot of reload details missed (like virtual register eliminations or addressing displacement constraints etc). I suppose those would stay in reload? Ideally they'd all move into IRA. ...and so, IRA became evil to destroy evil... :-) Obviously the hope would be we could do things much more cleanly in the IRA code base. Modeling spill code generation as a set of transformations on the conflict graph has some potential. jeff
Re: IRA conflict graph & alternative selection
Ian Lance Taylor wrote: Paolo Bonzini writes: That is in brief how I see it and there are a lot of reload details missed (like virtual register eliminations or addressing displacement constraints etc). I suppose those would stay in reload? I see no reason for those to stay in reload (especially since I think reload should disappear entirely). It is reasonable to pick the total maximum size of the stack frame, and thus resolve all displacement constraints, before register allocation. Carefully relaxing these constraints during reload can give you slightly better results for some instructions, but only in very very few cases, and only in functions which already have unusually large stack frames. I don't consider that to be an important optimization. Given that, we can determine the maximum offset for all virtual registers before register allocation, which suffices for selection of insn constraint alternatives, and then determine the actual offset, once, after register allocation. I would agree that careful relaxation of displacements is no longer as important as it once was, I don't think we can just hand wave away the displacement issues 1. The stack frames don't have to be that big to bump up against these problems. 2. The code we generate if we have to reload the address because the displacement was out of range can be horrific 3. There are targets where other registers used in the insn determine the range of the displacement. ie, in a load from memory, the destination register used determines the valid range of displacements (+-16 bytes vs +-8k on one target I'm aware of. 4. Register eliminations complicates matters as well. Enough that I don't think you can set maximum offsets until you've finalized everything in the stack -- which implies that you're done spilling. Jeff
Re: IRA conflict graph & alternative selection
Vladimir Makarov wrote: Jeff Law wrote: I've been thinking further about instruction alternative selection prior to allocation and one of the questions in my mind is how this interacts with IRA. We select an alternative for each insn based on some "best guess" heuristic -- the selection of an alternative will often restrict the register classes available for each of the operands. Presumably we'd want to encode that information it the conflict graph so that IRA would allocate registers so as to fit the constraints of the early insn alternative selection. Right? In the case where the graph is uncolorable, do we allow IRA to override the alternative selection, or do we insert copies to simplify the conflict graph or some mixture of both? Thoughts? As for copies, I think it would be a bad decision to stick only to original (after the code selection) alternative and generate copies to satisfy this alternative. For example, if pseudo got memory instead of hard-register required by the alternative, it would be bad to generate a copy (ld/st in this case) if memory is accepted by the insn. That's why I mentioned the possibility of relaxing the conflict graph to allow other alternatives if we find that the graph is uncolorable.So if we initially wanted class A, but couldn't get it and the operand could accept class B, then we remove the conflict between the pseudo and the hard regs in class B and recolor. I have no idea how expensive this would be. This also implies that we're representing conflicts for register classes & memory in the conflict graph. Jeff
Re: IRA conflict graph & alternative selection
Michael Matz wrote: Hi, On Fri, 13 Feb 2009, Paolo Bonzini wrote: We'd want to encode [early insn alternative selection] information in the conflict graph so that IRA would allocate registers so as to fit the constraints of the early insn alternative selection. Right? In the case where the graph is uncolorable, do we allow IRA to override the alternative selection, or do we insert copies to simplify the conflict graph or some mixture of both? If the initial alternative selection was done cleverly (like chose the alternatives allowing the largest register sets which don't immediately create conflicting demands for a pseudo register) the opportunities for making an uncolorable graph colorable by chosing another alternative will be very small. This can only happen if that new alternative somehow allows for the uncolorable node a completely new set of register (like say float instead of integer regs), which would mean also selecting other alternatives for all instructions where this pseudo also is used. So it's not impossible, but I think it would happen relatively seldom that changing the alternatives improves the situation. Of course. However, we might want to pick a narrower class if it has a smaller cost. The mn103 targets come to mind. In general you're better off with d0-d3/a0-a3 as they're the cheapest (cost & space). However, you've got some extended registers which can be used just like d0-d3/a0-a3, but which are more expensive (but still cheaper than memory). Jeff
Re: IRA conflict graph & alternative selection
Hi, On Mon, 16 Feb 2009, Jeff Law wrote: > > If the initial alternative selection was done cleverly (like chose the > > alternatives allowing the largest register sets which don't > > immediately create conflicting demands for a pseudo register) the > > opportunities for making an uncolorable graph colorable by chosing > > another alternative will be very small. This can only happen if that > > new alternative somehow allows for the uncolorable node a completely > > new set of register (like say float instead of integer regs), which > > would mean also selecting other alternatives for all instructions > > where this pseudo also is used. > > > > So it's not impossible, but I think it would happen relatively seldom > > that changing the alternatives improves the situation. > > > Of course. However, we might want to pick a narrower class if it has a > smaller cost. The mn103 targets come to mind. In general you're better > off with d0-d3/a0-a3 as they're the cheapest (cost & space). However, > you've got some extended registers which can be used just like > d0-d3/a0-a3, but which are more expensive (but still cheaper than > memory). I'd rather model this as a set of preferrable colors in the node. If they're still free when coloring the node, good, if not, too bad, but there are still others to chose from. This is more or less equivalent to chosing a different alternative, but more explicit for the coloring problem and with less ripple-down effects. Ciao, Michael.
Re: Incomplete Type on Pass By Value bug in g++ <4.3.0
On Fri, 2009-02-13 at 12:28 -0800, Joe Buck wrote: > On Fri, Feb 13, 2009 at 11:03:51AM -0800, Anthony Newnam wrote: > > Thanks Joe. > > > > As far as I know the problem I'm seeing isn't a regression but perhaps > > this script could still be useful. I don't really understand how it is > > supposed to work, since it doesn't appear be working off svn updates. > > I haven't looked at it in years, so I can't help you there. When Janis > first wrote the script gcc was still using CVS. But it should be useful > as a starting point. I've been using a different version since we moved to Subversion, and have intended to add it to contrib/. Maybe I'll do that now! > > Should I do something like a binary svn search between revisions > > 124707 and 132947? It takes such a long amount of time to compile g++, > > almost a half an hour with my quad core, that it didn't seem practical > > try to do build so many times. I guess there is probably a way to > > build g++ without the rest of gcc, but I haven't seen an option for > > it. > > Yes. It would suffice to only build phase 1 of g++, without any libraries, > to do what you want, so that should be faster. You don't need a full > bootstrap. If the endpoints you list are correct, that's 8240 revisions > to search, but a binary search only needs to try ceil(log2(8240)) > revisions, or 14. So if each build and test takes 30 minutes, you'll > have your answer in seven hours. I'll include my build script for the compiler only, which works back to early 2003. Please ping me daily until I do this! Janis
changed_allocation_pseudos
What purpose does changed_allocation_pseudos serve? AFAICT we set/clear the bitmap, but never use it for anything. It was added as part of the IRA integration. Did you have some purpose in mind for this bitmap? If not can we just remove it? Jeff
Re: IRA conflict graph & alternative selection
Jeff Law writes: > Ian Lance Taylor wrote: >> >> I see no reason for those to stay in reload (especially since I think >> reload should disappear entirely). It is reasonable to pick the total >> maximum size of the stack frame, and thus resolve all displacement >> constraints, before register allocation. Carefully relaxing these >> constraints during reload can give you slightly better results for some >> instructions, but only in very very few cases, and only in functions >> which already have unusually large stack frames. I don't consider that >> to be an important optimization. Given that, we can determine the >> maximum offset for all virtual registers before register allocation, >> which suffices for selection of insn constraint alternatives, and then >> determine the actual offset, once, after register allocation. >> > I would agree that careful relaxation of displacements is no longer as > important as it once was, I don't think we can just hand wave away > the displacement issues > > 1. The stack frames don't have to be that big to bump up against > these problems. > > 2. The code we generate if we have to reload the address because the > displacement was out of range can be horrific > > 3. There are targets where other registers used in the insn determine > the range of the displacement. ie, in a load from memory, the > destination register used determines the valid range of displacements > (+-16 bytes vs +-8k on one target I'm aware of. In all of thse cases, the relaxation loop can only affect a handful of instructions: the cases where saving a few less registers moves the offset within range. Those few instructions can only occur in a handful of functions: the ones where the stack frame is so large that this becomes an issue at all. I'm not handwaving away displacement issues in general. I'm handwaving away the need to do relaxation, such that we adjust if we find that need to save one more or one fewer register. If we eliminate that relaxation requirement, we can determine all displacements before register allocation. > 4. Register eliminations complicates matters as well. Enough that I > don't think you can set maximum offsets until you've finalized > everything in the stack -- which implies that you're done spilling. We clearly can set maximum offsets, if we are willing to sacrifice an optimization. I argue that that optimization is inconsequential for 99.9% of all code, and avoidable (through refactoring and good inline heuristics) for 100% of all code. Ian
GCC and the Visual Basic programmer....
Hi, This is posted partly to start a disscussion, and partly as technical enquiry. Granted that this is not strictly GCC related, but in relation to the GGC toolchain. Before Java was GPL'ed , comments were made in relation to the so called 'Java' trap. There is also another 'pit' into which some programmers fall, the use of a specific vendors tools or run-times. Whilst looking into the feasiblity of adding support for 16bit code to ReactOS (a 'free' implementation of an otherwise propriatery API) someone mentioned that there were a considerable number of applications which were originally developed using a vendors propriaetry toolsets or runtimes... In the specific example , the two areas of code originally written in Visual Basic and Visual C++. In terms of Visual C++ code, source code can quite reasonably be patched/adapted to compile under 'free' compilers. However, a proportion of code written for Visual C++ makes use of propriatery runtimes such as MFC, the runtime EULA of which 'currently' prevents the use of MFC based applications with a 'free' OS like ReactOS or GNU based toolchains... Should there be an alternate but compatible implementation of MFC? In terms of Visual Basic code, there is of course no 'free' compiler for VB code written prior to VB.NET., and again the EULA for the runtime support would prevent use of the Vendor's runtime on 'free' systems.. Should there be a way of using Visual Basic style code without using the vendors runtime? Technically speaking, I've been told VB used a p-code form rather than direct compliation to native code, so .. Is there a way to automate the conversion/loading of this p-code into form that would compile with with a GNU derived toolchain? Does GCC use some form of 'intermediate' form? Personally, I'd like to see the issue of 'vendor' shackling reduced by co-operation and effort from those within the free software community, especially given the direction in which one vendor seems to be moving...
Re: GCC and the Visual Basic programmer....
On Tue, Feb 17, 2009 at 8:38 AM, Farlie A wrote: > In terms of Visual Basic code, there is of course no 'free' compiler for VB > code written prior > to VB.NET., and again the EULA for the runtime support would prevent use of > the Vendor's > runtime on 'free' systems.. > Should there be a way of using Visual Basic style code without using the > vendors runtime? > I take it you are aware of the mono project?
Re: GCC and the Visual Basic programmer....
On Tue, 17 Feb 2009 09:03:36 +1100, Carl wrote: > On Tue, Feb 17, 2009 at 8:38 AM, Farlie A > wrote: > >> In terms of Visual Basic code, there is of course no 'free' compiler for >> VB >> code written prior >> to VB.NET., and again the EULA for the runtime support would prevent use >> of >> the Vendor's >> runtime on 'free' systems.. >> Should there be a way of using Visual Basic style code without using the >> vendors runtime? >> > > I take it you are aware of the mono project? I think he mentioned prior to VB.NET so mono doesn't help... In addtion VB is a lot different from VB.NET.
Re: IRA conflict graph & alternative selection
Ian Lance Taylor wrote: Jeff Law writes: Ian Lance Taylor wrote: I see no reason for those to stay in reload (especially since I think reload should disappear entirely). It is reasonable to pick the total maximum size of the stack frame, and thus resolve all displacement constraints, before register allocation. Carefully relaxing these constraints during reload can give you slightly better results for some instructions, but only in very very few cases, and only in functions which already have unusually large stack frames. I don't consider that to be an important optimization. Given that, we can determine the maximum offset for all virtual registers before register allocation, which suffices for selection of insn constraint alternatives, and then determine the actual offset, once, after register allocation. I would agree that careful relaxation of displacements is no longer as important as it once was, I don't think we can just hand wave away the displacement issues 1. The stack frames don't have to be that big to bump up against these problems. 2. The code we generate if we have to reload the address because the displacement was out of range can be horrific 3. There are targets where other registers used in the insn determine the range of the displacement. ie, in a load from memory, the destination register used determines the valid range of displacements (+-16 bytes vs +-8k on one target I'm aware of. In all of thse cases, the relaxation loop can only affect a handful of instructions: the cases where saving a few less registers moves the offset within range. Those few instructions can only occur in a handful of functions: the ones where the stack frame is so large that this becomes an issue at all. I disagree, particularly because of point #3.I don't see how you can hand wave it away, that is unless you plan on just making every load/store of a stack variable/spill be assumed to be out of the +-16 byte range which will generate absolutely horrible code. On that particular target is isn't uncommon to have situations where you think you're going to be able to use the +-8k instruction, but because of spilling you end up using a different register and suddenly you're stuck with only being able to use +-16 byte offsets. I'm not handwaving away displacement issues in general. I'm handwaving away the need to do relaxation, such that we adjust if we find that need to save one more or one fewer register. If we eliminate that relaxation requirement, we can determine all displacements before register allocation. I still don't see it as that simple. 4. Register eliminations complicates matters as well. Enough that I don't think you can set maximum offsets until you've finalized everything in the stack -- which implies that you're done spilling. We clearly can set maximum offsets, if we are willing to sacrifice an optimization. I argue that that optimization is inconsequential for 99.9% of all code, and avoidable (through refactoring and good inline heuristics) for 100% of all code. Without knowing the size of the frame, how do you plan on doing this without making the assumption that nothing is going to fit in the shorter displacement variants? How can you do this when the range of valid displacements can change because the register you used got spilled and you got a register from a different class (which in turn has a drastically smaller set of valid displacements). jeff
GCC 4.4.0 Status Report (2009-02-16)
Status == The trunk remains Stage 4, so only fixes for regressions (and changes to documentation) are allowed. As stated previously, the GCC 4.4 branch will be created when there are no open P1s and the total number of P1, P2, and P3 regressions is under 100. We've achieved that, but are still waiting for the FSF to provide instructions regarding the installation of the new run-time library license. I have pinged the FSF about that issue today. There are three open P1s: * PR39137, a problem with -mpreferred-stack-boundary=2 on x86 * PR39202, a crash with unions that appears target-independent * PR39204, a crash in compute_antic Quality Data Priority # Change from Last Report --- --- P130 P2 77 - 4 P310 --- --- Total81 - 4 Previous Report === http://gcc.gnu.org/ml/gcc/2009-02/msg00168.html The next report for 4.4.0 will be sent by Richard.
Re: IRA conflict graph & alternative selection
Jeff Law writes: >>> I would agree that careful relaxation of displacements is no longer as >>> important as it once was, I don't think we can just hand wave away >>> the displacement issues >>> >>> 1. The stack frames don't have to be that big to bump up against >>> these problems. >>> >>> 2. The code we generate if we have to reload the address because the >>> displacement was out of range can be horrific >>> >>> 3. There are targets where other registers used in the insn determine >>> the range of the displacement. ie, in a load from memory, the >>> destination register used determines the valid range of displacements >>> (+-16 bytes vs +-8k on one target I'm aware of. >>> >> >> In all of thse cases, the relaxation loop can only affect a handful of >> instructions: the cases where saving a few less registers moves the >> offset within range. Those few instructions can only occur in a handful >> of functions: the ones where the stack frame is so large that this >> becomes an issue at all. >> > I disagree, particularly because of point #3.I don't see how you > can hand wave it away, that is unless you plan on just making every > load/store of a stack variable/spill be assumed to be out of the +-16 > byte range which will generate absolutely horrible code. No, that makes no sense. What I'm suggesting is that we fix the stack offsets of all local variables before register allocation, based on a conservative assessment of how many registers will be saved on the stack. Then we know during register allocation whether the memory reference will be in or out of the +- 16 byte range. What we lose is the ability to discover that our conservative assessment was overly conservative, and so actually some small number of instructions will be generated as out of range when they could have been in range. (Of course we will pick up some of those cases using peepholes). > Without knowing the size of the frame, how do you plan on doing this > without making the assumption that nothing is going to fit in the > shorter displacement variants? How can you do this when the range of > valid displacements can change because the register you used got > spilled and you got a register from a different class (which in turn > has a drastically smaller set of valid displacements). I'm saying that you guess the size of the frame, so your premise does not describe the aproach that I am suggesting. Ian
Re: Code generated for a simple memory copy loop
Richard, thanks for the reply. I'd love to check out the generated code on a later gcc, but unfortunately we are not in a position to upgrade our gcc. We just use the default gcc that came with FreeBSD 7.0. I'm interested in understanding why gcc generates the code the way it does. I'm probably missing something, and I'd like to understand that. Is counting up better than counting down in some way (add v/s sub)? Thanks again for any help. Regards, N Datta --- On Mon, 16/2/09, Richard Guenther wrote: > From: Richard Guenther > Subject: Re: Code generated for a simple memory copy loop > To: "Narasimha Datta" > Cc: gcc@gcc.gnu.org > Date: Monday, 16 February, 2009, 3:54 PM > On Mon, Feb 16, 2009 at 11:19 AM, Narasimha Datta > wrote: > > Hello, > > > > Here's a simple memory copy macro: > > > > #define MYMEMCOPY(dp, sp, len) \ > > do { \ > >long __len = len; \ > >while (--__len >= 0) \ > >(dp)[__len] = (sp)[__len]; \ > > } while (0) > > > > void foo(unsigned char *dp, const unsigned char *sp, > unsigned long size) { > >MYMEMCOPY(dp, sp, size); > > } > > > > void bar(unsigned char *dp, const unsigned char *sp) { > >MYMEMCOPY(dp, sp, 128); > > } > > > > The code fragments generated for the foo and bar > functions with -O and -O2 optimizations respectively is as > follows: > > > > /* = With -O switch = */ > > /* function foo */ > > .L4: > >movzbl -1(%rcx), %eax > >movb%al, -1(%rdx) > >subq$1, %rcx > >subq$1, %rdx > >subq$1, %r8 > >jns .L4 > > > > /* function bar */ > >movl$126, %edx > > .L8: > > .LBB3: > >.loc 1 13 0 > >movzbl 1(%rdx,%rsi), %eax > >movb%al, 1(%rdx,%rdi) > >subq$1, %rdx > >cmpq$-2, %rdx > >jne .L8 > > > > /* = With -O2 switch =*/ > > /* function foo */ > > .L4: > >movzbl -1(%rsi), %eax > >addq$1, %rdi > >subq$1, %rsi > >movb%al, -1(%rcx) > >subq$1, %rcx > >cmpq%rdx, %rdi > >jne .L4 > > > > /* function bar */ > >movl$126, %edx > > .L9: > > .LBB3: > >.loc 1 13 0 > >movzbl 1(%rdx,%rsi), %eax > >movb%al, 1(%rdx,%rdi) > >subq$1, %rdx > >cmpq$-2, %rdx > >jne .L9 > > > > Now my questions are: > > (i) Why does the compiler generate an addq, cmpq and > jne for the foo function with -O2? Isn't subq/jns more > efficient, as seen from the output from -O? > > (ii) For function bar, why is the "cmpq $-2, > %rdx" instruction generated? Won't it be better to > count down from 128 to 0 instead of 126 to -2? > > > > Here's my OS and compiler version (I'm running > a 64-bit FreeBSD): > > $ uname -a > > FreeBSD xxx 7.0-RELEASE FreeBSD 7.0-RELEASE #0: Wed > Nov 12 18:54:21 PST 2008 > r...@wc7:/usr/obj/usr/src/sys/SMKERNEL amd64 > > $ cc --version > > cc (GCC) 4.2.1 20070719 [FreeBSD] > > Copyright (C) 2007 Free Software Foundation, Inc. > > This is free software; see the source for copying > conditions. There is NO > > warranty; not even for MERCHANTABILITY or FITNESS FOR > A PARTICULAR PURPOSE. > > > > And these are the commands I used to compile the > program: > > cc -S -O -g test.c > > cc -S -O2 -g test.c > > > > Any pointers would be appreciated. Thanks! > > 1) Try a more recent GCC > 2) Use memcpy. It is properly inlined/optimized. > > Richard. Connect with friends all over the world. Get Yahoo! India Messenger at http://in.messenger.yahoo.com/?wm=n/