Code generated for a simple memory copy loop

2009-02-16 Thread Narasimha Datta
Hello,

Here's a simple memory copy macro:

#define MYMEMCOPY(dp, sp, len) \
do { \
long __len = len; \
while (--__len >= 0) \
(dp)[__len] = (sp)[__len]; \
} while (0)

void foo(unsigned char *dp, const unsigned char *sp, unsigned long size) {
MYMEMCOPY(dp, sp, size);
}

void bar(unsigned char *dp, const unsigned char *sp) {
MYMEMCOPY(dp, sp, 128);
}

The code fragments generated for the foo and bar functions with -O and -O2 
optimizations respectively is as follows:

/* = With -O switch = */
/* function foo */
.L4:
movzbl  -1(%rcx), %eax
movb%al, -1(%rdx)
subq$1, %rcx
subq$1, %rdx
subq$1, %r8
jns .L4

/* function bar */
movl$126, %edx
.L8:
.LBB3:
.loc 1 13 0
movzbl  1(%rdx,%rsi), %eax
movb%al, 1(%rdx,%rdi)
subq$1, %rdx
cmpq$-2, %rdx
jne .L8

/* = With -O2 switch =*/
/* function foo */
.L4:
movzbl  -1(%rsi), %eax
addq$1, %rdi
subq$1, %rsi
movb%al, -1(%rcx)
subq$1, %rcx
cmpq%rdx, %rdi
jne .L4

/* function bar */
movl$126, %edx
.L9:
.LBB3:
.loc 1 13 0
movzbl  1(%rdx,%rsi), %eax
movb%al, 1(%rdx,%rdi)
subq$1, %rdx
cmpq$-2, %rdx
jne .L9

Now my questions are:
(i) Why does the compiler generate an addq, cmpq and jne for the foo function 
with -O2? Isn't subq/jns more efficient, as seen from the output from -O?
(ii) For function bar, why is the "cmpq $-2, %rdx" instruction generated? Won't 
it be better to count down from 128 to 0 instead of 126 to -2?

Here's my OS and compiler version (I'm running a 64-bit FreeBSD):
$ uname -a
FreeBSD xxx 7.0-RELEASE FreeBSD 7.0-RELEASE #0: Wed Nov 12 18:54:21 PST 2008
 r...@wc7:/usr/obj/usr/src/sys/SMKERNEL  amd64
$ cc --version
cc (GCC) 4.2.1 20070719  [FreeBSD]
Copyright (C) 2007 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

And these are the commands I used to compile the program:
cc -S -O -g test.c
cc -S -O2 -g test.c

Any pointers would be appreciated. Thanks!

Regards,
N Datta



  Add more friends to your messenger and enjoy! Go to 
http://messenger.yahoo.com/invite/


Re: Code generated for a simple memory copy loop

2009-02-16 Thread Richard Guenther
On Mon, Feb 16, 2009 at 11:19 AM, Narasimha Datta  wrote:
> Hello,
>
> Here's a simple memory copy macro:
>
> #define MYMEMCOPY(dp, sp, len) \
> do { \
>long __len = len; \
>while (--__len >= 0) \
>(dp)[__len] = (sp)[__len]; \
> } while (0)
>
> void foo(unsigned char *dp, const unsigned char *sp, unsigned long size) {
>MYMEMCOPY(dp, sp, size);
> }
>
> void bar(unsigned char *dp, const unsigned char *sp) {
>MYMEMCOPY(dp, sp, 128);
> }
>
> The code fragments generated for the foo and bar functions with -O and -O2 
> optimizations respectively is as follows:
>
> /* = With -O switch = */
> /* function foo */
> .L4:
>movzbl  -1(%rcx), %eax
>movb%al, -1(%rdx)
>subq$1, %rcx
>subq$1, %rdx
>subq$1, %r8
>jns .L4
>
> /* function bar */
>movl$126, %edx
> .L8:
> .LBB3:
>.loc 1 13 0
>movzbl  1(%rdx,%rsi), %eax
>movb%al, 1(%rdx,%rdi)
>subq$1, %rdx
>cmpq$-2, %rdx
>jne .L8
>
> /* = With -O2 switch =*/
> /* function foo */
> .L4:
>movzbl  -1(%rsi), %eax
>addq$1, %rdi
>subq$1, %rsi
>movb%al, -1(%rcx)
>subq$1, %rcx
>cmpq%rdx, %rdi
>jne .L4
>
> /* function bar */
>movl$126, %edx
> .L9:
> .LBB3:
>.loc 1 13 0
>movzbl  1(%rdx,%rsi), %eax
>movb%al, 1(%rdx,%rdi)
>subq$1, %rdx
>cmpq$-2, %rdx
>jne .L9
>
> Now my questions are:
> (i) Why does the compiler generate an addq, cmpq and jne for the foo function 
> with -O2? Isn't subq/jns more efficient, as seen from the output from -O?
> (ii) For function bar, why is the "cmpq $-2, %rdx" instruction generated? 
> Won't it be better to count down from 128 to 0 instead of 126 to -2?
>
> Here's my OS and compiler version (I'm running a 64-bit FreeBSD):
> $ uname -a
> FreeBSD xxx 7.0-RELEASE FreeBSD 7.0-RELEASE #0: Wed Nov 12 18:54:21 PST 2008  
>r...@wc7:/usr/obj/usr/src/sys/SMKERNEL  amd64
> $ cc --version
> cc (GCC) 4.2.1 20070719  [FreeBSD]
> Copyright (C) 2007 Free Software Foundation, Inc.
> This is free software; see the source for copying conditions.  There is NO
> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>
> And these are the commands I used to compile the program:
> cc -S -O -g test.c
> cc -S -O2 -g test.c
>
> Any pointers would be appreciated. Thanks!

1) Try a more recent GCC
2) Use memcpy.  It is properly inlined/optimized.

Richard.


ARM : code less efficient with gcc-trunk ?

2009-02-16 Thread Vincent R.
Hi,

I am comparing the assembly generated by compilers targeting arm-wince
platform and it seems
that cross-compiler from gcc-trunk is less optimized than an old one based
on gcc 4.1.x
Here is the comparison obtained from objdump:


cegcc-4.1.x : 

00011000 :
   11000:   e92d40f0push{r4, r5, r6, r7, lr}
   11004:   e1a04000mov r4, r0
   11008:   e1a05001mov r5, r1
   1100c:   e1a06002mov r6, r2
   11010:   e1a07003mov r7, r3
   11014:   ebdebl  11394 <_fpreset>
   11018:   eb2abl  110c8 <_pei386_runtime_relocator>
   1101c:   eb99bl  11288 <__atexit_init>
   11020:   ebd3bl  11374 <__gccmain>
   11024:   e1a01005mov r1, r5
   11028:   e1a4mov r0, r4
   1102c:   e1a02006mov r2, r6
   11030:   e1a03007mov r3, r7
   11034:   eb05bl  11050 
   11038:   e1a04000mov r4, r0
   1103c:   eb87bl  11260 <_cexit>
   11040:   e1a01004mov r1, r4
   11044:   e3a00042mov r0, #66 ; 0x42
   11048:   ebd4bl  113a0 
   1104c:   eafeb   1104c 

cegcc-4.4.x

00011000 :
   11000:   e92d4010push{r4, lr}
   11004:   e1a04000mov r4, r0
   11008:   e24dd00csub sp, sp, #12 ; 0xc
   1100c:   e58d1008str r1, [sp, #8]
   11010:   e58d2004str r2, [sp, #4]
   11014:   e58d3000str r3, [sp]
   11018:   eb000120bl  114a0 <_fpreset>
   1101c:   eb43bl  11130 <_pei386_runtime_relocator>
   11020:   ebcebl  11360 <__atexit_init>
   11024:   eb000111bl  11470 <__gccmain>
   11028:   e59d1008ldr r1, [sp, #8]
   1102c:   e1a4mov r0, r4
   11030:   e59d2004ldr r2, [sp, #4]
   11034:   e59d3000ldr r3, [sp]
   11038:   eb28bl  110e0 
   1103c:   e1a04000mov r4, r0
   11040:   ebbabl  11330 <_cexit>
   11044:   e1a01004mov r1, r4
   11048:   e3a00042mov r0, #66 ; 0x42
   1104c:   eb000116bl  114ac 
   11050:   eafeb   11050 
   11054:   e1a0nop (mov r0,r0)
   11058:   e1a0nop (mov r0,r0)
   1105c:   e1a0nop (mov r0,r0)

If you have a look at address 11008-1100c you can see that old gcc is using
registers 
but upcoming gcc-4.4 is using memory.

I tried to put some optim flags -O2 but it doesn't modify the situation.
Is there anything to do to improve this situation ? Is it a normal behavior
?
Maybe my remark is not relevant because I didn't try to do some benchmark
and I agree
this is not because gcc-trunk do not optimize this specific part that it
will be slower.
I have also noticed that now I get some nop instructions and when I ask gcc
to generate
assembly I can see that alignement directive is different.
I used to have .align 0 with gcc-4.1 and now I get a .align 4, how can I
change that ?
And finally maybe those nop insn prevents compiler from optimizing ...



Thanks









targed.md: copy_to_mode_reg or force_reg?

2009-02-16 Thread Georg-Johann Lay

Hi,

in machine description expanders the functions copy_to_mode_reg and and 
force_reg from explow.c can be used to ensure that an operand lives in a 
register.


But what function should be used?
What are the differences? The only difference I can depict from the 
comment is that an operand returned by force_reg must not be altered, 
i.e. overwritten afterwards.


Are there any pitfalls using these functions with respect to 
reload_completed, reload_in_progress or no_new_pseudos?


As far as I understand, these function can only be used if 
no_new_pseudos is false.


Thanks, Georg-Johann



Re: ARM : code less efficient with gcc-trunk ?

2009-02-16 Thread Daniel Jacobowitz
On Mon, Feb 16, 2009 at 12:19:52PM +0100, Vincent R. wrote:
> 00011000 :
>11000: e92d40f0push{r4, r5, r6, r7, lr}
>11004: e1a04000mov r4, r0
>11008: e1a05001mov r5, r1
>1100c: e1a06002mov r6, r2
>11010: e1a07003mov r7, r3

> 00011000 :
>11000: e92d4010push{r4, lr}
>11004: e1a04000mov r4, r0
>11008: e24dd00csub sp, sp, #12 ; 0xc
>1100c: e58d1008str r1, [sp, #8]
>11010: e58d2004str r2, [sp, #4]
>11014: e58d3000str r3, [sp]

Notice how many more registers used to be pushed?  I expect the new
code is faster.

> Maybe my remark is not relevant because I didn't try to do some benchmark

If you find any code that *runs* slower, please report it as a bug in
Bugzilla.

-- 
Daniel Jacobowitz
CodeSourcery


Re: ARM : code less efficient with gcc-trunk ?

2009-02-16 Thread Pedro Alves
On Monday 16 February 2009 11:19:52, Vincent R. wrote:
> I used to have .align 0 with gcc-4.1 and now I get a .align 4, how can I
> change that ?

It was a bug in the patches I had sent you months ago.  I've posted the
latest patch I had here at cegcc-devel@ --- it should fix this.

-- 
Pedro Alves


Re: IRA conflict graph & alternative selection

2009-02-16 Thread Jeff Law

Steven Bosscher wrote:

On Fri, Feb 13, 2009 at 8:53 PM, Jeff Law  wrote:
  

That is in brief how I see it and there are a lot of reload details
missed (like virtual register eliminations or addressing displacement
constraints etc).



I suppose those would stay in reload?
  

Ideally they'd all move into IRA.



...and so, IRA became evil to destroy evil...  :-)
  
Obviously the hope would be we could do things much more cleanly in the 
IRA code base.  Modeling spill code generation as a set of 
transformations on the conflict graph has some potential. 


jeff



Re: IRA conflict graph & alternative selection

2009-02-16 Thread Jeff Law

Ian Lance Taylor wrote:

Paolo Bonzini  writes:

  

That is in brief how I see it and there are a lot of reload details
missed (like virtual register eliminations or addressing displacement
constraints etc).
  

I suppose those would stay in reload?



I see no reason for those to stay in reload (especially since I think
reload should disappear entirely).  It is reasonable to pick the total
maximum size of the stack frame, and thus resolve all displacement
constraints, before register allocation.  Carefully relaxing these
constraints during reload can give you slightly better results for some
instructions, but only in very very few cases, and only in functions
which already have unusually large stack frames.  I don't consider that
to be an important optimization.  Given that, we can determine the
maximum offset for all virtual registers before register allocation,
which suffices for selection of insn constraint alternatives, and then
determine the actual offset, once, after register allocation.
  
I would agree that careful relaxation of displacements is no longer as 
important as it once was, I don't think we can just  hand wave away the 
displacement issues


 1. The stack frames don't have to be that big to bump up against these 
problems.


 2. The code we generate if we have to reload the address because the 
displacement was out of range can be horrific


 3. There are targets where other registers used in the insn determine 
the range of the displacement.  ie, in a load from memory, the 
destination register used determines the valid range of displacements 
(+-16 bytes vs +-8k on one target I'm aware of.


 4. Register eliminations complicates matters as well.  Enough that I 
don't think you can set maximum offsets until you've finalized 
everything in the stack -- which implies that you're done spilling.




Jeff





Re: IRA conflict graph & alternative selection

2009-02-16 Thread Jeff Law

Vladimir Makarov wrote:

Jeff Law wrote:
I've been thinking further about instruction alternative selection 
prior to allocation and one of the questions in my mind is how this 
interacts with IRA.


We select an alternative for each insn based on some "best guess" 
heuristic -- the selection of an alternative will often restrict the 
register classes available for each of the operands.  Presumably we'd 
want to encode that information it the conflict graph so that IRA 
would allocate registers so as to fit the constraints of the early 
insn alternative selection.  Right?   In the case where the graph is 
uncolorable, do we allow IRA to override the alternative selection, 
or do we insert copies to simplify the conflict graph or some mixture 
of both?


Thoughts?



As for copies,  I think it would be a bad decision to stick only to 
original (after the code selection) alternative and generate copies to 
satisfy this alternative.  For example, if pseudo got memory instead 
of hard-register required by the alternative, it would be bad to 
generate a copy (ld/st in this case) if memory is accepted by the insn.  
That's why I mentioned the possibility of relaxing the conflict graph to 
allow other alternatives if we find that the graph is uncolorable.So 
if we initially wanted class A, but couldn't get it and the operand 
could accept class B, then we remove the conflict between the pseudo and 
the hard regs in class B and recolor.


I have no idea how expensive this would be.

This also implies that we're representing conflicts for register classes 
& memory in the conflict graph. 


Jeff





Re: IRA conflict graph & alternative selection

2009-02-16 Thread Jeff Law

Michael Matz wrote:

Hi,

On Fri, 13 Feb 2009, Paolo Bonzini wrote:

  
We'd want to encode [early insn alternative selection] information in 
the conflict graph so that IRA would allocate registers so as to fit 
the constraints of the early insn alternative selection.  Right?  In 
the case where the graph is uncolorable, do we allow IRA to override 
the alternative selection, or do we insert copies to simplify the 
conflict graph or some mixture of both?
  


If the initial alternative selection was done cleverly (like chose the 
alternatives allowing the largest register sets which don't immediately 
create conflicting demands for a pseudo register) the opportunities for 
making an uncolorable graph colorable by chosing another alternative will 
be very small.  This can only happen if that new alternative somehow 
allows for the uncolorable node a completely new set of register (like say 
float instead of integer regs), which would mean also selecting other 
alternatives for all instructions where this pseudo also is used.


So it's not impossible, but I think it would happen relatively seldom that 
changing the alternatives improves the situation.
  
Of course.  However, we might want to pick a narrower class if it has a 
smaller cost.  The mn103 targets come to mind.  In general you're better 
off with d0-d3/a0-a3 as they're the cheapest (cost & space).  However, 
you've got some extended registers which can be used just like 
d0-d3/a0-a3, but which are more expensive (but still cheaper than memory).



Jeff


Re: IRA conflict graph & alternative selection

2009-02-16 Thread Michael Matz
Hi,

On Mon, 16 Feb 2009, Jeff Law wrote:

> > If the initial alternative selection was done cleverly (like chose the 
> > alternatives allowing the largest register sets which don't 
> > immediately create conflicting demands for a pseudo register) the 
> > opportunities for making an uncolorable graph colorable by chosing 
> > another alternative will be very small.  This can only happen if that 
> > new alternative somehow allows for the uncolorable node a completely 
> > new set of register (like say float instead of integer regs), which 
> > would mean also selecting other alternatives for all instructions 
> > where this pseudo also is used.
> >
> > So it's not impossible, but I think it would happen relatively seldom 
> > that changing the alternatives improves the situation.
> >   
> Of course.  However, we might want to pick a narrower class if it has a 
> smaller cost.  The mn103 targets come to mind.  In general you're better 
> off with d0-d3/a0-a3 as they're the cheapest (cost & space).  However, 
> you've got some extended registers which can be used just like 
> d0-d3/a0-a3, but which are more expensive (but still cheaper than 
> memory).

I'd rather model this as a set of preferrable colors in the node.  If 
they're still free when coloring the node, good, if not, too bad, but 
there are still others to chose from.  This is more or less equivalent to 
chosing a different alternative, but more explicit for the coloring 
problem and with less ripple-down effects.


Ciao,
Michael.


Re: Incomplete Type on Pass By Value bug in g++ <4.3.0

2009-02-16 Thread Janis Johnson
On Fri, 2009-02-13 at 12:28 -0800, Joe Buck wrote:
> On Fri, Feb 13, 2009 at 11:03:51AM -0800, Anthony Newnam wrote:
> > Thanks Joe.
> > 
> > As far as I know the problem I'm seeing isn't a regression but perhaps
> > this script could still be useful. I don't really understand how it is
> > supposed to work, since it doesn't appear be working off svn updates.
> 
> I haven't looked at it in years, so I can't help you there.  When Janis
> first wrote the script gcc was still using CVS.  But it should be useful
> as a starting point.

I've been using a different version since we moved to Subversion, and
have intended to add it to contrib/.  Maybe I'll do that now!

> > Should I do something like a binary svn search between revisions
> > 124707 and 132947? It takes such a long amount of time to compile g++,
> > almost a half an hour with my quad core, that it didn't seem practical
> > try to do build so many times. I guess there is probably a way to
> > build g++ without the rest of gcc, but I haven't seen an option for
> > it.
> 
> Yes. It would suffice to only build phase 1 of g++, without any libraries,
> to do what you want, so that should be faster.  You don't need a full
> bootstrap.  If the endpoints you list are correct, that's 8240 revisions
> to search, but a binary search only needs to try ceil(log2(8240))
> revisions, or 14.  So if each build and test takes 30 minutes, you'll
> have your answer in seven hours.

I'll include my build script for the compiler only, which works back to
early 2003.

Please ping me daily until I do this!

Janis



changed_allocation_pseudos

2009-02-16 Thread Jeff Law


What purpose does changed_allocation_pseudos serve?  AFAICT we set/clear 
the bitmap, but never use it for anything.  It was added as part of the 
IRA integration.  Did you have some purpose in mind for this bitmap?  If 
not can we just remove it?


Jeff


Re: IRA conflict graph & alternative selection

2009-02-16 Thread Ian Lance Taylor
Jeff Law  writes:

> Ian Lance Taylor wrote:
>>
>> I see no reason for those to stay in reload (especially since I think
>> reload should disappear entirely).  It is reasonable to pick the total
>> maximum size of the stack frame, and thus resolve all displacement
>> constraints, before register allocation.  Carefully relaxing these
>> constraints during reload can give you slightly better results for some
>> instructions, but only in very very few cases, and only in functions
>> which already have unusually large stack frames.  I don't consider that
>> to be an important optimization.  Given that, we can determine the
>> maximum offset for all virtual registers before register allocation,
>> which suffices for selection of insn constraint alternatives, and then
>> determine the actual offset, once, after register allocation.
>>   
> I would agree that careful relaxation of displacements is no longer as
> important as it once was, I don't think we can just  hand wave away
> the displacement issues
>
>  1. The stack frames don't have to be that big to bump up against
> these problems.
>
>  2. The code we generate if we have to reload the address because the
> displacement was out of range can be horrific
>
>  3. There are targets where other registers used in the insn determine
> the range of the displacement.  ie, in a load from memory, the
> destination register used determines the valid range of displacements
> (+-16 bytes vs +-8k on one target I'm aware of.

In all of thse cases, the relaxation loop can only affect a handful of
instructions: the cases where saving a few less registers moves the
offset within range.  Those few instructions can only occur in a handful
of functions: the ones where the stack frame is so large that this
becomes an issue at all.

I'm not handwaving away displacement issues in general.  I'm handwaving
away the need to do relaxation, such that we adjust if we find that need
to save one more or one fewer register.  If we eliminate that relaxation
requirement, we can determine all displacements before register
allocation.

>  4. Register eliminations complicates matters as well.  Enough that I
> don't think you can set maximum offsets until you've finalized
> everything in the stack -- which implies that you're done spilling.

We clearly can set maximum offsets, if we are willing to sacrifice an
optimization.  I argue that that optimization is inconsequential for
99.9% of all code, and avoidable (through refactoring and good inline
heuristics) for 100% of all code.

Ian


GCC and the Visual Basic programmer....

2009-02-16 Thread Farlie A


Hi,

This is posted partly to start a disscussion, and partly as technical 
enquiry.


Granted that this is not strictly GCC related, but in relation to the 
GGC toolchain.


Before Java was GPL'ed , comments were made in relation to the so called 
'Java' trap.


There is also another 'pit' into which some programmers fall, the use of 
a specific vendors

tools or run-times.

Whilst looking into the feasiblity of adding support for 16bit  code to 
ReactOS (a 'free' implementation
of an otherwise propriatery API) someone mentioned that there were a 
considerable number of applications
which were originally developed using a vendors propriaetry toolsets or 
runtimes...


In the specific example , the two areas of code originally written in  
Visual Basic and  Visual C++.


In terms of Visual C++ code, source code can quite reasonably be 
patched/adapted to compile under
'free' compilers. However,  a proportion of code written for Visual C++ 
makes use of
propriatery runtimes such as MFC, the runtime EULA of which 'currently' 
prevents the use of MFC

based applications with a 'free' OS like ReactOS or GNU based toolchains...
Should there be an alternate but compatible implementation of MFC?

In terms of Visual Basic code, there is of course no 'free' compiler for 
VB code written prior
to VB.NET., and again the EULA for the runtime support would prevent use 
of the Vendor's

runtime on 'free' systems..
Should there be a way of using Visual Basic style code without using the 
vendors runtime?


Technically speaking, I've been told VB used a p-code form rather than 
direct compliation

to native code, so ..
Is there a way to automate the conversion/loading of  this p-code into  
form that would compile with

with a GNU derived toolchain?
Does GCC use some form of 'intermediate' form?

Personally, I'd like to see the issue of 'vendor' shackling reduced by 
co-operation and effort  from those within the
free software community, especially given the direction in which one 
vendor seems to be moving...
















Re: GCC and the Visual Basic programmer....

2009-02-16 Thread Carl
On Tue, Feb 17, 2009 at 8:38 AM, Farlie A  wrote:

> In terms of Visual Basic code, there is of course no 'free' compiler for VB
> code written prior
> to VB.NET., and again the EULA for the runtime support would prevent use of
> the Vendor's
> runtime on 'free' systems..
> Should there be a way of using Visual Basic style code without using the
> vendors runtime?
>

I take it you are aware of the mono project?


Re: GCC and the Visual Basic programmer....

2009-02-16 Thread Vincent R.
On Tue, 17 Feb 2009 09:03:36 +1100, Carl  wrote:
> On Tue, Feb 17, 2009 at 8:38 AM, Farlie A 
> wrote:
> 
>> In terms of Visual Basic code, there is of course no 'free' compiler for
>> VB
>> code written prior
>> to VB.NET., and again the EULA for the runtime support would prevent use
>> of
>> the Vendor's
>> runtime on 'free' systems..
>> Should there be a way of using Visual Basic style code without using the
>> vendors runtime?
>>
> 
> I take it you are aware of the mono project?

I think he mentioned prior to VB.NET so mono doesn't help...
In addtion VB is a lot different from VB.NET.



Re: IRA conflict graph & alternative selection

2009-02-16 Thread Jeff Law

Ian Lance Taylor wrote:

Jeff Law  writes:

  

Ian Lance Taylor wrote:


I see no reason for those to stay in reload (especially since I think
reload should disappear entirely).  It is reasonable to pick the total
maximum size of the stack frame, and thus resolve all displacement
constraints, before register allocation.  Carefully relaxing these
constraints during reload can give you slightly better results for some
instructions, but only in very very few cases, and only in functions
which already have unusually large stack frames.  I don't consider that
to be an important optimization.  Given that, we can determine the
maximum offset for all virtual registers before register allocation,
which suffices for selection of insn constraint alternatives, and then
determine the actual offset, once, after register allocation.
  
  

I would agree that careful relaxation of displacements is no longer as
important as it once was, I don't think we can just  hand wave away
the displacement issues

 1. The stack frames don't have to be that big to bump up against
these problems.

 2. The code we generate if we have to reload the address because the
displacement was out of range can be horrific

 3. There are targets where other registers used in the insn determine
the range of the displacement.  ie, in a load from memory, the
destination register used determines the valid range of displacements
(+-16 bytes vs +-8k on one target I'm aware of.



In all of thse cases, the relaxation loop can only affect a handful of
instructions: the cases where saving a few less registers moves the
offset within range.  Those few instructions can only occur in a handful
of functions: the ones where the stack frame is so large that this
becomes an issue at all.
  
I disagree, particularly because of point #3.I don't see how you can 
hand wave it away, that is unless you plan on just making every 
load/store of a stack variable/spill be assumed to be out of the +-16 
byte range which will generate absolutely horrible code.


On that particular target is isn't uncommon to have situations where you 
think you're going to be able to use the +-8k instruction, but because 
of spilling you end up using a different register and suddenly you're 
stuck with only being able to use +-16 byte offsets.




I'm not handwaving away displacement issues in general.  I'm handwaving
away the need to do relaxation, such that we adjust if we find that need
to save one more or one fewer register.  If we eliminate that relaxation
requirement, we can determine all displacements before register
allocation.
  
I still don't see it as that simple. 
  

 4. Register eliminations complicates matters as well.  Enough that I
don't think you can set maximum offsets until you've finalized
everything in the stack -- which implies that you're done spilling.



We clearly can set maximum offsets, if we are willing to sacrifice an
optimization.  I argue that that optimization is inconsequential for
99.9% of all code, and avoidable (through refactoring and good inline
heuristics) for 100% of all code.
  
Without knowing the size of the frame, how do you plan on doing this 
without making the assumption that nothing is going to fit in the 
shorter displacement variants?  How can you do this when the range of 
valid displacements can change because the register you used got spilled 
and you got a register from a different class (which in turn has a 
drastically smaller set of valid displacements).


jeff




GCC 4.4.0 Status Report (2009-02-16)

2009-02-16 Thread Mark Mitchell

Status
==

The trunk remains Stage 4, so only fixes for regressions (and changes
to documentation) are allowed.

As stated previously, the GCC 4.4 branch will be created when there
are no open P1s and the total number of P1, P2, and P3 regressions is
under 100.  We've achieved that, but are still waiting for the FSF to
provide instructions regarding the installation of the new run-time
library license.  I have pinged the FSF about that issue today.

There are three open P1s:

* PR39137, a problem with -mpreferred-stack-boundary=2 on x86

* PR39202, a crash with unions that appears target-independent

* PR39204, a crash in compute_antic

Quality Data


Priority  # Change from Last Report
--- ---
P130
P2   77 -  4
P310
--- ---
Total81 -  4

Previous Report
===

http://gcc.gnu.org/ml/gcc/2009-02/msg00168.html

The next report for 4.4.0 will be sent by Richard.


Re: IRA conflict graph & alternative selection

2009-02-16 Thread Ian Lance Taylor
Jeff Law  writes:

>>> I would agree that careful relaxation of displacements is no longer as
>>> important as it once was, I don't think we can just  hand wave away
>>> the displacement issues
>>>
>>>  1. The stack frames don't have to be that big to bump up against
>>> these problems.
>>>
>>>  2. The code we generate if we have to reload the address because the
>>> displacement was out of range can be horrific
>>>
>>>  3. There are targets where other registers used in the insn determine
>>> the range of the displacement.  ie, in a load from memory, the
>>> destination register used determines the valid range of displacements
>>> (+-16 bytes vs +-8k on one target I'm aware of.
>>> 
>>
>> In all of thse cases, the relaxation loop can only affect a handful of
>> instructions: the cases where saving a few less registers moves the
>> offset within range.  Those few instructions can only occur in a handful
>> of functions: the ones where the stack frame is so large that this
>> becomes an issue at all.
>>   
> I disagree, particularly because of point #3.I don't see how you
> can hand wave it away, that is unless you plan on just making every
> load/store of a stack variable/spill be assumed to be out of the +-16
> byte range which will generate absolutely horrible code.

No, that makes no sense.  What I'm suggesting is that we fix the stack
offsets of all local variables before register allocation, based on a
conservative assessment of how many registers will be saved on the
stack.  Then we know during register allocation whether the memory
reference will be in or out of the +- 16 byte range.  What we lose is
the ability to discover that our conservative assessment was overly
conservative, and so actually some small number of instructions will be
generated as out of range when they could have been in range.  (Of
course we will pick up some of those cases using peepholes).


> Without knowing the size of the frame, how do you plan on doing this
> without making the assumption that nothing is going to fit in the
> shorter displacement variants?  How can you do this when the range of
> valid displacements can change because the register you used got
> spilled and you got a register from a different class (which in turn
> has a drastically smaller set of valid displacements).

I'm saying that you guess the size of the frame, so your premise does
not describe the aproach that I am suggesting.

Ian


Re: Code generated for a simple memory copy loop

2009-02-16 Thread Narasimha Datta
Richard, thanks for the reply.

I'd love to check out the generated code on a later gcc, but unfortunately we 
are not in a position to upgrade our gcc. We just use the default gcc that came 
with FreeBSD 7.0.

I'm interested in understanding why gcc generates the code the way it does. I'm 
probably missing something, and I'd like to understand that. Is counting up 
better than counting down in some way (add v/s sub)?

Thanks again for any help.

Regards,
N Datta

--- On Mon, 16/2/09, Richard Guenther  wrote:

> From: Richard Guenther 
> Subject: Re: Code generated for a simple memory copy loop
> To: "Narasimha Datta" 
> Cc: gcc@gcc.gnu.org
> Date: Monday, 16 February, 2009, 3:54 PM
> On Mon, Feb 16, 2009 at 11:19 AM, Narasimha Datta
>  wrote:
> > Hello,
> >
> > Here's a simple memory copy macro:
> >
> > #define MYMEMCOPY(dp, sp, len) \
> > do { \
> >long __len = len; \
> >while (--__len >= 0) \
> >(dp)[__len] = (sp)[__len]; \
> > } while (0)
> >
> > void foo(unsigned char *dp, const unsigned char *sp,
> unsigned long size) {
> >MYMEMCOPY(dp, sp, size);
> > }
> >
> > void bar(unsigned char *dp, const unsigned char *sp) {
> >MYMEMCOPY(dp, sp, 128);
> > }
> >
> > The code fragments generated for the foo and bar
> functions with -O and -O2 optimizations respectively is as
> follows:
> >
> > /* = With -O switch = */
> > /* function foo */
> > .L4:
> >movzbl  -1(%rcx), %eax
> >movb%al, -1(%rdx)
> >subq$1, %rcx
> >subq$1, %rdx
> >subq$1, %r8
> >jns .L4
> >
> > /* function bar */
> >movl$126, %edx
> > .L8:
> > .LBB3:
> >.loc 1 13 0
> >movzbl  1(%rdx,%rsi), %eax
> >movb%al, 1(%rdx,%rdi)
> >subq$1, %rdx
> >cmpq$-2, %rdx
> >jne .L8
> >
> > /* = With -O2 switch =*/
> > /* function foo */
> > .L4:
> >movzbl  -1(%rsi), %eax
> >addq$1, %rdi
> >subq$1, %rsi
> >movb%al, -1(%rcx)
> >subq$1, %rcx
> >cmpq%rdx, %rdi
> >jne .L4
> >
> > /* function bar */
> >movl$126, %edx
> > .L9:
> > .LBB3:
> >.loc 1 13 0
> >movzbl  1(%rdx,%rsi), %eax
> >movb%al, 1(%rdx,%rdi)
> >subq$1, %rdx
> >cmpq$-2, %rdx
> >jne .L9
> >
> > Now my questions are:
> > (i) Why does the compiler generate an addq, cmpq and
> jne for the foo function with -O2? Isn't subq/jns more
> efficient, as seen from the output from -O?
> > (ii) For function bar, why is the "cmpq $-2,
> %rdx" instruction generated? Won't it be better to
> count down from 128 to 0 instead of 126 to -2?
> >
> > Here's my OS and compiler version (I'm running
> a 64-bit FreeBSD):
> > $ uname -a
> > FreeBSD xxx 7.0-RELEASE FreeBSD 7.0-RELEASE #0: Wed
> Nov 12 18:54:21 PST 2008
> r...@wc7:/usr/obj/usr/src/sys/SMKERNEL  amd64
> > $ cc --version
> > cc (GCC) 4.2.1 20070719  [FreeBSD]
> > Copyright (C) 2007 Free Software Foundation, Inc.
> > This is free software; see the source for copying
> conditions.  There is NO
> > warranty; not even for MERCHANTABILITY or FITNESS FOR
> A PARTICULAR PURPOSE.
> >
> > And these are the commands I used to compile the
> program:
> > cc -S -O -g test.c
> > cc -S -O2 -g test.c
> >
> > Any pointers would be appreciated. Thanks!
> 
> 1) Try a more recent GCC
> 2) Use memcpy.  It is properly inlined/optimized.
> 
> Richard.


  Connect with friends all over the world. Get Yahoo! India Messenger at 
http://in.messenger.yahoo.com/?wm=n/