[Bug middle-end/37678] New: Failure to generate post-increment addressing

TabonyEE at austin dot rr dot com Mon, 29 Sep 2008 18:02:22 -0700

This is an ia64 performance and code size regression.  Consider the following
function:


void sump(int *a, int *b, int *c, int len){
  int i;
  for(i = 0; i < len; i++){
    *a++ = *b++ + *c++;
  }
}

GCC 3.4.6 generated the following code.  Note that the memory accesses use the
post-increment addressing mode.

sump:
        .prologue
        .mii
        nop 0
        .save ar.lc, r2
        mov r2 = ar.lc
        .body
        sxt4 r14 = r35
        .mfb
        cmp4.ge p6, p7 = 0, r35
        nop 0
        (p6) br.cond.dpnt .L16
        ;;
        .mmi
        adds r14 = -1, r14
        ;;
        nop 0
        mov ar.lc = r14
.L17:
        .mmb
        ld4 r14 = [r33], 4
        ld4 r15 = [r34], 4
        nop 0
        ;;
        .mii
        nop 0
        add r14 = r14, r15
        ;;
        nop 0
        .mfb
        st4 [r32] = r14, 4
        nop 0
        br.cloop.sptk.few .L17
        ;;
.L16:
        .mib
        nop 0
        mov ar.lc = r2
        br.ret.sptk.many b0

Mainline (revision 140763) generated the following code.  Instead of using
post-increment addressing, the original pointers are kept in r34, r33, and r32,
while an offset, held in r14, is incremented by four each iteration.  The
offset is added to each of the three bases each iteration.  This pattern of
code is ideal for machines that have a base-plus-index addressing mode and not
a post-increment addressing mode, such as x86.  However, ia64 has a
post-increment addressing mode and not a base-plus-index addressing mode.

sump:
        .prologue
        .body
        .mmi
        adds r18 = -1, r35
        nop 0
        mov r14 = r0
        .mmb
        cmp4.ge p6, p7 = 0, r35
        nop 0
        (p6) br.ret.dpnt.many rp
        ;;
        .mmi
        nop 0
        addp4 r18 = r18, r0
        nop 0
        ;;
        .mii
        adds r18 = 1, r18
        nop 0
        ;;
        shladd r18 = r18, 2, r0
.L10:
        .mmi
        add r17 = r34, r14
        add r16 = r33, r14
        add r15 = r32, r14
        .mmi
        adds r14 = 4, r14
        ;;
        ld4 r17 = [r17]
        nop 0
        .mii
        ld4 r16 = [r16]
        cmp.ne p6, p7 = r18, r14
        ;;
        add r16 = r17, r16
        ;;
        .mib
        st4 [r15] = r16
        nop 0
        (p6) br.cond.dptk .L10
        .mib
        nop 0
        nop 0
        br.ret.sptk.many rp

Here is the intermediate representation just before the ivopts pass (just after
the cunroll pass).  Let's call this "IR1".  Note that the three pointers are
incremented each iteration.  This pattern, were it preserved, would later be
transformed into post-increment addressing by the auto-inc-dec pass.

;; Function sump (sump)

sump (int * a, int * b, int * c, int len)
{
  int i;
  int D.1279;
  int D.1278;
  int D.1277;

<bb 2>:
  if (len_9(D) > 0)
    goto <bb 3>;
  else
    goto <bb 6>;

<bb 3>:

<bb 4>:
  # i_25 = PHI <i_16(5), 0(3)>
  # a_26 = PHI <a_13(5), a_6(D)(3)>
  # b_27 = PHI <b_14(5), b_7(D)(3)>
  # c_28 = PHI <c_15(5), c_8(D)(3)>
  D.1277_10 = *b_27;
  D.1278_11 = *c_28;
  D.1279_12 = D.1278_11 + D.1277_10;
  *a_26 = D.1279_12;
  a_13 = a_26 + 4;
  b_14 = b_27 + 4;  
  c_15 = c_28 + 4;
  i_16 = i_25 + 1;
  if (len_9(D) > i_16)
    goto <bb 5>;  
  else
    goto <bb 6>;

<bb 5>:
  goto <bb 4>;

<bb 6>:
  return;

}

Here is the intermediate representation after the ivopts pass.  Let's call this
"IR2".  The code has been transformed into the pattern we see in the final
assembly.

;; Function sump (sump)

sump (int * a, int * b, int * c, int len)
{
  unsigned int D.1361;
  unsigned int D.1362;
  long unsigned int D.1363;
  long unsigned int D.1364;
  long unsigned int D.1365;
  int * D.1366;
  int * D.1360;
  long unsigned int D.1359;
  int * D.1358;
  long unsigned int D.1357;
  int * D.1356;
  long unsigned int D.1355;
  int * ivtmp?59;
  int i;
  int D.1279;
  int D.1278;
  int D.1277;

<bb 2>:
  if (len_9(D) > 0)
    goto <bb 3>;   
  else
    goto <bb 6>;

<bb 3>:

<bb 4>:
  # ivtmp?59_1 = PHI <ivtmp?59_24(5), 0B(3)>
  D.1355_20 = (long unsigned int) ivtmp?59_1;
  D.1356_22 = b_7(D) + D.1355_20;
  D.1277_10 = MEM[base: D.1356_22];
  D.1357_23 = (long unsigned int) ivtmp?59_1;
  D.1358_21 = c_8(D) + D.1357_23;
  D.1278_11 = MEM[base: D.1358_21];
  D.1279_12 = D.1278_11 + D.1277_10;
  D.1359_30 = (long unsigned int) ivtmp?59_1;
  D.1360_31 = a_6(D) + D.1359_30;
  MEM[base: D.1360_31] = D.1279_12;
  ivtmp?59_24 = ivtmp?59_1 + 4;
  D.1361_32 = (unsigned int) len_9(D);
  D.1362_33 = D.1361_32 + 4294967295; 
  D.1363_34 = (long unsigned int) D.1362_33;
  D.1364_35 = D.1363_34 + 1;
  D.1365_36 = D.1364_35 * 4;
  D.1366_37 = (int *) D.1365_36;
  if (ivtmp?59_24 != D.1366_37) 
    goto <bb 5>;
  else
    goto <bb 6>;

<bb 5>:
  goto <bb 4>;

<bb 6>:
  return;

}

Here is the RTL representation (with extraneous information removed) going into
the auto-inc-dec pass (coming out of the regclass pass).  auto-inc-dec does not
recognize that this code can be transformed to use post-increment addressing. 
The only thing auto-inc-dec does to this code is add some death notes.


(set (reg:DI 341 [ a ]) (reg:DI in0))

(set (reg:DI 342 [ b ]) (reg:DI in1))

(set (reg:DI 343 [ c ]) (reg:DI in2))

(set (reg:SI 344 [ len ]) (reg:SI in3))

(set (reg:BI 345)
     (le:BI (reg:SI 344 [ len ])
            (const_int 0)))

(set (pc)
     (if_then_else (ne (reg:BI 345)
                       (const_int 0))
                   (label_ref 35)
                   (pc)))

(set (reg:DI 340 [ ivtmp?59 ]) (const_int 0))

(set (reg:SI 357)
     (plus:SI (reg:SI 344 [ len ])
              (const_int -1)))

(set (reg:DI 358)
     (zero_extend:DI (reg:SI 357)))

(set (reg:DI 359)
     (plus:DI (reg:DI 358)
              (const_int 1)))

(set (reg:DI 360)
     (ashift:DI (reg:DI 359)
                (const_int 2)))

(code_label 23)

(set (reg:DI 346)
     (plus:DI (reg:DI 341 [ a ])
              (reg:DI 340 [ ivtmp?59 ])))

(set (reg:DI 347)
     (plus:DI (reg:DI 343 [ c ])
              (reg:DI 340 [ ivtmp?59 ])))

(set (reg:DI 348)
     (plus:DI (reg:DI 342 [ b ])
              (reg:DI 340 [ ivtmp?59 ])))

(set (reg:SI 349)
     (mem:SI (reg:DI 347)))

(set (reg:SI 350)
     (mem:SI (reg:DI 348)))

(set (reg:SI 351)
     (plus:SI (reg:SI 349)
              (reg:SI 350)))

(set (mem:SI (reg:DI 346))
     (reg:SI 351))

(set (reg:DI 340 [ ivtmp?59 ])
     (plus:DI (reg/f:DI 340 [ ivtmp?59 ])
              (const_int 4)))

(set (reg:BI 356)
     (ne:BI (reg:DI 340 [ ivtmp?59 ])
            (reg:DI 360)))

(set (pc)
     (if_then_else (ne (reg:BI 356)
                       (const_int 0))
                   (label_ref 23)
                   (pc)))

(code_label 35)


Interestingly, bfin, which also has a post-increment addressing mode and not a
base-plus-index addressing mode, generates post-increment addressing for this
same function.  The IR looks like IR1 both before and after the ivopts pass. 
Why does ivopts decide to transform the code when compiling for ia64 and not
when compiling for bfin?

Another interesting data point, consider the following function:

void suma(int a[], int b[], int c[], int len){
  int i;
  for(i = 0; i < len; i++){
    a[i] = b[i] + c[i];
  }
}

Both GCC 3.4.6 and mainline generate the base-plus-index-like code for both
ia64 and bfin.  However, in GCC 3.4.6, there is an option named
"-freduce-all-givs".  Using that option causes GCC 3.4.6 to generate
post-increment addressing for the suma function for both ia64 and bfin.  The
transformation occurs in the RTL loop pass.  The incoming code has three base
pointers and one offset.  The output code has several copies of each pointer
that are each incremented, and one of each of those copies is used in the mem
expressions.  The other copies of the incremented pointers and the original
index register are dead and are later removed.

>From the 3.4.6 code in loop.c:strength_reduce, it appears the -freduce-all-givs
option overrides a cost decision.  Could that be the difference between ia64
and bfin in mainline?  Is ivopts deciding that IR2 is cheaper on ia64 while IR1
is cheaper on bfin?  If so, how can the ia64 cost model be changed to cause
ivopts to generate IR1?

Another interesting question is, is it possible for ivopts to transform the IR
for the suma function from IR2 into IR1?  Is this also a problem with costs, or
does ivopts simply lack the ability to make that transformation?

Another possibility is that ivopts is doing what people believe it should, and
the problem should be addressed in auto-inc-dec, or elsewhere.  Is that the
case?  Would it be possible to teach auto-inc-dec to recognize that this
base-plus-offset-like RTL can be transformed into auto-increments?  Is there
anywhere else this problem could be fixed?


-- 
           Summary: Failure to generate post-increment addressing
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: TabonyEE at austin dot rr dot com
 GCC build triplet: x86_64-unknown-linux-gnu
  GCC host triplet: x86_64-unknown-linux-gnu
GCC target triplet: ia64-unknown-elf


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37678

[Bug middle-end/37678] New: Failure to generate post-increment addressing

Reply via email to