http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54910



             Bug #: 54910

           Summary: ARM: Missed optimization of very simple ctz function

    Classification: Unclassified

           Product: gcc

           Version: 4.7.2

            Status: UNCONFIRMED

          Severity: minor

          Priority: P3

         Component: rtl-optimization

        AssignedTo: unassig...@gcc.gnu.org

        ReportedBy: li...@horizon.com

              Host: i386

            Target: arm-linux-gnueabi





Given the following function:



/* Number of trailing zero bits in x. */

unsigned __attribute__((const))

ctz(unsigned x)

{

    static unsigned char const ctz_table[16] = {

        4, 0, 1, 0,  2, 0, 1, 0,

        3, 0, 1, 0,  2, 0, 1, 0

    };

    int bit = 28;



    if (x << 16)  x <<= 16, bit -= 16;

    if (x <<  8)  x <<=  8, bit -=  8;

    if (x <<  4)  x <<=  4, bit -=  4;

    return bit + ctz_table[x >> 28];

}

And the command line:



arm-linux-gnueabi-gcc-4.7 -W -Wall -O2 -mcpu=arm7tdmi -mthumb-interwork -marm

-S baz.c



I get the following ARM code (-O2, -mthumb-interwork):



    .align    2

    .global    ctz

    .type    ctz, %function

ctz:

    @ Function supports interworking.

    @ args = 0, pretend = 0, frame = 0

    @ frame_needed = 0, uses_anonymous_args = 0

    @ link register save eliminated.

    movs    r3, r0, asl #16

    moveq    r3, r0

    movne    r2, #12

    moveq    r2, #28

    movs    r1, r3, asl #8

    movne    r3, r1

    subne    r2, r2, #8

    movs    r1, r3, asl #4

    movne    r3, r1

    ldr    r1, .L18

    ldrb    r0, [r1, r3, lsr #28]    @ zero_extendqisi2

    subne    r2, r2, #4

    add    r0, r0, r2

    bx    lr

.L19:

    .align    2

.L18:

    .word    .LANCHOR0

    .size    ctz, .-ctz

    .section    .rodata

    .align    2

.LANCHOR0 = . + 0

    .type    ctz_table.4122, %object

    .size    ctz_table.4122, 16

ctz_table.4122:

    .byte    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0

    .ident    "GCC: (Debian 4.7.2-1) 4.7.2"





What strikes me as strange about this code is that it uses 4-byte pointer

at .L18 to access an 16-byte table at .LANCHOR0.  Why the heck not just put

the table at .L18 directly and replace the ldr with an adr?  Save space and

time.





The thumb code is similar, but also fails to save the link register save,

despite the fact that this is an extremely simple leaf function:



    .align    2

    .global    ctz

    .code    16

    .thumb_func

    .type    ctz, %function

ctz:

    push    {lr}

    lsl    r3, r0, #16

    mov    r2, #12

    cmp    r3, #0

    bne    .L8

    mov    r3, r0

    mov    r2, #28

.L8:

    lsl    r1, r3, #8

    beq    .L9

    sub    r2, r2, #8

    mov    r3, r1

.L9:

    lsl    r1, r3, #4

    beq    .L10

    sub    r2, r2, #4

    mov    r3, r1

.L10:

    ldr    r1, .L18

    lsr    r3, r3, #28

    ldrb    r0, [r1, r3]

    @ sp needed for prologue

    add    r0, r0, r2

    pop    {r1}

    bx    r1

Reply via email to