Richard Henderson schrieb:
On 11/21/2011 11:31 AM, Georg-Johann Lay wrote:

;; The caveat is that if there are insns for some mode, there must also be a
;; respective move insn that describes reloads.  Therefore, this
;; implementation uses an accumulator-based model with two hard-coded,
;; accumulator-like registers
;;
;;    A[] = reg:DI 18
;;    B[] = reg:DI 10
;;
;; so that no DImode insn contains pseudos or needs reloading.

Well, rtl loop optimization will not work, but given that SSE

You mean "won't optimize" or "gives wrong code"?
What's SSE? I definitely need a GCC glossary.

optimizations ought to have been performed, that's probably
acceptable.

It's definitely a hack, but perhaps you'll be able to get away with
it.

Yes, I'm aware it's hack. But the extreme bloaty code -- see below -- is one of the reasons for bad reputation of avr-gcc, even though just very few people are using 64-bit.

I do wonder if you might even get smaller code if you force DImode
quantities into the stack (just hack use_register_for_decl locally
while testing; a new target hook if that pans out), and pass pointers
to the variables instead.  At the moment you're having to use 8*3
insns inline to put the quantities in place and take them back out
again.  With pointers this would seem to drop to 2*3.

I already thought about using pointers; but remind that AVR only has 3 pointer registers. Moreover, I remember some post in gcc@ or gcc-help@ where someone asked how to write an addition or similar that works /only/ on memory, and the answer was "it's not possible", IIRC.

Anyways, if you compare the new code with /some/ move insns against the old code for, say,

long long add64 (long long a, long long b)
{
    return a + b;
}

that compiles with -Os to

add64:
        push r10         ;  222 pushqi1/1       [length = 1]
        push r11         ;  223 pushqi1/1       [length = 1]
        push r12         ;  224 pushqi1/1       [length = 1]
        push r13         ;  225 pushqi1/1       [length = 1]
        push r14         ;  226 pushqi1/1       [length = 1]
        push r15         ;  227 pushqi1/1       [length = 1]
        push r16         ;  228 pushqi1/1       [length = 1]
        push r17         ;  229 pushqi1/1       [length = 1]
/* prologue: function */
/* frame size = 0 */
/* stack size = 8 */
.L__stack_usage = 8
        add r10,r18      ;  24  addqi3/1        [length = 1]
        ldi r30,lo8(1)   ;  25  *movqi/2        [length = 1]
        cp r10,r18       ;  26  *cmpqi/2        [length = 1]
        brlo .L2         ;  27  branch  [length = 1]
        ldi r30,lo8(0)   ;  28  *movqi/1        [length = 1]
.L2:
        add r11,r19      ;  30  addqi3/1        [length = 1]
        ldi r18,lo8(1)   ;  31  *movqi/2        [length = 1]
        cp r11,r19       ;  32  *cmpqi/2        [length = 1]
        brlo .L3         ;  33  branch  [length = 1]
        ldi r18,lo8(0)   ;  34  *movqi/1        [length = 1]
.L3:
        mov r19,r30      ;  216 *movqi/1        [length = 1]
        add r19,r11      ;  36  addqi3/1        [length = 1]
        ldi r30,lo8(1)   ;  37  *movqi/2        [length = 1]
        cp r19,r11       ;  38  *cmpqi/2        [length = 1]
        brlo .L4         ;  39  branch  [length = 1]
        ldi r30,lo8(0)   ;  40  *movqi/1        [length = 1]
.L4:
        or r18,r30       ;  42  iorqi3/1        [length = 1]
        add r12,r20      ;  44  addqi3/1        [length = 1]
        ldi r30,lo8(1)   ;  45  *movqi/2        [length = 1]
        cp r12,r20       ;  46  *cmpqi/2        [length = 1]
        brlo .L5         ;  47  branch  [length = 1]
        ldi r30,lo8(0)   ;  48  *movqi/1        [length = 1]
.L5:
        mov r20,r18      ;  217 *movqi/1        [length = 1]
        add r20,r12      ;  50  addqi3/1        [length = 1]
        ldi r18,lo8(1)   ;  51  *movqi/2        [length = 1]
        cp r20,r12       ;  52  *cmpqi/2        [length = 1]
        brlo .L6         ;  53  branch  [length = 1]
        ldi r18,lo8(0)   ;  54  *movqi/1        [length = 1]
.L6:
        or r30,r18       ;  56  iorqi3/1        [length = 1]
        add r13,r21      ;  58  addqi3/1        [length = 1]
        ldi r18,lo8(1)   ;  59  *movqi/2        [length = 1]
        cp r13,r21       ;  60  *cmpqi/2        [length = 1]
        brlo .L7         ;  61  branch  [length = 1]
        ldi r18,lo8(0)   ;  62  *movqi/1        [length = 1]
.L7:
        mov r21,r30      ;  218 *movqi/1        [length = 1]
        add r21,r13      ;  64  addqi3/1        [length = 1]
        ldi r30,lo8(1)   ;  65  *movqi/2        [length = 1]
        cp r21,r13       ;  66  *cmpqi/2        [length = 1]
        brlo .L8         ;  67  branch  [length = 1]
        ldi r30,lo8(0)   ;  68  *movqi/1        [length = 1]
.L8:
        or r18,r30       ;  70  iorqi3/1        [length = 1]
        add r14,r22      ;  72  addqi3/1        [length = 1]
        ldi r30,lo8(1)   ;  73  *movqi/2        [length = 1]
        cp r14,r22       ;  74  *cmpqi/2        [length = 1]
        brlo .L9         ;  75  branch  [length = 1]
        ldi r30,lo8(0)   ;  76  *movqi/1        [length = 1]
.L9:
        mov r22,r18      ;  219 *movqi/1        [length = 1]
        add r22,r14      ;  78  addqi3/1        [length = 1]
        ldi r18,lo8(1)   ;  79  *movqi/2        [length = 1]
        cp r22,r14       ;  80  *cmpqi/2        [length = 1]
        brlo .L10        ;  81  branch  [length = 1]
        ldi r18,lo8(0)   ;  82  *movqi/1        [length = 1]
.L10:
        or r30,r18       ;  84  iorqi3/1        [length = 1]
        add r15,r23      ;  86  addqi3/1        [length = 1]
        ldi r18,lo8(1)   ;  87  *movqi/2        [length = 1]
        cp r15,r23       ;  88  *cmpqi/2        [length = 1]
        brlo .L11        ;  89  branch  [length = 1]
        ldi r18,lo8(0)   ;  90  *movqi/1        [length = 1]
.L11:
        mov r23,r30      ;  220 *movqi/1        [length = 1]
        add r23,r15      ;  92  addqi3/1        [length = 1]
        ldi r30,lo8(1)   ;  93  *movqi/2        [length = 1]
        cp r23,r15       ;  94  *cmpqi/2        [length = 1]
        brlo .L12        ;  95  branch  [length = 1]
        ldi r30,lo8(0)   ;  96  *movqi/1        [length = 1]
.L12:
        or r18,r30       ;  98  iorqi3/1        [length = 1]
        add r16,r24      ;  100 addqi3/1        [length = 1]
        ldi r30,lo8(1)   ;  101 *movqi/2        [length = 1]
        cp r16,r24       ;  102 *cmpqi/2        [length = 1]
        brlo .L13        ;  103 branch  [length = 1]
        ldi r30,lo8(0)   ;  104 *movqi/1        [length = 1]
.L13:
        mov r24,r18      ;  221 *movqi/1        [length = 1]
        add r24,r16      ;  106 addqi3/1        [length = 1]
        ldi r18,lo8(1)   ;  107 *movqi/2        [length = 1]
        cp r24,r16       ;  108 *cmpqi/2        [length = 1]
        brlo .L14        ;  109 branch  [length = 1]
        ldi r18,lo8(0)   ;  110 *movqi/1        [length = 1]
.L14:
        or r30,r18       ;  112 iorqi3/1        [length = 1]
        add r25,r17      ;  114 addqi3/1        [length = 1]
        mov r18,r10      ;  138 *movqi/1        [length = 1]
        add r25,r30      ;  145 addqi3/1        [length = 1]
/* epilogue start */
        pop r17  ;  232 popqi   [length = 1]
        pop r16  ;  233 popqi   [length = 1]
        pop r15  ;  234 popqi   [length = 1]
        pop r14  ;  235 popqi   [length = 1]
        pop r13  ;  236 popqi   [length = 1]
        pop r12  ;  237 popqi   [length = 1]
        pop r11  ;  238 popqi   [length = 1]
        pop r10  ;  239 popqi   [length = 1]
        ret      ;  240 return_from_epilogue    [length = 1]

I'd say that the new code is way better -- even with 24 move instructions. And if there are more DI operations in a line, some moves might vanish because only registers 18 and 10 are used.

And I even say that this approach is no worse than supplying movdi and let IRA/reload do the work -- at least that's my impression from the code that I often see from IRA, like PR50775 for example.

Johann

Reply via email to