Richard Henderson schrieb:
On 11/21/2011 11:31 AM, Georg-Johann Lay wrote:
;; The caveat is that if there are insns for some mode, there must also be a
;; respective move insn that describes reloads. Therefore, this
;; implementation uses an accumulator-based model with two hard-coded,
;; accumulator-like registers
;;
;; A[] = reg:DI 18
;; B[] = reg:DI 10
;;
;; so that no DImode insn contains pseudos or needs reloading.
Well, rtl loop optimization will not work, but given that SSE
You mean "won't optimize" or "gives wrong code"?
What's SSE? I definitely need a GCC glossary.
optimizations ought to have been performed, that's probably
acceptable.
It's definitely a hack, but perhaps you'll be able to get away with
it.
Yes, I'm aware it's hack. But the extreme bloaty code -- see below -- is
one of the reasons for bad reputation of avr-gcc, even though just very
few people are using 64-bit.
I do wonder if you might even get smaller code if you force DImode
quantities into the stack (just hack use_register_for_decl locally
while testing; a new target hook if that pans out), and pass pointers
to the variables instead. At the moment you're having to use 8*3
insns inline to put the quantities in place and take them back out
again. With pointers this would seem to drop to 2*3.
I already thought about using pointers; but remind that AVR only has 3
pointer registers. Moreover, I remember some post in gcc@ or gcc-help@
where someone asked how to write an addition or similar that works
/only/ on memory, and the answer was "it's not possible", IIRC.
Anyways, if you compare the new code with /some/ move insns against the
old code for, say,
long long add64 (long long a, long long b)
{
return a + b;
}
that compiles with -Os to
add64:
push r10 ; 222 pushqi1/1 [length = 1]
push r11 ; 223 pushqi1/1 [length = 1]
push r12 ; 224 pushqi1/1 [length = 1]
push r13 ; 225 pushqi1/1 [length = 1]
push r14 ; 226 pushqi1/1 [length = 1]
push r15 ; 227 pushqi1/1 [length = 1]
push r16 ; 228 pushqi1/1 [length = 1]
push r17 ; 229 pushqi1/1 [length = 1]
/* prologue: function */
/* frame size = 0 */
/* stack size = 8 */
.L__stack_usage = 8
add r10,r18 ; 24 addqi3/1 [length = 1]
ldi r30,lo8(1) ; 25 *movqi/2 [length = 1]
cp r10,r18 ; 26 *cmpqi/2 [length = 1]
brlo .L2 ; 27 branch [length = 1]
ldi r30,lo8(0) ; 28 *movqi/1 [length = 1]
.L2:
add r11,r19 ; 30 addqi3/1 [length = 1]
ldi r18,lo8(1) ; 31 *movqi/2 [length = 1]
cp r11,r19 ; 32 *cmpqi/2 [length = 1]
brlo .L3 ; 33 branch [length = 1]
ldi r18,lo8(0) ; 34 *movqi/1 [length = 1]
.L3:
mov r19,r30 ; 216 *movqi/1 [length = 1]
add r19,r11 ; 36 addqi3/1 [length = 1]
ldi r30,lo8(1) ; 37 *movqi/2 [length = 1]
cp r19,r11 ; 38 *cmpqi/2 [length = 1]
brlo .L4 ; 39 branch [length = 1]
ldi r30,lo8(0) ; 40 *movqi/1 [length = 1]
.L4:
or r18,r30 ; 42 iorqi3/1 [length = 1]
add r12,r20 ; 44 addqi3/1 [length = 1]
ldi r30,lo8(1) ; 45 *movqi/2 [length = 1]
cp r12,r20 ; 46 *cmpqi/2 [length = 1]
brlo .L5 ; 47 branch [length = 1]
ldi r30,lo8(0) ; 48 *movqi/1 [length = 1]
.L5:
mov r20,r18 ; 217 *movqi/1 [length = 1]
add r20,r12 ; 50 addqi3/1 [length = 1]
ldi r18,lo8(1) ; 51 *movqi/2 [length = 1]
cp r20,r12 ; 52 *cmpqi/2 [length = 1]
brlo .L6 ; 53 branch [length = 1]
ldi r18,lo8(0) ; 54 *movqi/1 [length = 1]
.L6:
or r30,r18 ; 56 iorqi3/1 [length = 1]
add r13,r21 ; 58 addqi3/1 [length = 1]
ldi r18,lo8(1) ; 59 *movqi/2 [length = 1]
cp r13,r21 ; 60 *cmpqi/2 [length = 1]
brlo .L7 ; 61 branch [length = 1]
ldi r18,lo8(0) ; 62 *movqi/1 [length = 1]
.L7:
mov r21,r30 ; 218 *movqi/1 [length = 1]
add r21,r13 ; 64 addqi3/1 [length = 1]
ldi r30,lo8(1) ; 65 *movqi/2 [length = 1]
cp r21,r13 ; 66 *cmpqi/2 [length = 1]
brlo .L8 ; 67 branch [length = 1]
ldi r30,lo8(0) ; 68 *movqi/1 [length = 1]
.L8:
or r18,r30 ; 70 iorqi3/1 [length = 1]
add r14,r22 ; 72 addqi3/1 [length = 1]
ldi r30,lo8(1) ; 73 *movqi/2 [length = 1]
cp r14,r22 ; 74 *cmpqi/2 [length = 1]
brlo .L9 ; 75 branch [length = 1]
ldi r30,lo8(0) ; 76 *movqi/1 [length = 1]
.L9:
mov r22,r18 ; 219 *movqi/1 [length = 1]
add r22,r14 ; 78 addqi3/1 [length = 1]
ldi r18,lo8(1) ; 79 *movqi/2 [length = 1]
cp r22,r14 ; 80 *cmpqi/2 [length = 1]
brlo .L10 ; 81 branch [length = 1]
ldi r18,lo8(0) ; 82 *movqi/1 [length = 1]
.L10:
or r30,r18 ; 84 iorqi3/1 [length = 1]
add r15,r23 ; 86 addqi3/1 [length = 1]
ldi r18,lo8(1) ; 87 *movqi/2 [length = 1]
cp r15,r23 ; 88 *cmpqi/2 [length = 1]
brlo .L11 ; 89 branch [length = 1]
ldi r18,lo8(0) ; 90 *movqi/1 [length = 1]
.L11:
mov r23,r30 ; 220 *movqi/1 [length = 1]
add r23,r15 ; 92 addqi3/1 [length = 1]
ldi r30,lo8(1) ; 93 *movqi/2 [length = 1]
cp r23,r15 ; 94 *cmpqi/2 [length = 1]
brlo .L12 ; 95 branch [length = 1]
ldi r30,lo8(0) ; 96 *movqi/1 [length = 1]
.L12:
or r18,r30 ; 98 iorqi3/1 [length = 1]
add r16,r24 ; 100 addqi3/1 [length = 1]
ldi r30,lo8(1) ; 101 *movqi/2 [length = 1]
cp r16,r24 ; 102 *cmpqi/2 [length = 1]
brlo .L13 ; 103 branch [length = 1]
ldi r30,lo8(0) ; 104 *movqi/1 [length = 1]
.L13:
mov r24,r18 ; 221 *movqi/1 [length = 1]
add r24,r16 ; 106 addqi3/1 [length = 1]
ldi r18,lo8(1) ; 107 *movqi/2 [length = 1]
cp r24,r16 ; 108 *cmpqi/2 [length = 1]
brlo .L14 ; 109 branch [length = 1]
ldi r18,lo8(0) ; 110 *movqi/1 [length = 1]
.L14:
or r30,r18 ; 112 iorqi3/1 [length = 1]
add r25,r17 ; 114 addqi3/1 [length = 1]
mov r18,r10 ; 138 *movqi/1 [length = 1]
add r25,r30 ; 145 addqi3/1 [length = 1]
/* epilogue start */
pop r17 ; 232 popqi [length = 1]
pop r16 ; 233 popqi [length = 1]
pop r15 ; 234 popqi [length = 1]
pop r14 ; 235 popqi [length = 1]
pop r13 ; 236 popqi [length = 1]
pop r12 ; 237 popqi [length = 1]
pop r11 ; 238 popqi [length = 1]
pop r10 ; 239 popqi [length = 1]
ret ; 240 return_from_epilogue [length = 1]
I'd say that the new code is way better -- even with 24 move
instructions. And if there are more DI operations in a line, some moves
might vanish because only registers 18 and 10 are used.
And I even say that this approach is no worse than supplying movdi and
let IRA/reload do the work -- at least that's my impression from the
code that I often see from IRA, like PR50775 for example.
Johann