On Thu, Jul 20, 2006 at 04:37:41PM +0200, Rask Ingemann Lambertsen wrote:
> ;; This is primarily a hack for the Nintendo DS external RAM.
> (define_insn "_arm_movqi_insn_swp"
> [(set (match_operand:QI 0 "reg_or_Qmem_operand" "=r,r,r,Q,Q")
> (match_operand:QI 1 "general_operand" "rI,K,m,r,r"))
> (clobber (match_scratch:QI 2 "=X,X,X,1,&r"))]
> "TARGET_ARM && TARGET_SWP_BYTE_WRITES
> && ( register_operand (operands[0], QImode)
> || register_operand (operands[1], QImode))"
> "@
> mov%?\\t%0, %1
> mvn%?\\t%0, #%B1
> ldr%?b\\t%0, %1
> swp%?b\\t%1, %1, [%|%m0]
> swp%?b\\t%2, %1, [%|%m0]"
> [(set_attr "type" "*,*,load1,store1,store1")
> (set_attr "predicable" "yes")]
> )
I found that this peephole optimization improves the code a whole lot:
;; The register allocator is often stupid. Try to change
;; mov r2, r1
;; swpb r2, r2, [r0]
;; into
;; swpb r2, r1, [r0]
;; (and pretend it is just another way of allocating a scratch register).
(define_peephole2
[(parallel
[(set (match_operand:QI 2 "register_operand")
(match_operand:QI 1 "register_operand"))
(clobber (match_scratch:QI 3))])
(parallel [
(set (match_operand:QI 0 "memory_operand") (match_dup 2))
(clobber (match_dup 2))])]
"TARGET_ARM && TARGET_SWP_BYTE_WRITES"
[(parallel
[(set (match_dup 0) (match_dup 1))
(clobber (match_dup 2))])]
)
Another way of improving the code was to swap the order of the two last
alternatives of _arm_movqi_insn_swp. There are a few differences in the
generated code, shown with "1,&r" to the left and "&r,1" to the right:
.L92: .L92:
ldr r2, [fp, #-144] | ldr r1, [fp, #-144]
ldr r3, [fp, #-152] ldr r3, [fp, #-152]
cmp r2, #0 | cmp r1, #0
add r2, r3, #2 add r2, r3, #2
ldreq r0, [fp, #-144] | moveq r0, r1
Above, reload from memory [fp, #-144] for no apparent reason.
.L141: .L141:
ldr r0, [fp, #-152] | ldr r2, [fp, #-152]
sub r3, r0, #2 | sub r3, r2, #2
cmp r5, r3 cmp r5, r3
beq .L142 beq .L142
cmp r5, #0 cmp r5, #0
movne r2, r0 | beq .L144
bne .L146 | b .L146
b .L144 <
Some sort of register allocation mismatch.
beq .L160 | beq .L155
cmp r0, #44 cmp r0, #44
cmpne r0, #59 cmpne r0, #59
beq .L160 | beq .L155
cmp r0, #61 cmp r0, #61
cmpne r0, #43 cmpne r0, #43
bne .L158 bne .L158
> .L155:
> mov ip, #95
> str r8, [fp, #-120]
> mov r0, #1
> swpb r2, ip, [r6]
> b .L159
.L160: .L160:
mov r3, #95 mov r3, #95
str r8, [fp, #-120] str r8, [fp, #-120]
mov r0, #1 mov r0, #1
swpb r1, r3, [r6] swpb r1, r3, [r6]
b .L159 b .L159
Code duplication, presumably because of the different register allocation.
ldr lr, [fp, #-104] ldr lr, [fp, #-104]
ldrb r2, [r1, ip] ldrb r2, [r1, ip]
add r3, r1, lr add r3, r1, lr
swpb r2, r2, [r3] | swpb lr, r2, [r3]
ldr r2, [fp, #-132] ldr r2, [fp, #-132]
add r1, r1, #1 add r1, r1, #1
> ldr lr, [fp, #-104]
add r2, r2, #1 add r2, r2, #1
cmp r1, r0 cmp r1, r0
str r2, [fp, #-132] str r2, [fp, #-132]
add r3, lr, r1 add r3, lr, r1
Here, the register allocator is just plain stupid in not using the best
alternative. I suspect this is because only reload allocates scratch
registers and doesn't realize that the input register dies in this insn.
ldr r2, [fp, #-184] | ldr r5, [fp, #-184]
strh r3, [r4, #22] strh r3, [r4, #22]
strh r3, [r4, #14] strh r3, [r4, #14]
mov r0, r2, asr #16 <
ldrh r2, [fp, #-48] ldrh r2, [fp, #-48]
mov r1, #0 mov r1, #0
> mov r0, r5, asr #16
add r3, r4, #13 add r3, r4, #13
strh r2, [r4, #24] strh r2, [r4, #24]
strh r2, [r4, #18] strh r2, [r4, #18]
strh r2, [r4, #16] strh r2, [r4, #16]
swpb ip, r1, [r3] | swpb r6, r1, [r3]
strh r0, [r4, #20] strh r0, [r4, #20]
str r1, [r4, #28] str r1, [r4, #28]
ldr lr, [fp, #-184] | strh r5, [r4, #26]
strh lr, [r4, #26] <
Again, needless reload from memory [fp, #-184]. One more example omitted.
--
Rask Ingemann Lambertsen