Applied this patch that improves IEEE double multiplication.
The old code consumed time for calling local helpers and to
prepare arguments.
Functions that use mul like expl or sinl are around 5%...9% faster
now. The code size did not increase.
Johann
--
LibF7: Tweak IEEE double multiplication.
libgcc/config/avr/libf7/
* libf7-asm.sx (mul_mant) [AVR_HAVE_MUL]: Tweak code.
diff --git a/libgcc/config/avr/libf7/libf7-asm.sx
b/libgcc/config/avr/libf7/libf7-asm.sx
index 4505764c126..01d1fa3e876 100644
--- a/libgcc/config/avr/libf7/libf7-asm.sx
+++ b/libgcc/config/avr/libf7/libf7-asm.sx
@@ -877,10 +877,14 @@ DEFUN ashldi3
;; R18.0 = 1: No rounding.
DEFUN mul_mant
+ ;; 10 = Y, R17...R10
do_prologue_saves 10
+ ;; T = R18.0: Skip rounding?
bst r18, 0
+ ;; Save result address for later.
push r25
push r24
+ ;; Load A's mantissa.
movw ZL, r22
LDD A0, Z+0+Off
LDD A1, Z+1+Off
@@ -913,26 +917,15 @@ DEFUN mul_mant
adc C6, ZERO
;; Done B6
- ;; 3 * 3 -> 0:a
- ;; 4 * 4 -> 2:1
- ;; 5 * 5 -> 4:3
- ldd BB, Z+3+Off $ mul A3, BB $ movw TT0, r0
- ldd BB, Z+4+Off $ mul A4, BB $ movw TT2, r0
- ldd BB, Z+5+Off $ mul A5, BB
-
- ADD CA, TT0 $ adc C0, TT1
- adc C1, TT2 $ adc C2, TT3
- adc C3, r0 $ adc C4, r1
- brcc .+2
- adiw C5, 1
-
;; 6 * 5 -> 5:4
;; 4 * 5 -> 3:2
;; 2 * 5 -> 1:0
;; 0 * 5 -> a:-
+ ldd BB, Z+5+Off
mul A0, BB
- ;; A0 done
+ ;; Done A0
#define Atmp A0
+#define Null A0
mov Atmp, r1
mul A6, BB $ movw TT2, r0
@@ -942,82 +935,127 @@ DEFUN mul_mant
ADD CA, Atmp
adc C0, r0 $ adc C1, r1
adc C2, TT0 $ adc C3, TT1
- adc C4, TT2 $ adc C5, TT3 $ clr ZERO
- adc C6, ZERO
+ adc C4, TT2 $ adc C5, TT3 $ clr Null
+ adc C6, Null
;; 1 * 5 -> 0:a
;; 3 * 5 -> 2:1
- ;; 6 * 4 -> 4:3
+ ;; 5 * 5 -> 4:3
mul A1, BB $ movw TT0, r0
mul A3, BB $ movw TT2, r0
+ mul A5, BB
+
+ ADD CA, TT0 $ adc C0, TT1
+ adc C1, TT2 $ adc C2, TT3
+ adc C3, r0 $ adc C4, r1
+ adc C5, Null $ adc C6, Null
+ ;; Done B5
+
+ ;; 2 * 4 -> 0:a
+ ;; 4 * 4 -> 2:1
+ ;; 6 * 4 -> 4:3
ldd BB, Z+4+Off
+ mul A2, BB $ movw TT0, r0
+ mul A4, BB $ movw TT2, r0
mul A6, BB
ADD CA, TT0 $ adc C0, TT1
adc C1, TT2 $ adc C2, TT3
- adc C3, r0 $ adc C4, r1 $ clr ZERO
- adc C5, ZERO $ adc C6, ZERO
- ;; B5 done
+ adc C3, r0 $ adc C4, r1
+ adc C5, Null $ adc C6, Null
+ ;; 1 * 4 -> a:-
+ ;; 3 * 4 -> 1:0
+ ;; 5 * 4 -> 3:2
+ mul A1, BB $ mov TT1, r1
+ mul A3, BB $ movw TT2, r0
+ mul A5, BB
+ ;; Done A1
+ ;; Done B4
+ ADD CA, TT1
+ adc C0, TT2 $ adc C1, TT3
+ adc C2, r0 $ adc C3, r1
+ ;; Accumulate carry for C3 in TT1.
+ ;; Accumulate carry for C4 in A1.
+#define Cry3 TT1
+#define Cry4 A1
+ clr Cry3
+ clr Cry4
+ rol Cry4
+
+ ;; 6 * 2 -> 2:1
;; 6 * 3 -> 3:2
- ;; 6 * 1 -> 1:0
- ;; 4 * 1 -> a:-
- mov TT0, A6 $ ldd TMP, Z+3+Off
- mov BB, A4 $ ldd Atmp, Z+1+Off
- rcall .Lmul.help.3
+ ;; 5 * 3 -> 2:1
+ ldd BB, Z+2+Off
+ mul A6, BB
+ add C1, r0
+ adc C2, r1
+ adc Cry3, Null
- ;; 5 * 4 -> 3:2
- ;; 5 * 2 -> 1:0
- ;; 3 * 2 -> a:-
- mov TT0, A5 $ ldd TMP, Z+4+Off
- mov BB, A3 $ ldd Atmp, Z+2+Off
- rcall .Lmul.help.3
+ ldd BB, Z+3+Off
+ mul A6, BB
+ add C2, r0
+ adc C3, r1
+ adc Cry4, Null
+
+ mul A5, BB
+ add C1, r0
+ adc C2, r1
+ adc Cry3, Null
- ;; 4 * -> 3:2 (=0)
+ ;; Perform the remaining 11 multiplications in 4 loopings:
;; 4 * 3 -> 1:0
+ ;; 3 * 3 -> 0:a
;; 2 * 3 -> a:-
- mov TT0, A4 $ clr TMP
- mov BB, A2 $ ldd Atmp, Z+3+Off
- rcall .Lmul.help.3
-
- ;; 3 * . -> 3:2 (=0)
- ;; 3 * 4 -> 1:0
- ;; 1 * 4 -> a:-
- mov TT0, A3 $ clr TMP
- mov BB, A1 $ ldd Atmp, Z+4+Off
- rcall .Lmul.help.3
-
- ;; . * ? -> 3:2 (=0)
- ;; . * 0 -> 1:0 (=0)
+ ;;
+ ;; 5 * 2 -> 1:0
+ ;; 4 * 2 -> 0:a
+ ;; 3 * 2 -> a:-
+ ;;
+ ;; 6 * 1 -> 1:0
+ ;; 5 * 1 -> 0:a
+ ;; 4 * 1 -> a:-
+ ;;
+ ;; . * 0 -> 1:0 (=0)
+ ;; 6 * 0 -> 0:a
;; 5 * 0 -> a:-
- clr TT0
- mov BB, A5 $ ldd Atmp, Z+0+Off
- rcall .Lmul.help.3
- clr TT3 ;; Asserted by .Lmul.help.2
- ;; 6 * 2 -> 2:1
- ;; 6 * 0 -> 0:a
- $ ldd TMP, Z+2+Off
- mov BB, A6 ;$ ldd Atmp, Z+0+Off
- rcall .Lmul.help.2
+ ;; BB already contains B3, hence let Z point one past B2 so that
+ ;; the LD *, -Z below will pick up B2, B1, B0.
+ adiw r30, 1 + Off+2
- ;; 5 * 3 -> 2:1
- ;; 5 * 1 -> 0:a
- $ ldd TMP, Z+3+Off
- mov BB, A5 $ ldd Atmp, Z+1+Off
- rcall .Lmul.help.2
+ ;; Accumulate carry for C2 in TT2.
+#define Cry2 TT2
+ clr Cry2
- ;; 4 * . -> 2:1 (=0)
- ;; 4 * 2 -> 0:a
- $ clr TMP
- mov BB, A4 $ ldd Atmp, Z+2+Off
- rcall .Lmul.help.2
+ ;; TT3 is the loop counter, iterate over B3...B0.
+ ldi TT3, 4
+ rjmp .Loop_start
- ;; 2 * . -> 2:1 (=0)
- ;; 2 * 4 -> 0:a
- $ clr TMP
- mov BB, A2 $ ldd Atmp, Z+4+Off
- rcall .Lmul.help.2
+.Loop:
+ ;; We use A2...A4 below; so shift bytes of A into place.
+ mov A2, A3
+ mov A3, A4
+ mov A4, A5
+ mov A5, A6
+ clr A6
+ ld BB, -Z
+.Loop_start:
+ mul A3, BB
+ ADD CA, r0 $ adc C0, r1 $ adc C1, Null $ adc
Cry2, Null
+ MUL A2, BB
+ mov TT0, r1
+ MUL A4, BB
+ ADD CA, TT0 $ adc C0, r0 $ adc C1, r1 $ adc
Cry2, Null
+ dec TT3
+ brne .Loop
+
+ clr ZERO
+ ADD C2, Cry2
+ adc C3, Cry3
+ adc C4, Cry4
+ adc C5, ZERO
+ adc C6, ZERO
;; Finally...
@@ -1032,38 +1070,6 @@ DEFUN mul_mant
do_epilogue_restores 10
-;; TT0 * Tmp -> 3:2
-;; TT0 * Atmp -> 1:0
-;; BB * Atmp -> a:-
-;;
-;; Clobbers : TMP, TT0...TT3.
-;; Sets : ZERO = 0.
-.Lmul.help.3:
- mul TT0, TMP $ movw TT2, r0
- mul TT0, Atmp $ movw TT0, r0
- mul BB, Atmp
-
- ADD CA, r1
- adc C0, TT0 $ adc C1, TT1
- adc C2, TT2
-.Lmul.help.3.C3: $ adc C3, TT3 $ clr ZERO
- adc C4, ZERO $ adc C5, ZERO
- adc C6, ZERO
- ret
-
-;; BB * TMP -> 2:1
-;; BB * Atmp -> 0:a
-;;
-;; Asserts : TT3 = 0
-;; Clobbers : TMP, TT0, TT1.
-;; Sets : ZERO = 0.
-.Lmul.help.2:
- mul BB, TMP $ movw TT0, r0
- mul BB, Atmp
- ADD CA, r0 $ adc C0, r1
- adc C1, TT0 $ adc C2, TT1
- rjmp .Lmul.help.3.C3
-
ENDF mul_mant
#endif /* F7MOD_mul_mant_ && MUL */