[PATCH v4 03/29] Refactor 'ctz' functions into a new file.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/lib1funcs.S: Move __ctzsi2() to
* config/arm/bits/ctz2.S: New file.
---
 libgcc/config/arm/bits/ctz2.S | 65 +++
 libgcc/config/arm/lib1funcs.S | 65 +--
 2 files changed, 66 insertions(+), 64 deletions(-)
 create mode 100644 libgcc/config/arm/bits/ctz2.S

diff --git a/libgcc/config/arm/bits/ctz2.S b/libgcc/config/arm/bits/ctz2.S
new file mode 100644
index 000..f0422d1fbba
--- /dev/null
+++ b/libgcc/config/arm/bits/ctz2.S
@@ -0,0 +1,65 @@
+
+#ifdef L_ctzsi2
+#ifdef NOT_ISA_TARGET_32BIT
+FUNC_START ctzsi2
+   negsr1, r0
+   andsr0, r0, r1
+   movsr1, #28
+   movsr3, #1
+   lslsr3, r3, #16
+   cmp r0, r3 /* 0x1 */
+   bcc 2f
+   lsrsr0, r0, #16
+   subsr1, r1, #16
+2: lsrsr3, r3, #8
+   cmp r0, r3 /* #0x100 */
+   bcc 2f
+   lsrsr0, r0, #8
+   subsr1, r1, #8
+2: lsrsr3, r3, #4
+   cmp r0, r3 /* #0x10 */
+   bcc 2f
+   lsrsr0, r0, #4
+   subsr1, r1, #4
+2: adr r2, 1f
+   ldrbr0, [r2, r0]
+   subsr0, r0, r1
+   bx lr
+.align 2
+1:
+.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
+   FUNC_END ctzsi2
+#else
+ARM_FUNC_START ctzsi2
+   rsb r1, r0, #0
+   and r0, r0, r1
+# if defined (__ARM_FEATURE_CLZ)
+   clz r0, r0
+   rsb r0, r0, #31
+   RET
+# else
+   mov r1, #28
+   cmp r0, #0x1
+   do_it   cs, t
+   movcs   r0, r0, lsr #16
+   subcs   r1, r1, #16
+   cmp r0, #0x100
+   do_it   cs, t
+   movcs   r0, r0, lsr #8
+   subcs   r1, r1, #8
+   cmp r0, #0x10
+   do_it   cs, t
+   movcs   r0, r0, lsr #4
+   subcs   r1, r1, #4
+   adr r2, 1f
+   ldrbr0, [r2, r0]
+   sub r0, r0, r1
+   RET
+.align 2
+1:
+.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
+# endif /* !defined (__ARM_FEATURE_CLZ) */
+   FUNC_END ctzsi2
+#endif
+#endif /* L_clzsi2 */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index f5aa5505e9d..acafff62448 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1723,70 +1723,7 @@ LSYM(Lover12):
 #endif /* __symbian__ */
 
 #include "bits/clz2.S"
-
-#ifdef L_ctzsi2
-#ifdef NOT_ISA_TARGET_32BIT
-FUNC_START ctzsi2
-   negsr1, r0
-   andsr0, r0, r1
-   movsr1, #28
-   movsr3, #1
-   lslsr3, r3, #16
-   cmp r0, r3 /* 0x1 */
-   bcc 2f
-   lsrsr0, r0, #16
-   subsr1, r1, #16
-2: lsrsr3, r3, #8
-   cmp r0, r3 /* #0x100 */
-   bcc 2f
-   lsrsr0, r0, #8
-   subsr1, r1, #8
-2: lsrsr3, r3, #4
-   cmp r0, r3 /* #0x10 */
-   bcc 2f
-   lsrsr0, r0, #4
-   subsr1, r1, #4
-2: adr r2, 1f
-   ldrbr0, [r2, r0]
-   subsr0, r0, r1
-   bx lr
-.align 2
-1:
-.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
-   FUNC_END ctzsi2
-#else
-ARM_FUNC_START ctzsi2
-   rsb r1, r0, #0
-   and r0, r0, r1
-# if defined (__ARM_FEATURE_CLZ)
-   clz r0, r0
-   rsb r0, r0, #31
-   RET
-# else
-   mov r1, #28
-   cmp r0, #0x1
-   do_it   cs, t
-   movcs   r0, r0, lsr #16
-   subcs   r1, r1, #16
-   cmp r0, #0x100
-   do_it   cs, t
-   movcs   r0, r0, lsr #8
-   subcs   r1, r1, #8
-   cmp r0, #0x10
-   do_it   cs, t
-   movcs   r0, r0, lsr #4
-   subcs   r1, r1, #4
-   adr r2, 1f
-   ldrbr0, [r2, r0]
-   sub r0, r0, r1
-   RET
-.align 2
-1:
-.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
-# endif /* !defined (__ARM_FEATURE_CLZ) */
-   FUNC_END ctzsi2
-#endif
-#endif /* L_clzsi2 */
+#include "bits/ctz2.S"
 
 /*  */
 /* These next two sections are here despite the fact that they contain Thumb 
-- 
2.25.1



[PATCH v4 08/29] Import new 'clrsb' functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

The new functions provide an efficient tail call to __clzsi2(),
making them rather smaller and faster than the C versions.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bits/clz2.S: Add __clrsbsi2() and __clrsbdi2().
* config/arm/t-elf: Add _clrsbsi2 and _clrsbdi2.
---
 libgcc/config/arm/bits/clz2.S | 108 +-
 libgcc/config/arm/t-elf   |   2 +
 2 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/libgcc/config/arm/bits/clz2.S b/libgcc/config/arm/bits/clz2.S
index d0a1fbec4d0..75e89b45f88 100644
--- a/libgcc/config/arm/bits/clz2.S
+++ b/libgcc/config/arm/bits/clz2.S
@@ -1,4 +1,4 @@
-/* clz2.S: Cortex M0 optimized 'clz' functions
+/* clz2.S: ARM optimized 'clz' and related functions
 
Copyright (C) 2018-2021 Free Software Foundation, Inc.
Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
@@ -23,7 +23,7 @@
.  */
 
 
-#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+#ifdef __ARM_FEATURE_CLZ
 
 #ifdef L_clzdi2
 
@@ -232,3 +232,107 @@ FUNC_END clzdi2
 
 #endif /* !__ARM_FEATURE_CLZ */
 
+
+#ifdef L_clrsbdi2
+
+// int __clrsbdi2(int)
+// Counts the number of "redundant sign bits" in $r1:$r0.
+// Returns the result in $r0.
+// Uses $r2 and $r3 as scratch space.
+FUNC_START_SECTION clrsbdi2 .text.sorted.libgcc.clz2.clrsbdi2
+CFI_START_FUNCTION
+
+  #if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+// Invert negative signs to keep counting zeros.
+asrsr3, xxh,#31
+eorsxxl,r3
+eorsxxh,r3
+
+// Same as __clzdi2(), except that the 'C' flag is pre-calculated.
+// Also, the trailing 'subs', since the last bit is not redundant.
+do_it   eq, et
+clzeq   r0, xxl
+clzne   r0, xxh
+addeq   r0, #32
+subsr0, #1
+RET
+
+  #else  /* !__ARM_FEATURE_CLZ */
+// Result if all the bits in the argument are zero.
+// Set it here to keep the flags clean after 'eors' below.
+movsr2, #31
+
+// Invert negative signs to keep counting zeros.
+asrsr3, xxh,#31
+eorsxxh,r3
+
+#if defined(__ARMEB__) && __ARMEB__
+// If the upper word is non-zero, return '__clzsi2(upper) - 1'.
+bne SYM(__internal_clzsi2)
+
+// The upper word is zero, prepare the lower word.
+movsr0, r1
+eorsr0, r3
+
+#else /* !__ARMEB__ */
+// Save the lower word temporarily.
+// This somewhat awkward construction adds one cycle when the
+//  branch is not taken, but prevents a double-branch.
+eorsr3, r0
+
+// If the upper word is non-zero, return '__clzsi2(upper) - 1'.
+movsr0, r1
+bneSYM(__internal_clzsi2)
+
+// Restore the lower word.
+movsr0, r3
+
+#endif /* !__ARMEB__ */
+
+// The upper word is zero, return '31 + __clzsi2(lower)'.
+addsr2, #32
+b   SYM(__internal_clzsi2)
+
+  #endif /* !__ARM_FEATURE_CLZ */
+
+CFI_END_FUNCTION
+FUNC_END clrsbdi2
+
+#endif /* L_clrsbdi2 */
+
+
+#ifdef L_clrsbsi2
+
+// int __clrsbsi2(int)
+// Counts the number of "redundant sign bits" in $r0.
+// Returns the result in $r0.
+// Uses $r2 and possibly $r3 as scratch space.
+FUNC_START_SECTION clrsbsi2 .text.sorted.libgcc.clz2.clrsbsi2
+CFI_START_FUNCTION
+
+// Invert negative signs to keep counting zeros.
+asrsr2, r0,#31
+eorsr0, r2
+
+  #if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+// Count.
+clz r0, r0
+
+// The result for a positive value will always be >= 1.
+// By definition, the last bit is not redundant.
+subsr0, #1
+RET
+
+  #else /* !__ARM_FEATURE_CLZ */
+// Result if all the bits in the argument are zero.
+// By definition, the last bit is not redundant.
+movsr2, #31
+b   SYM(__internal_clzsi2)
+
+  #endif  /* !__ARM_FEATURE_CLZ */
+
+CFI_END_FUNCTION
+FUNC_END clrsbsi2
+
+#endif /* L_clrsbsi2 */
+
diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf
index 998169e24c8..88ea869eea7 100644
--- a/libgcc/config/arm/t-elf
+++ b/libgcc/config/arm/t-elf
@@ -32,6 +32,8 @@ LIB1ASMFUNCS += \
_ashldi3 \
_ashrdi3 \
_lshrdi3 \
+   _clrsbsi2 \
+   _clrsbdi2 \
_clzdi2 \
_ctzdi2 \
_dvmd_tls \
-- 
2.25.1



[PATCH v4 01/29] Add and organize macros.

2021-01-11 Thread gnu
From: Daniel Engel 

These definitions facilitate subsequent patches in this series.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/t-elf: Organize functions into logical groups.
* config/arm/lib1funcs.S: Add FUNC_START macro variations for
weak functions and manual control of the target section;
rename THUMB_FUNC_START as THUMB_FUNC_ENTRY for consistency;
removed unused macros THUMB_SYNTAX, ARM_SYM_START, SYM_END;
removed redundant syntax directives.
---
 libgcc/config/arm/lib1funcs.S | 114 +++---
 libgcc/config/arm/t-elf   |  55 +---
 2 files changed, 110 insertions(+), 59 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index c2fcfc503ec..b4541bae791 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -69,11 +69,13 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #define TYPE(x) .type SYM(x),function
 #define SIZE(x) .size SYM(x), . - SYM(x)
 #define LSYM(x) .x
+#define LLSYM(x) .L##x
 #else
 #define __PLT__
 #define TYPE(x)
 #define SIZE(x)
 #define LSYM(x) x
+#define LLSYM(x) x
 #endif
 
 /* Function end macros.  Variants for interworking.  */
@@ -247,6 +249,14 @@ LSYM(Lend_fde):
 
 #define COND(op1, op2, cond) op1 ## op2 ## cond
 
+#ifdef __ARM_FEATURE_IT
+  #define IT(ins,c) ins##c
+#else
+  // Assume default Thumb-1 flags-affecting suffix 's'.
+  // Almost all instructions require this in unified syntax.
+  #define IT(ins,c) ins##s
+#endif
+
 #ifdef __ARM_EABI__
 .macro ARM_LDIV0 name signed
cmp r0, #0
@@ -280,7 +290,6 @@ LSYM(Lend_fde):
pop {r1, pc}
 
 #elif defined(__thumb2__)
-   .syntax unified
.ifc \signed, unsigned
cbz r0, 1f
mov r0, #0x
@@ -324,10 +333,6 @@ LSYM(Lend_fde):
 .endm
 #endif
 
-.macro FUNC_END name
-   SIZE (__\name)
-.endm
-
 .macro DIV_FUNC_END name signed
cfi_start   __\name, LSYM(Lend_div0)
 LSYM(Ldiv0):
@@ -340,48 +345,64 @@ LSYM(Ldiv0):
FUNC_END \name
 .endm
 
-.macro THUMB_FUNC_START name
-   .globl  SYM (\name)
-   TYPE(\name)
-   .thumb_func
-SYM (\name):
-.endm
-
 /* Function start macros.  Variants for ARM and Thumb.  */
-
 #ifdef __thumb__
 #define THUMB_FUNC .thumb_func
 #define THUMB_CODE .force_thumb
-# if defined(__thumb2__)
-#define THUMB_SYNTAX
-# else
-#define THUMB_SYNTAX
-# endif
 #else
 #define THUMB_FUNC
 #define THUMB_CODE
-#define THUMB_SYNTAX
 #endif
 
-.macro FUNC_START name
-   .text
+.macro THUMB_FUNC_ENTRY name
+   .globl  SYM (\name)
+   TYPE(\name)
+   .force_thumb
+   .thumb_func
+SYM (\name):
+.endm
+
+/* Strong global export, no section change. */
+.macro FUNC_ENTRY name
.globl SYM (__\name)
TYPE (__\name)
-   .align 0
THUMB_CODE
THUMB_FUNC
-   THUMB_SYNTAX
 SYM (__\name):
 .endm
 
-.macro ARM_SYM_START name
-   TYPE (\name)
-   .align 0
-SYM (\name):
+/* Weak global export, no section change. */
+.macro WEAK_ENTRY name
+   .weak SYM(__\name)
+   FUNC_ENTRY \name
+.endm
+
+/* Strong global export, explicit section. */
+.macro FUNC_START_SECTION name section
+   .section \section,"x"
+   .align 0
+   FUNC_ENTRY \name
 .endm
 
-.macro SYM_END name
-   SIZE (\name)
+/* Weak global export, explicit section. */
+.macro WEAK_START_SECTION name section
+   .weak SYM(__\name)
+   FUNC_START_SECTION \name \section
+.endm
+
+/* Strong global export, default section. */
+.macro FUNC_START name
+   FUNC_START_SECTION \name .text
+.endm
+
+/* Weak global export, default section. */
+.macro WEAK_START name
+   .weak SYM(__\name)
+   FUNC_START_SECTION \name .text
+.endm
+
+.macro FUNC_END name
+   SIZE (__\name)
 .endm
 
 /* Special function that will always be coded in ARM assembly, even if
@@ -392,7 +413,6 @@ SYM (\name):
 /* For Thumb-2 we build everything in thumb mode.  */
 .macro ARM_FUNC_START name
FUNC_START \name
-   .syntax unified
 .endm
 #define EQUIV .thumb_set
 .macro  ARM_CALL name
@@ -447,6 +467,11 @@ SYM (__\name):
 #endif
 .endm
 
+.macro WEAK_ALIAS new old 
+   .weak SYM(__\new)
+   FUNC_ALIAS \new \old
+.endm
+
 #ifndef NOT_ISA_TARGET_32BIT
 .macro ARM_FUNC_ALIAS new old
.globl  SYM (__\new)
@@ -1905,10 +1930,9 @@ ARM_FUNC_START ctzsi2

.text
.align 0
-.force_thumb
 
 .macro call_via register
-   THUMB_FUNC_START _call_via_\register
+   THUMB_FUNC_ENTRY _call_via_\register
 
bx  \register
nop
@@ -1991,7 +2015,7 @@ _arm_return_r11:
 .macro interwork_with_frame frame, register, name, return
.code   16
 
-   THUMB_FUNC_START \name
+   THUMB_FUNC_ENTRY \name
 
bx  pc
nop
@@ -2008,7 +2032,7 @@ _arm_return_r11:
 .macro interwork register
.code   16
 
-   THUMB_FUNC_START _interwork_call_v

[PATCH v4 00/29] libgcc: Thumb-1 Floating-Point Library for Cortex M0

2021-01-11 Thread gnu
From: Daniel Engel 


This patch revision is based on comments received against:

  
  
As one point of comparison, a test program [1] links 916 bytes from libgcc with
the patched toolchain vs 10276 bytes with gcc-arm-none-eabi-9-2020-q2 
toolchain.
That's a 90% size reduction.  

I have extensive test vectors [2], and this patch pass all tests on an 
STM32F051.  
These vectors were derived from UCB [3], Testfloat [4], and IEEECC754 [5], plus 
many of my own generation.  

There may be some follow-on projects worth discussing:

* The library is currently integrated into the ARM v6m multilib only.  It 
is 
likely that some of the architectures would benefit from these routines.   
However, I have NOT profiled the existing implementations (ieee754-sf.S) to 
estimate where improvements may be found.

* GCC currently lacks test for some functions, such as __aeabi_[u]ldivmod().
There may be useful bits in [1] that can be integrated.  

On Cortex M0, the functions have (approximately) the following properties: 

Function(s) Size (bytes)Cycles  Stack   
Accuracy
__clzsi250  20  0   
exact
__clzsi2 (OPTIMIZE_SIZE)22  51  0   
exact
__clzdi28+__clzsi2  4+__clzsi2  0   
exact

__clrsbsi2  8+__clzsi2  6+__clzsi2  0   
exact
__clrsbdi2  18+__clzsi2 (8..10)+__clzsi20   
exact

__ctzsi252  21  0   
exact
__ctzsi2 (OPTIMIZE_SIZE)24  52  0   
exact
__ctzdi28+__ctzsi2  5+__ctzsi2  0   
exact

__ffssi28   6..(5+__ctzsi2) 0   
exact
__ffsdi214+__ctzsi2 9..(8+__ctzsi2) 0   
exact

__popcountsi2   52  25  0   
exact
__popcountsi2 (OPTIMIZE_SIZE)   14  9..201  0   
exact
__popcountdi2   34+__popcountsi246  0   
exact
__popcountdi2 (OPTIMIZE_SIZE)   12+__popcountsi217..401 0   
exact

__paritysi2 24  14  0   
exact
__paritysi2 (OPTIMIZE_SIZE) 16  38  0   
exact
__paritydi2 2+__paritysi2   1+__paritysi2   0   
exact

__umulsidi3 44  24  0   
exact
__mulsidi3  30+__umulsidi3  24+__umulsidi3  8   
exact
__muldi3 (__aeabi_lmul) 10+__umulsidi3  6+__umulsidi3   0   
exact
__ashldi3 (__aeabi_llsl)22  13  0   
exact
__lshrdi3 (__aeabi_llsr)22  13  0   
exact
__ashrdi3 (__aeabi_lasr)22  13  0   
exact

__aeabi_lcmp20  13  0   
exact
__aeabi_ulcmp   16  10  0   
exact

__udivsi3 (__aeabi_uidiv)   56  72..385 0   
< 1 lsb
__divsi3 (__aeabi_idiv) 38+__udivsi326+__udivsi38   
< 1 lsb
__udivdi3 (__aeabi_uldiv)   164 103..1394   16  
< 1 lsb
__udivdi3 (OPTIMIZE_SIZE)   142 120..1392   16  
< 1 lsb
__divdi3 (__aeabi_ldiv) 54+__udivdi336+__udivdi332  
< 1 lsb

__shared_float  178 
__shared_float (OPTIMIZE_SIZE)  154 

__addsf3 (__aeabi_fadd) 116+__shared_float  31..76  8   
<= 0.5 ulp
__addsf3 (OPTIMIZE_SIZE)112+__shared_float  74  8   
<= 0.5 ulp
__subsf3 (__aeabi_fsub) 6+__addsf3  3+__addsf3  8   
<= 0.5 ulp
__aeabi_frsub   8+__addsf3  6+__addsf3  8   
<= 0.5 ulp
__mulsf3 (__aeabi_fmul) 112+__shared_float  73..97  8   
<= 0.5 ulp
__mulsf3 (OPTIMIZE_SIZE)96+__shared_float   93  8   
<= 0.5 ulp
__divsf3 (__aeabi_fdiv) 132+__shared_float  83..361 8   
<= 0.5 ulp
__divsf3 (OPTIMIZE_SIZE)120+__shared_float  263..3598   
<= 0.5 ulp

__cmpsf2/__lesf2/__ltsf272  33  0   
exact
__eqsf2/__nesf2 4+__cmpsf2  3+__cmpsf2  0   
exact
__gesf2/__gesf2 4+__cmpsf2  3+__cmpsf2  0   
exact
__unordsf2 (__aeabi_fcmpun)  

[PATCH v4 09/29] Import new 'ffs' functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

The new functions provide an efficient tail call to __ctzsi2(),
making them rather smaller and faster than the C versions.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bits/ctz2.S: Add __ffssi2() and __ffsdi2().
* config/arm/t-elf: Add _ffssi2 and _ffsdi2.
---
 libgcc/config/arm/bits/ctz2.S | 77 ++-
 libgcc/config/arm/t-elf   |  2 +
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/libgcc/config/arm/bits/ctz2.S b/libgcc/config/arm/bits/ctz2.S
index 4241fdad283..609b61cb6f3 100644
--- a/libgcc/config/arm/bits/ctz2.S
+++ b/libgcc/config/arm/bits/ctz2.S
@@ -1,4 +1,4 @@
-/* ctz2.S: ARM optimized 'ctz' functions
+/* ctz2.S: ARM optimized 'ctz' and related functions
 
Copyright (C) 2020-2021 Free Software Foundation, Inc.
Contributed by Daniel Engel (g...@danielengel.com)
@@ -228,3 +228,78 @@ FUNC_END ctzdi2
 
 #endif /* L_ctzsi2 || L_ctzdi2 */
 
+
+#ifdef L_ffsdi2
+
+// int __ffsdi2(int)
+// Return the index of the least significant 1-bit in $r1:r0,
+//  or zero if $r1:r0 is zero.  The least significant bit is index 1.
+// Returns the result in $r0.
+// Uses $r2 and possibly $r3 as scratch space.
+// Same section as __ctzsi2() for sake of the tail call branches.
+FUNC_START_SECTION ffsdi2 .text.sorted.libgcc.ctz2.ffsdi2
+CFI_START_FUNCTION
+
+// Simplify branching by assuming a non-zero lower word.
+// For all such, ffssi2(x) == ctzsi2(x) + 1.
+movsr2,#(33 - CTZ_RESULT_OFFSET)
+
+  #if defined(__ARMEB__) && __ARMEB__
+// HACK: Save the upper word in a scratch register.
+movsr3, r0
+
+// Test the lower word.
+movsr0, r1
+bne SYM(__internal_ctzsi2)
+
+// Test the upper word.
+movsr2,#(65 - CTZ_RESULT_OFFSET)
+movsr0, r3
+bne SYM(__internal_ctzsi2)
+
+  #else /* !__ARMEB__ */
+// Test the lower word.
+cmp r0, #0
+bne SYM(__internal_ctzsi2)
+
+// Test the upper word.
+movsr2,#(65 - CTZ_RESULT_OFFSET)
+movsr0, r1
+bne SYM(__internal_ctzsi2)
+
+  #endif /* !__ARMEB__ */
+
+// Upper and lower words are both zero.
+RET
+
+CFI_END_FUNCTION
+FUNC_END ffsdi2
+
+#endif /* L_ffsdi2 */
+
+
+#ifdef L_ffssi2
+
+// int __ffssi2(int)
+// Return the index of the least significant 1-bit in $r0,
+//  or zero if $r0 is zero.  The least significant bit is index 1.
+// Returns the result in $r0.
+// Uses $r2 and possibly $r3 as scratch space.
+// Same section as __ctzsi2() for sake of the tail call branches.
+FUNC_START_SECTION ffssi2 .text.sorted.libgcc.ctz2.ffssi2
+CFI_START_FUNCTION
+
+// Simplify branching by assuming a non-zero argument.
+// For all such, ffssi2(x) == ctzsi2(x) + 1.
+movsr2,#(33 - CTZ_RESULT_OFFSET)
+
+// Test for zero, return unmodified.
+cmp r0, #0
+bne SYM(__internal_ctzsi2)
+RET
+
+CFI_END_FUNCTION
+FUNC_END ffssi2
+
+#endif /* L_ffssi2 */
+
diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf
index 88ea869eea7..32de63f4c64 100644
--- a/libgcc/config/arm/t-elf
+++ b/libgcc/config/arm/t-elf
@@ -36,6 +36,8 @@ LIB1ASMFUNCS += \
_clrsbdi2 \
_clzdi2 \
_ctzdi2 \
+   _ffssi2 \
+   _ffsdi2 \
_dvmd_tls \
_divsi3 \
_modsi3 \
-- 
2.25.1



[PATCH v4 06/29] Import replacement 'ctz' functions from CM0 library

2021-01-11 Thread gnu
From: Daniel Engel 

This version combines __ctzdi2() with __ctzsi2() into a single object with
an efficient tail call.  The former implementation of __ctzdi2() was in C.

On architectures without a clz instruction, this version merges the formerly
separate Thumb and ARM code sequences into a unified instruction sequence.
This change significantly improves the Thumb performance without affecting ARM
performance.  Finally, this version adds a new __OPTIMIZE_SIZE__ build option.

On architectures with a clz instruction, __ctzsi2() now return 32 instead
of -1 when the argument is 0.  This costs an extra 2 instructions, branchless.
Although the output of this function is technically undefined when the argument
is 0, this makes the behavior consistent with __clzsi2().

Likewise, __ctzdi2() now returns '64' on a zero argument instead of '31'

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bits/ctz2.S: Size-optimized __ctzsi2(), new function 
__ctzdi2();
added logic to return '32' for x=0 when using hardware clz instruction.
* config/arm/t-elf: Add _ctzdi2, move _clzsi2 to weak LIB1ASMFUNCS 
group.
---
 libgcc/config/arm/bits/ctz2.S | 287 ++
 libgcc/config/arm/t-elf   |   3 +-
 2 files changed, 228 insertions(+), 62 deletions(-)

diff --git a/libgcc/config/arm/bits/ctz2.S b/libgcc/config/arm/bits/ctz2.S
index f0422d1fbba..4241fdad283 100644
--- a/libgcc/config/arm/bits/ctz2.S
+++ b/libgcc/config/arm/bits/ctz2.S
@@ -1,65 +1,230 @@
+/* ctz2.S: ARM optimized 'ctz' functions
 
-#ifdef L_ctzsi2
-#ifdef NOT_ISA_TARGET_32BIT
-FUNC_START ctzsi2
-   negsr1, r0
-   andsr0, r0, r1
-   movsr1, #28
-   movsr3, #1
-   lslsr3, r3, #16
-   cmp r0, r3 /* 0x1 */
-   bcc 2f
-   lsrsr0, r0, #16
-   subsr1, r1, #16
-2: lsrsr3, r3, #8
-   cmp r0, r3 /* #0x100 */
-   bcc 2f
-   lsrsr0, r0, #8
-   subsr1, r1, #8
-2: lsrsr3, r3, #4
-   cmp r0, r3 /* #0x10 */
-   bcc 2f
-   lsrsr0, r0, #4
-   subsr1, r1, #4
-2: adr r2, 1f
-   ldrbr0, [r2, r0]
-   subsr0, r0, r1
-   bx lr
-.align 2
-1:
-.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
-   FUNC_END ctzsi2
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+// When the hardware 'ctz' function is available, an efficient version
+//  of __ctzsi2(x) can be created by calculating '31 - __ctzsi2(lsb(x))',
+//  where lsb(x) is 'x' with only the least-significant '1' bit set.
+// The following offset applies to all of the functions in this file.
+#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+  #define CTZ_RESULT_OFFSET 1
 #else
-ARM_FUNC_START ctzsi2
-   rsb r1, r0, #0
-   and r0, r0, r1
-# if defined (__ARM_FEATURE_CLZ)
-   clz r0, r0
-   rsb r0, r0, #31
-   RET
-# else
-   mov r1, #28
-   cmp r0, #0x1
-   do_it   cs, t
-   movcs   r0, r0, lsr #16
-   subcs   r1, r1, #16
-   cmp r0, #0x100
-   do_it   cs, t
-   movcs   r0, r0, lsr #8
-   subcs   r1, r1, #8
-   cmp r0, #0x10
-   do_it   cs, t
-   movcs   r0, r0, lsr #4
-   subcs   r1, r1, #4
-   adr r2, 1f
-   ldrbr0, [r2, r0]
-   sub r0, r0, r1
-   RET
-.align 2
-1:
-.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
-# endif /* !defined (__ARM_FEATURE_CLZ) */
-   FUNC_END ctzsi2
+  #define CTZ_RESULT_OFFSET 0
 #endif
-#endif /* L_clzsi2 */
+
+
+#ifdef L_ctzdi2
+
+// int __ctzdi2(long long)
+// Counts trailing zeros in a 64 bit double word.
+// Expects the argument  in $r1:$r0.
+// Returns the result in $r0.
+// Uses $r2 and possibly $r3 as scratch space.
+FUNC_START_SECTION ctzdi2 .text.sorted.libgcc.ctz2.ctzdi2
+C

[PATCH v4 14/29] Add branchless conditional sequences to the 64-bit comparison funtions.

2021-01-11 Thread gnu
From: Daniel Engel 

This effectively merges support for ARM architecture variants into a
common function path with appropriate build conditions.
ARM performance is 1-2 instructions faster/smaller; Thumb-2 is about 50%.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bpabi.S: Removed __aeabi_lcmp() and __aeabi_ulcmp().
* config/arm/eabi/lcmp.S: Add branchless paths (__ARM_FEATURE_IT).
* config/arm/lib1funcs.S: Changed #include scope of eabi/lcmp.S.
---
 libgcc/config/arm/bpabi.S | 42 ---
 libgcc/config/arm/eabi/lcmp.S | 47 ++-
 libgcc/config/arm/lib1funcs.S |  2 +-
 3 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S
index 2cbb67d54ad..4281a2be594 100644
--- a/libgcc/config/arm/bpabi.S
+++ b/libgcc/config/arm/bpabi.S
@@ -34,48 +34,6 @@
.eabi_attribute 25, 1
 #endif /* __ARM_EABI__ */
 
-#ifdef L_aeabi_lcmp
-
-ARM_FUNC_START aeabi_lcmp
-   cmp xxh, yyh
-   do_it   lt
-   movlt   r0, #-1
-   do_it   gt
-   movgt   r0, #1
-   do_it   ne
-   RETc(ne)
-   subsr0, xxl, yyl
-   do_it   lo
-   movlo   r0, #-1
-   do_it   hi
-   movhi   r0, #1
-   RET
-   FUNC_END aeabi_lcmp
-
-#endif /* L_aeabi_lcmp */
-   
-#ifdef L_aeabi_ulcmp
-
-ARM_FUNC_START aeabi_ulcmp
-   cmp xxh, yyh
-   do_it   lo
-   movlo   r0, #-1
-   do_it   hi
-   movhi   r0, #1
-   do_it   ne
-   RETc(ne)
-   cmp xxl, yyl
-   do_it   lo
-   movlo   r0, #-1
-   do_it   hi
-   movhi   r0, #1
-   do_it   eq
-   moveq   r0, #0
-   RET
-   FUNC_END aeabi_ulcmp
-
-#endif /* L_aeabi_ulcmp */
-
 .macro test_div_by_zero signed
 /* Tail-call to divide-by-zero handlers which may be overridden by the user,
so unwinding works properly.  */
diff --git a/libgcc/config/arm/eabi/lcmp.S b/libgcc/config/arm/eabi/lcmp.S
index 2ac9d178b34..3ec1de5a0b1 100644
--- a/libgcc/config/arm/eabi/lcmp.S
+++ b/libgcc/config/arm/eabi/lcmp.S
@@ -46,6 +46,19 @@ FUNC_START_SECTION LCMP_NAME LCMP_SECTION
 subsxxl,yyl
 sbcsxxh,yyh
 
+#ifdef __ARM_FEATURE_IT
+do_it   lt,t
+
+  #ifdef L_aeabi_lcmp
+movlt   r0,#-1
+  #else
+movlt   r0,#0
+  #endif
+
+// Early return on '<'.
+RETc(lt)
+
+#else /* !__ARM_FEATURE_IT */
 // With $r2 free, create a known offset value without affecting
 //  the N or Z flags.
 // BUG? The originally unified instruction for v6m was 'mov r2, r3'.
@@ -62,17 +75,27 @@ FUNC_START_SECTION LCMP_NAME LCMP_SECTION
 //  argument is larger, otherwise the offset value remains 0.
 addsr2, #2
 
+#endif
+
 // Check for zero (equality in 64 bits).
 // It doesn't matter which register was originally "hi".
 orrsr0,r1
 
+#ifdef __ARM_FEATURE_IT
+// The result is already 0 on equality.
+// -1 already returned, so just force +1.
+do_it   ne
+movne   r0, #1
+
+#else /* !__ARM_FEATURE_IT */
 // The result is already 0 on equality.
 beq LLSYM(__lcmp_return)
 
-LLSYM(__lcmp_lt):
+  LLSYM(__lcmp_lt):
 // Create +1 or -1 from the offset value defined earlier.
 addsr3, #1
 subsr0, r2, r3
+#endif
 
 LLSYM(__lcmp_return):
   #ifdef L_cmpdi2
@@ -111,21 +134,43 @@ FUNC_START_SECTION ULCMP_NAME ULCMP_SECTION
 subsxxl,yyl
 sbcsxxh,yyh
 
+#ifdef __ARM_FEATURE_IT
+do_it   lo,t
+
+  #ifdef L_aeabi_ulcmp
+movlo   r0, -1
+  #else
+movlo   r0, #0
+  #endif
+
+// Early return on '<'.
+RETc(lo)
+
+#else
 // Capture the carry flg.
 // $r2 will contain -1 if the first value is smaller,
 //  0 if the first value is larger or equal.
 sbcsr2, r2
+#endif
 
 // Check for zero (equality in 64 bits).
 // It doesn't matter which register was originally "hi".
 orrsr0, r1
 
+#ifdef __ARM_FEATURE_IT
+// The result is already 0 on equality.
+// -1 already returned, so just force +1.
+do_it   ne
+movne   r0, #1
+
+#else /* !__ARM_FEATURE_IT */
 // The result is already 0 on equality.
 beq LLSYM(__ulcmp_return)
 
 // Assume +1.  If -1 is correct, $r2 will override.
 movsr0, #1
 orrsr0, r2
+#endif
 
 LLSYM(__ulcmp_return):
   #ifdef L_ucmpdi2
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index b4abb4a3365..6aed09f4930 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1925,6 +1925,6 @@ LSYM(Lchange_\register):
 #include "bpabi.S"
 #else /* NOT_ISA_TARGET

[PATCH v4 02/29] Refactor 'clz' functions into a new file.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/lib1funcs.S: Move __clzsi2() and __clzdi2() to
* config/arm/bits/clz2.S: New file.
---
 libgcc/config/arm/bits/clz2.S | 124 ++
 libgcc/config/arm/lib1funcs.S | 123 +
 2 files changed, 125 insertions(+), 122 deletions(-)
 create mode 100644 libgcc/config/arm/bits/clz2.S

diff --git a/libgcc/config/arm/bits/clz2.S b/libgcc/config/arm/bits/clz2.S
new file mode 100644
index 000..1c8f10a5b29
--- /dev/null
+++ b/libgcc/config/arm/bits/clz2.S
@@ -0,0 +1,124 @@
+
+#ifdef L_clzsi2
+#ifdef NOT_ISA_TARGET_32BIT
+FUNC_START clzsi2
+   movsr1, #28
+   movsr3, #1
+   lslsr3, r3, #16
+   cmp r0, r3 /* 0x1 */
+   bcc 2f
+   lsrsr0, r0, #16
+   subsr1, r1, #16
+2: lsrsr3, r3, #8
+   cmp r0, r3 /* #0x100 */
+   bcc 2f
+   lsrsr0, r0, #8
+   subsr1, r1, #8
+2: lsrsr3, r3, #4
+   cmp r0, r3 /* #0x10 */
+   bcc 2f
+   lsrsr0, r0, #4
+   subsr1, r1, #4
+2: adr r2, 1f
+   ldrbr0, [r2, r0]
+   addsr0, r0, r1
+   bx lr
+.align 2
+1:
+.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+   FUNC_END clzsi2
+#else
+ARM_FUNC_START clzsi2
+# if defined (__ARM_FEATURE_CLZ)
+   clz r0, r0
+   RET
+# else
+   mov r1, #28
+   cmp r0, #0x1
+   do_it   cs, t
+   movcs   r0, r0, lsr #16
+   subcs   r1, r1, #16
+   cmp r0, #0x100
+   do_it   cs, t
+   movcs   r0, r0, lsr #8
+   subcs   r1, r1, #8
+   cmp r0, #0x10
+   do_it   cs, t
+   movcs   r0, r0, lsr #4
+   subcs   r1, r1, #4
+   adr r2, 1f
+   ldrbr0, [r2, r0]
+   add r0, r0, r1
+   RET
+.align 2
+1:
+.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+# endif /* !defined (__ARM_FEATURE_CLZ) */
+   FUNC_END clzsi2
+#endif
+#endif /* L_clzsi2 */
+
+#ifdef L_clzdi2
+#if !defined (__ARM_FEATURE_CLZ)
+
+# ifdef NOT_ISA_TARGET_32BIT
+FUNC_START clzdi2
+   push{r4, lr}
+   cmp xxh, #0
+   bne 1f
+#  ifdef __ARMEB__
+   movsr0, xxl
+   bl  __clzsi2
+   addsr0, r0, #32
+   b 2f
+1:
+   bl  __clzsi2
+#  else
+   bl  __clzsi2
+   addsr0, r0, #32
+   b 2f
+1:
+   movsr0, xxh
+   bl  __clzsi2
+#  endif
+2:
+   pop {r4, pc}
+# else /* NOT_ISA_TARGET_32BIT */
+ARM_FUNC_START clzdi2
+   do_push {r4, lr}
+   cmp xxh, #0
+   bne 1f
+#  ifdef __ARMEB__
+   mov r0, xxl
+   bl  __clzsi2
+   add r0, r0, #32
+   b 2f
+1:
+   bl  __clzsi2
+#  else
+   bl  __clzsi2
+   add r0, r0, #32
+   b 2f
+1:
+   mov r0, xxh
+   bl  __clzsi2
+#  endif
+2:
+   RETLDM  r4
+   FUNC_END clzdi2
+# endif /* NOT_ISA_TARGET_32BIT */
+
+#else /* defined (__ARM_FEATURE_CLZ) */
+
+ARM_FUNC_START clzdi2
+   cmp xxh, #0
+   do_it   eq, et
+   clzeq   r0, xxl
+   clzne   r0, xxh
+   addeq   r0, r0, #32
+   RET
+   FUNC_END clzdi2
+
+#endif
+#endif /* L_clzdi2 */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index b4541bae791..f5aa5505e9d 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1722,128 +1722,7 @@ LSYM(Lover12):
 
 #endif /* __symbian__ */
 
-#ifdef L_clzsi2
-#ifdef NOT_ISA_TARGET_32BIT
-FUNC_START clzsi2
-   movsr1, #28
-   movsr3, #1
-   lslsr3, r3, #16
-   cmp r0, r3 /* 0x1 */
-   bcc 2f
-   lsrsr0, r0, #16
-   subsr1, r1, #16
-2: lsrsr3, r3, #8
-   cmp r0, r3 /* #0x100 */
-   bcc 2f
-   lsrsr0, r0, #8
-   subsr1, r1, #8
-2: lsrsr3, r3, #4
-   cmp r0, r3 /* #0x10 */
-   bcc 2f
-   lsrsr0, r0, #4
-   subsr1, r1, #4
-2: adr r2, 1f
-   ldrbr0, [r2, r0]
-   addsr0, r0, r1
-   bx lr
-.align 2
-1:
-.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
-   FUNC_END clzsi2
-#else
-ARM_FUNC_START clzsi2
-# if defined (__ARM_FEATURE_CLZ)
-   clz r0, r0
-   RET
-# else
-   mov r1, #28
-   cmp r0, #0x1
-   do_it   cs, t
-   movcs   r0, r0, lsr #16
-   subcs   r1, r1, #16
-   cmp r0, #0x100
-   do_it   cs, t
-   movcs   r0, r0, lsr #8
-   subcs   r1, r1, #8
-   cmp r0, #0x10
-   do_it   cs, t
-   movcs   r0, r0, lsr #4
-   subcs   r1, r1, #4
-   adr r2, 1f
-   ldrbr0, [r2, r0]
-   add r0, r0, r1
-   RET
-.align 2
-1:
-.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
-# endif /* !defined (__ARM_FEATURE_CLZ) */
-   FUNC_END clzsi2
-#endif
-#endif /* L_clzsi2 */
-
-#ifdef L_clzdi2
-#if !defined (__ARM_FEATURE

[PATCH v4 10/29] Import new 'parity' functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

The functional overlap between the single- and double-word variants
functions makes these versions about half the size of the C versions
if both functions are linked together in an application.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bits/parity.S: New file for __paritysi2/di2().
* config/arm/lib1funcs.S: #include bit/parity.S.
* config/arm/t-elf: Add _parity* objects to LIB1ASMFUNCS.
---
 libgcc/config/arm/bits/parity.S | 120 
 libgcc/config/arm/lib1funcs.S   |   1 +
 libgcc/config/arm/t-elf |   2 +
 3 files changed, 123 insertions(+)
 create mode 100644 libgcc/config/arm/bits/parity.S

diff --git a/libgcc/config/arm/bits/parity.S b/libgcc/config/arm/bits/parity.S
new file mode 100644
index 000..f2c657ba927
--- /dev/null
+++ b/libgcc/config/arm/bits/parity.S
@@ -0,0 +1,120 @@
+/* parity.S: ARM optimized parity functions
+
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifdef L_paritydi2
+   
+// int __paritydi2(int)
+// Returns '0' if the number of bits set in $r1:r0 is even, and '1' otherwise.
+// Returns the result in $r0.
+FUNC_START_SECTION paritydi2 .text.sorted.libgcc.paritydi2
+CFI_START_FUNCTION
+
+// Combine the upper and lower words, then fall through. 
+// Byte-endianness does not matter for this function.  
+eorsr0, r1
+
+#endif /* L_paritydi2 */ 
+
+
+// The implementation of __paritydi2() tightly couples with __paritysi2(),
+//  such that instructions must appear consecutively in the same memory
+//  section for proper flow control.  However, this construction inhibits
+//  the ability to discard __paritydi2() when only using __paritysi2().
+// Therefore, this block configures __paritysi2() for compilation twice.
+// The first version is a minimal standalone implementation, and the second
+//  version is the continuation of __paritydi2().  The standalone version must
+//  be declared WEAK, so that the combined version can supersede it and
+//  provide both symbols when required.
+// '_paritysi2' should appear before '_paritydi2' in LIB1ASMFUNCS.
+#if defined(L_paritysi2) || defined(L_paritydi2) 
+
+#ifdef L_paritysi2
+// int __paritysi2(int)
+// Returns '0' if the number of bits set in $r0 is even, and '1' otherwise.
+// Returns the result in $r0.
+// Uses $r2 as scratch space.
+WEAK_START_SECTION paritysi2 .text.sorted.libgcc.paritysi2
+CFI_START_FUNCTION
+
+#else /* L_paritydi2 */
+FUNC_ENTRY paritysi2
+
+#endif
+
+  #if defined(__thumb__) && __thumb__
+#if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+
+// Size optimized: 16 bytes, 40 cycles
+// Speed optimized: 24 bytes, 14 cycles
+movsr2, #16 
+
+LLSYM(__parity_loop):
+// Calculate the parity of successively smaller half-words into the 
MSB.  
+movsr1, r0 
+lslsr1, r2 
+eorsr0, r1 
+lsrsr2, #1 
+bne LLSYM(__parity_loop)
+   
+#else /* !__OPTIMIZE_SIZE__ */
+
+// Unroll the loop.  The 'libgcc' reference C implementation replaces 
+//  the x2 and the x1 shifts with a constant.  However, since it takes 
+//  4 cycles to load, index, and mask the constant result, it doesn't 
+//  cost anything to keep shifting (and saves a few bytes).  
+lslsr1, r0, #16 
+eorsr0, r1 
+lslsr1, r0, #8 
+eorsr0, r1 
+lslsr1, r0, #4 
+eorsr0, r1 
+lslsr1, r0, #2 
+eorsr0, r1 
+lslsr1, r0, #1 
+eorsr0, r1 
+
+#endif /* !__OPTIMIZE_SIZE__ */
+  #else /* !__thumb__ */
+   
+eorsr0,r0, r0, lsl #16
+eors   

[PATCH v4 11/29] Import new 'popcnt' functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

The functional overlap between the single- and double-word versions
functions makes this implementation about 30% smaller than C when
both functions are linked together in an appliation.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bits/popcnt.S: New file for __popcountsi2/di2()..
* config/arm/lib1funcs.S: #include bit/popcnt.S.
* config/arm/t-elf: Add _popcount* objects to LIB1ASMFUNCS.
---
 libgcc/config/arm/bits/popcnt.S | 189 
 libgcc/config/arm/lib1funcs.S   |   1 +
 libgcc/config/arm/t-elf |   2 +
 3 files changed, 192 insertions(+)
 create mode 100644 libgcc/config/arm/bits/popcnt.S

diff --git a/libgcc/config/arm/bits/popcnt.S b/libgcc/config/arm/bits/popcnt.S
new file mode 100644
index 000..13642267d64
--- /dev/null
+++ b/libgcc/config/arm/bits/popcnt.S
@@ -0,0 +1,189 @@
+/* popcnt.S: ARM optimized popcount functions
+
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifdef L_popcountdi2
+   
+// int __popcountdi2(int)
+// Returns the number of bits set in $r1:$r0.
+// Returns the result in $r0.
+FUNC_START_SECTION popcountdi2 .text.sorted.libgcc.popcountdi2
+CFI_START_FUNCTION
+
+  #if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+// Initialize the result.
+// Compensate for the two extra loop (one for each word)
+//  required to detect zero arguments.  
+movsr2, #2
+
+LLSYM(__popcountd_loop):
+// Same as __popcounts_loop below, except for $r1.
+subsr2, #1
+subsr3, r1, #1
+andsr1, r3 
+bcs LLSYM(__popcountd_loop)
+
+// Repeat the operation for the second word.  
+b   LLSYM(__popcounts_loop)
+
+  #else /* !__OPTIMIZE_SIZE__ */
+// Load the one-bit alternating mask.
+ldr r3, =0x
+
+// Reduce the second word. 
+lsrsr2, r1, #1
+andsr2, r3
+subsr1, r2 
+
+// Reduce the first word. 
+lsrsr2, r0, #1
+andsr2, r3
+subsr0, r2 
+
+// Load the two-bit alternating mask. 
+ldr r3, =0x
+
+// Reduce the second word.
+lsrsr2, r1, #2
+andsr2, r3
+andsr1, r3
+addsr1, r2
+
+// Reduce the first word. 
+lsrsr2, r0, #2
+andsr2, r3
+andsr0, r3
+addsr0, r2
+
+// There will be a maximum of 8 bits in each 4-bit field.   
+// Jump into the single word flow to combine and complete.
+b   LLSYM(__popcounts_merge)
+
+  #endif /* !__OPTIMIZE_SIZE__ */
+#endif /* L_popcountdi2 */ 
+
+
+// The implementation of __popcountdi2() tightly couples with __popcountsi2(),
+//  such that instructions must appear consecutively in the same memory
+//  section for proper flow control.  However, this construction inhibits
+//  the ability to discard __popcountdi2() when only using __popcountsi2().
+// Therefore, this block configures __popcountsi2() for compilation twice.
+// The first version is a minimal standalone implementation, and the second
+//  version is the continuation of __popcountdi2().  The standalone version 
must
+//  be declared WEAK, so that the combined version can supersede it and
+//  provide both symbols when required.
+// '_popcountsi2' should appear before '_popcountdi2' in LIB1ASMFUNCS.
+#if defined(L_popcountsi2) || defined(L_popcountdi2) 
+
+#ifdef L_popcountsi2
+// int __popcountsi2(int)
+// Returns '0' if the number of bits set in $r0 is even, and '1' otherwise.
+// Returns the result in $r0.
+// Uses $r2 as scratch space.
+WEAK_START_SECTION popcountsi2 .text.sorted.libgcc.popcountsi2
+CFI_START_FUNCTION
+
+#else /* L_pop

[PATCH v4 16/29] Refactor Thumb-1 64-bit division functions into a new file.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bpabi-v6m.S: Moved _ldivmod() and __uldivmod() to
* config/arm/eabi/ldiv.S: New file.
* config/arm/lib1funcs.S: #include eabi/ldiv.S (v6m only).
---
 libgcc/config/arm/bpabi-v6m.S | 81 --
 libgcc/config/arm/eabi/ldiv.S | 82 +++
 libgcc/config/arm/lib1funcs.S |  1 +
 3 files changed, 83 insertions(+), 81 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/ldiv.S

diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S
index a051c1530a4..b3dc3bf8f4d 100644
--- a/libgcc/config/arm/bpabi-v6m.S
+++ b/libgcc/config/arm/bpabi-v6m.S
@@ -34,87 +34,6 @@
 #endif /* __ARM_EABI__ */
 
 
-.macro test_div_by_zero signed
-   cmp yyh, #0
-   bne 7f
-   cmp yyl, #0
-   bne 7f
-   cmp xxh, #0
-   .ifc\signed, unsigned
-   bne 2f
-   cmp xxl, #0
-2:
-   beq 3f
-   movsxxh, #0
-   mvnsxxh, xxh@ 0x
-   movsxxl, xxh
-3:
-   .else
-   blt 6f
-   bgt 4f
-   cmp xxl, #0
-   beq 5f
-4: movsxxl, #0
-   mvnsxxl, xxl@ 0x
-   lsrsxxh, xxl, #1@ 0x7fff
-   b   5f
-6: movsxxh, #0x80
-   lslsxxh, xxh, #24   @ 0x8000
-   movsxxl, #0
-5:
-   .endif
-   @ tailcalls are tricky on v6-m.
-   push{r0, r1, r2}
-   ldr r0, 1f
-   adr r1, 1f
-   addsr0, r1
-   str r0, [sp, #8]
-   @ We know we are not on armv4t, so pop pc is safe.
-   pop {r0, r1, pc}
-   .align  2
-1:
-   .word   __aeabi_ldiv0 - 1b
-7:
-.endm
-
-#ifdef L_aeabi_ldivmod
-
-FUNC_START aeabi_ldivmod
-   test_div_by_zero signed
-
-   push{r0, r1}
-   mov r0, sp
-   push{r0, lr}
-   ldr r0, [sp, #8]
-   bl  SYM(__gnu_ldivmod_helper)
-   ldr r3, [sp, #4]
-   mov lr, r3
-   add sp, sp, #8
-   pop {r2, r3}
-   RET
-   FUNC_END aeabi_ldivmod
-
-#endif /* L_aeabi_ldivmod */
-
-#ifdef L_aeabi_uldivmod
-
-FUNC_START aeabi_uldivmod
-   test_div_by_zero unsigned
-
-   push{r0, r1}
-   mov r0, sp
-   push{r0, lr}
-   ldr r0, [sp, #8]
-   bl  SYM(__udivmoddi4)
-   ldr r3, [sp, #4]
-   mov lr, r3
-   add sp, sp, #8
-   pop {r2, r3}
-   RET
-   FUNC_END aeabi_uldivmod
-   
-#endif /* L_aeabi_uldivmod */
-
 #ifdef L_arm_addsubsf3
 
 FUNC_START aeabi_frsub
diff --git a/libgcc/config/arm/eabi/ldiv.S b/libgcc/config/arm/eabi/ldiv.S
new file mode 100644
index 000..514a3b8c3a3
--- /dev/null
+++ b/libgcc/config/arm/eabi/ldiv.S
@@ -0,0 +1,82 @@
+
+.macro test_div_by_zero signed
+   cmp yyh, #0
+   bne 7f
+   cmp yyl, #0
+   bne 7f
+   cmp xxh, #0
+   .ifc\signed, unsigned
+   bne 2f
+   cmp xxl, #0
+2:
+   beq 3f
+   movsxxh, #0
+   mvnsxxh, xxh@ 0x
+   movsxxl, xxh
+3:
+   .else
+   blt 6f
+   bgt 4f
+   cmp xxl, #0
+   beq 5f
+4: movsxxl, #0
+   mvnsxxl, xxl@ 0x
+   lsrsxxh, xxl, #1@ 0x7fff
+   b   5f
+6: movsxxh, #0x80
+   lslsxxh, xxh, #24   @ 0x8000
+   movsxxl, #0
+5:
+   .endif
+   @ tailcalls are tricky on v6-m.
+   push{r0, r1, r2}
+   ldr r0, 1f
+   adr r1, 1f
+   addsr0, r1
+   str r0, [sp, #8]
+   @ We know we are not on armv4t, so pop pc is safe.
+   pop {r0, r1, pc}
+   .align  2
+1:
+   .word   __aeabi_ldiv0 - 1b
+7:
+.endm
+
+#ifdef L_aeabi_ldivmod
+
+FUNC_START aeabi_ldivmod
+   test_div_by_zero signed
+
+   push{r0, r1}
+   mov r0, sp
+   push{r0, lr}
+   ldr r0, [sp, #8]
+   bl  SYM(__gnu_ldivmod_helper)
+   ldr r3, [sp, #4]
+   mov lr, r3
+   add sp, sp, #8
+   pop {r2, r3}
+   RET
+   FUNC_END aeabi_ldivmod
+
+#endif /* L_aeabi_ldivmod */
+
+#ifdef L_aeabi_uldivmod
+
+FUNC_START aeabi_uldivmod
+   test_div_by_zero unsigned
+
+   push{r0, r1}
+   mov r0, sp
+   push{r0, lr}
+   ldr r0, [sp, #8]
+   bl  SYM(__udivmoddi4)
+   ldr r3, [sp, #4]
+   mov lr, r3
+   add sp, sp, #8
+   pop {r2, r3}
+   RET
+   FUNC_END aeabi_uldivmod
+   
+#endif /* L_aeabi_uldivmod */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 6657f3b5749..ce9fa941c05 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1509,6 +1509,7 @@ LSYM(Lover12):
 
 #define PEDANTIC_DIV

[PATCH v4 12/29] Refactor Thumb-1 long int comparison functions into a new file.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bpabi-v6m.S: Moved __aeabi_lcmp/ulcmp() to
* config/arm/eabi/lcmp.S: New file.
* config/arm/lib1funcs.S: #include eabi/lcmp.S.
---
 libgcc/config/arm/bpabi-v6m.S | 46 --
 libgcc/config/arm/eabi/lcmp.S | 47 +++
 libgcc/config/arm/lib1funcs.S |  1 +
 3 files changed, 48 insertions(+), 46 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/lcmp.S

diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S
index 069fcbbf48c..a051c1530a4 100644
--- a/libgcc/config/arm/bpabi-v6m.S
+++ b/libgcc/config/arm/bpabi-v6m.S
@@ -33,52 +33,6 @@
.eabi_attribute 25, 1
 #endif /* __ARM_EABI__ */
 
-#ifdef L_aeabi_lcmp
-
-FUNC_START aeabi_lcmp
-   cmp xxh, yyh
-   beq 1f
-   bgt 2f
-   movsr0, #1
-   negsr0, r0
-   RET
-2:
-   movsr0, #1
-   RET
-1:
-   subsr0, xxl, yyl
-   beq 1f
-   bhi 2f
-   movsr0, #1
-   negsr0, r0
-   RET
-2:
-   movsr0, #1
-1:
-   RET
-   FUNC_END aeabi_lcmp
-
-#endif /* L_aeabi_lcmp */
-   
-#ifdef L_aeabi_ulcmp
-
-FUNC_START aeabi_ulcmp
-   cmp xxh, yyh
-   bne 1f
-   subsr0, xxl, yyl
-   beq 2f
-1:
-   bcs 1f
-   movsr0, #1
-   negsr0, r0
-   RET
-1:
-   movsr0, #1
-2:
-   RET
-   FUNC_END aeabi_ulcmp
-
-#endif /* L_aeabi_ulcmp */
 
 .macro test_div_by_zero signed
cmp yyh, #0
diff --git a/libgcc/config/arm/eabi/lcmp.S b/libgcc/config/arm/eabi/lcmp.S
new file mode 100644
index 000..466686fcb41
--- /dev/null
+++ b/libgcc/config/arm/eabi/lcmp.S
@@ -0,0 +1,47 @@
+#ifdef L_aeabi_lcmp
+
+FUNC_START aeabi_lcmp
+cmp xxh, yyh
+beq 1f
+bgt 2f
+movsr0, #1
+negsr0, r0
+RET
+2:
+movsr0, #1
+RET
+1:
+subsr0, xxl, yyl
+beq 1f
+bhi 2f
+movsr0, #1
+negsr0, r0
+RET
+2:
+movsr0, #1
+1:
+RET
+FUNC_END aeabi_lcmp
+
+#endif /* L_aeabi_lcmp */
+
+#ifdef L_aeabi_ulcmp
+
+FUNC_START aeabi_ulcmp
+cmp xxh, yyh
+bne 1f
+subsr0, xxl, yyl
+beq 2f
+1:
+bcs 1f
+movsr0, #1
+negsr0, r0
+RET
+1:
+movsr0, #1
+2:
+RET
+FUNC_END aeabi_ulcmp
+
+#endif /* L_aeabi_ulcmp */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 2323fefa731..b4abb4a3365 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1925,5 +1925,6 @@ LSYM(Lchange_\register):
 #include "bpabi.S"
 #else /* NOT_ISA_TARGET_32BIT */
 #include "bpabi-v6m.S"
+#include "eabi/lcmp.S"
 #endif /* NOT_ISA_TARGET_32BIT */
 #endif /* !__symbian__ */
-- 
2.25.1



[PATCH v4 13/29] Import replacement 64-bit comparison functions from CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

These are 2-5 instructions smaller and just as fast.  Branches are
minimized, which will make them easier to adapt to Thumb-2/ARM mode.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/eabi/lcmp.S: Replace with faster versions;
add macro configuration to build __cmpdi2() and __ucmpdi2().
* config/arm/t-elf: Add _cmpdi2 and _ucmpdi2 to LIB1ASMFUNCS.
---
 libgcc/config/arm/eabi/lcmp.S | 169 ++
 libgcc/config/arm/t-elf   |   2 +
 2 files changed, 134 insertions(+), 37 deletions(-)

diff --git a/libgcc/config/arm/eabi/lcmp.S b/libgcc/config/arm/eabi/lcmp.S
index 466686fcb41..2ac9d178b34 100644
--- a/libgcc/config/arm/eabi/lcmp.S
+++ b/libgcc/config/arm/eabi/lcmp.S
@@ -1,47 +1,142 @@
+/* lcmp.S: Thumb-1 optimized 64-bit integer comparison
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if defined(L_aeabi_lcmp) || defined(L_cmpdi2)
+
 #ifdef L_aeabi_lcmp
+  #define LCMP_NAME aeabi_lcmp
+  #define LCMP_SECTION .text.sorted.libgcc.lcmp
+#else
+  #define LCMP_NAME cmpdi2
+  #define LCMP_SECTION .text.sorted.libgcc.cmpdi2
+#endif
+
+// int __aeabi_lcmp(long long, long long)
+// int __cmpdi2(long long, long long)
+// Compares the 64 bit signed values in $r1:$r0 and $r3:$r2.
+// lcmp() returns $r0 = { -1, 0, +1 } for orderings { <, ==, > } respectively.
+// cmpdi2() returns $r0 = { 0, 1, 2 } for orderings { <, ==, > } respectively.
+// Object file duplication assumes typical programs follow one runtime ABI.
+FUNC_START_SECTION LCMP_NAME LCMP_SECTION
+CFI_START_FUNCTION
+
+// Calculate the difference $r1:$r0 - $r3:$r2.
+subsxxl,yyl
+sbcsxxh,yyh
+
+// With $r2 free, create a known offset value without affecting
+//  the N or Z flags.
+// BUG? The originally unified instruction for v6m was 'mov r2, r3'.
+//  However, this resulted in a compile error with -mthumb:
+//"MOV Rd, Rs with two low registers not permitted".
+// Since unified syntax deprecates the "cpy" instruction, shouldn't
+//  there be a backwards-compatible tranlation available?
+cpy r2, r3
+
+// Evaluate the comparison result.
+blt LLSYM(__lcmp_lt)
+
+// The reference offset ($r2 - $r3) will be +2 iff the first
+//  argument is larger, otherwise the offset value remains 0.
+addsr2, #2
+
+// Check for zero (equality in 64 bits).
+// It doesn't matter which register was originally "hi".
+orrsr0,r1
+
+// The result is already 0 on equality.
+beq LLSYM(__lcmp_return)
+
+LLSYM(__lcmp_lt):
+// Create +1 or -1 from the offset value defined earlier.
+addsr3, #1
+subsr0, r2, r3
+
+LLSYM(__lcmp_return):
+  #ifdef L_cmpdi2
+// Offset to the correct output specification.
+addsr0, #1
+  #endif
 
-FUNC_START aeabi_lcmp
-cmp xxh, yyh
-beq 1f
-bgt 2f
-movsr0, #1
-negsr0, r0
-RET
-2:
-movsr0, #1
-RET
-1:
-subsr0, xxl, yyl
-beq 1f
-bhi 2f
-movsr0, #1
-negsr0, r0
-RET
-2:
-movsr0, #1
-1:
 RET
-FUNC_END aeabi_lcmp
 
-#endif /* L_aeabi_lcmp */
+CFI_END_FUNCTION
+FUNC_END LCMP_NAME
+
+#endif /* L_aeabi_lcmp || L_cmpdi2 */
+
+
+#if defined(L_aeabi_ulcmp) || defined(L_ucmpdi2)
 
 #ifdef L_aeabi_ulcmp
+  #define ULCMP_NAME aeabi_ulcmp
+  #define ULCMP_SECTION .text.sorted.libgcc.ulcmp
+#else
+  #define ULCMP_NAME ucmpdi2
+  #define ULCMP_SECTION .text.sorted.libgcc.ucmpdi2
+#endif
+
+// int __aeabi_ulcmp(unsigned long long, unsigned long long)
+// int __ucmpdi2(unsigned long long, unsigned long long)
+// Compares the 64

[PATCH v4 15/29] Import new integer division functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/eabi/idiv.S: New file for __udivsi3() and __divsi3().
* config/arm/lib1funcs.S: #include eabi/idiv.S (v6m only).
---
 libgcc/config/arm/eabi/idiv.S | 299 ++
 libgcc/config/arm/lib1funcs.S |  19 ++-
 2 files changed, 317 insertions(+), 1 deletion(-)
 create mode 100644 libgcc/config/arm/eabi/idiv.S

diff --git a/libgcc/config/arm/eabi/idiv.S b/libgcc/config/arm/eabi/idiv.S
new file mode 100644
index 000..7381e8f57a3
--- /dev/null
+++ b/libgcc/config/arm/eabi/idiv.S
@@ -0,0 +1,299 @@
+/* div.S: Thumb-1 size-optimized 32-bit integer division
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifndef __GNUC__
+
+// int __aeabi_idiv0(int)
+// Helper function for division by 0.
+WEAK_START_SECTION aeabi_idiv0 .text.sorted.libgcc.idiv.idiv0
+FUNC_ALIAS cm0_idiv0 aeabi_idiv0
+CFI_START_FUNCTION
+
+  #if defined(TRAP_EXCEPTIONS) && TRAP_EXCEPTIONS
+svc #(SVC_DIVISION_BY_ZERO)
+  #endif
+
+RET
+
+CFI_END_FUNCTION
+FUNC_END cm0_idiv0
+FUNC_END aeabi_idiv0
+
+#endif /* !__GNUC__ */
+
+
+#ifdef L_divsi3
+
+// int __aeabi_idiv(int, int)
+// idiv_return __aeabi_idivmod(int, int)
+// Returns signed $r0 after division by $r1.
+// Also returns the signed remainder in $r1.
+// Same parent section as __divsi3() to keep branches within range.
+FUNC_START_SECTION divsi3 .text.sorted.libgcc.idiv.divsi3
+
+#ifndef __symbian__
+  FUNC_ALIAS aeabi_idiv divsi3
+  FUNC_ALIAS aeabi_idivmod divsi3
+#endif
+
+CFI_START_FUNCTION
+
+// Extend signs.
+asrsr2, r0, #31
+asrsr3, r1, #31
+
+// Absolute value of the denominator, abort on division by zero.
+eorsr1, r3
+subsr1, r3
+  #if defined(PEDANTIC_DIV0) && PEDANTIC_DIV0
+beq LLSYM(__idivmod_zero)
+  #else
+beq SYM(__uidivmod_zero)
+  #endif
+
+// Absolute value of the numerator.
+eorsr0, r2
+subsr0, r2
+
+// Keep the sign of the numerator in bit[31] (for the remainder).
+// Save the XOR of the signs in bits[15:0] (for the quotient).
+push{ rT, lr }
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+lsrsrT, r3, #16
+eorsrT, r2
+
+// Handle division as unsigned.
+bl  SYM(__uidivmod_nonzero) __PLT__
+
+// Set the sign of the remainder.
+asrsr2, rT, #31
+eorsr1, r2
+subsr1, r2
+
+// Set the sign of the quotient.
+sxthr3, rT
+eorsr0, r3
+subsr0, r3
+
+LLSYM(__idivmod_return):
+pop { rT, pc }
+.cfi_restore_state
+
+  #if defined(PEDANTIC_DIV0) && PEDANTIC_DIV0
+LLSYM(__idivmod_zero):
+// Set up the *div0() parameter specified in the ARM runtime ABI:
+//  * 0 if the numerator is 0,
+//  * Or, the largest value of the type manipulated by the calling
+// division function if the numerator is positive,
+//  * Or, the least value of the type manipulated by the calling
+// division function if the numerator is negative.
+subsr1, r0
+orrsr0, r1
+asrsr0, #31
+lsrsr0, #1
+eorsr0, r2
+
+// At least the __aeabi_idiv0() call is common.
+b   SYM(__uidivmod_zero2)
+  #endif /* PEDANTIC_DIV0 */
+
+CFI_END_FUNCTION
+FUNC_END divsi3
+
+#ifndef __symbian__
+  FUNC_END aeabi_idiv
+  FUNC_END aeabi_idivmod
+#endif 
+
+#endif /* L_divsi3 */
+
+
+#ifdef L_udivsi3
+
+// int __aeabi_uidiv(unsigned int, unsigned int)
+// idiv_return __aeabi_uidiv

[PATCH v4 29/29] Remove single-precision functions from the v6m soft-float library.

2021-01-11 Thread gnu
From: Daniel Engel 

With the complete CM0 library integrated, regression testing showed new
failures with the message "compilation failed to produce executable":

gcc.dg/fixed-point/convert-float-1.c
gcc.dg/fixed-point/convert-float-3.c
gcc.dg/fixed-point/convert-sat.c

Investigating, this appears to be caused by the linker.  I can't find a
comprehensive linker specification to claim this is actually a bug, but it
certainly doesn't match my expectations.  Investigating, I found issues
with the link order of these symbols:

  * __aeabi_fmul()
  * __aeabi_f2d()
  * __aeabi_f2iz()

Specifically, I expect the linker to import the _first_ definition of any
symbol.  This is the basic behavior that allows the soft-float library to
supply missing symbols on architectures without optimized routines.

Comparing the v6-m multilib with the default, I see symbol exports for all
of the affect symbols:

gcc-obj/gcc/libgcc.a:

// assembly routines

_arm_mulsf3.o:
 W __aeabi_fmul
 W __mulsf3

_arm_addsubdf3.o:
0368 T __aeabi_f2d
0368 T __extendsfdf2

_arm_fixsfsi.o:
 T __aeabi_f2iz
 T __fixsfsi

mulsf3.o:


fixsfsi.o:


extendsfdf2.o.o:


gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a:

// assembly routines

_arm_mulsf3.o:
 T __aeabi_fmul
 U __fp_assemble
 U __fp_exception
 U __fp_infinity
 U __fp_zero
 T __mulsf3
 U __umulsidi3

_arm_fixsfsi.o:
 T __aeabi_f2iz
 T __fixsfsi
0002 T __internal_f2iz

_arm_f2d.o:
 T __aeabi_f2d
 T __extendsfdf2
 U __fp_normalize2

// soft-float library

mulsf3.o:
 T __aeabi_fmul

fixsfsi.o:
 T __aeabi_f2iz

extendsfdf2.o:
 T __aeabi_f2d

Given the order of the archive file, I expect the linker to import the affected
functions from the _arm_* archive elements.

For "convert-sat.c", all is well with -march=armv7-m.
...
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_muldf3.o
OK> (/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_mulsf3.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_cmpsf2.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_fixsfsi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_fixunssfsi.o
OK> (/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_addsubdf3.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_cmpdf2.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_fixdfsi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_fixunsdfsi.o
OK> (/home/mirdan/gcc-obj/gcc/libgcc.a)_fixsfdi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_fixdfdi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_fixunssfdi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_fixunsdfdi.o
...

However, with -march=armv6s-m, the linker imports these symbols from the soft-
float library.  (NOTE: The CM0 library only implements single-precision float
operations, so imports from muldf3.o, fixdfsi.o, etc are expected.)
...
??> (/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)mulsf3.o
??> (/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)fixsfsi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)muldf3.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)fixdfsi.o
??> (/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)extendsfdf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_clzsi2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_arm_fcmpge.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_arm_fcmple.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixsfdi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixunssfdi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixunssfsi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_arm_cmpdf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixunsdfsi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixdfdi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixunsdfdi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)eqdf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)gedf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)ledf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)subdf3.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)floatunsidf.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_arm_cmpsf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixsfsi.o
...

It seems that the order in which the linker resolves symbols matters.  In the
affected test cases, the linker begins searching for fixed-point function
symbols first: _subQQ

[PATCH v4 05/29] Import replacement 'clz' functions from CM0 library

2021-01-11 Thread gnu
From: Daniel Engel 

On architectures with no clz instruction, this version combines __clzdi2()
with __clzsi2() into a single object with an efficient tail call.  Also, this
version merges the formerly separate for Thumb and ARM code implementations
into a unified instruction sequence.  This change significantly improves the
Thumb performance with affecting ARM performance.  Finally, this version adds
a new __OPTIMIZE_SIZE__ build option (using a loop).

On architectures with a clz instruction, functionality is unchanged.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bits/clz2.S: Size-optimized bitwise versions of __clzsi2()
and __clzdi2() (i.e. __ARM_FEATURE_CLZ not available).
* config/arm/lib1funcs.S: Moved CFI_FUNCTION macros, added 
__ARM_FEATURE_IT.
* config/arm/t-elf: Move _clzsi2 to new group of weak LIB1ASMFUNCS.
---
 libgcc/config/arm/bits/clz2.S | 342 ++
 libgcc/config/arm/lib1funcs.S |  25 ++-
 libgcc/config/arm/t-elf   |   8 +-
 3 files changed, 248 insertions(+), 127 deletions(-)

diff --git a/libgcc/config/arm/bits/clz2.S b/libgcc/config/arm/bits/clz2.S
index 1c8f10a5b29..d0a1fbec4d0 100644
--- a/libgcc/config/arm/bits/clz2.S
+++ b/libgcc/config/arm/bits/clz2.S
@@ -1,124 +1,234 @@
+/* clz2.S: Cortex M0 optimized 'clz' functions
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+
+#ifdef L_clzdi2
+
+// int __clzdi2(long long)
+// Counts leading zero bits in $r1:$r0.
+// Returns the result in $r0.
+FUNC_START_SECTION clzdi2 .text.sorted.libgcc.clz2.clzdi2
+CFI_START_FUNCTION
+
+// Moved here from lib1funcs.S
+cmp xxh,#0
+do_it   eq, et
+clzeq   r0, xxl
+clzne   r0, xxh
+addeq   r0, #32
+RET
+
+CFI_END_FUNCTION
+FUNC_END clzdi2
+
+#endif /* L_clzdi2 */
+
 
 #ifdef L_clzsi2
-#ifdef NOT_ISA_TARGET_32BIT
-FUNC_START clzsi2
-   movsr1, #28
-   movsr3, #1
-   lslsr3, r3, #16
-   cmp r0, r3 /* 0x1 */
-   bcc 2f
-   lsrsr0, r0, #16
-   subsr1, r1, #16
-2: lsrsr3, r3, #8
-   cmp r0, r3 /* #0x100 */
-   bcc 2f
-   lsrsr0, r0, #8
-   subsr1, r1, #8
-2: lsrsr3, r3, #4
-   cmp r0, r3 /* #0x10 */
-   bcc 2f
-   lsrsr0, r0, #4
-   subsr1, r1, #4
-2: adr r2, 1f
-   ldrbr0, [r2, r0]
-   addsr0, r0, r1
-   bx lr
-.align 2
-1:
-.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
-   FUNC_END clzsi2
-#else
-ARM_FUNC_START clzsi2
-# if defined (__ARM_FEATURE_CLZ)
-   clz r0, r0
-   RET
-# else
-   mov r1, #28
-   cmp r0, #0x1
-   do_it   cs, t
-   movcs   r0, r0, lsr #16
-   subcs   r1, r1, #16
-   cmp r0, #0x100
-   do_it   cs, t
-   movcs   r0, r0, lsr #8
-   subcs   r1, r1, #8
-   cmp r0, #0x10
-   do_it   cs, t
-   movcs   r0, r0, lsr #4
-   subcs   r1, r1, #4
-   adr r2, 1f
-   ldrbr0, [r2, r0]
-   add r0, r0, r1
-   RET
-.align 2
-1:
-.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
-# endif /* !defined (__ARM_FEATURE_CLZ) */
-   FUNC_END clzsi2
-#endif
+
+// int __clzsi2(int)
+// Counts leading zero bits in $r0.
+// Returns the result in $r0.
+FUNC_START_SECTION clzsi2 .text.sorted.libgcc.clz2.clzsi2
+CFI_START_FUNCTION
+
+// Moved here from lib1funcs.S
+clz r0, r0
+RET
+
+CFI_END_FUNCTION
+FUNC_END clzsi2
+
 #endif /* L_clzsi2 */
 
+#else /* !__ARM_FEATURE_CLZ */
+
 #ifdef L_clzdi2
-#if !defined (__ARM_FEATURE_CLZ)
-
-# ifdef NOT_ISA_TARGET_32BIT
-FUNC_START clzdi2
-   push{r4, lr}
-   cmp xxh, #0
-   bne 1f
-#  ifdef __ARMEB__
-   movsr0, xxl
-   bl  __clzsi2
-   addsr0,

[PATCH v4 07/29] Import replacement 64-bit shift functions from CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

In Thumb mode, the new functions are each 1-2 instructions smaller
and faster, and branchless when the IT instruction is available.
The ARM versions were converted to the "xxl/xxh" big-endian register
convention, but are otherwise unchanged.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bits/shift.S: Faster thumb versions;
updated big-endian register convention to "xxl/xxh".
---
 libgcc/config/arm/bits/shift.S | 327 +++--
 1 file changed, 233 insertions(+), 94 deletions(-)

diff --git a/libgcc/config/arm/bits/shift.S b/libgcc/config/arm/bits/shift.S
index 94e466ac0d2..16cf2dcef04 100644
--- a/libgcc/config/arm/bits/shift.S
+++ b/libgcc/config/arm/bits/shift.S
@@ -1,102 +1,241 @@
+/* lshift.S: ARM optimized 64-bit integer shift
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
 
 #ifdef L_lshrdi3
 
-   FUNC_START lshrdi3
-   FUNC_ALIAS aeabi_llsr lshrdi3
-   
-#ifdef __thumb__
-   lsrsal, r2
-   movsr3, ah
-   lsrsah, r2
-   mov ip, r3
-   subsr2, #32
-   lsrsr3, r2
-   orrsal, r3
-   negsr2, r2
-   mov r3, ip
-   lslsr3, r2
-   orrsal, r3
-   RET
-#else
-   subsr3, r2, #32
-   rsb ip, r2, #32
-   movmi   al, al, lsr r2
-   movpl   al, ah, lsr r3
-   orrmi   al, al, ah, lsl ip
-   mov ah, ah, lsr r2
-   RET
-#endif
-   FUNC_END aeabi_llsr
-   FUNC_END lshrdi3
-
-#endif
-   
+// long long __aeabi_llsr(long long, int)
+// Logical shift right the 64 bit value in $r1:$r0 by the count in $r2.
+// The result is only guaranteed for shifts in the range of '0' to '63'.
+// Uses $r3 as scratch space.
+FUNC_START_SECTION aeabi_llsr .text.sorted.libgcc.lshrdi3
+FUNC_ALIAS lshrdi3 aeabi_llsr
+CFI_START_FUNCTION
+
+  #if defined(__thumb__) && __thumb__
+
+// Save a copy for the remainder.
+movsr3, xxh
+
+// Assume a simple shift.
+lsrsxxl,r2
+lsrsxxh,r2
+
+// Test if the shift distance is larger than 1 word.
+subsr2, #32
+
+#ifdef __HAVE_FEATURE_IT
+do_it   lo,te
+
+// The remainder is opposite the main shift, (32 - x) bits.
+rsblo   r2, #0
+lsllo   r3, r2
+
+// The remainder shift extends into the hi word.
+lsrhs   r3, r2
+
+#else /* !__HAVE_FEATURE_IT */
+bhs LLSYM(__llsr_large)
+
+// The remainder is opposite the main shift, (32 - x) bits.
+rsbsr2, #0
+lslsr3, r2
+
+// Cancel any remaining shift.
+eorsr2, r2
+
+  LLSYM(__llsr_large):
+// Apply any remaining shift to the hi word.
+lsrsr3, r2
+
+#endif /* !__HAVE_FEATURE_IT */
+
+// Merge remainder and result.
+addsxxl,r3
+RET
+
+  #else /* !__thumb__ */
+
+subsr3, r2, #32
+rsb ip, r2, #32
+movmi   xxl,xxl,lsr r2
+movpl   xxl,xxh,lsr r3
+orrmi   xxl,xxl,xxh,lsl ip
+mov xxh,xxh,lsr r2
+RET
+
+  #endif /* !__thumb__ */
+
+
+CFI_END_FUNCTION
+FUNC_END lshrdi3
+FUNC_END aeabi_llsr
+
+#endif /* L_lshrdi3 */
+
+
 #ifdef L_ashrdi3
-   
-   FUNC_START ashrdi3
-   FUNC_ALIAS aeabi_lasr ashrdi3
-   
-#ifdef __thumb__
-   lsrsal, r2
-   movsr3, ah
-   asrsah, r2
-   subsr2, #32
-   @ If r2 is negative at this point the following step would OR
-   @ the sign bit into all of AL.  That's not what we want...
-   bmi 1f
-   mov ip, r3
-   asrsr3, r2
-   orrsal, r3
-   mov r3, ip
-1:
-   negsr2, r2
-   lslsr3, r2
-   orrsal, r3
-   RET
-#else
-   su

[PATCH v4 19/29] Refactor Thumb-1 single precision comparison functions into a new file.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bpabi-v6m.S: Moved __aeabi_fcmp*() functions to
* config/arm/eabi/fcmp.S: New file.
* config/arm/lib1funcs.S: #include eabi/fcmp.S (v6m only).
---
 libgcc/config/arm/bpabi-v6m.S | 63 --
 libgcc/config/arm/eabi/fcmp.S | 64 +++
 libgcc/config/arm/lib1funcs.S |  1 +
 3 files changed, 65 insertions(+), 63 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/fcmp.S

diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S
index b3dc3bf8f4d..7c874f06218 100644
--- a/libgcc/config/arm/bpabi-v6m.S
+++ b/libgcc/config/arm/bpabi-v6m.S
@@ -49,69 +49,6 @@ FUNC_START aeabi_frsub
 
 #endif /* L_arm_addsubsf3 */
 
-#ifdef L_arm_cmpsf2
-
-FUNC_START aeabi_cfrcmple
-
-   mov ip, r0
-   movsr0, r1
-   mov r1, ip
-   b   6f
-
-FUNC_START aeabi_cfcmpeq
-FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq
-
-   @ The status-returning routines are required to preserve all
-   @ registers except ip, lr, and cpsr.
-6: push{r0, r1, r2, r3, r4, lr}
-   bl  __lesf2
-   @ Set the Z flag correctly, and the C flag unconditionally.
-   cmp r0, #0
-   @ Clear the C flag if the return value was -1, indicating
-   @ that the first operand was smaller than the second.
-   bmi 1f
-   movsr1, #0
-   cmn r0, r1
-1:
-   pop {r0, r1, r2, r3, r4, pc}
-
-   FUNC_END aeabi_cfcmple
-   FUNC_END aeabi_cfcmpeq
-   FUNC_END aeabi_cfrcmple
-
-FUNC_START aeabi_fcmpeq
-
-   push{r4, lr}
-   bl  __eqsf2
-   negsr0, r0
-   addsr0, r0, #1
-   pop {r4, pc}
-
-   FUNC_END aeabi_fcmpeq
-
-.macro COMPARISON cond, helper, mode=sf2
-FUNC_START aeabi_fcmp\cond
-
-   push{r4, lr}
-   bl  __\helper\mode
-   cmp r0, #0
-   b\cond  1f
-   movsr0, #0
-   pop {r4, pc}
-1:
-   movsr0, #1
-   pop {r4, pc}
-
-   FUNC_END aeabi_fcmp\cond
-.endm
-
-COMPARISON lt, le
-COMPARISON le, le
-COMPARISON gt, ge
-COMPARISON ge, ge
-
-#endif /* L_arm_cmpsf2 */
-
 #ifdef L_arm_addsubdf3
 
 FUNC_START aeabi_drsub
diff --git a/libgcc/config/arm/eabi/fcmp.S b/libgcc/config/arm/eabi/fcmp.S
new file mode 100644
index 000..3d02e191a43
--- /dev/null
+++ b/libgcc/config/arm/eabi/fcmp.S
@@ -0,0 +1,64 @@
+
+#ifdef L_arm_cmpsf2
+
+FUNC_START aeabi_cfrcmple
+
+   mov ip, r0
+   movsr0, r1
+   mov r1, ip
+   b   6f
+
+FUNC_START aeabi_cfcmpeq
+FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq
+
+   @ The status-returning routines are required to preserve all
+   @ registers except ip, lr, and cpsr.
+6: push{r0, r1, r2, r3, r4, lr}
+   bl  __lesf2
+   @ Set the Z flag correctly, and the C flag unconditionally.
+   cmp r0, #0
+   @ Clear the C flag if the return value was -1, indicating
+   @ that the first operand was smaller than the second.
+   bmi 1f
+   movsr1, #0
+   cmn r0, r1
+1:
+   pop {r0, r1, r2, r3, r4, pc}
+
+   FUNC_END aeabi_cfcmple
+   FUNC_END aeabi_cfcmpeq
+   FUNC_END aeabi_cfrcmple
+
+FUNC_START aeabi_fcmpeq
+
+   push{r4, lr}
+   bl  __eqsf2
+   negsr0, r0
+   addsr0, r0, #1
+   pop {r4, pc}
+
+   FUNC_END aeabi_fcmpeq
+
+.macro COMPARISON cond, helper, mode=sf2
+FUNC_START aeabi_fcmp\cond
+
+   push{r4, lr}
+   bl  __\helper\mode
+   cmp r0, #0
+   b\cond  1f
+   movsr0, #0
+   pop {r4, pc}
+1:
+   movsr0, #1
+   pop {r4, pc}
+
+   FUNC_END aeabi_fcmp\cond
+.endm
+
+COMPARISON lt, le
+COMPARISON le, le
+COMPARISON gt, ge
+COMPARISON ge, ge
+
+#endif /* L_arm_cmpsf2 */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index a8afe78a69c..bd41ea79283 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1944,6 +1944,7 @@ LSYM(Lchange_\register):
 #include "bpabi.S"
 #else /* NOT_ISA_TARGET_32BIT */
 #include "bpabi-v6m.S"
+#include "eabi/fcmp.S"
 #endif /* NOT_ISA_TARGET_32BIT */
 #include "eabi/lcmp.S"
 #endif /* !__symbian__ */
-- 
2.25.1



[PATCH v4 20/29] Import single precision comparison functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

These functions are significantly smaller and faster than the wrapper
functions and soft-float implementation they replace.  Using the first
comparison operator (e.g. '<=') costs about 70 bytes initially, but every
additional operator incrementally adds just 4 bytes.

NOTE: It seems that the __aeabi_cfcmp*() routines formerly in bpabi-v6m.S
were not well tested, as they produced the wrong output for the 'C' flag.
The replacement functions are tested correct.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/eabi/fcmp.S: New __aeabi_fcmp() function family.
* config/arm/eabi/fplib.h: New file with fcmp-specific constants
and general library configuration macros.
* config/arm/lib1funcs.S: #include eabi/fplib.h (v6m only).
* config/arm/t-elf: Add _fcmp family members to LIB1ASMFUNCS.
---
 libgcc/config/arm/eabi/fcmp.S  | 660 ++---
 libgcc/config/arm/eabi/fplib.h |  83 +
 libgcc/config/arm/lib1funcs.S  |   1 +
 libgcc/config/arm/t-elf|  18 +
 4 files changed, 702 insertions(+), 60 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/fplib.h

diff --git a/libgcc/config/arm/eabi/fcmp.S b/libgcc/config/arm/eabi/fcmp.S
index 3d02e191a43..cada33f4d35 100644
--- a/libgcc/config/arm/eabi/fcmp.S
+++ b/libgcc/config/arm/eabi/fcmp.S
@@ -1,64 +1,604 @@
+/* fcmp.S: Thumb-1 optimized 32-bit float comparison
 
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+// The various compare functions in this file all expect to tail call 
__cmpsf2()
+//  with flags set for a particular comparison mode.  The __internal_cmpsf2()
+//  symbol  itself is unambiguous, but there is a remote risk that the linker 
+//  will prefer some other symbol in place of __cmpsf2().  Importing an archive
+//  file that also exports __cmpsf2() will throw an error in this case.
+// As a workaround, this block configures __aeabi_f2lz() for compilation twice.
+// The first version configures __internal_cmpsf2() as a WEAK standalone 
symbol,
+//  and the second exports __cmpsf2() and __internal_cmpsf2() normally.
+// A small bonus: programs not using __cmpsf2() itself will be slightly 
smaller.
+// 'L_internal_cmpsf2' should appear before 'L_arm_cmpsf2' in LIB1ASMFUNCS.
+#if defined(L_arm_cmpsf2) || defined(L_internal_cmpsf2)
+
+#define CMPSF2_SECTION .text.sorted.libgcc.fcmp.cmpsf2
+
+// int __cmpsf2(float, float)
+// <https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html>
+// Returns the three-way comparison result of $r0 with $r1:
+//  * +1 if ($r0 > $r1), or either argument is NAN
+//  *  0 if ($r0 == $r1)
+//  * -1 if ($r0 < $r1)
+// Uses $r2, $r3, and $ip as scratch space.
 #ifdef L_arm_cmpsf2
+FUNC_START_SECTION cmpsf2 CMPSF2_SECTION
+FUNC_ALIAS lesf2 cmpsf2
+FUNC_ALIAS ltsf2 cmpsf2
+CFI_START_FUNCTION
+
+// Assumption: The 'libgcc' functions should raise exceptions.
+movsr2, #(FCMP_UN_POSITIVE + FCMP_RAISE_EXCEPTIONS + FCMP_3WAY)
+
+// int,int __internal_cmpsf2(float, float, int)
+// Internal function expects a set of control flags in $r2.
+// If ordered, returns a comparison type { 0, 1, 2 } in $r3
+FUNC_ENTRY internal_cmpsf2
+
+#else /* L_internal_cmpsf2 */
+WEAK_START_SECTION internal_cmpsf2 CMPSF2_SECTION
+CFI_START_FUNCTION
+
+#endif 
+
+// When operand signs are considered, the comparison result falls
+//  within one of the following quadrants:
+//
+// $r0  $r1  $r0-$r1* flags  result
+//  ++  >  C=0 GT
+//  ++  =  Z=1 EQ
+//  ++  <  C=1 LT
+//  +-  >  C=1 GT
+//  +-  =  C=1 GT
+//  +-  <  C=1 GT
+//  -+  >  C=0 LT
+//  -+  =  C=0 LT

[PATCH v4 21/29] Refactor Thumb-1 floating point subtraction into a new file.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bpabi-v6m.S: Moved __aeabi_frsub() to
* config/arm/eabi/fadd.S: New file.
* config/arm/lib1funcs.S: #include eabi/fadd.S (v6m only).
---
 libgcc/config/arm/bpabi-v6m.S | 16 
 libgcc/config/arm/eabi/fadd.S | 16 
 libgcc/config/arm/lib1funcs.S |  1 +
 3 files changed, 17 insertions(+), 16 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/fadd.S

diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S
index 7c874f06218..c76c3b0568b 100644
--- a/libgcc/config/arm/bpabi-v6m.S
+++ b/libgcc/config/arm/bpabi-v6m.S
@@ -33,22 +33,6 @@
.eabi_attribute 25, 1
 #endif /* __ARM_EABI__ */
 
-
-#ifdef L_arm_addsubsf3
-
-FUNC_START aeabi_frsub
-
-  push {r4, lr}
-  movs r4, #1
-  lsls r4, #31
-  eors r0, r0, r4
-  bl   __aeabi_fadd
-  pop  {r4, pc}
-
-  FUNC_END aeabi_frsub
-
-#endif /* L_arm_addsubsf3 */
-
 #ifdef L_arm_addsubdf3
 
 FUNC_START aeabi_drsub
diff --git a/libgcc/config/arm/eabi/fadd.S b/libgcc/config/arm/eabi/fadd.S
new file mode 100644
index 000..223e38f7e50
--- /dev/null
+++ b/libgcc/config/arm/eabi/fadd.S
@@ -0,0 +1,16 @@
+
+#ifdef L_arm_addsubsf3
+
+FUNC_START aeabi_frsub
+
+  push {r4, lr}
+  movs r4, #1
+  lsls r4, #31
+  eors r0, r0, r4
+  bl   __aeabi_fadd
+  pop  {r4, pc}
+
+  FUNC_END aeabi_frsub
+
+#endif /* L_arm_addsubsf3 */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index ed0b1eb1041..e439449422f 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1946,6 +1946,7 @@ LSYM(Lchange_\register):
 #include "bpabi-v6m.S"
 #include "eabi/fplib.h"
 #include "eabi/fcmp.S"
+#include "eabi/fadd.S"
 #endif /* NOT_ISA_TARGET_32BIT */
 #include "eabi/lcmp.S"
 #endif /* !__symbian__ */
-- 
2.25.1



[PATCH v4 18/29] Import new integer multiplication functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/eabi/lmul.S: New file for __muldi3(), __mulsidi3(), and
 __umulsidi3().
* config/arm/lib1funcs.S: #eabi/lmul.S (v6m only).
* config/arm/t-elf: Add the new objects to LIB1ASMFUNCS.
---
 libgcc/config/arm/eabi/lmul.S | 218 ++
 libgcc/config/arm/lib1funcs.S |   1 +
 libgcc/config/arm/t-elf   |  13 +-
 3 files changed, 230 insertions(+), 2 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/lmul.S

diff --git a/libgcc/config/arm/eabi/lmul.S b/libgcc/config/arm/eabi/lmul.S
new file mode 100644
index 000..9fec4364a26
--- /dev/null
+++ b/libgcc/config/arm/eabi/lmul.S
@@ -0,0 +1,218 @@
+/* lmul.S: Thumb-1 optimized 64-bit integer multiplication
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifdef L_muldi3
+
+// long long __aeabi_lmul(long long, long long)
+// Returns the least significant 64 bits of a 64 bit multiplication.
+// Expects the two multiplicands in $r1:$r0 and $r3:$r2.
+// Returns the product in $r1:$r0 (does not distinguish signed types).
+// Uses $r4 and $r5 as scratch space.
+// Same parent section as __umulsidi3() to keep tail call branch within range.
+FUNC_START_SECTION muldi3 .text.sorted.libgcc.lmul.muldi3
+
+#ifndef __symbian__
+  FUNC_ALIAS aeabi_lmul muldi3
+#endif
+
+CFI_START_FUNCTION
+
+// $r1:$r0 = 0x
+// $r3:$r2 = 0x
+
+// The following operations that only affect the upper 64 bits
+//  can be safely discarded:
+//    * 
+//    * 
+//    * 
+//    * 
+//    * 
+//    * 
+
+// MAYBE: Test for multiply by ZERO on implementations with a 32-cycle
+//  'muls' instruction, and skip over the operation in that case.
+
+// (0x * 0x), free $r1
+mulsxxh,yyl
+
+// (0x * 0x), free $r3
+mulsyyh,xxl
+addsyyh,xxh
+
+// Put the parameters in the correct form for umulsidi3().
+movsxxh,yyl
+b   LLSYM(__mul_overflow)
+
+CFI_END_FUNCTION
+FUNC_END muldi3
+
+#ifndef __symbian__
+  FUNC_END aeabi_lmul
+#endif
+
+#endif /* L_muldi3 */
+
+
+// The following implementation of __umulsidi3() integrates with __muldi3()
+//  above to allow the fast tail call while still preserving the extra
+//  hi-shifted bits of the result.  However, these extra bits add a few
+//  instructions not otherwise required when using only __umulsidi3().
+// Therefore, this block configures __umulsidi3() for compilation twice.
+// The first version is a minimal standalone implementation, and the second
+//  version adds the hi bits of __muldi3().  The standalone version must
+//  be declared WEAK, so that the combined version can supersede it and
+//  provide both symbols in programs that multiply long doubles.
+// This means '_umulsidi3' should appear before '_muldi3' in LIB1ASMFUNCS.
+#if defined(L_muldi3) || defined(L_umulsidi3)
+
+#ifdef L_umulsidi3
+// unsigned long long __umulsidi3(unsigned int, unsigned int)
+// Returns all 64 bits of a 32 bit multiplication.
+// Expects the two multiplicands in $r0 and $r1.
+// Returns the product in $r1:$r0.
+// Uses $r3, $r4 and $ip as scratch space.
+WEAK_START_SECTION umulsidi3 .text.sorted.libgcc.lmul.umulsidi3
+CFI_START_FUNCTION
+
+#else /* L_muldi3 */
+FUNC_ENTRY umulsidi3
+CFI_START_FUNCTION
+
+// 32x32 multiply with 64 bit result.
+// Expand the multiply into 4 parts, since muls only returns 32 bits.
+// (a16h * b16h / 2^32)
+//   + (a16h * b16l / 2^48) + (a16l * b16h / 2^48)
+//   + (a16l * b16l / 2^64)
+
+// MAYBE: Test for multiply by 0 on implementations with a 32-cycle
+//  'muls&

[PATCH v4 28/29] Import float<->__fp16 conversion functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-09 Daniel Engel 

* config/arm/eabi/fcast.S: Add __aeabi_f2h*() and __aeabi_h2f*().
* config/arm/fp16: Disable duplicate C routines (v6m only).
* config/arm/t-bpabi: Add _arm_f2h* and _arm_h2f* objects to 
LIB1ASMFUNCS.
---
 libgcc/config/arm/eabi/fcast.S | 277 +
 libgcc/config/arm/fp16.c   |   4 +
 libgcc/config/arm/t-bpabi  |   7 +
 3 files changed, 288 insertions(+)

diff --git a/libgcc/config/arm/eabi/fcast.S b/libgcc/config/arm/eabi/fcast.S
index b1184ee1d53..1783a161912 100644
--- a/libgcc/config/arm/eabi/fcast.S
+++ b/libgcc/config/arm/eabi/fcast.S
@@ -254,3 +254,280 @@ FUNC_END D2F_NAME
 
 #endif /* L_arm_d2f || L_arm_truncdfsf2 */
 
+
+#if defined(L_aeabi_h2f_ieee) || defined(L_aeabi_h2f_alt)
+
+#ifdef L_aeabi_h2f_ieee
+  #define H2F_NAME aeabi_h2f
+  #define H2F_ALIAS gnu_h2f_ieee
+#else
+  #define H2F_NAME aeabi_h2f_alt
+  #define H2F_ALIAS gnu_h2f_alternative
+#endif
+
+// float __aeabi_h2f(short hf)
+// float __aeabi_h2f_alt(short hf)
+// Converts a half-precision float in $r0 to single-precision.
+// Rounding, overflow, and underflow conditions are impossible.
+// In IEEE mode, INF, ZERO, and NAN are returned unmodified.
+FUNC_START_SECTION H2F_NAME .text.sorted.libgcc.h2f
+FUNC_ALIAS H2F_ALIAS H2F_NAME
+CFI_START_FUNCTION
+
+// Set up registers for __fp_normalize2().
+push{ rT, lr }
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+// Save the mantissa and exponent.
+lslsr2, r0, #17
+
+// Isolate the sign.
+lsrsr0, #15
+lslsr0, #31
+
+// Align the exponent at bit[24] for normalization.
+// If zero, return the original sign.
+lsrsr2, #3
+
+  #ifdef __ARM_FEATURE_IT
+do_it   eq
+RETc(eq)
+  #else
+beq LLSYM(__h2f_return)
+  #endif
+
+// Split the exponent and mantissa into separate registers.
+// This is the most efficient way to convert subnormals in the
+//  half-precision form into normals in single-precision.
+// This does add a leading implicit '1' to INF and NAN,
+//  but that will be absorbed when the value is re-assembled.
+bl  SYM(__fp_normalize2) __PLT__
+
+   #ifdef L_aeabi_h2f_ieee
+// Set up the exponent bias.  For INF/NAN values, the bias is 223,
+//  where the last '1' accounts for the implicit '1' in the mantissa.
+addsr2, #(255 - 31 - 1)
+
+// Test for INF/NAN.
+cmp r2, #254
+
+  #ifdef __ARM_FEATURE_IT
+do_it   ne
+  #else
+beq LLSYM(__h2f_assemble)
+  #endif
+
+// For normal values, the bias should have been 111.
+// However, this offset must be adjusted per the INF check above.
+ IT(sub,ne) r2, #((255 - 31 - 1) - (127 - 15 - 1))
+
+#else /* L_aeabi_h2f_alt */
+// Set up the exponent bias.  All values are normal.
+addsr2, #(127 - 15 - 1)
+#endif
+
+LLSYM(__h2f_assemble):
+// Combine exponent and sign.
+lslsr2, #23
+addsr0, r2
+
+// Combine mantissa.
+lsrsr3, #8
+add r0, r3
+
+LLSYM(__h2f_return):
+pop { rT, pc }
+.cfi_restore_state
+
+CFI_END_FUNCTION
+FUNC_END H2F_NAME
+FUNC_END H2F_ALIAS
+
+#endif /* L_aeabi_h2f_ieee || L_aeabi_h2f_alt */
+
+
+#if defined(L_aeabi_f2h_ieee) || defined(L_aeabi_f2h_alt)
+
+#ifdef L_aeabi_f2h_ieee
+  #define F2H_NAME aeabi_f2h
+  #define F2H_ALIAS gnu_f2h_ieee
+#else
+  #define F2H_NAME aeabi_f2h_alt
+  #define F2H_ALIAS gnu_f2h_alternative
+#endif
+
+// short __aeabi_f2h(float f)
+// short __aeabi_f2h_alt(float f)
+// Converts a single-precision float in $r0 to half-precision,
+//  rounding to nearest, ties to even.
+// Values out of range are forced to either ZERO or INF.
+// In IEEE mode, the upper 12 bits of a NAN will be preserved.
+FUNC_START_SECTION F2H_NAME .text.sorted.libgcc.f2h
+FUNC_ALIAS F2H_ALIAS F2H_NAME
+CFI_START_FUNCTION
+
+// Set up the sign.
+lsrsr2, r0, #31
+lslsr2, #15
+
+// Save the exponent and mantissa.
+// If ZERO, return the original sign.
+lslsr0, #1
+
+  #ifdef __ARM_FEATURE_IT
+do_it   ne,t
+addne   r0, r2
+RETc(ne)
+  #else
+beq LLSYM(__f2h_return)
+  #endif
+
+// Isolate the exponent.
+lsrsr1, r0, #24
+
+  #ifdef L_aeabi_f2h_ieee
+// Check for NAN.
+cmp r1, #255
+beq LLSYM(__f2h_indefinite)
+
+// Check for overflow.
+cmp r1, #(127 + 15)
+bhi LLSYM(__f2h_overflow)
+
+  #else /* L_aeabi_f2h_alt 

[PATCH v4 27/29] Import float<->double conversion functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-08 Daniel Engel 

* config/arm/eabi/fcast.S: New file for __aeabi_f2d/__extendsfdf2()
__aeabi_d2f(), __truncdfsf2().
* config/arm/lib1funcs.S: #include eabi/fcast.S (v6m only).
* config/arm/t-elf: Add _arm_d2f and _arm_f2d objects to LIB1ASMFUNCS.
---
 libgcc/config/arm/eabi/fcast.S | 256 +
 libgcc/config/arm/lib1funcs.S  |   1 +
 libgcc/config/arm/t-elf|   2 +
 3 files changed, 259 insertions(+)
 create mode 100644 libgcc/config/arm/eabi/fcast.S

diff --git a/libgcc/config/arm/eabi/fcast.S b/libgcc/config/arm/eabi/fcast.S
new file mode 100644
index 000..b1184ee1d53
--- /dev/null
+++ b/libgcc/config/arm/eabi/fcast.S
@@ -0,0 +1,256 @@
+/* fcast.S: Thumb-1 optimized 32- and 64-bit float conversions
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifdef L_arm_f2d
+
+// double __aeabi_f2d(float)
+// Converts a single-precision float in $r0 to double-precision in $r1:$r0.
+// Rounding, overflow, and underflow are impossible.
+// INF and ZERO are returned unmodified.
+FUNC_START_SECTION aeabi_f2d .text.sorted.libgcc.fpcore.v.f2d
+FUNC_ALIAS extendsfdf2 aeabi_f2d
+CFI_START_FUNCTION
+
+// Save the sign.
+lsrsr1, r0, #31
+lslsr1, #31
+
+// Set up registers for __fp_normalize2().
+push{ rT, lr }
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+// Test for zero.
+lslsr0, #1
+beq LLSYM(__f2d_return)
+
+// Split the exponent and mantissa into separate registers.
+// This is the most efficient way to convert subnormals in the
+//  half-precision form into normals in single-precision.
+// This does add a leading implicit '1' to INF and NAN,
+//  but that will be absorbed when the value is re-assembled.
+movsr2, r0
+bl  SYM(__fp_normalize2) __PLT__
+
+// Set up the exponent bias.  For INF/NAN values, the bias
+//  is 1791 (2047 - 255 - 1), where the last '1' accounts
+//  for the implicit '1' in the mantissa.
+movsr0, #3
+lslsr0, #9
+addsr0, #255
+
+// Test for INF/NAN, promote exponent if necessary
+cmp r2, #255
+beq LLSYM(__f2d_indefinite)
+
+// For normal values, the exponent bias is 895 (1023 - 127 - 1),
+//  which is half of the prepared INF/NAN bias.
+lsrsr0, #1
+
+LLSYM(__f2d_indefinite):
+// Assemble exponent with bias correction.
+addsr2, r0
+lslsr2, #20
+addsr1, r2
+
+// Assemble the high word of the mantissa.
+lsrsr0, r3, #11
+add r1, r0
+
+// Remainder of the mantissa in the low word of the result.
+lslsr0, r3, #21
+
+LLSYM(__f2d_return):
+pop { rT, pc }
+.cfi_restore_state
+
+CFI_END_FUNCTION
+FUNC_END extendsfdf2
+FUNC_END aeabi_f2d
+
+#endif /* L_arm_f2d */
+
+
+#if defined(L_arm_d2f) || defined(L_arm_truncdfsf2)
+
+// HACK: Build two separate implementations:
+//  * __aeabi_d2f() rounds to nearest per traditional IEEE-753 rules.
+//  * __truncdfsf2() rounds towards zero per GCC specification.
+// Presumably, a program will consistently use one ABI or the other,
+//  which means that code size will not be duplicated in practice.
+// Merging two versions with dynamic rounding would be rather hard.
+#ifdef L_arm_truncdfsf2
+  #define D2F_NAME truncdfsf2
+  #define D2F_SECTION .text.sorted.libgcc.fpcore.x.truncdfsf2
+#else
+  #define D2F_NAME aeabi_d2f
+  #define D2F_SECTION .text.sorted.libgcc.fpcore.w.d2f
+#endif
+
+// float __aeabi_d2f(double)
+// Conv

[PATCH v4 25/29] Import integer-to-float conversion functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-08 Daniel Engel 

* config/arm/bpabi-lib.h: Remove obsolete RENAME_LIBRARY directives.
* config/arm/eabi/ffloat.S: New file for __float[un]sisf/disf().
* config/arm/lib1funcs.S: #include eabi/ffloat.S (v6m only).
* config/arm/t-elf: Add _float[un]sisf/disf objects to LIB1ASMFUNCS.
---
 libgcc/config/arm/bpabi-lib.h   |   6 -
 libgcc/config/arm/eabi/ffloat.S | 249 
 libgcc/config/arm/lib1funcs.S   |   1 +
 libgcc/config/arm/t-elf |   5 +-
 4 files changed, 254 insertions(+), 7 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/ffloat.S

diff --git a/libgcc/config/arm/bpabi-lib.h b/libgcc/config/arm/bpabi-lib.h
index 3cb90b4b345..1e651ead4ac 100644
--- a/libgcc/config/arm/bpabi-lib.h
+++ b/libgcc/config/arm/bpabi-lib.h
@@ -56,9 +56,6 @@
 #ifdef L_floatdidf
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatdidf, l2d)
 #endif
-#ifdef L_floatdisf
-#define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatdisf, l2f)
-#endif
 
 /* These renames are needed on ARMv6M.  Other targets get them from
assembly routines.  */
@@ -71,9 +68,6 @@
 #ifdef L_floatundidf
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatundidf, ul2d)
 #endif
-#ifdef L_floatundisf
-#define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatundisf, ul2f)
-#endif
 
 /* For ARM bpabi, we only want to use a "__gnu_" prefix for the fixed-point
helper functions - not everything in libgcc - in the interests of
diff --git a/libgcc/config/arm/eabi/ffloat.S b/libgcc/config/arm/eabi/ffloat.S
new file mode 100644
index 000..eadc9d8d08e
--- /dev/null
+++ b/libgcc/config/arm/eabi/ffloat.S
@@ -0,0 +1,249 @@
+/* ffixed.S: Thumb-1 optimized integer-to-float conversion
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifdef L_arm_floatsisf
+
+// float __aeabi_i2f(int)
+// Converts a signed integer in $r0 to float.
+
+// On little-endian cores (including all Cortex-M), __floatsisf() can be
+//  implemented as below in 5 instructions.  However, it can also be
+//  implemented by prefixing a single instruction to __floatdisf().
+// A memory savings of 4 instructions at a cost of only 2 execution cycles
+//  seems reasonable enough.  Plus, the trade-off only happens in programs
+//  that require both __floatsisf() and __floatdisf().  Programs only using
+//  __floatsisf() always get the smallest version.
+// When the combined version will be provided, this standalone version
+//  must be declared WEAK, so that the combined version can supersede it.
+// '_arm_floatsisf' should appear before '_arm_floatdisf' in LIB1ASMFUNCS.
+// Same parent section as __ul2f() to keep tail call branch within range.
+#if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+WEAK_START_SECTION aeabi_i2f .text.sorted.libgcc.fpcore.p.floatsisf
+WEAK_ALIAS floatsisf aeabi_i2f
+CFI_START_FUNCTION
+
+#else /* !__OPTIMIZE_SIZE__ */
+FUNC_START_SECTION aeabi_i2f .text.sorted.libgcc.fpcore.p.floatsisf
+FUNC_ALIAS floatsisf aeabi_i2f
+CFI_START_FUNCTION
+
+#endif /* !__OPTIMIZE_SIZE__ */
+
+// Save the sign.
+asrsr3, r0, #31
+
+// Absolute value of the input.
+eorsr0, r3
+subsr0, r3
+
+// Sign extension to long long unsigned.
+eorsr1, r1
+b   SYM(__internal_floatundisf_noswap)
+
+CFI_END_FUNCTION
+FUNC_END floatsisf
+FUNC_END aeabi_i2f
+
+#endif /* L_arm_floatsisf */
+
+
+#ifdef L_arm_floatdisf
+
+// float __aeabi_l2f(long long)
+// Converts a signed 64-bit integer in $r1:$r0 to a float in $r0.
+// See build comments for __floatsisf() above.
+// Same parent section as __ul2f() to keep tail call branch within range.
+#if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+FUNC_START_SECTION aeabi_i2f .text.sorted.libgcc.fpcore.p.floatdisf
+FUNC_ALIAS floatsisf aeabi_i2f
+CFI_START_F

[PATCH v4 22/29] Import single precision addition and subtraction from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

Since this is the first phase of the floating point functions, some
common parsing and formatting routines and also included.  These common
routines will be referenced by other functions in subsequent commits.
However, even if the size penalty is accounted entirely to __addsf3(),
the total compiled size is still less than half the size of soft-float.

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/eabi/fadd.S: Add new functions __addsf3() and __subsf3().
* config/arm/eabi/fneg.S: Add new file for __negsf2().
* config/arm/eabi/futil.S: Add new file for shared floating point
helper functions (normalization, rounding, etc).
* config/arm/lib1funcs.S: #include eabi/fneg.S and eabi/futil.S (v6m 
only).
* config/arm/t-elf: Add _addsf3, _frsubsf3, and helpers to LIB1ASMFUNCS.
---
 libgcc/config/arm/eabi/fadd.S  | 324 -
 libgcc/config/arm/eabi/fneg.S  |  76 ++
 libgcc/config/arm/eabi/fplib.h |   3 -
 libgcc/config/arm/eabi/futil.S | 418 +
 libgcc/config/arm/lib1funcs.S  |   2 +
 libgcc/config/arm/t-elf|   6 +
 6 files changed, 818 insertions(+), 11 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/fneg.S
 create mode 100644 libgcc/config/arm/eabi/futil.S

diff --git a/libgcc/config/arm/eabi/fadd.S b/libgcc/config/arm/eabi/fadd.S
index 223e38f7e50..77b81d62b3b 100644
--- a/libgcc/config/arm/eabi/fadd.S
+++ b/libgcc/config/arm/eabi/fadd.S
@@ -1,16 +1,324 @@
+/* fadd.S: Thumb-1 optimized 32-bit float addition and subtraction
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifdef L_arm_frsubsf3
+
+// float __aeabi_frsub(float, float)
+// Returns the floating point difference of $r1 - $r0 in $r0.
+// Subsection ordering within fpcore keeps conditional branches within range.
+FUNC_START_SECTION aeabi_frsub .text.sorted.libgcc.fpcore.b.frsub
+CFI_START_FUNCTION
+
+  #if defined(STRICT_NANS) && STRICT_NANS
+// Check if $r0 is NAN before modifying.
+lslsr2, r0, #1
+movsr3, #255
+lslsr3, #24
+
+// Let fadd() find the NAN in the normal course of operation,
+//  moving it to $r0 and checking the quiet/signaling bit.
+cmp r2, r3
+bhi SYM(__aeabi_fadd)
+  #endif
+
+// Flip sign and run through fadd().
+movsr2, #1
+lslsr2, #31
+addsr0, r2
+b   SYM(__aeabi_fadd)
+
+CFI_END_FUNCTION
+FUNC_END aeabi_frsub
+
+#endif /* L_arm_frsubsf3 */
+
 
 #ifdef L_arm_addsubsf3
 
-FUNC_START aeabi_frsub
+// float __aeabi_fsub(float, float)
+// Returns the floating point difference of $r0 - $r1 in $r0.
+// Subsection ordering within fpcore keeps conditional branches within range.
+FUNC_START_SECTION aeabi_fsub .text.sorted.libgcc.fpcore.c.faddsub
+FUNC_ALIAS subsf3 aeabi_fsub
+CFI_START_FUNCTION
+
+  #if defined(STRICT_NANS) && STRICT_NANS
+// Check if $r1 is NAN before modifying.
+lslsr2, r1, #1
+movsr3, #255
+lslsr3, #24
 
-  push {r4, lr}
-  movs r4, #1
-  lsls r4, #31
-  eors r0, r0, r4
-  bl   __aeabi_fadd
-  pop  {r4, pc}
+// Let fadd() find the NAN in the normal course of operation,
+//  moving it to $r0 and checking the quiet/signaling bit.
+cmp r2, r3
+bhi SYM(__aeabi_fadd)
+  #endif
 
-  FUNC_END aeabi_frsub
+// Flip sign and fall into fadd().
+movsr2, #1
+lslsr2, #31
+addsr1, r2
 
 #endif /* L_arm_addsubsf3 */
 
+
+// The execution of __subsf3() flows directly into __addsf3(), such that
+//  instructions must appear consecutively in the same memory section.
+//  However, this construction inhibits the ability to discard __subsf3()
+//  wh

[PATCH v4 23/29] Import single precision multiplication from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-08 Daniel Engel 

* config/arm/eabi/fmul.S: New file for __mulsf3().
* config/arm/lib1funcs.S: #include eabi/fmul.S (v6m only).
* config/arm/t-elf: Move _mulsf3 to global scope in LIB1ASMFUNCS
(this object was formerly blocked on v6m builds).
---
 libgcc/config/arm/eabi/fmul.S | 215 ++
 libgcc/config/arm/lib1funcs.S |   1 +
 libgcc/config/arm/t-elf   |   3 +-
 3 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 libgcc/config/arm/eabi/fmul.S

diff --git a/libgcc/config/arm/eabi/fmul.S b/libgcc/config/arm/eabi/fmul.S
new file mode 100644
index 000..767de988f0b
--- /dev/null
+++ b/libgcc/config/arm/eabi/fmul.S
@@ -0,0 +1,215 @@
+/* fmul.S: Thumb-1 optimized 32-bit float multiplication
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifdef L_arm_mulsf3
+
+// float __aeabi_fmul(float, float)
+// Returns $r0 after multiplication by $r1.
+// Subsection ordering within fpcore keeps conditional branches within range.
+FUNC_START_SECTION aeabi_fmul .text.sorted.libgcc.fpcore.m.fmul
+FUNC_ALIAS mulsf3 aeabi_fmul
+CFI_START_FUNCTION
+
+// Standard registers, compatible with exception handling.
+push{ rT, lr }
+.cfi_remember_state
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+// Save the sign of the result.
+movsrT, r1
+eorsrT, r0
+lsrsrT, #31
+lslsrT, #31
+mov ip, rT
+
+// Set up INF for comparison.
+movsrT, #255
+lslsrT, #24
+
+// Check for multiplication by zero.
+lslsr2, r0, #1
+beq LLSYM(__fmul_zero1)
+
+lslsr3, r1, #1
+beq LLSYM(__fmul_zero2)
+
+// Check for INF/NAN.
+cmp r3, rT
+bhs LLSYM(__fmul_special2)
+
+cmp r2, rT
+bhs LLSYM(__fmul_special1)
+
+// Because neither operand is INF/NAN, the result will be finite.
+// It is now safe to modify the original operand registers.
+lslsr0, #9
+
+// Isolate the first exponent.  When normal, add back the implicit '1'.
+// The result is always aligned with the MSB in bit [31].
+// Subnormal mantissas remain effectively multiplied by 2x relative to
+//  normals, but this works because the weight of a subnormal is -126.
+lsrsr2, #24
+beq LLSYM(__fmul_normalize2)
+addsr0, #1
+rorsr0, r0
+
+LLSYM(__fmul_normalize2):
+// IMPORTANT: exp10i() jumps in here!
+// Repeat for the mantissa of the second operand.
+// Short-circuit when the mantissa is 1.0, as the
+//  first mantissa is already prepared in $r0
+lslsr1, #9
+
+// When normal, add back the implicit '1'.
+lsrsr3, #24
+beq LLSYM(__fmul_go)
+addsr1, #1
+rorsr1, r1
+
+LLSYM(__fmul_go):
+// Calculate the final exponent, relative to bit [30].
+addsrT, r2, r3
+subsrT, #127
+
+  #if !defined(__OPTIMIZE_SIZE__) || !__OPTIMIZE_SIZE__
+// Short-circuit on multiplication by powers of 2.
+lslsr3, r0, #1
+beq LLSYM(__fmul_simple1)
+
+lslsr3, r1, #1
+beq LLSYM(__fmul_simple2)
+  #endif
+
+// Save $ip across the call.
+// (Alternatively, could push/pop a separate register,
+//  but the four instructions here are equivally fast)
+//  without imposing on the stack.
+add rT, ip
+
+// 32x32 unsigned multiplication, 64 bit result.
+ 

[PATCH v4 26/29] Import float-to-integer conversion functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-08 Daniel Engel 

* config/arm/bpabi-lib.h: Remove obsolete RENAME_LIBRARY directives.
* config/arm/eabi/ffixed.S: New file for __float[un]sisf/disf().
* config/arm/lib1funcs.S: #include eabi/ffixed.S (v6m only).
* config/arm/t-elf: Add _fix[uns]sfdi/sfsi objects to LIB1ASMFUNCS.
---
 libgcc/config/arm/bpabi-lib.h   |   6 -
 libgcc/config/arm/eabi/ffixed.S | 414 
 libgcc/config/arm/lib1funcs.S   |   1 +
 libgcc/config/arm/t-elf |   4 +
 4 files changed, 419 insertions(+), 6 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/ffixed.S

diff --git a/libgcc/config/arm/bpabi-lib.h b/libgcc/config/arm/bpabi-lib.h
index 1e651ead4ac..a1c631640bb 100644
--- a/libgcc/config/arm/bpabi-lib.h
+++ b/libgcc/config/arm/bpabi-lib.h
@@ -32,9 +32,6 @@
 #ifdef L_muldi3
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (muldi3, lmul)
 #endif
-#ifdef L_muldi3
-#define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (muldi3, lmul)
-#endif
 #ifdef L_fixdfdi
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (fixdfdi, d2lz) \
   extern DWtype __fixdfdi (DFtype) __attribute__((pcs("aapcs"))); \
@@ -62,9 +59,6 @@
 #ifdef L_fixunsdfsi
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (fixunsdfsi, d2uiz)
 #endif
-#ifdef L_fixunssfsi
-#define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (fixunssfsi, f2uiz)
-#endif
 #ifdef L_floatundidf
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatundidf, ul2d)
 #endif
diff --git a/libgcc/config/arm/eabi/ffixed.S b/libgcc/config/arm/eabi/ffixed.S
new file mode 100644
index 000..8ced3a701ff
--- /dev/null
+++ b/libgcc/config/arm/eabi/ffixed.S
@@ -0,0 +1,414 @@
+/* ffixed.S: Thumb-1 optimized float-to-integer conversion
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+// The implementation of __aeabi_f2uiz() expects to tail call __internal_f2iz()
+//  with the flags register set for unsigned conversion.  The __internal_f2iz()
+//  symbol itself is unambiguous, but there is a remote risk that the linker
+//  will prefer some other symbol in place of __aeabi_f2iz().  Importing an
+//  archive file that exports __aeabi_f2iz() will throw an error in this case.
+// As a workaround, this block configures __aeabi_f2iz() for compilation twice.
+// The first version configures __internal_f2iz() as a WEAK standalone symbol,
+//  and the second exports __aeabi_f2iz() and __internal_f2iz() normally.
+// A small bonus: programs only using __aeabi_f2uiz() will be slightly smaller.
+// '_internal_fixsfsi' should appear before '_arm_fixsfsi' in LIB1ASMFUNCS.
+#if defined(L_arm_fixsfsi) || \
+   (defined(L_internal_fixsfsi) && \
+  !(defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__))
+
+// Subsection ordering within fpcore keeps conditional branches within range.
+#define F2IZ_SECTION .text.sorted.libgcc.fpcore.r.fixsfsi
+
+// int __aeabi_f2iz(float)
+// Converts a float in $r0 to signed integer, rounding toward 0.
+// Values out of range are forced to either INT_MAX or INT_MIN.
+// NAN becomes zero.
+#ifdef L_arm_fixsfsi
+FUNC_START_SECTION aeabi_f2iz F2IZ_SECTION
+FUNC_ALIAS fixsfsi aeabi_f2iz
+CFI_START_FUNCTION
+#endif
+
+  #if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+// Flag for unsigned conversion.
+movsr1, #33
+b   SYM(__internal_fixsfdi)
+
+  #else /* !__OPTIMIZE_SIZE__ */
+
+#ifdef L_arm_fixsfsi
+// Flag for signed conversion.
+movsr3, #1
+
+// [unsigned] int internal_f2iz(float, int)
+// Internal function expects a boolean flag in $r1.
+// If the boolean flag is 0, the result is unsigned.
+// If the boolean flag is 1, the result is signed.
+FUNC_ENTRY internal_f2iz
+
+#else /* L_internal_fixsfsi */
+WEAK_START_SECTION internal_f2iz F2IZ_SECTION
+CFI_START_FUNCTION
+
+#endif
+
+// Isolate the sign of the result.
+   

[PATCH v4 24/29] Import single precision division from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-08 Daniel Engel 

* config/arm/eabi/fdiv.S: New file for __divsf3().
* config/arm/lib1funcs.S: #include eabi/fdiv.S (v6m only).
* config/arm/t-elf: Add _divsf3 and _fp_divloopf3 to LIB1ASMFUNCS.
---
 libgcc/config/arm/eabi/fdiv.S | 261 ++
 libgcc/config/arm/lib1funcs.S |   1 +
 libgcc/config/arm/t-elf   |   2 +
 3 files changed, 264 insertions(+)
 create mode 100644 libgcc/config/arm/eabi/fdiv.S

diff --git a/libgcc/config/arm/eabi/fdiv.S b/libgcc/config/arm/eabi/fdiv.S
new file mode 100644
index 000..118f4e94676
--- /dev/null
+++ b/libgcc/config/arm/eabi/fdiv.S
@@ -0,0 +1,261 @@
+/* fdiv.S: Cortex M0 optimized 32-bit float division
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#ifdef L_arm_divsf3
+
+// float __aeabi_fdiv(float, float)
+// Returns $r0 after division by $r1.
+// Subsection ordering within fpcore keeps conditional branches within range.
+FUNC_START_SECTION aeabi_fdiv .text.sorted.libgcc.fpcore.n.fdiv
+FUNC_ALIAS divsf3 aeabi_fdiv
+CFI_START_FUNCTION
+
+// Standard registers, compatible with exception handling.
+push{ rT, lr }
+.cfi_remember_state
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+// Save for the sign of the result.
+movsr3, r1
+eorsr3, r0
+lsrsrT, r3, #31
+lslsrT, #31
+mov ip, rT
+
+// Set up INF for comparison.
+movsrT, #255
+lslsrT, #24
+
+// Check for divide by 0.  Automatically catches 0/0.
+lslsr2, r1, #1
+beq LLSYM(__fdiv_by_zero)
+
+// Check for INF/INF, or a number divided by itself.
+lslsr3, #1
+beq LLSYM(__fdiv_equal)
+
+// Check the numerator for INF/NAN.
+eorsr3, r2
+cmp r3, rT
+bhs LLSYM(__fdiv_special1)
+
+// Check the denominator for INF/NAN.
+cmp r2, rT
+bhs LLSYM(__fdiv_special2)
+
+// Check the numerator for zero.
+cmp r3, #0
+beq SYM(__fp_zero)
+
+// No action if the numerator is subnormal.
+//  The mantissa will normalize naturally in the division loop.
+lslsr0, #9
+lsrsr1, r3, #24
+beq LLSYM(__fdiv_denominator)
+
+// Restore the numerator's implicit '1'.
+addsr0, #1
+rorsr0, r0
+
+LLSYM(__fdiv_denominator):
+// The denominator must be normalized and left aligned.
+bl  SYM(__fp_normalize2)
+
+// 25 bits of precision will be sufficient.
+movsrT, #64
+
+// Run division.
+bl  SYM(__fp_divloopf)
+b   SYM(__fp_assemble)
+
+LLSYM(__fdiv_equal):
+  #if defined(EXCEPTION_CODES) && EXCEPTION_CODES
+movsr3, #(DIVISION_INF_BY_INF)
+  #endif
+
+// The absolute value of both operands are equal, but not 0.
+// If both operands are INF, create a new NAN.
+cmp r2, rT
+beq SYM(__fp_exception)
+
+  #if defined(TRAP_NANS) && TRAP_NANS
+// If both operands are NAN, return the NAN in $r0.
+bhi SYM(__fp_check_nan)
+  #else
+bhi LLSYM(__fdiv_return)
+  #endif
+
+// Return 1.0f, with appropriate sign.
+movsr0, #127
+lslsr0, #23
+add r0, ip
+
+LLSYM(__fdiv_return):
+pop { rT, pc }
+.cfi_restore_state
+
+LLSYM(__fdiv_special2):
+// The denominator is either INF or NAN, numerator is neither.
+// Also, the denominator is not equal to 0.
+

[PATCH v4 17/29] Import replacement 64-bit division functions from the CM0 library.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/bpabi.c: Deleted unused file.
* config/arm/eabi/ldiv.S: Replaced the __aeabi_ldivmod() and
__aeabi_uldivmod() wrapper functions with a full implementation.
* config/arm/t-bpabi: Removed bpabi.c from LIB2ADD_ST.
* config/arm/t-elf: Add _divdi3 and _udivdi3 to LIB1ASMFUNCS.
---
 libgcc/config/arm/bpabi.c |  42 ---
 libgcc/config/arm/eabi/ldiv.S | 571 +-
 libgcc/config/arm/t-bpabi |   3 +-
 libgcc/config/arm/t-elf   |   9 +
 4 files changed, 501 insertions(+), 124 deletions(-)
 delete mode 100644 libgcc/config/arm/bpabi.c

diff --git a/libgcc/config/arm/bpabi.c b/libgcc/config/arm/bpabi.c
deleted file mode 100644
index bf6ba757964..000
--- a/libgcc/config/arm/bpabi.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Miscellaneous BPABI functions.
-
-   Copyright (C) 2003-2021 Free Software Foundation, Inc.
-   Contributed by CodeSourcery, LLC.
-
-   This file is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by the
-   Free Software Foundation; either version 3, or (at your option) any
-   later version.
-
-   This file is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-extern long long __divdi3 (long long, long long);
-extern unsigned long long __udivdi3 (unsigned long long, 
-unsigned long long);
-extern long long __gnu_ldivmod_helper (long long, long long, long long *);
-
-
-long long
-__gnu_ldivmod_helper (long long a, 
- long long b, 
- long long *remainder)
-{
-  long long quotient;
-
-  quotient = __divdi3 (a, b);
-  *remainder = a - b * quotient;
-  return quotient;
-}
-
diff --git a/libgcc/config/arm/eabi/ldiv.S b/libgcc/config/arm/eabi/ldiv.S
index 514a3b8c3a3..c225e5973b2 100644
--- a/libgcc/config/arm/eabi/ldiv.S
+++ b/libgcc/config/arm/eabi/ldiv.S
@@ -1,82 +1,493 @@
+/* ldiv.S: Thumb-1 optimized 64-bit integer division
 
-.macro test_div_by_zero signed
-   cmp yyh, #0
-   bne 7f
-   cmp yyl, #0
-   bne 7f
-   cmp xxh, #0
-   .ifc\signed, unsigned
-   bne 2f
-   cmp xxl, #0
-2:
-   beq 3f
-   movsxxh, #0
-   mvnsxxh, xxh@ 0x
-   movsxxl, xxh
-3:
-   .else
-   blt 6f
-   bgt 4f
-   cmp xxl, #0
-   beq 5f
-4: movsxxl, #0
-   mvnsxxl, xxl@ 0x
-   lsrsxxh, xxl, #1@ 0x7fff
-   b   5f
-6: movsxxh, #0x80
-   lslsxxh, xxh, #24   @ 0x8000
-   movsxxl, #0
-5:
-   .endif
-   @ tailcalls are tricky on v6-m.
-   push{r0, r1, r2}
-   ldr r0, 1f
-   adr r1, 1f
-   addsr0, r1
-   str r0, [sp, #8]
-   @ We know we are not on armv4t, so pop pc is safe.
-   pop {r0, r1, pc}
-   .align  2
-1:
-   .word   __aeabi_ldiv0 - 1b
-7:
-.endm
-
-#ifdef L_aeabi_ldivmod
-
-FUNC_START aeabi_ldivmod
-   test_div_by_zero signed
-
-   push{r0, r1}
-   mov r0, sp
-   push{r0, lr}
-   ldr r0, [sp, #8]
-   bl  SYM(__gnu_ldivmod_helper)
-   ldr r3, [sp, #4]
-   mov lr, r3
-   add sp, sp, #8
-   pop {r2, r3}
-   RET
-   FUNC_END aeabi_ldivmod
-
-#endif /* L_aeabi_ldivmod */
-
-#ifdef L_aeabi_uldivmod
-
-FUNC_START aeabi_uldivmod
-   test_div_by_zero unsigned
-
-   push{r0, r1}
-   mov r0, sp
-   push{r0, lr}
-   ldr r0, [sp, #8]
-   bl  SYM(__udivmoddi4)
-   ldr r3, [sp, #4]
-   mov lr, r3
-   add sp, sp, #8
-   pop {r2, r3}
-   RET
-   FUNC_END aeabi_uldivmod
-   
-#endif /* L_aeabi_uldivmod */
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope t

[PATCH v4 04/29] Refactor 64-bit shift functions into a new file.

2021-01-11 Thread gnu
From: Daniel Engel 

gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/lib1funcs.S: Move __ashldi3(), __ashrdi3(), __lshldi3() to
* config/arm/bits/shift.S: New file.
---
 libgcc/config/arm/bits/shift.S | 102 
 libgcc/config/arm/lib1funcs.S  | 103 +
 2 files changed, 103 insertions(+), 102 deletions(-)
 create mode 100644 libgcc/config/arm/bits/shift.S

diff --git a/libgcc/config/arm/bits/shift.S b/libgcc/config/arm/bits/shift.S
new file mode 100644
index 000..94e466ac0d2
--- /dev/null
+++ b/libgcc/config/arm/bits/shift.S
@@ -0,0 +1,102 @@
+
+#ifdef L_lshrdi3
+
+   FUNC_START lshrdi3
+   FUNC_ALIAS aeabi_llsr lshrdi3
+   
+#ifdef __thumb__
+   lsrsal, r2
+   movsr3, ah
+   lsrsah, r2
+   mov ip, r3
+   subsr2, #32
+   lsrsr3, r2
+   orrsal, r3
+   negsr2, r2
+   mov r3, ip
+   lslsr3, r2
+   orrsal, r3
+   RET
+#else
+   subsr3, r2, #32
+   rsb ip, r2, #32
+   movmi   al, al, lsr r2
+   movpl   al, ah, lsr r3
+   orrmi   al, al, ah, lsl ip
+   mov ah, ah, lsr r2
+   RET
+#endif
+   FUNC_END aeabi_llsr
+   FUNC_END lshrdi3
+
+#endif
+   
+#ifdef L_ashrdi3
+   
+   FUNC_START ashrdi3
+   FUNC_ALIAS aeabi_lasr ashrdi3
+   
+#ifdef __thumb__
+   lsrsal, r2
+   movsr3, ah
+   asrsah, r2
+   subsr2, #32
+   @ If r2 is negative at this point the following step would OR
+   @ the sign bit into all of AL.  That's not what we want...
+   bmi 1f
+   mov ip, r3
+   asrsr3, r2
+   orrsal, r3
+   mov r3, ip
+1:
+   negsr2, r2
+   lslsr3, r2
+   orrsal, r3
+   RET
+#else
+   subsr3, r2, #32
+   rsb ip, r2, #32
+   movmi   al, al, lsr r2
+   movpl   al, ah, asr r3
+   orrmi   al, al, ah, lsl ip
+   mov ah, ah, asr r2
+   RET
+#endif
+
+   FUNC_END aeabi_lasr
+   FUNC_END ashrdi3
+
+#endif
+
+#ifdef L_ashldi3
+
+   FUNC_START ashldi3
+   FUNC_ALIAS aeabi_llsl ashldi3
+   
+#ifdef __thumb__
+   lslsah, r2
+   movsr3, al
+   lslsal, r2
+   mov ip, r3
+   subsr2, #32
+   lslsr3, r2
+   orrsah, r3
+   negsr2, r2
+   mov r3, ip
+   lsrsr3, r2
+   orrsah, r3
+   RET
+#else
+   subsr3, r2, #32
+   rsb ip, r2, #32
+   movmi   ah, ah, lsl r2
+   movpl   ah, al, lsl r3
+   orrmi   ah, ah, al, lsr ip
+   mov al, al, lsl r2
+   RET
+#endif
+   FUNC_END aeabi_llsl
+   FUNC_END ashldi3
+
+#endif
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index acafff62448..c7a3b85bf2b 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1618,108 +1618,7 @@ LSYM(Lover12):
 
 /* Prevent __aeabi double-word shifts from being produced on SymbianOS.  */
 #ifndef __symbian__
-
-#ifdef L_lshrdi3
-
-   FUNC_START lshrdi3
-   FUNC_ALIAS aeabi_llsr lshrdi3
-   
-#ifdef __thumb__
-   lsrsal, r2
-   movsr3, ah
-   lsrsah, r2
-   mov ip, r3
-   subsr2, #32
-   lsrsr3, r2
-   orrsal, r3
-   negsr2, r2
-   mov r3, ip
-   lslsr3, r2
-   orrsal, r3
-   RET
-#else
-   subsr3, r2, #32
-   rsb ip, r2, #32
-   movmi   al, al, lsr r2
-   movpl   al, ah, lsr r3
-   orrmi   al, al, ah, lsl ip
-   mov ah, ah, lsr r2
-   RET
-#endif
-   FUNC_END aeabi_llsr
-   FUNC_END lshrdi3
-
-#endif
-   
-#ifdef L_ashrdi3
-   
-   FUNC_START ashrdi3
-   FUNC_ALIAS aeabi_lasr ashrdi3
-   
-#ifdef __thumb__
-   lsrsal, r2
-   movsr3, ah
-   asrsah, r2
-   subsr2, #32
-   @ If r2 is negative at this point the following step would OR
-   @ the sign bit into all of AL.  That's not what we want...
-   bmi 1f
-   mov ip, r3
-   asrsr3, r2
-   orrsal, r3
-   mov r3, ip
-1:
-   negsr2, r2
-   lslsr3, r2
-   orrsal, r3
-   RET
-#else
-   subsr3, r2, #32
-   rsb ip, r2, #32
-   movmi   al, al, lsr r2
-   movpl   al, ah, asr r3
-   orrmi   al, al, ah, lsl ip
-   mov ah, ah, asr r2
-   RET
-#endif
-
-   FUNC_END aeabi_lasr
-   FUNC_END ashrdi3
-
-#endif
-
-#ifdef L_ashldi3
-
-   FUNC_START ashldi3
-   FUNC_ALIAS aeabi_llsl ashldi3
-   
-#ifdef __thumb__
-   lslsah, r2
-   movsr3, al
-   lslsal, r2
-   mov ip, r3
-   subsr2, #32
-   lslsr3, r2
-   orrsah, r3
-   negsr2, r2
-   mov r3, ip
-   lsrsr3, r2
-   orrsah, r3
-   RET
-#else
-   subsr3,