From: Daniel Engel <g...@danielengel.com> On architectures with no clz instruction, this version combines __clzdi2() with __clzsi2() into a single object with an efficient tail call. Also, this version merges the formerly separate for Thumb and ARM code implementations into a unified instruction sequence. This change significantly improves the Thumb performance with affecting ARM performance. Finally, this version adds a new __OPTIMIZE_SIZE__ build option (using a loop).
On architectures with a clz instruction, functionality is unchanged. gcc/libgcc/ChangeLog: 2021-01-07 Daniel Engel <g...@danielengel.com> * config/arm/bits/clz2.S: Size-optimized bitwise versions of __clzsi2() and __clzdi2() (i.e. __ARM_FEATURE_CLZ not available). * config/arm/lib1funcs.S: Moved CFI_FUNCTION macros, added __ARM_FEATURE_IT. * config/arm/t-elf: Move _clzsi2 to new group of weak LIB1ASMFUNCS. --- libgcc/config/arm/bits/clz2.S | 342 ++++++++++++++++++++++------------ libgcc/config/arm/lib1funcs.S | 25 ++- libgcc/config/arm/t-elf | 8 +- 3 files changed, 248 insertions(+), 127 deletions(-) diff --git a/libgcc/config/arm/bits/clz2.S b/libgcc/config/arm/bits/clz2.S index 1c8f10a5b29..d0a1fbec4d0 100644 --- a/libgcc/config/arm/bits/clz2.S +++ b/libgcc/config/arm/bits/clz2.S @@ -1,124 +1,234 @@ +/* clz2.S: Cortex M0 optimized 'clz' functions + + Copyright (C) 2018-2021 Free Software Foundation, Inc. + Contributed by Daniel Engel, Senva Inc (g...@danielengel.com) + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + + +#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ + +#ifdef L_clzdi2 + +// int __clzdi2(long long) +// Counts leading zero bits in $r1:$r0. +// Returns the result in $r0. +FUNC_START_SECTION clzdi2 .text.sorted.libgcc.clz2.clzdi2 + CFI_START_FUNCTION + + // Moved here from lib1funcs.S + cmp xxh, #0 + do_it eq, et + clzeq r0, xxl + clzne r0, xxh + addeq r0, #32 + RET + + CFI_END_FUNCTION +FUNC_END clzdi2 + +#endif /* L_clzdi2 */ + #ifdef L_clzsi2 -#ifdef NOT_ISA_TARGET_32BIT -FUNC_START clzsi2 - movs r1, #28 - movs r3, #1 - lsls r3, r3, #16 - cmp r0, r3 /* 0x10000 */ - bcc 2f - lsrs r0, r0, #16 - subs r1, r1, #16 -2: lsrs r3, r3, #8 - cmp r0, r3 /* #0x100 */ - bcc 2f - lsrs r0, r0, #8 - subs r1, r1, #8 -2: lsrs r3, r3, #4 - cmp r0, r3 /* #0x10 */ - bcc 2f - lsrs r0, r0, #4 - subs r1, r1, #4 -2: adr r2, 1f - ldrb r0, [r2, r0] - adds r0, r0, r1 - bx lr -.align 2 -1: -.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 - FUNC_END clzsi2 -#else -ARM_FUNC_START clzsi2 -# if defined (__ARM_FEATURE_CLZ) - clz r0, r0 - RET -# else - mov r1, #28 - cmp r0, #0x10000 - do_it cs, t - movcs r0, r0, lsr #16 - subcs r1, r1, #16 - cmp r0, #0x100 - do_it cs, t - movcs r0, r0, lsr #8 - subcs r1, r1, #8 - cmp r0, #0x10 - do_it cs, t - movcs r0, r0, lsr #4 - subcs r1, r1, #4 - adr r2, 1f - ldrb r0, [r2, r0] - add r0, r0, r1 - RET -.align 2 -1: -.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 -# endif /* !defined (__ARM_FEATURE_CLZ) */ - FUNC_END clzsi2 -#endif + +// int __clzsi2(int) +// Counts leading zero bits in $r0. +// Returns the result in $r0. +FUNC_START_SECTION clzsi2 .text.sorted.libgcc.clz2.clzsi2 + CFI_START_FUNCTION + + // Moved here from lib1funcs.S + clz r0, r0 + RET + + CFI_END_FUNCTION +FUNC_END clzsi2 + #endif /* L_clzsi2 */ +#else /* !__ARM_FEATURE_CLZ */ + #ifdef L_clzdi2 -#if !defined (__ARM_FEATURE_CLZ) - -# ifdef NOT_ISA_TARGET_32BIT -FUNC_START clzdi2 - push {r4, lr} - cmp xxh, #0 - bne 1f -# ifdef __ARMEB__ - movs r0, xxl - bl __clzsi2 - adds r0, r0, #32 - b 2f -1: - bl __clzsi2 -# else - bl __clzsi2 - adds r0, r0, #32 - b 2f -1: - movs r0, xxh - bl __clzsi2 -# endif -2: - pop {r4, pc} -# else /* NOT_ISA_TARGET_32BIT */ -ARM_FUNC_START clzdi2 - do_push {r4, lr} - cmp xxh, #0 - bne 1f -# ifdef __ARMEB__ - mov r0, xxl - bl __clzsi2 - add r0, r0, #32 - b 2f -1: - bl __clzsi2 -# else - bl __clzsi2 - add r0, r0, #32 - b 2f -1: - mov r0, xxh - bl __clzsi2 -# endif -2: - RETLDM r4 - FUNC_END clzdi2 -# endif /* NOT_ISA_TARGET_32BIT */ - -#else /* defined (__ARM_FEATURE_CLZ) */ - -ARM_FUNC_START clzdi2 - cmp xxh, #0 - do_it eq, et - clzeq r0, xxl - clzne r0, xxh - addeq r0, r0, #32 - RET - FUNC_END clzdi2 -#endif +// int __clzdi2(long long) +// Counts leading zero bits in $r1:$r0. +// Returns the result in $r0. +// Uses $r2 and possibly $r3 as scratch space. +FUNC_START_SECTION clzdi2 .text.sorted.libgcc.clz2.clzdi2 + CFI_START_FUNCTION + + #if defined(__ARMEB__) && __ARMEB__ + // Check if the upper word is zero. + cmp r0, #0 + + // The upper word is non-zero, so calculate __clzsi2(upper). + bne SYM(__clzsi2) + + // The upper word is zero, so calculate 32 + __clzsi2(lower). + movs r2, #64 + movs r0, r1 + b SYM(__internal_clzsi2) + + #else /* !__ARMEB__ */ + // Assume all the bits in the argument are zero. + movs r2, #64 + + // Check if the upper word is zero. + cmp r1, #0 + + // The upper word is zero, so calculate 32 + __clzsi2(lower). + beq SYM(__internal_clzsi2) + + // The upper word is non-zero, so set up __clzsi2(upper). + // Then fall through. + movs r0, r1 + + #endif /* !__ARMEB__ */ + #endif /* L_clzdi2 */ + +// The bitwise implementation of __clzdi2() tightly couples with __clzsi2(), +// such that instructions must appear consecutively in the same memory +// section for proper flow control. However, this construction inhibits +// the ability to discard __clzdi2() when only using __clzsi2(). +// Therefore, this block configures __clzsi2() for compilation twice. +// The first version is a minimal standalone implementation, and the second +// version is the continuation of __clzdi2(). The standalone version must +// be declared WEAK, so that the combined version can supersede it and +// provide both symbols when required. +// '_clzsi2' should appear before '_clzdi2' in LIB1ASMFUNCS. +#if defined(L_clzsi2) || defined(L_clzdi2) + +#ifdef L_clzsi2 +// int __clzsi2(int) +// Counts leading zero bits in $r0. +// Returns the result in $r0. +// Uses $r2 and possibly $r3 as scratch space. +WEAK_START_SECTION clzsi2 .text.sorted.libgcc.clz2.clzsi2 + CFI_START_FUNCTION + +#else /* L_clzdi2 */ +FUNC_ENTRY clzsi2 + +#endif + + // Assume all the bits in the argument are zero + movs r2, #32 + +#ifdef L_clzsi2 + WEAK_ENTRY internal_clzsi2 +#else /* L_clzdi2 */ + FUNC_ENTRY internal_clzsi2 +#endif + + // Size optimized: 22 bytes, 51 cycles + // Speed optimized: 50 bytes, 20 cycles + + #if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__ + + // Binary search starts at half the word width. + movs r3, #16 + + LLSYM(__clz_loop): + // Test the upper 'n' bits of the operand for ZERO. + movs r1, r0 + lsrs r1, r3 + + // When the test fails, discard the lower bits of the register, + // and deduct the count of discarded bits from the result. + #ifdef __ARM_FEATURE_IT + do_it ne, t + #else + beq LLSYM(__clz_skip) + #endif + + IT(mov,ne) r0, r1 + IT(sub,ne) r2, r3 + + LLSYM(__clz_skip): + // Decrease the shift distance for the next test. + lsrs r3, #1 + bne LLSYM(__clz_loop) + + #else /* __OPTIMIZE_SIZE__ */ + + // Unrolled binary search. + lsrs r1, r0, #16 + #ifdef __ARM_FEATURE_IT + do_it ne,t + #else + beq LLSYM(__clz8) + #endif + + IT(mov,ne) r0, r1 + IT(sub,ne) r2, #16 + + LLSYM(__clz8): + lsrs r1, r0, #8 + #ifdef __ARM_FEATURE_IT + do_it ne,t + #else + beq LLSYM(__clz4) + #endif + + IT(mov,ne) r0, r1 + IT(sub,ne) r2, #8 + + LLSYM(__clz4): + lsrs r1, r0, #4 + #ifdef __ARM_FEATURE_IT + do_it ne,t + #else + beq LLSYM(__clz2) + #endif + + IT(mov,ne) r0, r1 + IT(sub,ne) r2, #4 + + LLSYM(__clz2): + // Load the remainder by index + adr r1, LLSYM(__clz_remainder) + ldrb r0, [r1, r0] + + #endif /* !__OPTIMIZE_SIZE__ */ + + // Account for the remainder. + subs r0, r2, r0 + RET + + #if !defined(__OPTIMIZE_SIZE__) || !__OPTIMIZE_SIZE__ + .align 2 + LLSYM(__clz_remainder): + .byte 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 + #endif + + CFI_END_FUNCTION +FUNC_END clzsi2 + +#ifdef L_clzdi2 +FUNC_END clzdi2 +#endif + +#endif /* L_clzsi2 || L_clzdi2 */ + +#endif /* !__ARM_FEATURE_CLZ */ + diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S index c7a3b85bf2b..e6cb2570c8d 100644 --- a/libgcc/config/arm/lib1funcs.S +++ b/libgcc/config/arm/lib1funcs.S @@ -96,6 +96,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define NOT_ISA_TARGET_32BIT 1 #endif + /* How to return from a function call depends on the architecture variant. */ #if (__ARM_ARCH > 4) || defined(__ARM_ARCH_4T__) @@ -184,6 +185,16 @@ LSYM(Lend_fde): #endif .endm +.macro CFI_START_FUNCTION + .cfi_startproc + .cfi_remember_state +.endm + +.macro CFI_END_FUNCTION + .cfi_restore_state + .cfi_endproc +.endm + /* Don't pass dirn, it's there just to get token pasting right. */ .macro RETLDM regs=, cond=, unwind=, dirn=ia @@ -220,6 +231,7 @@ LSYM(Lend_fde): ARM and Thumb-2. However this is only supported by recent gas, so define a set of macros to allow ARM code on older assemblers. */ #if defined(__thumb2__) +#define __ARM_FEATURE_IT .macro do_it cond, suffix="" it\suffix \cond .endm @@ -235,6 +247,9 @@ LSYM(Lend_fde): \name \dest, \src1, \tmp .endm #else +#if !defined(__thumb__) +#define __ARM_FEATURE_IT +#endif .macro do_it cond, suffix="" .endm .macro shift1 op, arg0, arg1, arg2 @@ -1899,16 +1914,6 @@ LSYM(Lchange_\register): #endif /* Arch supports thumb. */ -.macro CFI_START_FUNCTION - .cfi_startproc - .cfi_remember_state -.endm - -.macro CFI_END_FUNCTION - .cfi_restore_state - .cfi_endproc -.endm - #ifndef __symbian__ /* The condition here must match the one in gcc/config/arm/elf.h and libgcc/config/arm/t-elf. */ diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf index 6a31a328073..8f1a4614904 100644 --- a/libgcc/config/arm/t-elf +++ b/libgcc/config/arm/t-elf @@ -19,13 +19,19 @@ endif # !__symbian__ # implementation. The soft-fp code is only build for ARMv6M. This pulls # in the asm implementation for other CPUs. + +# Group 0: WEAK overridable function objects. +# See respective sources for rationale. +LIB1ASMFUNCS += \ + _clzsi2 \ + + # Group 1: Integer functions LIB1ASMFUNCS += \ _ashldi3 \ _ashrdi3 \ _lshrdi3 \ _clzdi2 \ - _clzsi2 \ _ctzsi2 \ _dvmd_tls \ _divsi3 \ -- 2.25.1