Re: [1/2] PR96463 - aarch64 specific changes

2021-12-27 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 17 Dec 2021 at 17:03, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi,
> > The patch folds:
> > lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> > into:
> > lhs = vec_perm_expr
> > and expands above vec_perm_expr using aarch64_expand_sve_dupq.
> >
> > With patch, for following test:
> > #include 
> > #include 
> >
> > svint32_t
> > foo (int32x4_t x)
> > {
> >   return svld1rq (svptrue_b8 (), &x[0]);
> > }
> >
> > it generates following code:
> > foo:
> > .LFB4350:
> > dup z0.q, z0.q[0]
> > ret
> >
> > and passes bootstrap+test on aarch64-linux-gnu.
> > But I am not sure if the changes to aarch64_evpc_sve_tbl
> > are correct.
>
> Just in case: I was only using int32x4_t in the PR as an example.
> The same thing should work for all element types.
>
> >
> > Thanks,
> > Prathamesh
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index 02e42a71e5e..e21bbec360c 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -1207,6 +1207,56 @@ public:
> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >  return e.use_contiguous_load_insn (icode);
> >}
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const OVERRIDE
> > +  {
> > +tree arg0 = gimple_call_arg (f.call, 0);
> > +tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +/* Transform:
> > +   lhs = svld1rq ({-1, -1, ... }, &v[0])
> > +   into:
> > +   lhs = vec_perm_expr.
> > +   on little endian target.  */
> > +
> > +if (!BYTES_BIG_ENDIAN
> > + && integer_all_onesp (arg0)
> > + && TREE_CODE (arg1) == ADDR_EXPR)
> > +  {
> > + tree t = TREE_OPERAND (arg1, 0);
> > + if (TREE_CODE (t) == ARRAY_REF)
> > +   {
> > + tree index = TREE_OPERAND (t, 1);
> > + t = TREE_OPERAND (t, 0);
> > + if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> > +   {
> > + t = TREE_OPERAND (t, 0);
> > + tree vectype = TREE_TYPE (t);
> > + if (VECTOR_TYPE_P (vectype)
> > + && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> > + && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> > +   {
>
> Since this is quite a specific pattern match, and since we now lower
> arm_neon.h vld1* to normal gimple accesses, I think we should try the
> “more generally” approach mentioned in the PR and see what the fallout
> is.  That is, keep:
>
> if (!BYTES_BIG_ENDIAN
> && integer_all_onesp (arg0)
>
> If those conditions pass, create an Advanced SIMD access at address arg1,
> using similar code to the handling of:
>
>  BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
>  BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
>  BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
>
> in aarch64_general_gimple_fold_builtin.  (Would be good to move the
> common code to aarch64.c so that both files can use it.)
>
> > + tree lhs = gimple_call_lhs (f.call);
> > + tree lhs_type = TREE_TYPE (lhs);
> > + int source_nelts = TYPE_VECTOR_SUBPARTS 
> > (vectype).to_constant ();
> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
> > source_nelts, 1);
> > + for (int i = 0; i < source_nelts; i++)
> > +   sel.quick_push (i);
> > +
> > + vec_perm_indices indices (sel, 1, source_nelts);
> > + if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
> > +   return NULL;
>
> I don't think we need to check this: it should always be true.
> Probably worth keeping as a gcc_checking_assert though.
>
> > +
> > + tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, 
> > mask);
> > +   }
> > +   }
> > +   }
> > +  }
> > +
> > +return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index f07330cff4f..af27f550be3 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >
> >machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
> >rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> > +
> >if (d->one_vector_p)
> > -emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, 
> > sel));
> > +{
> > +  bool use_dupq = false;
> > +  /* Check if sel is dup vector with encoded elements {0, 1, 2, ... 
> > nelts}  */
> > +  if (GET_CODE (sel) == CONST_VECTOR
> > +   && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> > +   && CONST_VECTOR_DUPLICATE_P (sel))
> > +   {
> > + unsigned nelts = 

Re: [2/2] PR96463 -- changes to type checking vec_perm_expr in middle end

2021-12-27 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 17 Dec 2021 at 16:37, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi,
> > The attached patch rearranges order of type-check for vec_perm_expr
> > and relaxes type checking for
> > lhs = vec_perm_expr
> >
> > when:
> > rhs1 == rhs2,
> > lhs is variable length vector,
> > rhs1 is fixed length vector,
> > TREE_TYPE (lhs) == TREE_TYPE (rhs1)
> >
> > I am not sure tho if this check is correct ? My intent was to capture
> > case when vec_perm_expr is used to "extend" fixed length vector to
> > it's VLA equivalent.
>
> VLAness isn't really the issue.  We want the same thing to work for
> -msve-vector-bits=256, -msve-vector-bits=512, etc., even though the
> vectors are fixed-length in that case.
>
> The principle is that for:
>
>   A = VEC_PERM_EXPR ;
>
> the requirements are:
>
> - A, B, C and D must be vectors
> - A, B and C must have the same element type
> - D must have an integer element type
> - A and D must have the same number of elements (NA)
> - B and C must have the same number of elements (NB)
>
> The semantics are that we create a joined vector BC (all elements of B
> followed by all element of C) and that:
>
>   A[i] = BC[D[i] % (NB+NB)]
>
> for 0 ≤ i < NA.
>
> This operation makes sense even if NA != NB.
Thanks for the suggestions, I tried to modify the patch accordingly.
Does it look OK ?
Passes bootstrap+test on aarch64-linux-gnu.

Thanks,
Prathamesh

>
> Thanks,
> Richard
>
> >
> > Thanks,
> > Prathamesh
> >
> > diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
> > index 672e384ef09..9f91878c468 100644
> > --- a/gcc/tree-cfg.c
> > +++ b/gcc/tree-cfg.c
> > @@ -4325,10 +4325,11 @@ verify_gimple_assign_ternary (gassign *stmt)
> >break;
> >
> >  case VEC_PERM_EXPR:
> > -  if (!useless_type_conversion_p (lhs_type, rhs1_type)
> > -   || !useless_type_conversion_p (lhs_type, rhs2_type))
> > +  if (TREE_CODE (rhs1_type) != VECTOR_TYPE
> > +   || TREE_CODE (rhs2_type) != VECTOR_TYPE
> > +   || TREE_CODE (rhs3_type) != VECTOR_TYPE)
> >   {
> > -   error ("type mismatch in %qs", code_name);
> > +   error ("vector types expected in %qs", code_name);
> > debug_generic_expr (lhs_type);
> > debug_generic_expr (rhs1_type);
> > debug_generic_expr (rhs2_type);
> > @@ -4336,11 +4337,14 @@ verify_gimple_assign_ternary (gassign *stmt)
> > return true;
> >   }
> >
> > -  if (TREE_CODE (rhs1_type) != VECTOR_TYPE
> > -   || TREE_CODE (rhs2_type) != VECTOR_TYPE
> > -   || TREE_CODE (rhs3_type) != VECTOR_TYPE)
> > +  if (TREE_CODE (TREE_TYPE (rhs3_type)) != INTEGER_TYPE
> > +   || (TREE_CODE (rhs3) != VECTOR_CST
> > +   && (GET_MODE_BITSIZE (SCALAR_INT_TYPE_MODE
> > + (TREE_TYPE (rhs3_type)))
> > +   != GET_MODE_BITSIZE (SCALAR_TYPE_MODE
> > +(TREE_TYPE (rhs1_type))
> >   {
> > -   error ("vector types expected in %qs", code_name);
> > +   error ("invalid mask type in %qs", code_name);
> > debug_generic_expr (lhs_type);
> > debug_generic_expr (rhs1_type);
> > debug_generic_expr (rhs2_type);
> > @@ -4348,15 +4352,18 @@ verify_gimple_assign_ternary (gassign *stmt)
> > return true;
> >   }
> >
> > -  if (maybe_ne (TYPE_VECTOR_SUBPARTS (rhs1_type),
> > - TYPE_VECTOR_SUBPARTS (rhs2_type))
> > -   || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type),
> > -TYPE_VECTOR_SUBPARTS (rhs3_type))
> > -   || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs3_type),
> > -TYPE_VECTOR_SUBPARTS (lhs_type)))
> > +  /* Accept lhs = vec_perm_expr if lhs is vector length 
> > agnostic,
> > +  and has same element type as v.  */
> > +  if (!TYPE_VECTOR_SUBPARTS (lhs_type).is_constant ()
> > +   && operand_equal_p (rhs1, rhs2, 0)
> > +   && TYPE_VECTOR_SUBPARTS (rhs1_type).is_constant ()
> > +   && TREE_TYPE (lhs_type) == TREE_TYPE (rhs1_type))
> > + return false;
> > +
> > +  if (!useless_type_conversion_p (lhs_type, rhs1_type)
> > +   || !useless_type_conversion_p (lhs_type, rhs2_type))
> >   {
> > -   error ("vectors with different element number found in %qs",
> > -  code_name);
> > +   error ("type mismatch in %qs", code_name);
> > debug_generic_expr (lhs_type);
> > debug_generic_expr (rhs1_type);
> > debug_generic_expr (rhs2_type);
> > @@ -4364,21 +4371,21 @@ verify_gimple_assign_ternary (gassign *stmt)
> > return true;
> >   }
> >
> > -  if (TREE_CODE (TREE_TYPE (rhs3_type)) != INTEGER_TYPE
> > -   || (TREE_CODE (rhs3) != VECTOR_CST
> > -   && (GET_MODE_BITSIZE (SCALAR_INT_TYPE_MODE
> > - (TREE_TYPE (rhs3_type)))
> > -   != GET_MODE_BITSIZE (SCALAR_TYPE_MODE
> > -(TREE_TYPE (rhs1_type))
> > +  if (maybe_ne (TYPE_VECTOR_SUBPARTS (rhs1_type),

[committed] aarch64: Fix mismatched extern "C" block [PR100985]

2021-12-27 Thread Jonathan Wakely via Gcc-patches
Untested, committed as obvious, fixing the 9.4.0 regression introduced
by r9-8936.


gcc/ChangeLog:

PR target/100985
* config/aarch64/arm_acle.h: Remove unclosed extern "C" block.
---
 gcc/config/aarch64/arm_acle.h | 4 
 1 file changed, 4 deletions(-)

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 56147352c23..8c622a3985b 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -98,10 +98,6 @@ __rint64x (double __a)
 
 #pragma GCC target ("+nothing+crc")
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 __crc32b (uint32_t __a, uint8_t __b)
 {
-- 
2.31.1



[committed] hppa: Improve atomic store implementation on hppa-linux

2021-12-27 Thread John David Anglin
Atomic stores on hppa-linux must be synthesized using the kernel
light-weight system calls. Instead of using a compare and swap loop,
it is more efficient to use the __sync_lock_test_and_set routines
in libgcc.

Tested on hppa-unknown-linux-gnu. Committed to trunk and gcc-11.

Dave
---

Improve atomic store implementation on hppa-linux.

2021-12-27  John David Anglin  

gcc/ChangeLog:

* config/pa/pa-protos.h: Delete
pa_maybe_emit_compare_and_swap_exchange_loop() declaration.
* config/pa/pa.c (pa_expand_compare_and_swap_loop): Delete.
(pa_maybe_emit_compare_and_swap_exchange_loop): Delete.
* config/pa/pa.md (atomic_storeq): Use __sync_lock_test_and_set
instead of pa_maybe_emit_compare_and_swap_exchange_loop.
(atomic_storehi, atomic_storesi, atomic_storedi): Likewise.

diff --git a/gcc/config/pa/pa-protos.h b/gcc/config/pa/pa-protos.h
index 5bf6fef4968..69377db45c5 100644
--- a/gcc/config/pa/pa-protos.h
+++ b/gcc/config/pa/pa-protos.h
@@ -73,7 +73,6 @@ extern rtx pa_return_addr_rtx (int, rtx);
 
 extern int pa_insn_refs_are_delayed (rtx_insn *);
 extern rtx pa_get_deferred_plabel (rtx);
-extern rtx pa_maybe_emit_compare_and_swap_exchange_loop (rtx, rtx, rtx);
 #endif /* RTX_CODE */
 
 extern int pa_and_mask_p (unsigned HOST_WIDE_INT);
diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c
index 2b10ef34061..895978aea1c 100644
--- a/gcc/config/pa/pa.c
+++ b/gcc/config/pa/pa.c
@@ -11023,82 +11023,6 @@ pa_output_addr_diff_vec (rtx lab, rtx body)
 fputs ("\t.end_brtab\n", asm_out_file);
 }
 
-/* This is a helper function for the other atomic operations.  This function
-   emits a loop that contains SEQ that iterates until a compare-and-swap
-   operation at the end succeeds.  MEM is the memory to be modified.  SEQ is
-   a set of instructions that takes a value from OLD_REG as an input and
-   produces a value in NEW_REG as an output.  Before SEQ, OLD_REG will be
-   set to the current contents of MEM.  After SEQ, a compare-and-swap will
-   attempt to update MEM with NEW_REG.  The function returns true when the
-   loop was generated successfully.  */
-
-static bool
-pa_expand_compare_and_swap_loop (rtx mem, rtx old_reg, rtx new_reg, rtx seq)
-{
-  machine_mode mode = GET_MODE (mem);
-  rtx_code_label *label;
-  rtx cmp_reg, success, oldval;
-
-  /* The loop we want to generate looks like
-
-cmp_reg = mem;
-  label:
-old_reg = cmp_reg;
-seq;
-(success, cmp_reg) = compare-and-swap(mem, old_reg, new_reg)
-if (success)
-  goto label;
-
- Note that we only do the plain load from memory once.  Subsequent
- iterations use the value loaded by the compare-and-swap pattern.  */
-
-  label = gen_label_rtx ();
-  cmp_reg = gen_reg_rtx (mode);
-
-  emit_move_insn (cmp_reg, mem);
-  emit_label (label);
-  emit_move_insn (old_reg, cmp_reg);
-  if (seq)
-emit_insn (seq);
-
-  success = NULL_RTX;
-  oldval = cmp_reg;
-  if (!expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
-   new_reg, false, MEMMODEL_SYNC_SEQ_CST,
-   MEMMODEL_RELAXED))
-return false;
-
-  if (oldval != cmp_reg)
-emit_move_insn (cmp_reg, oldval);
-
-  /* Mark this jump predicted not taken.  */
-  emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
-   GET_MODE (success), 1, label,
-  profile_probability::guessed_never ());
-  return true;
-}
-
-/* This function tries to implement an atomic exchange operation using a 
-   compare_and_swap loop. VAL is written to *MEM.  The previous contents of
-   *MEM are returned, using TARGET if possible.  No memory model is required
-   since a compare_and_swap loop is seq-cst.  */
-
-rtx
-pa_maybe_emit_compare_and_swap_exchange_loop (rtx target, rtx mem, rtx val)
-{
-  machine_mode mode = GET_MODE (mem);
-
-  if (can_compare_and_swap_p (mode, true))
-{
-  if (!target || !register_operand (target, mode))
-target = gen_reg_rtx (mode);
-  if (pa_expand_compare_and_swap_loop (mem, target, val, NULL_RTX))
-return target;
-}
-
-  return NULL_RTX;
-}
-
 /* Implement TARGET_CALLEE_COPIES.  The callee is responsible for copying
arguments passed by hidden reference in the 32-bit HP runtime.  Users
can override this behavior for better compatibility with openmp at the
diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md
index f124c301b7a..af5449a9ea3 100644
--- a/gcc/config/pa/pa.md
+++ b/gcc/config/pa/pa.md
@@ -10366,10 +10366,12 @@ add,l %2,%3,%3\;bv,n %%r0(%3)"
 {
   if (TARGET_SYNC_LIBCALL)
 {
-  rtx mem = operands[0];
-  rtx val = operands[1];
-  if (pa_maybe_emit_compare_and_swap_exchange_loop (NULL_RTX, mem, val))
-   DONE;
+  rtx libfunc = init_one_libfunc ("__sync_lock_test_and_set_1");
+
+  emit_library_call (libfunc, LCT_NORMAL, VOIDmode,
+XEXP (operands[0],

[PATCH v6 00/34] libgcc: Thumb-1 Floating-Point Assembly for Cortex M0

2021-12-27 Thread Daniel Engel
Hi Richard, 

I am re-submitting my libgcc patch from last year: 

https://gcc.gnu.org/pipermail/gcc-patches/2021-January/563585.html 

I clearly missed the stage1 window again.  However, since the patch rebased 
cleanly onto gcc-12 with no regressions, and it's not quite stage4 yet, I 
figured submission is worth a chance. 

Regards,
Daniel

---

Changes since v5:

* Rebased and tested with gcc-12

Regressions for -march={armv4t,armv6s-m,armv7-m,armv7-a}, clean master:

# of expected passes513596
# of unexpected failures38829
# of unexpected successes   16
# of expected failures  3450
# of unresolved testcases   1108
# of unsupported tests  28224

Patched master:

# of expected passes513596
# of unexpected failures38829
# of unexpected successes   16
# of expected failures  3450
# of unresolved testcases   1108
# of unsupported tests  28224

---

This patch series adds an assembly-language implementation of IEEE-754 compliant
single-precision functions designed for the Cortex M0 (v6m) architecture.  There
are improvements to most of the EABI integer functions as well.  This is the
ibgcc component of a larger library project originally proposed in 2018:

https://gcc.gnu.org/legacy-ml/gcc/2018-11/msg00043.html

As one point of comparison, a test program [1] links 916 bytes from libgcc with
the patched toolchain vs 10276 bytes with gcc-arm-none-eabi-9-2020-q2 toolchain.
That's a 90% size reduction.

I have extensive test vectors [2], and this patch pass all tests on an 
STM32F051.
These vectors were derived from UCB [3], Testfloat [4], and IEEECC754 [5], plus
many of my own generation.

There may be some follow-on projects worth discussing:

* The library is currently integrated into the ARM v6s-m multilib only.  It
is likely that some other architectures would benefit from these routines.
However, I have NOT profiled the existing implementations (ieee754-sf.S) to
estimate where improvements may be found.

* GCC currently lacks test for some functions, such as __aeabi_[u]ldivmod().
There may be useful bits in [1] that can be integrated.

On Cortex M0, the library has (approximately) the following properties:

Function(s) Size (bytes)Cycles  Stack   
Accuracy
__clzsi250  20  0   
exact
__clzsi2 (OPTIMIZE_SIZE)22  51  0   
exact
__clzdi28+__clzsi2  4+__clzsi2  0   
exact

__clrsbsi2  8+__clzsi2  6+__clzsi2  0   
exact
__clrsbdi2  18+__clzsi2 (8..10)+__clzsi20   
exact

__ctzsi252  21  0   
exact
__ctzsi2 (OPTIMIZE_SIZE)24  52  0   
exact
__ctzdi28+__ctzsi2  5+__ctzsi2  0   
exact

__ffssi28   6..(5+__ctzsi2) 0   
exact
__ffsdi214+__ctzsi2 9..(8+__ctzsi2) 0   
exact

__popcountsi2   52  25  0   
exact
__popcountsi2 (OPTIMIZE_SIZE)   14  9..201  0   
exact
__popcountdi2   34+__popcountsi246  0   
exact
__popcountdi2 (OPTIMIZE_SIZE)   12+__popcountsi217..401 0   
exact

__paritysi2 24  14  0   
exact
__paritysi2 (OPTIMIZE_SIZE) 16  38  0   
exact
__paritydi2 2+__paritysi2   1+__paritysi2   0   
exact

__umulsidi3 44  24  0   
exact
__mulsidi3  30+__umulsidi3  24+__umulsidi3  8   
exact
__muldi3 (__aeabi_lmul) 10+__umulsidi3  6+__umulsidi3   0   
exact
__ashldi3 (__aeabi_llsl)22  13  0   
exact
__lshrdi3 (__aeabi_llsr)22  13  0   
exact
__ashrdi3 (__aeabi_lasr)22  13  0   
exact

__aeabi_lcmp20  13  0   
exact
__aeabi_ulcmp   16  10  0   
exact

__udivsi3 (__aeabi_uidiv)   56  72..385 0   
< 1 lsb
__divsi3 (__aeabi_idiv) 38+__udivsi326+__udivsi38   
< 1 lsb
__udivdi3 (__aeabi_uldiv)   164 103..1394   16  
< 1 lsb
__udivdi3 (OPTIMIZE_SIZE)   142 120..1392   16  
< 1 lsb
__divdi3 (__aeabi_ldiv) 54+__udivdi33

[PATCH v6 01/34] Add and restructure function declaration macros

2021-12-27 Thread Daniel Engel
Most of these changes support subsequent patches in this series.
Particularly, the FUNC_START macro becomes part of a new macro chain:

  * FUNC_ENTRY  Common global symbol directives
  * FUNC_START_SECTION  FUNC_ENTRY to start a new 
  * FUNC_START  FUNC_START_SECTION <".text">

The effective definition of FUNC_START is unchanged from the previous
version of lib1funcs.  See code comments for detailed usage.

The new names FUNC_ENTRY and FUNC_START_SECTION were chosen specifically
to complement the existing FUNC_START name.  Alternate name patterns are
possible (such as {FUNC_SYMBOL, FUNC_START_SECTION, FUNC_START_TEXT}),
but any change to FUNC_START would require refactoring much of libgcc.

Additionally, a parallel chain of new macros supports weak functions:

  * WEAK_ENTRY
  * WEAK_START_SECTION
  * WEAK_START
  * WEAK_ALIAS

Moving the CFI_* macros earlier in the file scope will increase their
scope for use in additional functions.

gcc/libgcc/ChangeLog:
2021-01-14 Daniel Engel 

* config/arm/lib1funcs.S:
(LLSYM): New macro prefix ".L" for strippable local symbols.
(CFI_START_FUNCTION, CFI_END_FUNCTION): Moved earlier in the file.
(FUNC_ENTRY): New macro for symbols with no ".section" directive.
(WEAK_ENTRY): New macro FUNC_ENTRY + ".weak".
(FUNC_START_SECTION): New macro FUNC_ENTRY with  argument.
(WEAK_START_SECTION): New macro FUNC_START_SECTION + ".weak".
(FUNC_START): Redefined in terms of FUNC_START_SECTION <".text">.
(WEAK_START): New macro FUNC_START + ".weak".
(WEAK_ALIAS): New macro FUNC_ALIAS + ".weak".
(FUNC_END): Moved after FUNC_START macro group.
(THUMB_FUNC_START): Moved near the other *FUNC* macros.
(THUMB_SYNTAX, ARM_SYM_START, SYM_END): Deleted unused macros.
---
 libgcc/config/arm/lib1funcs.S | 109 +-
 1 file changed, 69 insertions(+), 40 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index c2fcfc503ec..f14662d7e15 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -69,11 +69,13 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #define TYPE(x) .type SYM(x),function
 #define SIZE(x) .size SYM(x), . - SYM(x)
 #define LSYM(x) .x
+#define LLSYM(x) .L##x
 #else
 #define __PLT__
 #define TYPE(x)
 #define SIZE(x)
 #define LSYM(x) x
+#define LLSYM(x) x
 #endif
 
 /* Function end macros.  Variants for interworking.  */
@@ -182,6 +184,16 @@ LSYM(Lend_fde):
 #endif
 .endm
 
+.macro CFI_START_FUNCTION
+   .cfi_startproc
+   .cfi_remember_state
+.endm
+
+.macro CFI_END_FUNCTION
+   .cfi_restore_state
+   .cfi_endproc
+.endm
+
 /* Don't pass dirn, it's there just to get token pasting right.  */
 
 .macro RETLDM  regs=, cond=, unwind=, dirn=ia
@@ -324,10 +336,6 @@ LSYM(Lend_fde):
 .endm
 #endif
 
-.macro FUNC_END name
-   SIZE (__\name)
-.endm
-
 .macro DIV_FUNC_END name signed
cfi_start   __\name, LSYM(Lend_div0)
 LSYM(Ldiv0):
@@ -340,48 +348,76 @@ LSYM(Ldiv0):
FUNC_END \name
 .endm
 
-.macro THUMB_FUNC_START name
-   .globl  SYM (\name)
-   TYPE(\name)
-   .thumb_func
-SYM (\name):
-.endm
-
 /* Function start macros.  Variants for ARM and Thumb.  */
 
 #ifdef __thumb__
 #define THUMB_FUNC .thumb_func
 #define THUMB_CODE .force_thumb
-# if defined(__thumb2__)
-#define THUMB_SYNTAX
-# else
-#define THUMB_SYNTAX
-# endif
 #else
 #define THUMB_FUNC
 #define THUMB_CODE
-#define THUMB_SYNTAX
 #endif
 
+.macro THUMB_FUNC_START name
+   .globl  SYM (\name)
+   TYPE(\name)
+   .thumb_func
+SYM (\name):
+.endm
+
+/* Strong global symbol, ".text" section.
+   The default macro for function declarations. */
 .macro FUNC_START name
-   .text
+   FUNC_START_SECTION \name .text
+.endm
+
+/* Weak global symbol, ".text" section.
+   Use WEAK_* macros to declare a function/object that may be discarded in by
+the linker when another library or object exports the same name.
+   Typically, functions declared with WEAK_* macros implement a subset of
+functionality provided by the overriding definition, and are discarded
+when the full functionality is required. */
+.macro WEAK_START name
+   .weak SYM(__\name)
+   FUNC_START_SECTION \name .text
+.endm
+
+/* Strong global symbol, alternate section.
+   Use the *_START_SECTION macros for declarations that the linker should
+place in a non-defailt section (e.g. ".rodata", ".text.subsection"). */
+.macro FUNC_START_SECTION name section
+   .section \section,"x"
+   .align 0
+   FUNC_ENTRY \name
+.endm
+
+/* Weak global symbol, alternate section. */
+.macro WEAK_START_SECTION name section
+   .weak SYM(__\name)
+   FUNC_START_SECTION \name \section
+.endm
+
+/* Strong global symbol.
+   Use *_ENTRY macros internal to a function/object body to declare a second
+or subsequent entry point wi

[PATCH v6 02/34] Rename THUMB_FUNC_START to THUMB_FUNC_ENTRY

2021-12-27 Thread Daniel Engel
Since THUMB_FUNC_START does not insert the ".text" directive, it aligns
more closely with the new FUNC_ENTRY maro and is renamed accordingly.

THUMB_FUNC_START usage has been universally synonymous with the
".force_thumb" directive, so this is now folded into the definition.
Usage of ".force_thumb" and ".thumb_func" is now tightly coupled
throughout the "arm" subdirectory.

gcc/libgcc/ChangeLog:
2021-01-14 Daniel Engel 

* config/arm/lib1funcs.S: (THUMB_FUNC_START): Renamed to ...
(THUMB_FUNC_ENTRY): for consistency; also added ".force_thumb".
(_call_via_r0): Removed redundant preceding ".force_thumb".
(__gnu_thumb1_case_sqi, __gnu_thumb1_case_uqi, __gnu_thumb1_case_shi,
__gnu_thumb1_case_si): Removed redundant ".force_thumb" and ".syntax".
---
 libgcc/config/arm/lib1funcs.S | 32 +++-
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index f14662d7e15..65d070d8178 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -358,10 +358,11 @@ LSYM(Ldiv0):
 #define THUMB_CODE
 #endif
 
-.macro THUMB_FUNC_START name
+.macro THUMB_FUNC_ENTRY name
.globl  SYM (\name)
TYPE(\name)
.thumb_func
+   .force_thumb
 SYM (\name):
 .endm
 
@@ -1944,10 +1945,9 @@ ARM_FUNC_START ctzsi2

.text
.align 0
-.force_thumb
 
 .macro call_via register
-   THUMB_FUNC_START _call_via_\register
+   THUMB_FUNC_ENTRY _call_via_\register
 
bx  \register
nop
@@ -2030,7 +2030,7 @@ _arm_return_r11:
 .macro interwork_with_frame frame, register, name, return
.code   16
 
-   THUMB_FUNC_START \name
+   THUMB_FUNC_ENTRY \name
 
bx  pc
nop
@@ -2047,7 +2047,7 @@ _arm_return_r11:
 .macro interwork register
.code   16
 
-   THUMB_FUNC_START _interwork_call_via_\register
+   THUMB_FUNC_ENTRY _interwork_call_via_\register
 
bx  pc
nop
@@ -2084,7 +2084,7 @@ LSYM(Lchange_\register):
/* The LR case has to be handled a little differently...  */
.code 16
 
-   THUMB_FUNC_START _interwork_call_via_lr
+   THUMB_FUNC_ENTRY _interwork_call_via_lr
 
bx  pc
nop
@@ -2112,9 +2112,7 @@ LSYM(Lchange_\register):

.text
.align 0
-.force_thumb
-   .syntax unified
-   THUMB_FUNC_START __gnu_thumb1_case_sqi
+   THUMB_FUNC_ENTRY __gnu_thumb1_case_sqi
push{r1}
mov r1, lr
lsrsr1, r1, #1
@@ -2131,9 +2129,7 @@ LSYM(Lchange_\register):

.text
.align 0
-.force_thumb
-   .syntax unified
-   THUMB_FUNC_START __gnu_thumb1_case_uqi
+   THUMB_FUNC_ENTRY __gnu_thumb1_case_uqi
push{r1}
mov r1, lr
lsrsr1, r1, #1
@@ -2150,9 +2146,7 @@ LSYM(Lchange_\register):

.text
.align 0
-.force_thumb
-   .syntax unified
-   THUMB_FUNC_START __gnu_thumb1_case_shi
+   THUMB_FUNC_ENTRY __gnu_thumb1_case_shi
push{r0, r1}
mov r1, lr
lsrsr1, r1, #1
@@ -2170,9 +2164,7 @@ LSYM(Lchange_\register):

.text
.align 0
-.force_thumb
-   .syntax unified
-   THUMB_FUNC_START __gnu_thumb1_case_uhi
+   THUMB_FUNC_ENTRY __gnu_thumb1_case_uhi
push{r0, r1}
mov r1, lr
lsrsr1, r1, #1
@@ -2190,9 +2182,7 @@ LSYM(Lchange_\register):

.text
.align 0
-.force_thumb
-   .syntax unified
-   THUMB_FUNC_START __gnu_thumb1_case_si
+   THUMB_FUNC_ENTRY __gnu_thumb1_case_si
push{r0, r1}
mov r1, lr
adds.n  r1, r1, #2  /* Align to word.  */
-- 
2.25.1



[PATCH v6 03/34] Fix syntax warnings on conditional instructions

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-14 Daniel Engel 

* config/arm/lib1funcs.S (RETLDM, ARM_DIV_BODY, ARM_MOD_BODY,
_interwork_call_via_lr): Moved condition code after the flags
update specifier "s".
(ARM_FUNC_START, THUMB_LDIV0): Removed redundant ".syntax".
---
 libgcc/config/arm/lib1funcs.S | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 65d070d8178..b8693be8e4f 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -204,7 +204,7 @@ LSYM(Lend_fde):
 # if defined(__thumb2__)
pop\cond{\regs, lr}
 # else
-   ldm\cond\dirn   sp!, {\regs, lr}
+   ldm\dirn\cond   sp!, {\regs, lr}
 # endif
.endif
.ifnc "\unwind", ""
@@ -220,7 +220,7 @@ LSYM(Lend_fde):
 # if defined(__thumb2__)
pop\cond{\regs, pc}
 # else
-   ldm\cond\dirn   sp!, {\regs, pc}
+   ldm\dirn\cond   sp!, {\regs, pc}
 # endif
.endif
 #endif
@@ -292,7 +292,6 @@ LSYM(Lend_fde):
pop {r1, pc}
 
 #elif defined(__thumb2__)
-   .syntax unified
.ifc \signed, unsigned
cbz r0, 1f
mov r0, #0x
@@ -429,7 +428,6 @@ SYM (__\name):
 /* For Thumb-2 we build everything in thumb mode.  */
 .macro ARM_FUNC_START name
FUNC_START \name
-   .syntax unified
 .endm
 #define EQUIV .thumb_set
 .macro  ARM_CALL name
@@ -643,7 +641,7 @@ pc  .reqr15
orrhs   \result,   \result,   \curbit,  lsr #3
cmp \dividend, #0   @ Early termination?
do_it   ne, t
-   movnes  \curbit,   \curbit,  lsr #4 @ No, any more bits to do?
+   movsne  \curbit,   \curbit,  lsr #4 @ No, any more bits to do?
movne   \divisor,  \divisor, lsr #4
bne 1b
 
@@ -745,7 +743,7 @@ pc  .reqr15
subhs   \dividend, \dividend, \divisor, lsr #3
cmp \dividend, #1
mov \divisor, \divisor, lsr #4
-   subges  \order, \order, #4
+   subsge  \order, \order, #4
bge 1b
 
tst \order, #3
@@ -2093,7 +2091,7 @@ LSYM(Lchange_\register):
.globl .Lchange_lr
 .Lchange_lr:
tst lr, #1
-   stmeqdb r13!, {lr, pc}
+   stmdbeq r13!, {lr, pc}
mov ip, lr
adreq   lr, _arm_return
bx  ip
-- 
2.25.1



[PATCH v6 04/34] Reorganize LIB1ASMFUNCS object wrapper macros

2021-12-27 Thread Daniel Engel
This will make it easier to isolate changes in subsequent patches.

gcc/libgcc/ChangeLog:
2021-01-14 Daniel Engel 

* config/arm/t-elf (LIB1ASMFUNCS): Split macros into logical groups.
---
 libgcc/config/arm/t-elf | 66 +
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf
index 9da6cd37054..93ea1cd8f76 100644
--- a/libgcc/config/arm/t-elf
+++ b/libgcc/config/arm/t-elf
@@ -14,19 +14,59 @@ LIB1ASMFUNCS += _arm_muldf3 _arm_mulsf3
 endif
 endif # !__symbian__
 
-# For most CPUs we have an assembly soft-float implementations.
-# However this is not true for ARMv6M.  Here we want to use the soft-fp C
-# implementation.  The soft-fp code is only build for ARMv6M.  This pulls
-# in the asm implementation for other CPUs.
-LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func \
-   _call_via_rX _interwork_call_via_rX \
-   _lshrdi3 _ashrdi3 _ashldi3 \
-   _arm_negdf2 _arm_addsubdf3 _arm_muldivdf3 _arm_cmpdf2 _arm_unorddf2 \
-   _arm_fixdfsi _arm_fixunsdfsi \
-   _arm_truncdfsf2 _arm_negsf2 _arm_addsubsf3 _arm_muldivsf3 \
-   _arm_cmpsf2 _arm_unordsf2 _arm_fixsfsi _arm_fixunssfsi \
-   _arm_floatdidf _arm_floatdisf _arm_floatundidf _arm_floatundisf \
-   _clzsi2 _clzdi2 _ctzsi2
+# This pulls in the available assembly function implementations.
+# The soft-fp code is only built for ARMv6M, since there is no
+# assembly implementation here for double-precision values.
+
+
+# Group 1: Integer function objects.
+LIB1ASMFUNCS += \
+   _ashldi3 \
+   _ashrdi3 \
+   _lshrdi3 \
+   _clzdi2 \
+   _clzsi2 \
+   _ctzsi2 \
+   _dvmd_tls \
+   _divsi3 \
+   _modsi3 \
+   _udivsi3 \
+   _umodsi3 \
+
+
+# Group 2: Single precision floating point function objects.
+LIB1ASMFUNCS += \
+   _arm_addsubsf3 \
+   _arm_cmpsf2 \
+   _arm_fixsfsi \
+   _arm_fixunssfsi \
+   _arm_floatdisf \
+   _arm_floatundisf \
+   _arm_muldivsf3 \
+   _arm_negsf2 \
+   _arm_unordsf2 \
+
+
+# Group 3: Double precision floating point function objects.
+LIB1ASMFUNCS += \
+   _arm_addsubdf3 \
+   _arm_cmpdf2 \
+   _arm_fixdfsi \
+   _arm_fixunsdfsi \
+   _arm_floatdidf \
+   _arm_floatundidf \
+   _arm_muldivdf3 \
+   _arm_negdf2 \
+   _arm_truncdfsf2 \
+   _arm_unorddf2 \
+
+
+# Group 4: Miscellaneous function objects.
+LIB1ASMFUNCS += \
+   _bb_init_func \
+   _call_via_rX \
+   _interwork_call_via_rX \
+
 
 # Currently there is a bug somewhere in GCC's alias analysis
 # or scheduling code that is breaking _fpmul_parts in fp-bit.c.
-- 
2.25.1



[PATCH v6 05/34] Add the __HAVE_FEATURE_IT and IT() macros

2021-12-27 Thread Daniel Engel
These macros complement and extend the existing do_it() macro.
Together, they streamline the process of optimizing short branchless
contitional sequences to support ARM, Thumb-2, and Thumb-1.

The inherent architecture limitations of Thumb-1 means that writing
assembly code is somewhat more tedious.  And, while such code will run
unmodified in an ARM or Thumb-2 enfironment, it will lack one of the
key performance optimizations available there.

Initially, the first idea might be to split the an instruction sequence
with #ifdef(s): one path for Thumb-1 and the other for ARM/Thumb-2.
This could suffice if conditional execution optimizations were rare.

However, #ifdef(s) break flow of an algorithm and shift focus to the
architectural differences instead of the similarities.  On functions
with a high percentage of conditional execution, it starts to become
attractive to split everything into distinct architecture-specific
function objects -- even when the underlying algorithm is identical.

Additionally, duplicated code and comments (whether an individual
operand, a line, or a larger block) become a future maintenance
liability if the two versions aren't kept in sync.

See code comments for limitations and expecated usage.

gcc/libgcc/ChangeLog:
2021-01-14 Daniel Engel 

(__HAVE_FEATURE_IT, IT): New macros.
---
 libgcc/config/arm/lib1funcs.S | 68 +++
 1 file changed, 68 insertions(+)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index b8693be8e4f..1233b8c0992 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -230,6 +230,7 @@ LSYM(Lend_fde):
ARM and Thumb-2.  However this is only supported by recent gas, so define
a set of macros to allow ARM code on older assemblers.  */
 #if defined(__thumb2__)
+#define __HAVE_FEATURE_IT
 .macro do_it cond, suffix=""
it\suffix   \cond
 .endm
@@ -245,6 +246,9 @@ LSYM(Lend_fde):
\name \dest, \src1, \tmp
 .endm
 #else
+#if !defined(__thumb__)
+#define __HAVE_FEATURE_IT
+#endif
 .macro do_it cond, suffix=""
 .endm
 .macro shift1 op, arg0, arg1, arg2
@@ -259,6 +263,70 @@ LSYM(Lend_fde):
 
 #define COND(op1, op2, cond) op1 ## op2 ## cond
 
+
+/* The IT() macro streamlines the construction of short branchless contitional
+sequences that support ARM, Thumb-2, and Thumb-1.  It is intended as an
+extension to the .do_it macro defined above.  Code not written with the
+intent to support Thumb-1 need not use IT().
+
+   IT()'s main advantage is the minimization of syntax differences.  Unified
+functions can support Thumb-1 without imposiing an undue performance
+penalty on ARM and Thumb-2.  Writing code without duplicate instructions
+and operands keeps the high level function flow clearer and should reduce
+the incidence of maintenance bugs.
+
+   Where conditional execution is supported by ARM and Thumb-2, the specified
+instruction compiles with the conditional suffix 'c'.
+
+   Where Thumb-1 and v6m do not support IT, the given instruction compiles
+with the standard unified syntax suffix "s", and a preceding branch
+instruction is required to implement conditional behavior.
+
+   (Aside: The Thumb-1 "s"-suffix pattern is somewhat simplistic, since it
+does not support 'cmp' or 'tst' with a non-"s" suffix.  It also appends
+"s" to 'mov' and 'add' with high register operands which are otherwise
+legal on v6m.  Use of IT() will result in a compiler error for all of
+these exceptional cases, and a full #ifdef code split will be required.
+However, it is unlikely that code written with Thumb-1 compatibility
+in mind will use such patterns, so IT() still promises a good value.)
+
+   Typical if/then/else usage is:
+
+#ifdef __HAVE_FEATURE_IT
+// ARM and Thumb-2 'true' condition.
+do_it   c,  tee
+#else
+// Thumb-1 'false' condition.  This must be opposite the
+//  sense of the ARM and Thumb-2 condition, since the
+//  branch is taken to skip the 'true' instruction block.
+b!c else_label
+#endif
+
+// Conditional 'true' execution for all compile modes.
+ IT(ins1,c) op1,op2
+ IT(ins2,c) op1,op2
+
+#ifndef __HAVE_FEATURE_IT
+// Thumb-1 branch to skip the 'else' instruction block.
+// Omitted for if/then usage.
+b   end_label
+#endif
+
+   else_label:
+// Conditional 'false' execution for all compile modes.
+// Omitted for if/then usage.
+ IT(ins3,!c) op1,   op2
+ IT(ins4,!c) op1,   op2
+
+   end_label:
+// Unconditional execution resumes here.
+ */
+#ifdef __HAVE_FEATURE_IT
+  #define IT(ins,c) ins##c
+#else
+  #define IT(ins,c) ins##s
+#endif
+
 #ifdef __ARM_EABI__
 .macro ARM_LDIV0 name signed
cmp r0, #0
-- 
2.25.1



[PATCH v6 06/34] Refactor 'clz' functions into a new file

2021-12-27 Thread Daniel Engel
This will make it easier to isolate changes in subsequent patches.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/lib1funcs.S (__clzsi2i, __clzdi2): Moved to ...
* config/arm/clz2.S: New file.
---
 libgcc/config/arm/clz2.S  | 145 ++
 libgcc/config/arm/lib1funcs.S | 123 +---
 2 files changed, 146 insertions(+), 122 deletions(-)
 create mode 100644 libgcc/config/arm/clz2.S

diff --git a/libgcc/config/arm/clz2.S b/libgcc/config/arm/clz2.S
new file mode 100644
index 000..2ad9a81892c
--- /dev/null
+++ b/libgcc/config/arm/clz2.S
@@ -0,0 +1,145 @@
+/* Copyright (C) 1995-2021 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+.  */
+
+
+#ifdef L_clzsi2
+#ifdef NOT_ISA_TARGET_32BIT
+FUNC_START clzsi2
+   movsr1, #28
+   movsr3, #1
+   lslsr3, r3, #16
+   cmp r0, r3 /* 0x1 */
+   bcc 2f
+   lsrsr0, r0, #16
+   subsr1, r1, #16
+2: lsrsr3, r3, #8
+   cmp r0, r3 /* #0x100 */
+   bcc 2f
+   lsrsr0, r0, #8
+   subsr1, r1, #8
+2: lsrsr3, r3, #4
+   cmp r0, r3 /* #0x10 */
+   bcc 2f
+   lsrsr0, r0, #4
+   subsr1, r1, #4
+2: adr r2, 1f
+   ldrbr0, [r2, r0]
+   addsr0, r0, r1
+   bx lr
+.align 2
+1:
+.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+   FUNC_END clzsi2
+#else
+ARM_FUNC_START clzsi2
+# if defined (__ARM_FEATURE_CLZ)
+   clz r0, r0
+   RET
+# else
+   mov r1, #28
+   cmp r0, #0x1
+   do_it   cs, t
+   movcs   r0, r0, lsr #16
+   subcs   r1, r1, #16
+   cmp r0, #0x100
+   do_it   cs, t
+   movcs   r0, r0, lsr #8
+   subcs   r1, r1, #8
+   cmp r0, #0x10
+   do_it   cs, t
+   movcs   r0, r0, lsr #4
+   subcs   r1, r1, #4
+   adr r2, 1f
+   ldrbr0, [r2, r0]
+   add r0, r0, r1
+   RET
+.align 2
+1:
+.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+# endif /* !defined (__ARM_FEATURE_CLZ) */
+   FUNC_END clzsi2
+#endif
+#endif /* L_clzsi2 */
+
+#ifdef L_clzdi2
+#if !defined (__ARM_FEATURE_CLZ)
+
+# ifdef NOT_ISA_TARGET_32BIT
+FUNC_START clzdi2
+   push{r4, lr}
+   cmp xxh, #0
+   bne 1f
+#  ifdef __ARMEB__
+   movsr0, xxl
+   bl  __clzsi2
+   addsr0, r0, #32
+   b 2f
+1:
+   bl  __clzsi2
+#  else
+   bl  __clzsi2
+   addsr0, r0, #32
+   b 2f
+1:
+   movsr0, xxh
+   bl  __clzsi2
+#  endif
+2:
+   pop {r4, pc}
+# else /* NOT_ISA_TARGET_32BIT */
+ARM_FUNC_START clzdi2
+   do_push {r4, lr}
+   cmp xxh, #0
+   bne 1f
+#  ifdef __ARMEB__
+   mov r0, xxl
+   bl  __clzsi2
+   add r0, r0, #32
+   b 2f
+1:
+   bl  __clzsi2
+#  else
+   bl  __clzsi2
+   add r0, r0, #32
+   b 2f
+1:
+   mov r0, xxh
+   bl  __clzsi2
+#  endif
+2:
+   RETLDM  r4
+   FUNC_END clzdi2
+# endif /* NOT_ISA_TARGET_32BIT */
+
+#else /* defined (__ARM_FEATURE_CLZ) */
+
+ARM_FUNC_START clzdi2
+   cmp xxh, #0
+   do_it   eq, et
+   clzeq   r0, xxl
+   clzne   r0, xxh
+   addeq   r0, r0, #32
+   RET
+   FUNC_END clzdi2
+
+#endif
+#endif /* L_clzdi2 */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 1233b8c0992..d92f73ba0c9 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1803,128 +1803,7 @@ LSYM(Lover12):
 
 #endif /* __symbian__ */
 
-#ifdef L_clzsi2
-#ifdef NOT_ISA_TARGET_32BIT
-FUNC_START clzsi2
-   movsr1, #28
-   movsr3, #1
-   lslsr3, r3, #16
-   cmp r0, r3 /* 0x1 */
-   bcc 2f
-   lsrsr0, r0, #16
-   subsr1, r1, #16
-2: lsrsr3, r3, #8
-   cmp r0, r3 /* #0x100 */
-   bcc 2f
-   lsrsr0, r0, #8
-   subsr1, r1, #8
-2: lsrsr3, r3, #4
-   cmp r0, r3 /* #0x10 */
-   bcc 2f
-   lsrsr0,

[PATCH v6 07/34] Refactor 'ctz' functions into a new file

2021-12-27 Thread Daniel Engel
This will make it easier to isolate changes in subsequent patches.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/lib1funcs.S (__ctzsi2): Moved to ...
* config/arm/ctz2.S: New file.
---
 libgcc/config/arm/ctz2.S  | 86 +++
 libgcc/config/arm/lib1funcs.S | 65 +-
 2 files changed, 87 insertions(+), 64 deletions(-)
 create mode 100644 libgcc/config/arm/ctz2.S

diff --git a/libgcc/config/arm/ctz2.S b/libgcc/config/arm/ctz2.S
new file mode 100644
index 000..8702c9afb94
--- /dev/null
+++ b/libgcc/config/arm/ctz2.S
@@ -0,0 +1,86 @@
+/* Copyright (C) 1995-2021 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+.  */
+
+
+#ifdef L_ctzsi2
+#ifdef NOT_ISA_TARGET_32BIT
+FUNC_START ctzsi2
+   negsr1, r0
+   andsr0, r0, r1
+   movsr1, #28
+   movsr3, #1
+   lslsr3, r3, #16
+   cmp r0, r3 /* 0x1 */
+   bcc 2f
+   lsrsr0, r0, #16
+   subsr1, r1, #16
+2: lsrsr3, r3, #8
+   cmp r0, r3 /* #0x100 */
+   bcc 2f
+   lsrsr0, r0, #8
+   subsr1, r1, #8
+2: lsrsr3, r3, #4
+   cmp r0, r3 /* #0x10 */
+   bcc 2f
+   lsrsr0, r0, #4
+   subsr1, r1, #4
+2: adr r2, 1f
+   ldrbr0, [r2, r0]
+   subsr0, r0, r1
+   bx lr
+.align 2
+1:
+.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
+   FUNC_END ctzsi2
+#else
+ARM_FUNC_START ctzsi2
+   rsb r1, r0, #0
+   and r0, r0, r1
+# if defined (__ARM_FEATURE_CLZ)
+   clz r0, r0
+   rsb r0, r0, #31
+   RET
+# else
+   mov r1, #28
+   cmp r0, #0x1
+   do_it   cs, t
+   movcs   r0, r0, lsr #16
+   subcs   r1, r1, #16
+   cmp r0, #0x100
+   do_it   cs, t
+   movcs   r0, r0, lsr #8
+   subcs   r1, r1, #8
+   cmp r0, #0x10
+   do_it   cs, t
+   movcs   r0, r0, lsr #4
+   subcs   r1, r1, #4
+   adr r2, 1f
+   ldrbr0, [r2, r0]
+   sub r0, r0, r1
+   RET
+.align 2
+1:
+.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
+# endif /* !defined (__ARM_FEATURE_CLZ) */
+   FUNC_END ctzsi2
+#endif
+#endif /* L_clzsi2 */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index d92f73ba0c9..b1df00ac597 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1804,70 +1804,7 @@ LSYM(Lover12):
 #endif /* __symbian__ */
 
 #include "clz2.S"
-
-#ifdef L_ctzsi2
-#ifdef NOT_ISA_TARGET_32BIT
-FUNC_START ctzsi2
-   negsr1, r0
-   andsr0, r0, r1
-   movsr1, #28
-   movsr3, #1
-   lslsr3, r3, #16
-   cmp r0, r3 /* 0x1 */
-   bcc 2f
-   lsrsr0, r0, #16
-   subsr1, r1, #16
-2: lsrsr3, r3, #8
-   cmp r0, r3 /* #0x100 */
-   bcc 2f
-   lsrsr0, r0, #8
-   subsr1, r1, #8
-2: lsrsr3, r3, #4
-   cmp r0, r3 /* #0x10 */
-   bcc 2f
-   lsrsr0, r0, #4
-   subsr1, r1, #4
-2: adr r2, 1f
-   ldrbr0, [r2, r0]
-   subsr0, r0, r1
-   bx lr
-.align 2
-1:
-.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
-   FUNC_END ctzsi2
-#else
-ARM_FUNC_START ctzsi2
-   rsb r1, r0, #0
-   and r0, r0, r1
-# if defined (__ARM_FEATURE_CLZ)
-   clz r0, r0
-   rsb r0, r0, #31
-   RET
-# else
-   mov r1, #28
-   cmp r0, #0x1
-   do_it   cs, t
-   movcs   r0, r0, lsr #16
-   subcs   r1, r1, #16
-   cmp r0, #0x100
-   do_it   cs, t
-   movcs   r0, r0, lsr #8
-   subcs   r1, r1, #8
-   cmp r0, #0x10
-   do_it   cs, t
-   movcs   r0, r0, lsr #4
-   subcs   r1, r1, #4
-   adr r2, 1f
-   ldrbr0, [r2, r0]
-   sub r0, r0, r1
-   RET
-.align 2
-1:
-.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
-# endif /* !defined (__ARM_FEATURE_CLZ) */
-   FUNC_

[PATCH v6 08/34] Refactor 64-bit shift functions into a new file

2021-12-27 Thread Daniel Engel
This will make it easier to isolate changes in subsequent patches.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/lib1funcs.S (__ashldi3, __ashrdi3, __lshldi3): Moved to ...
* config/arm/eabi/lshift.S: New file.
---
 libgcc/config/arm/eabi/lshift.S | 123 
 libgcc/config/arm/lib1funcs.S   | 103 +-
 2 files changed, 124 insertions(+), 102 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/lshift.S

diff --git a/libgcc/config/arm/eabi/lshift.S b/libgcc/config/arm/eabi/lshift.S
new file mode 100644
index 000..0974a72c377
--- /dev/null
+++ b/libgcc/config/arm/eabi/lshift.S
@@ -0,0 +1,123 @@
+/* Copyright (C) 1995-2021 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+.  */
+
+
+#ifdef L_lshrdi3
+
+   FUNC_START lshrdi3
+   FUNC_ALIAS aeabi_llsr lshrdi3
+   
+#ifdef __thumb__
+   lsrsal, r2
+   movsr3, ah
+   lsrsah, r2
+   mov ip, r3
+   subsr2, #32
+   lsrsr3, r2
+   orrsal, r3
+   negsr2, r2
+   mov r3, ip
+   lslsr3, r2
+   orrsal, r3
+   RET
+#else
+   subsr3, r2, #32
+   rsb ip, r2, #32
+   movmi   al, al, lsr r2
+   movpl   al, ah, lsr r3
+   orrmi   al, al, ah, lsl ip
+   mov ah, ah, lsr r2
+   RET
+#endif
+   FUNC_END aeabi_llsr
+   FUNC_END lshrdi3
+
+#endif
+   
+#ifdef L_ashrdi3
+   
+   FUNC_START ashrdi3
+   FUNC_ALIAS aeabi_lasr ashrdi3
+   
+#ifdef __thumb__
+   lsrsal, r2
+   movsr3, ah
+   asrsah, r2
+   subsr2, #32
+   @ If r2 is negative at this point the following step would OR
+   @ the sign bit into all of AL.  That's not what we want...
+   bmi 1f
+   mov ip, r3
+   asrsr3, r2
+   orrsal, r3
+   mov r3, ip
+1:
+   negsr2, r2
+   lslsr3, r2
+   orrsal, r3
+   RET
+#else
+   subsr3, r2, #32
+   rsb ip, r2, #32
+   movmi   al, al, lsr r2
+   movpl   al, ah, asr r3
+   orrmi   al, al, ah, lsl ip
+   mov ah, ah, asr r2
+   RET
+#endif
+
+   FUNC_END aeabi_lasr
+   FUNC_END ashrdi3
+
+#endif
+
+#ifdef L_ashldi3
+
+   FUNC_START ashldi3
+   FUNC_ALIAS aeabi_llsl ashldi3
+   
+#ifdef __thumb__
+   lslsah, r2
+   movsr3, al
+   lslsal, r2
+   mov ip, r3
+   subsr2, #32
+   lslsr3, r2
+   orrsah, r3
+   negsr2, r2
+   mov r3, ip
+   lsrsr3, r2
+   orrsah, r3
+   RET
+#else
+   subsr3, r2, #32
+   rsb ip, r2, #32
+   movmi   ah, ah, lsl r2
+   movpl   ah, al, lsl r3
+   orrmi   ah, ah, al, lsr ip
+   mov al, al, lsl r2
+   RET
+#endif
+   FUNC_END aeabi_llsl
+   FUNC_END ashldi3
+
+#endif
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index b1df00ac597..7ac50230725 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1699,108 +1699,7 @@ LSYM(Lover12):
 
 /* Prevent __aeabi double-word shifts from being produced on SymbianOS.  */
 #ifndef __symbian__
-
-#ifdef L_lshrdi3
-
-   FUNC_START lshrdi3
-   FUNC_ALIAS aeabi_llsr lshrdi3
-   
-#ifdef __thumb__
-   lsrsal, r2
-   movsr3, ah
-   lsrsah, r2
-   mov ip, r3
-   subsr2, #32
-   lsrsr3, r2
-   orrsal, r3
-   negsr2, r2
-   mov r3, ip
-   lslsr3, r2
-   orrsal, r3
-   RET
-#else
-   subsr3, r2, #32
-   rsb ip, r2, #32
-   movmi   al, al, lsr r2
-   movpl   al, ah, lsr r3
-   orrmi   al, al, ah, lsl ip
-   mov ah, ah, lsr r2
-   RET
-#endif
-   FUNC_END aeabi_llsr
-   FUNC_END lshrdi3
-
-#endif
-   
-#ifdef L_ashrdi3
-   
-   FUNC_START ashrdi3
-   FUNC_ALIAS aeabi_lasr ashrdi3
-   
-#ifdef __thumb__
-   lsrsal, r2
-   movsr3, ah
-   asrsah, r2
-   subsr2, #32
-

[PATCH v6 09/34] Import 'clz' functions from the CM0 library

2021-12-27 Thread Daniel Engel
On architectures without __ARM_FEATURE_CLZ, this version combines __clzdi2()
with __clzsi2() into a single object with an efficient tail call.  Also, this
version merges the formerly separate Thumb and ARM code implementations
into a unified instruction sequence.  This change significantly improves
Thumb performance without affecting ARM performance.  Finally, this version
adds a new __OPTIMIZE_SIZE__ build option (binary search loop).

There is no change to the code for architectures with __ARM_FEATURE_CLZ.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bits/clz2.S (__clzsi2, __clzdi2): Reduced code size on
architectures without __ARM_FEATURE_CLZ.
* config/arm/t-elf (LIB1ASMFUNCS): Moved _clzsi2 to new weak roup.
---
 libgcc/config/arm/clz2.S | 363 +--
 libgcc/config/arm/t-elf  |   7 +-
 2 files changed, 237 insertions(+), 133 deletions(-)

diff --git a/libgcc/config/arm/clz2.S b/libgcc/config/arm/clz2.S
index 2ad9a81892c..51ee35fbe78 100644
--- a/libgcc/config/arm/clz2.S
+++ b/libgcc/config/arm/clz2.S
@@ -1,145 +1,244 @@
-/* Copyright (C) 1995-2021 Free Software Foundation, Inc.
+/* clz2.S: Cortex M0 optimized 'clz' functions
 
-This file is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 3, or (at your option) any
-later version.
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel (g...@danielengel.com)
 
-This file is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-General Public License for more details.
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
 
-Under Section 7 of GPL version 3, you are granted additional
-permissions described in the GCC Runtime Library Exception, version
-3.1, as published by the Free Software Foundation.
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
 
-You should have received a copy of the GNU General Public License and
-a copy of the GCC Runtime Library Exception along with this program;
-see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-.  */
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+
+#ifdef L_clzdi2
+
+// int __clzdi2(long long)
+// Counts leading zero bits in $r1:$r0.
+// Returns the result in $r0.
+FUNC_START_SECTION clzdi2 .text.sorted.libgcc.clz2.clzdi2
+CFI_START_FUNCTION
+
+// Moved here from lib1funcs.S
+cmp xxh,#0
+do_it   eq, et
+clzeq   r0, xxl
+clzne   r0, xxh
+addeq   r0, #32
+RET
+
+CFI_END_FUNCTION
+FUNC_END clzdi2
+
+#endif /* L_clzdi2 */
 
 
 #ifdef L_clzsi2
-#ifdef NOT_ISA_TARGET_32BIT
-FUNC_START clzsi2
-   movsr1, #28
-   movsr3, #1
-   lslsr3, r3, #16
-   cmp r0, r3 /* 0x1 */
-   bcc 2f
-   lsrsr0, r0, #16
-   subsr1, r1, #16
-2: lsrsr3, r3, #8
-   cmp r0, r3 /* #0x100 */
-   bcc 2f
-   lsrsr0, r0, #8
-   subsr1, r1, #8
-2: lsrsr3, r3, #4
-   cmp r0, r3 /* #0x10 */
-   bcc 2f
-   lsrsr0, r0, #4
-   subsr1, r1, #4
-2: adr r2, 1f
-   ldrbr0, [r2, r0]
-   addsr0, r0, r1
-   bx lr
-.align 2
-1:
-.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
-   FUNC_END clzsi2
-#else
-ARM_FUNC_START clzsi2
-# if defined (__ARM_FEATURE_CLZ)
-   clz r0, r0
-   RET
-# else
-   mov r1, #28
-   cmp r0, #0x1
-   do_it   cs, t
-   movcs   r0, r0, lsr #16
-   subcs   r1, r1, #16
-   cmp r0, #0x100
-   do_it   cs, t
-   movcs   r0, r0, lsr #8
-   subcs   r1, r1, #8
-   cmp r0, #0x10
-   do_it   cs, t
-   movcs   r0, r0, lsr #4
-   subcs   r1, r1, #4
-   adr r2, 1f
-   ldrbr0, [r2, r0]
-   add r0, r0, r1
-   RET
-.align 2
-1:
-.byte 4, 3, 2, 2, 1, 1, 1

[PATCH v6 10/34] Import 'ctz' functions from the CM0 library

2021-12-27 Thread Daniel Engel
This version combines __ctzdi2() with __ctzsi2() into a single object with
an efficient tail call.  The former implementation of __ctzdi2() was in C.

On architectures without __ARM_FEATURE_CLZ, this version merges the formerly
separate Thumb and ARM code sequences into a unified instruction sequence.
This change significantly improves Thumb performance without affecting ARM
performance.  Finally, this version adds a new __OPTIMIZE_SIZE__ build option.

On architectures with __ARM_FEATURE_CLZ, __ctzsi2(0) now returns 32.  Formerly,
__ctzsi2(0) would return -1.  Architectures without __ARM_FEATURE_CLZ have
always returned 32, so this change makes the return value consistent.
This change costs 2 extra instructions (branchless).

Likewise on architectures with __ARM_FEATURE_CLZ,  __ctzdi2(0) now returns
64 instead of 31.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bits/ctz2.S (__ctzdi2): Added a new function.
(__clzsi2): Reduced size on architectures without __ARM_FEATURE_CLZ;
changed so __clzsi2(0)=32 on architectures wtih __ARM_FEATURE_CLZ.
* config/arm/t-elf (LIB1ASMFUNCS): Added _ctzdi2;
moved _ctzsi2 to the weak function objects group.
---
 libgcc/config/arm/ctz2.S | 308 +--
 libgcc/config/arm/t-elf  |   3 +-
 2 files changed, 233 insertions(+), 78 deletions(-)

diff --git a/libgcc/config/arm/ctz2.S b/libgcc/config/arm/ctz2.S
index 8702c9afb94..dc436af3571 100644
--- a/libgcc/config/arm/ctz2.S
+++ b/libgcc/config/arm/ctz2.S
@@ -1,86 +1,240 @@
-/* Copyright (C) 1995-2021 Free Software Foundation, Inc.
+/* ctz2.S: ARM optimized 'ctz' functions
 
-This file is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 3, or (at your option) any
-later version.
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel (g...@danielengel.com)
 
-This file is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-General Public License for more details.
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
 
-Under Section 7 of GPL version 3, you are granted additional
-permissions described in the GCC Runtime Library Exception, version
-3.1, as published by the Free Software Foundation.
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
 
-You should have received a copy of the GNU General Public License and
-a copy of the GCC Runtime Library Exception along with this program;
-see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-.  */
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
 
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
 
-#ifdef L_ctzsi2
-#ifdef NOT_ISA_TARGET_32BIT
-FUNC_START ctzsi2
-   negsr1, r0
-   andsr0, r0, r1
-   movsr1, #28
-   movsr3, #1
-   lslsr3, r3, #16
-   cmp r0, r3 /* 0x1 */
-   bcc 2f
-   lsrsr0, r0, #16
-   subsr1, r1, #16
-2: lsrsr3, r3, #8
-   cmp r0, r3 /* #0x100 */
-   bcc 2f
-   lsrsr0, r0, #8
-   subsr1, r1, #8
-2: lsrsr3, r3, #4
-   cmp r0, r3 /* #0x10 */
-   bcc 2f
-   lsrsr0, r0, #4
-   subsr1, r1, #4
-2: adr r2, 1f
-   ldrbr0, [r2, r0]
-   subsr0, r0, r1
-   bx lr
-.align 2
-1:
-.byte  27, 28, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31
-   FUNC_END ctzsi2
+
+// When the hardware 'ctz' function is available, an efficient version
+//  of __ctzsi2(x) can be created by calculating '31 - __ctzsi2(lsb(x))',
+//  where lsb(x) is 'x' with only the least-significant '1' bit set.
+// The following offset applies to all of the functions in this file.
+#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+  #define CTZ_RESULT_OFFSET 1
 #else
-ARM_FUNC_START ctzsi2
-   rsb r1, r0, #0
-   and r0, r0, r1
-# if defined (__ARM_FEATURE_CLZ)
-   clz r0, r0
-   rsb r0, r0, #31
-   RET
-# else
-   mov r1, #28
-   cmp 

[PATCH v6 11/34] Import 64-bit shift functions from the CM0 library

2021-12-27 Thread Daniel Engel
The Thumb versions of these functions are each 1-2 instructions smaller
and faster, and branchless when the IT instruction is available.

The ARM versions were converted to the "xxl/xxh" big-endian register
naming convention, but are otherwise unchanged.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bits/shift.S (__ashldi3, __ashrdi3, __lshldi3):
Reduced code size on Thumb architectures;
updated big-endian register naming convention to "xxl/xxh".
---
 libgcc/config/arm/eabi/lshift.S | 338 +---
 1 file changed, 228 insertions(+), 110 deletions(-)

diff --git a/libgcc/config/arm/eabi/lshift.S b/libgcc/config/arm/eabi/lshift.S
index 0974a72c377..16cf2dcef04 100644
--- a/libgcc/config/arm/eabi/lshift.S
+++ b/libgcc/config/arm/eabi/lshift.S
@@ -1,123 +1,241 @@
-/* Copyright (C) 1995-2021 Free Software Foundation, Inc.
+/* lshift.S: ARM optimized 64-bit integer shift
 
-This file is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 3, or (at your option) any
-later version.
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
 
-This file is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-General Public License for more details.
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
 
-Under Section 7 of GPL version 3, you are granted additional
-permissions described in the GCC Runtime Library Exception, version
-3.1, as published by the Free Software Foundation.
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
 
-You should have received a copy of the GNU General Public License and
-a copy of the GCC Runtime Library Exception along with this program;
-see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-.  */
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
 
 
 #ifdef L_lshrdi3
 
-   FUNC_START lshrdi3
-   FUNC_ALIAS aeabi_llsr lshrdi3
-   
-#ifdef __thumb__
-   lsrsal, r2
-   movsr3, ah
-   lsrsah, r2
-   mov ip, r3
-   subsr2, #32
-   lsrsr3, r2
-   orrsal, r3
-   negsr2, r2
-   mov r3, ip
-   lslsr3, r2
-   orrsal, r3
-   RET
-#else
-   subsr3, r2, #32
-   rsb ip, r2, #32
-   movmi   al, al, lsr r2
-   movpl   al, ah, lsr r3
-   orrmi   al, al, ah, lsl ip
-   mov ah, ah, lsr r2
-   RET
-#endif
-   FUNC_END aeabi_llsr
-   FUNC_END lshrdi3
-
-#endif
-   
+// long long __aeabi_llsr(long long, int)
+// Logical shift right the 64 bit value in $r1:$r0 by the count in $r2.
+// The result is only guaranteed for shifts in the range of '0' to '63'.
+// Uses $r3 as scratch space.
+FUNC_START_SECTION aeabi_llsr .text.sorted.libgcc.lshrdi3
+FUNC_ALIAS lshrdi3 aeabi_llsr
+CFI_START_FUNCTION
+
+  #if defined(__thumb__) && __thumb__
+
+// Save a copy for the remainder.
+movsr3, xxh
+
+// Assume a simple shift.
+lsrsxxl,r2
+lsrsxxh,r2
+
+// Test if the shift distance is larger than 1 word.
+subsr2, #32
+
+#ifdef __HAVE_FEATURE_IT
+do_it   lo,te
+
+// The remainder is opposite the main shift, (32 - x) bits.
+rsblo   r2, #0
+lsllo   r3, r2
+
+// The remainder shift extends into the hi word.
+lsrhs   r3, r2
+
+#else /* !__HAVE_FEATURE_IT */
+bhs LLSYM(__llsr_large)
+
+// The remainder is opposite the main shift, (32 - x) bits.
+rsbsr2, #0
+lslsr3, r2
+
+// Cancel any remaining shift.
+eorsr2, r2
+
+  LLSYM(__llsr_large):
+// Apply any remaining shift to the hi word.
+lsrsr3, r2
+
+#endif /* !__HAVE_FEATURE_IT */
+
+// Merge remainder and result.
+addsxxl,r3
+RET
+
+  #else /* 

[PATCH v6 12/34] Import 'clrsb' functions from the CM0 library

2021-12-27 Thread Daniel Engel
This implementation provides an efficient tail call to __clzsi2(), making the
functions rather smaller and faster than the C versions.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bits/clz2.S (__clrsbsi2, __clrsbdi2):
Added new functions.
* config/arm/t-elf (LIB1ASMFUNCS):
Added new function objects _clrsbsi2 and _clrsbdi2).
---
 libgcc/config/arm/clz2.S | 108 ++-
 libgcc/config/arm/t-elf  |   2 +
 2 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/libgcc/config/arm/clz2.S b/libgcc/config/arm/clz2.S
index 51ee35fbe78..a2de45ff651 100644
--- a/libgcc/config/arm/clz2.S
+++ b/libgcc/config/arm/clz2.S
@@ -1,4 +1,4 @@
-/* clz2.S: Cortex M0 optimized 'clz' functions
+/* clz2.S: ARM optimized 'clz' and related functions
 
Copyright (C) 2018-2021 Free Software Foundation, Inc.
Contributed by Daniel Engel (g...@danielengel.com)
@@ -23,7 +23,7 @@
.  */
 
 
-#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+#ifdef __ARM_FEATURE_CLZ
 
 #ifdef L_clzdi2
 
@@ -242,3 +242,107 @@ FUNC_END clzdi2
 
 #endif /* !__ARM_FEATURE_CLZ */
 
+
+#ifdef L_clrsbdi2
+
+// int __clrsbdi2(int)
+// Counts the number of "redundant sign bits" in $r1:$r0.
+// Returns the result in $r0.
+// Uses $r2 and $r3 as scratch space.
+FUNC_START_SECTION clrsbdi2 .text.sorted.libgcc.clz2.clrsbdi2
+CFI_START_FUNCTION
+
+  #if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+// Invert negative signs to keep counting zeros.
+asrsr3, xxh,#31
+eorsxxl,r3
+eorsxxh,r3
+
+// Same as __clzdi2(), except that the 'C' flag is pre-calculated.
+// Also, the trailing 'subs', since the last bit is not redundant.
+do_it   eq, et
+clzeq   r0, xxl
+clzne   r0, xxh
+addeq   r0, #32
+subsr0, #1
+RET
+
+  #else  /* !__ARM_FEATURE_CLZ */
+// Result if all the bits in the argument are zero.
+// Set it here to keep the flags clean after 'eors' below.
+movsr2, #31
+
+// Invert negative signs to keep counting zeros.
+asrsr3, xxh,#31
+eorsxxh,r3
+
+#if defined(__ARMEB__) && __ARMEB__
+// If the upper word is non-zero, return '__clzsi2(upper) - 1'.
+bne SYM(__internal_clzsi2)
+
+// The upper word is zero, prepare the lower word.
+movsr0, r1
+eorsr0, r3
+
+#else /* !__ARMEB__ */
+// Save the lower word temporarily.
+// This somewhat awkward construction adds one cycle when the
+//  branch is not taken, but prevents a double-branch.
+eorsr3, r0
+
+// If the upper word is non-zero, return '__clzsi2(upper) - 1'.
+movsr0, r1
+bneSYM(__internal_clzsi2)
+
+// Restore the lower word.
+movsr0, r3
+
+#endif /* !__ARMEB__ */
+
+// The upper word is zero, return '31 + __clzsi2(lower)'.
+addsr2, #32
+b   SYM(__internal_clzsi2)
+
+  #endif /* !__ARM_FEATURE_CLZ */
+
+CFI_END_FUNCTION
+FUNC_END clrsbdi2
+
+#endif /* L_clrsbdi2 */
+
+
+#ifdef L_clrsbsi2
+
+// int __clrsbsi2(int)
+// Counts the number of "redundant sign bits" in $r0.
+// Returns the result in $r0.
+// Uses $r2 and possibly $r3 as scratch space.
+FUNC_START_SECTION clrsbsi2 .text.sorted.libgcc.clz2.clrsbsi2
+CFI_START_FUNCTION
+
+// Invert negative signs to keep counting zeros.
+asrsr2, r0,#31
+eorsr0, r2
+
+  #if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ
+// Count.
+clz r0, r0
+
+// The result for a positive value will always be >= 1.
+// By definition, the last bit is not redundant.
+subsr0, #1
+RET
+
+  #else /* !__ARM_FEATURE_CLZ */
+// Result if all the bits in the argument are zero.
+// By definition, the last bit is not redundant.
+movsr2, #31
+b   SYM(__internal_clzsi2)
+
+  #endif  /* !__ARM_FEATURE_CLZ */
+
+CFI_END_FUNCTION
+FUNC_END clrsbsi2
+
+#endif /* L_clrsbsi2 */
+
diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf
index 33b83ac4adf..89071cebe45 100644
--- a/libgcc/config/arm/t-elf
+++ b/libgcc/config/arm/t-elf
@@ -31,6 +31,8 @@ LIB1ASMFUNCS += \
_ashldi3 \
_ashrdi3 \
_lshrdi3 \
+   _clrsbsi2 \
+   _clrsbdi2 \
_clzdi2 \
_ctzdi2 \
_dvmd_tls \
-- 
2.25.1



[PATCH v6 13/34] Import 'ffs' functions from the CM0 library

2021-12-27 Thread Daniel Engel
This implementation provides an efficient tail call to __clzdi2(), making the
functions rather smaller and faster than the C versions.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bits/ctz2.S (__ffssi2, __ffsdi2): New functions.
* config/arm/t-elf (LIB1ASMFUNCS): Added _ffssi2 and _ffsdi2.
---
 libgcc/config/arm/ctz2.S | 77 +++-
 libgcc/config/arm/t-elf  |  2 ++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/libgcc/config/arm/ctz2.S b/libgcc/config/arm/ctz2.S
index dc436af3571..b9528a061a2 100644
--- a/libgcc/config/arm/ctz2.S
+++ b/libgcc/config/arm/ctz2.S
@@ -1,4 +1,4 @@
-/* ctz2.S: ARM optimized 'ctz' functions
+/* ctz2.S: ARM optimized 'ctz' and related functions
 
Copyright (C) 2020-2021 Free Software Foundation, Inc.
Contributed by Daniel Engel (g...@danielengel.com)
@@ -238,3 +238,78 @@ FUNC_END ctzdi2
 
 #endif /* L_ctzsi2 || L_ctzdi2 */
 
+
+#ifdef L_ffsdi2
+
+// int __ffsdi2(int)
+// Return the index of the least significant 1-bit in $r1:r0,
+//  or zero if $r1:r0 is zero.  The least significant bit is index 1.
+// Returns the result in $r0.
+// Uses $r2 and possibly $r3 as scratch space.
+// Same section as __ctzsi2() for sake of the tail call branches.
+FUNC_START_SECTION ffsdi2 .text.sorted.libgcc.ctz2.ffsdi2
+CFI_START_FUNCTION
+
+// Simplify branching by assuming a non-zero lower word.
+// For all such, ffssi2(x) == ctzsi2(x) + 1.
+movsr2,#(33 - CTZ_RESULT_OFFSET)
+
+  #if defined(__ARMEB__) && __ARMEB__
+// HACK: Save the upper word in a scratch register.
+movsr3, r0
+
+// Test the lower word.
+movsr0, r1
+bne SYM(__internal_ctzsi2)
+
+// Test the upper word.
+movsr2,#(65 - CTZ_RESULT_OFFSET)
+movsr0, r3
+bne SYM(__internal_ctzsi2)
+
+  #else /* !__ARMEB__ */
+// Test the lower word.
+cmp r0, #0
+bne SYM(__internal_ctzsi2)
+
+// Test the upper word.
+movsr2,#(65 - CTZ_RESULT_OFFSET)
+movsr0, r1
+bne SYM(__internal_ctzsi2)
+
+  #endif /* !__ARMEB__ */
+
+// Upper and lower words are both zero.
+RET
+
+CFI_END_FUNCTION
+FUNC_END ffsdi2
+
+#endif /* L_ffsdi2 */
+
+
+#ifdef L_ffssi2
+
+// int __ffssi2(int)
+// Return the index of the least significant 1-bit in $r0,
+//  or zero if $r0 is zero.  The least significant bit is index 1.
+// Returns the result in $r0.
+// Uses $r2 and possibly $r3 as scratch space.
+// Same section as __ctzsi2() for sake of the tail call branches.
+FUNC_START_SECTION ffssi2 .text.sorted.libgcc.ctz2.ffssi2
+CFI_START_FUNCTION
+
+// Simplify branching by assuming a non-zero argument.
+// For all such, ffssi2(x) == ctzsi2(x) + 1.
+movsr2,#(33 - CTZ_RESULT_OFFSET)
+
+// Test for zero, return unmodified.
+cmp r0, #0
+bne SYM(__internal_ctzsi2)
+RET
+
+CFI_END_FUNCTION
+FUNC_END ffssi2
+
+#endif /* L_ffssi2 */
+
diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf
index 89071cebe45..346fc766f17 100644
--- a/libgcc/config/arm/t-elf
+++ b/libgcc/config/arm/t-elf
@@ -35,6 +35,8 @@ LIB1ASMFUNCS += \
_clrsbdi2 \
_clzdi2 \
_ctzdi2 \
+   _ffssi2 \
+   _ffsdi2 \
_dvmd_tls \
_divsi3 \
_modsi3 \
-- 
2.25.1



[PATCH v6 14/34] Import 'parity' functions from the CM0 library

2021-12-27 Thread Daniel Engel
The functional overlap between the single- and double-word functions makes
functions makes this implementation about half the size of the C functions
if both functions are linked in the same application.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/parity.S: New file for __paritysi2/di2().
* config/arm/lib1funcs.S: #include bit/parity.S
* config/arm/t-elf (LIB1ASMFUNCS): Added _paritysi2/di2.
---
 libgcc/config/arm/lib1funcs.S |   1 +
 libgcc/config/arm/parity.S| 120 ++
 libgcc/config/arm/t-elf   |   2 +
 3 files changed, 123 insertions(+)
 create mode 100644 libgcc/config/arm/parity.S

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 7ac50230725..600ea2dfdc9 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1704,6 +1704,7 @@ LSYM(Lover12):
 
 #include "clz2.S"
 #include "ctz2.S"
+#include "parity.S"
 
 /*  */
 /* These next two sections are here despite the fact that they contain Thumb 
diff --git a/libgcc/config/arm/parity.S b/libgcc/config/arm/parity.S
new file mode 100644
index 000..45233bc9d8f
--- /dev/null
+++ b/libgcc/config/arm/parity.S
@@ -0,0 +1,120 @@
+/* parity.S: ARM optimized parity functions
+
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_paritydi2
+
+// int __paritydi2(int)
+// Returns '0' if the number of bits set in $r1:r0 is even, and '1' otherwise.
+// Returns the result in $r0.
+FUNC_START_SECTION paritydi2 .text.sorted.libgcc.paritydi2
+CFI_START_FUNCTION
+
+// Combine the upper and lower words, then fall through.
+// Byte-endianness does not matter for this function.
+eorsr0, r1
+
+#endif /* L_paritydi2 */
+
+
+// The implementation of __paritydi2() tightly couples with __paritysi2(),
+//  such that instructions must appear consecutively in the same memory
+//  section for proper flow control.  However, this construction inhibits
+//  the ability to discard __paritydi2() when only using __paritysi2().
+// Therefore, this block configures __paritysi2() for compilation twice.
+// The first version is a minimal standalone implementation, and the second
+//  version is the continuation of __paritydi2().  The standalone version must
+//  be declared WEAK, so that the combined version can supersede it and
+//  provide both symbols when required.
+// '_paritysi2' should appear before '_paritydi2' in LIB1ASMFUNCS.
+#if defined(L_paritysi2) || defined(L_paritydi2)
+
+#ifdef L_paritysi2
+// int __paritysi2(int)
+// Returns '0' if the number of bits set in $r0 is even, and '1' otherwise.
+// Returns the result in $r0.
+// Uses $r2 as scratch space.
+WEAK_START_SECTION paritysi2 .text.sorted.libgcc.paritysi2
+CFI_START_FUNCTION
+
+#else /* L_paritydi2 */
+FUNC_ENTRY paritysi2
+
+#endif
+
+  #if defined(__thumb__) && __thumb__
+#if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+
+// Size optimized: 16 bytes, 40 cycles
+// Speed optimized: 24 bytes, 14 cycles
+movsr2, #16
+
+LLSYM(__parity_loop):
+// Calculate the parity of successively smaller half-words into the 
MSB.
+movsr1, r0
+lslsr1, r2
+eorsr0, r1
+lsrsr2, #1
+bne LLSYM(__parity_loop)
+
+#else /* !__OPTIMIZE_SIZE__ */
+
+// Unroll the loop.  The 'libgcc' reference C implementation replaces
+//  the x2 and the x1 shifts with a constant.  However, since it takes
+//  4 cycles to load, index, and mask the constant result, it doesn't
+//  cost anything to keep shifting (and saves a few bytes).
+lslsr1, r0, #16
+eorsr0, r1
+lslsr1, r0, #8
+eorsr0, r1
+lslsr1, r0, #4
+eorsr0, r1
+lslsr1, r0, 

[PATCH v6 15/34] Import 'popcnt' functions from the CM0 library

2021-12-27 Thread Daniel Engel
The functional overlap between the single- and double-word functions
makes this implementation about 30% smaller than the C functions
if both functions are linked together in the same appliation.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/popcnt.S (__popcountsi, __popcountdi2): New file.
* config/arm/lib1funcs.S: #include bit/popcnt.S
* config/arm/t-elf (LIB1ASMFUNCS): Add _popcountsi2/di2.
---
 libgcc/config/arm/lib1funcs.S |   1 +
 libgcc/config/arm/popcnt.S| 189 ++
 libgcc/config/arm/t-elf   |   2 +
 3 files changed, 192 insertions(+)
 create mode 100644 libgcc/config/arm/popcnt.S

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 600ea2dfdc9..bd84a3e4281 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1705,6 +1705,7 @@ LSYM(Lover12):
 #include "clz2.S"
 #include "ctz2.S"
 #include "parity.S"
+#include "popcnt.S"
 
 /*  */
 /* These next two sections are here despite the fact that they contain Thumb 
diff --git a/libgcc/config/arm/popcnt.S b/libgcc/config/arm/popcnt.S
new file mode 100644
index 000..51b1ed745ee
--- /dev/null
+++ b/libgcc/config/arm/popcnt.S
@@ -0,0 +1,189 @@
+/* popcnt.S: ARM optimized popcount functions
+
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_popcountdi2
+
+// int __popcountdi2(int)
+// Returns the number of bits set in $r1:$r0.
+// Returns the result in $r0.
+FUNC_START_SECTION popcountdi2 .text.sorted.libgcc.popcountdi2
+CFI_START_FUNCTION
+
+  #if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+// Initialize the result.
+// Compensate for the two extra loop (one for each word)
+//  required to detect zero arguments.
+movsr2, #2
+
+LLSYM(__popcountd_loop):
+// Same as __popcounts_loop below, except for $r1.
+subsr2, #1
+subsr3, r1, #1
+andsr1, r3
+bcs LLSYM(__popcountd_loop)
+
+// Repeat the operation for the second word.
+b   LLSYM(__popcounts_loop)
+
+  #else /* !__OPTIMIZE_SIZE__ */
+// Load the one-bit alternating mask.
+ldr r3, =0x
+
+// Reduce the second word.
+lsrsr2, r1, #1
+andsr2, r3
+subsr1, r2
+
+// Reduce the first word.
+lsrsr2, r0, #1
+andsr2, r3
+subsr0, r2
+
+// Load the two-bit alternating mask.
+ldr r3, =0x
+
+// Reduce the second word.
+lsrsr2, r1, #2
+andsr2, r3
+andsr1, r3
+addsr1, r2
+
+// Reduce the first word.
+lsrsr2, r0, #2
+andsr2, r3
+andsr0, r3
+addsr0, r2
+
+// There will be a maximum of 8 bits in each 4-bit field.
+// Jump into the single word flow to combine and complete.
+b   LLSYM(__popcounts_merge)
+
+  #endif /* !__OPTIMIZE_SIZE__ */
+#endif /* L_popcountdi2 */
+
+
+// The implementation of __popcountdi2() tightly couples with __popcountsi2(),
+//  such that instructions must appear consecutively in the same memory
+//  section for proper flow control.  However, this construction inhibits
+//  the ability to discard __popcountdi2() when only using __popcountsi2().
+// Therefore, this block configures __popcountsi2() for compilation twice.
+// The first version is a minimal standalone implementation, and the second
+//  version is the continuation of __popcountdi2().  The standalone version 
must
+//  be declared WEAK, so that the combined version can supersede it and
+//  provide both symbols when required.
+// '_popcountsi2' should appear before '_popcountdi2' in LIB1ASMFUNCS.
+#if defined(L_popcountsi2) || def

[PATCH v6 16/34] Refactor Thumb-1 64-bit comparison into a new file

2021-12-27 Thread Daniel Engel
This will make it easier to isolate changes in subsequent patches.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bpabi-v6m.S (__aeabi_lcmp, __aeabi_ulcmp): Moved to ...
* config/arm/eabi/lcmp.S: New file.
* config/arm/lib1funcs.S: #include eabi/lcmp.S.
---
 libgcc/config/arm/bpabi-v6m.S | 46 --
 libgcc/config/arm/eabi/lcmp.S | 73 +++
 libgcc/config/arm/lib1funcs.S |  1 +
 3 files changed, 74 insertions(+), 46 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/lcmp.S

diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S
index 069fcbbf48c..a051c1530a4 100644
--- a/libgcc/config/arm/bpabi-v6m.S
+++ b/libgcc/config/arm/bpabi-v6m.S
@@ -33,52 +33,6 @@
.eabi_attribute 25, 1
 #endif /* __ARM_EABI__ */
 
-#ifdef L_aeabi_lcmp
-
-FUNC_START aeabi_lcmp
-   cmp xxh, yyh
-   beq 1f
-   bgt 2f
-   movsr0, #1
-   negsr0, r0
-   RET
-2:
-   movsr0, #1
-   RET
-1:
-   subsr0, xxl, yyl
-   beq 1f
-   bhi 2f
-   movsr0, #1
-   negsr0, r0
-   RET
-2:
-   movsr0, #1
-1:
-   RET
-   FUNC_END aeabi_lcmp
-
-#endif /* L_aeabi_lcmp */
-   
-#ifdef L_aeabi_ulcmp
-
-FUNC_START aeabi_ulcmp
-   cmp xxh, yyh
-   bne 1f
-   subsr0, xxl, yyl
-   beq 2f
-1:
-   bcs 1f
-   movsr0, #1
-   negsr0, r0
-   RET
-1:
-   movsr0, #1
-2:
-   RET
-   FUNC_END aeabi_ulcmp
-
-#endif /* L_aeabi_ulcmp */
 
 .macro test_div_by_zero signed
cmp yyh, #0
diff --git a/libgcc/config/arm/eabi/lcmp.S b/libgcc/config/arm/eabi/lcmp.S
new file mode 100644
index 000..336db1d398c
--- /dev/null
+++ b/libgcc/config/arm/eabi/lcmp.S
@@ -0,0 +1,73 @@
+/* Miscellaneous BPABI functions.  Thumb-1 implementation, suitable for ARMv4T,
+   ARMv6-M and ARMv8-M Baseline like ISA variants.
+
+   Copyright (C) 2006-2020 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_aeabi_lcmp
+
+FUNC_START aeabi_lcmp
+cmp xxh, yyh
+beq 1f
+bgt 2f
+movsr0, #1
+negsr0, r0
+RET
+2:
+movsr0, #1
+RET
+1:
+subsr0, xxl, yyl
+beq 1f
+bhi 2f
+movsr0, #1
+negsr0, r0
+RET
+2:
+movsr0, #1
+1:
+RET
+FUNC_END aeabi_lcmp
+
+#endif /* L_aeabi_lcmp */
+
+#ifdef L_aeabi_ulcmp
+
+FUNC_START aeabi_ulcmp
+cmp xxh, yyh
+bne 1f
+subsr0, xxl, yyl
+beq 2f
+1:
+bcs 1f
+movsr0, #1
+negsr0, r0
+RET
+1:
+movsr0, #1
+2:
+RET
+FUNC_END aeabi_ulcmp
+
+#endif /* L_aeabi_ulcmp */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index bd84a3e4281..5e24d0a6749 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1991,5 +1991,6 @@ LSYM(Lchange_\register):
 #include "bpabi.S"
 #else /* NOT_ISA_TARGET_32BIT */
 #include "bpabi-v6m.S"
+#include "eabi/lcmp.S"
 #endif /* NOT_ISA_TARGET_32BIT */
 #endif /* !__symbian__ */
-- 
2.25.1



[PATCH v6 17/34] Import 64-bit comparison from CM0 library

2021-12-27 Thread Daniel Engel
These are 2-5 instructions smaller and just as fast.  Branches are
minimized, which will allow easier adaptation to Thumb-2/ARM mode.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/eabi/lcmp.S (__aeabi_lcmp, __aeabi_ulcmp): Replaced;
add macro configuration to build __cmpdi2() and __ucmpdi2().
* config/arm/t-elf (LIB1ASMFUNCS): Added _cmpdi2 and _ucmpdi2.
---
 libgcc/config/arm/eabi/lcmp.S | 151 +-
 libgcc/config/arm/t-elf   |   2 +
 2 files changed, 112 insertions(+), 41 deletions(-)

diff --git a/libgcc/config/arm/eabi/lcmp.S b/libgcc/config/arm/eabi/lcmp.S
index 336db1d398c..2ac9d178b34 100644
--- a/libgcc/config/arm/eabi/lcmp.S
+++ b/libgcc/config/arm/eabi/lcmp.S
@@ -1,8 +1,7 @@
-/* Miscellaneous BPABI functions.  Thumb-1 implementation, suitable for ARMv4T,
-   ARMv6-M and ARMv8-M Baseline like ISA variants.
+/* lcmp.S: Thumb-1 optimized 64-bit integer comparison
 
-   Copyright (C) 2006-2020 Free Software Foundation, Inc.
-   Contributed by CodeSourcery.
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
 
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@@ -24,50 +23,120 @@
.  */
 
 
+#if defined(L_aeabi_lcmp) || defined(L_cmpdi2)
+
 #ifdef L_aeabi_lcmp
+  #define LCMP_NAME aeabi_lcmp
+  #define LCMP_SECTION .text.sorted.libgcc.lcmp
+#else
+  #define LCMP_NAME cmpdi2
+  #define LCMP_SECTION .text.sorted.libgcc.cmpdi2
+#endif
+
+// int __aeabi_lcmp(long long, long long)
+// int __cmpdi2(long long, long long)
+// Compares the 64 bit signed values in $r1:$r0 and $r3:$r2.
+// lcmp() returns $r0 = { -1, 0, +1 } for orderings { <, ==, > } respectively.
+// cmpdi2() returns $r0 = { 0, 1, 2 } for orderings { <, ==, > } respectively.
+// Object file duplication assumes typical programs follow one runtime ABI.
+FUNC_START_SECTION LCMP_NAME LCMP_SECTION
+CFI_START_FUNCTION
+
+// Calculate the difference $r1:$r0 - $r3:$r2.
+subsxxl,yyl
+sbcsxxh,yyh
+
+// With $r2 free, create a known offset value without affecting
+//  the N or Z flags.
+// BUG? The originally unified instruction for v6m was 'mov r2, r3'.
+//  However, this resulted in a compile error with -mthumb:
+//"MOV Rd, Rs with two low registers not permitted".
+// Since unified syntax deprecates the "cpy" instruction, shouldn't
+//  there be a backwards-compatible tranlation available?
+cpy r2, r3
+
+// Evaluate the comparison result.
+blt LLSYM(__lcmp_lt)
+
+// The reference offset ($r2 - $r3) will be +2 iff the first
+//  argument is larger, otherwise the offset value remains 0.
+addsr2, #2
+
+// Check for zero (equality in 64 bits).
+// It doesn't matter which register was originally "hi".
+orrsr0,r1
+
+// The result is already 0 on equality.
+beq LLSYM(__lcmp_return)
+
+LLSYM(__lcmp_lt):
+// Create +1 or -1 from the offset value defined earlier.
+addsr3, #1
+subsr0, r2, r3
+
+LLSYM(__lcmp_return):
+  #ifdef L_cmpdi2
+// Offset to the correct output specification.
+addsr0, #1
+  #endif
 
-FUNC_START aeabi_lcmp
-cmp xxh, yyh
-beq 1f
-bgt 2f
-movsr0, #1
-negsr0, r0
-RET
-2:
-movsr0, #1
-RET
-1:
-subsr0, xxl, yyl
-beq 1f
-bhi 2f
-movsr0, #1
-negsr0, r0
-RET
-2:
-movsr0, #1
-1:
 RET
-FUNC_END aeabi_lcmp
 
-#endif /* L_aeabi_lcmp */
+CFI_END_FUNCTION
+FUNC_END LCMP_NAME
+
+#endif /* L_aeabi_lcmp || L_cmpdi2 */
+
+
+#if defined(L_aeabi_ulcmp) || defined(L_ucmpdi2)
 
 #ifdef L_aeabi_ulcmp
+  #define ULCMP_NAME aeabi_ulcmp
+  #define ULCMP_SECTION .text.sorted.libgcc.ulcmp
+#else
+  #define ULCMP_NAME ucmpdi2
+  #define ULCMP_SECTION .text.sorted.libgcc.ucmpdi2
+#endif
+
+// int __aeabi_ulcmp(unsigned long long, unsigned long long)
+// int __ucmpdi2(unsigned long long, unsigned long long)
+// Compares the 64 bit unsigned values in $r1:$r0 and $r3:$r2.
+// ulcmp() returns $r0 = { -1, 0, +1 } for orderings { <, ==, > } respectively.
+// ucmpdi2() returns $r0 = { 0, 1, 2 } for orderings { <, ==, > } respectively.
+// Object file duplication assumes typical programs follow one runtime ABI.
+FUNC_START_SECTION ULCMP_NAME ULCMP_SECTION
+CFI_START_FUNCTION
+
+// Calculate the 'C' flag.
+subsxxl,yyl
+sbcsxxh,yyh
+
+// Capture the carry flg.
+// $r2 will contain -1 if the first value is smaller,
+//  0 if the first value is larger or equa

[PATCH v6 18/34] Merge Thumb-2 optimizations for 64-bit comparison

2021-12-27 Thread Daniel Engel
This effectively merges support for all architecture variants into a
common function path with appropriate build conditions.
ARM performance is 1-2 instructions faster; Thumb-2 is about 50% faster.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bpabi.S (__aeabi_lcmp, __aeabi_ulcmp): Removed.
* config/arm/eabi/lcmp.S (__aeabi_lcmp, __aeabi_ulcmp): Added
conditional execution on supported architectures (__ARM_FEATURE_IT).
* config/arm/lib1funcs.S: Moved #include scope of eabi/lcmp.S.
---
 libgcc/config/arm/bpabi.S | 42 ---
 libgcc/config/arm/eabi/lcmp.S | 47 ++-
 libgcc/config/arm/lib1funcs.S |  2 +-
 3 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S
index 2cbb67d54ad..4281a2be594 100644
--- a/libgcc/config/arm/bpabi.S
+++ b/libgcc/config/arm/bpabi.S
@@ -34,48 +34,6 @@
.eabi_attribute 25, 1
 #endif /* __ARM_EABI__ */
 
-#ifdef L_aeabi_lcmp
-
-ARM_FUNC_START aeabi_lcmp
-   cmp xxh, yyh
-   do_it   lt
-   movlt   r0, #-1
-   do_it   gt
-   movgt   r0, #1
-   do_it   ne
-   RETc(ne)
-   subsr0, xxl, yyl
-   do_it   lo
-   movlo   r0, #-1
-   do_it   hi
-   movhi   r0, #1
-   RET
-   FUNC_END aeabi_lcmp
-
-#endif /* L_aeabi_lcmp */
-   
-#ifdef L_aeabi_ulcmp
-
-ARM_FUNC_START aeabi_ulcmp
-   cmp xxh, yyh
-   do_it   lo
-   movlo   r0, #-1
-   do_it   hi
-   movhi   r0, #1
-   do_it   ne
-   RETc(ne)
-   cmp xxl, yyl
-   do_it   lo
-   movlo   r0, #-1
-   do_it   hi
-   movhi   r0, #1
-   do_it   eq
-   moveq   r0, #0
-   RET
-   FUNC_END aeabi_ulcmp
-
-#endif /* L_aeabi_ulcmp */
-
 .macro test_div_by_zero signed
 /* Tail-call to divide-by-zero handlers which may be overridden by the user,
so unwinding works properly.  */
diff --git a/libgcc/config/arm/eabi/lcmp.S b/libgcc/config/arm/eabi/lcmp.S
index 2ac9d178b34..f1a9c3b8fe0 100644
--- a/libgcc/config/arm/eabi/lcmp.S
+++ b/libgcc/config/arm/eabi/lcmp.S
@@ -46,6 +46,19 @@ FUNC_START_SECTION LCMP_NAME LCMP_SECTION
 subsxxl,yyl
 sbcsxxh,yyh
 
+#ifdef __HAVE_FEATURE_IT
+do_it   lt,t
+
+  #ifdef L_aeabi_lcmp
+movlt   r0,#-1
+  #else
+movlt   r0,#0
+  #endif
+
+// Early return on '<'.
+RETc(lt)
+
+#else /* !__HAVE_FEATURE_IT */
 // With $r2 free, create a known offset value without affecting
 //  the N or Z flags.
 // BUG? The originally unified instruction for v6m was 'mov r2, r3'.
@@ -62,17 +75,27 @@ FUNC_START_SECTION LCMP_NAME LCMP_SECTION
 //  argument is larger, otherwise the offset value remains 0.
 addsr2, #2
 
+#endif
+
 // Check for zero (equality in 64 bits).
 // It doesn't matter which register was originally "hi".
 orrsr0,r1
 
+#ifdef __HAVE_FEATURE_IT
+// The result is already 0 on equality.
+// -1 already returned, so just force +1.
+do_it   ne
+movne   r0, #1
+
+#else /* !__HAVE_FEATURE_IT */
 // The result is already 0 on equality.
 beq LLSYM(__lcmp_return)
 
-LLSYM(__lcmp_lt):
+  LLSYM(__lcmp_lt):
 // Create +1 or -1 from the offset value defined earlier.
 addsr3, #1
 subsr0, r2, r3
+#endif
 
 LLSYM(__lcmp_return):
   #ifdef L_cmpdi2
@@ -111,21 +134,43 @@ FUNC_START_SECTION ULCMP_NAME ULCMP_SECTION
 subsxxl,yyl
 sbcsxxh,yyh
 
+#ifdef __HAVE_FEATURE_IT
+do_it   lo,t
+
+  #ifdef L_aeabi_ulcmp
+movlo   r0, -1
+  #else
+movlo   r0, #0
+  #endif
+
+// Early return on '<'.
+RETc(lo)
+
+#else
 // Capture the carry flg.
 // $r2 will contain -1 if the first value is smaller,
 //  0 if the first value is larger or equal.
 sbcsr2, r2
+#endif
 
 // Check for zero (equality in 64 bits).
 // It doesn't matter which register was originally "hi".
 orrsr0, r1
 
+#ifdef __HAVE_FEATURE_IT
+// The result is already 0 on equality.
+// -1 already returned, so just force +1.
+do_it   ne
+movne   r0, #1
+
+#else /* !__HAVE_FEATURE_IT */
 // The result is already 0 on equality.
 beq LLSYM(__ulcmp_return)
 
 // Assume +1.  If -1 is correct, $r2 will override.
 movsr0, #1
 orrsr0, r2
+#endif
 
 LLSYM(__ulcmp_return):
   #ifdef L_ucmpdi2
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 5e24d0a6749..f41354f811e 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -1991,6 +1991,6 @@ LSYM(Lchange_\regis

[PATCH v6 19/34] Import 32-bit division from the CM0 library

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/eabi/idiv.S: New file for __udivsi3() and __divsi3().
* config/arm/lib1funcs.S: #include eabi/idiv.S (v6m only).
---
 libgcc/config/arm/eabi/idiv.S | 299 ++
 libgcc/config/arm/lib1funcs.S |  19 ++-
 2 files changed, 317 insertions(+), 1 deletion(-)
 create mode 100644 libgcc/config/arm/eabi/idiv.S

diff --git a/libgcc/config/arm/eabi/idiv.S b/libgcc/config/arm/eabi/idiv.S
new file mode 100644
index 000..7381e8f57a3
--- /dev/null
+++ b/libgcc/config/arm/eabi/idiv.S
@@ -0,0 +1,299 @@
+/* div.S: Thumb-1 size-optimized 32-bit integer division
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifndef __GNUC__
+
+// int __aeabi_idiv0(int)
+// Helper function for division by 0.
+WEAK_START_SECTION aeabi_idiv0 .text.sorted.libgcc.idiv.idiv0
+FUNC_ALIAS cm0_idiv0 aeabi_idiv0
+CFI_START_FUNCTION
+
+  #if defined(TRAP_EXCEPTIONS) && TRAP_EXCEPTIONS
+svc #(SVC_DIVISION_BY_ZERO)
+  #endif
+
+RET
+
+CFI_END_FUNCTION
+FUNC_END cm0_idiv0
+FUNC_END aeabi_idiv0
+
+#endif /* !__GNUC__ */
+
+
+#ifdef L_divsi3
+
+// int __aeabi_idiv(int, int)
+// idiv_return __aeabi_idivmod(int, int)
+// Returns signed $r0 after division by $r1.
+// Also returns the signed remainder in $r1.
+// Same parent section as __divsi3() to keep branches within range.
+FUNC_START_SECTION divsi3 .text.sorted.libgcc.idiv.divsi3
+
+#ifndef __symbian__
+  FUNC_ALIAS aeabi_idiv divsi3
+  FUNC_ALIAS aeabi_idivmod divsi3
+#endif
+
+CFI_START_FUNCTION
+
+// Extend signs.
+asrsr2, r0, #31
+asrsr3, r1, #31
+
+// Absolute value of the denominator, abort on division by zero.
+eorsr1, r3
+subsr1, r3
+  #if defined(PEDANTIC_DIV0) && PEDANTIC_DIV0
+beq LLSYM(__idivmod_zero)
+  #else
+beq SYM(__uidivmod_zero)
+  #endif
+
+// Absolute value of the numerator.
+eorsr0, r2
+subsr0, r2
+
+// Keep the sign of the numerator in bit[31] (for the remainder).
+// Save the XOR of the signs in bits[15:0] (for the quotient).
+push{ rT, lr }
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+lsrsrT, r3, #16
+eorsrT, r2
+
+// Handle division as unsigned.
+bl  SYM(__uidivmod_nonzero) __PLT__
+
+// Set the sign of the remainder.
+asrsr2, rT, #31
+eorsr1, r2
+subsr1, r2
+
+// Set the sign of the quotient.
+sxthr3, rT
+eorsr0, r3
+subsr0, r3
+
+LLSYM(__idivmod_return):
+pop { rT, pc }
+.cfi_restore_state
+
+  #if defined(PEDANTIC_DIV0) && PEDANTIC_DIV0
+LLSYM(__idivmod_zero):
+// Set up the *div0() parameter specified in the ARM runtime ABI:
+//  * 0 if the numerator is 0,
+//  * Or, the largest value of the type manipulated by the calling
+// division function if the numerator is positive,
+//  * Or, the least value of the type manipulated by the calling
+// division function if the numerator is negative.
+subsr1, r0
+orrsr0, r1
+asrsr0, #31
+lsrsr0, #1
+eorsr0, r2
+
+// At least the __aeabi_idiv0() call is common.
+b   SYM(__uidivmod_zero2)
+  #endif /* PEDANTIC_DIV0 */
+
+CFI_END_FUNCTION
+FUNC_END divsi3
+
+#ifndef __symbian__
+  FUNC_END aeabi_idiv
+  FUNC_END aeabi_idivmod
+#endif 
+
+#endif /* L_divsi3 */
+
+
+#ifdef L_udivsi3
+
+// int __aeabi_uidiv(unsigned int, unsigned int)
+// idiv_return __aeabi_uidivmod(unsigned int, unsigned int)
+// Returns unsigned $r0 a

[PATCH v6 20/34] Refactor Thumb-1 64-bit division into a new file

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bpabi-v6m.S (__aeabi_ldivmod/ldivmod): Moved to ...
* config/arm/eabi/ldiv.S: New file.
* config/arm/lib1funcs.S: #include eabi/ldiv.S (v6m only).
---
 libgcc/config/arm/bpabi-v6m.S |  81 -
 libgcc/config/arm/eabi/ldiv.S | 107 ++
 libgcc/config/arm/lib1funcs.S |   1 +
 3 files changed, 108 insertions(+), 81 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/ldiv.S

diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S
index a051c1530a4..b3dc3bf8f4d 100644
--- a/libgcc/config/arm/bpabi-v6m.S
+++ b/libgcc/config/arm/bpabi-v6m.S
@@ -34,87 +34,6 @@
 #endif /* __ARM_EABI__ */
 
 
-.macro test_div_by_zero signed
-   cmp yyh, #0
-   bne 7f
-   cmp yyl, #0
-   bne 7f
-   cmp xxh, #0
-   .ifc\signed, unsigned
-   bne 2f
-   cmp xxl, #0
-2:
-   beq 3f
-   movsxxh, #0
-   mvnsxxh, xxh@ 0x
-   movsxxl, xxh
-3:
-   .else
-   blt 6f
-   bgt 4f
-   cmp xxl, #0
-   beq 5f
-4: movsxxl, #0
-   mvnsxxl, xxl@ 0x
-   lsrsxxh, xxl, #1@ 0x7fff
-   b   5f
-6: movsxxh, #0x80
-   lslsxxh, xxh, #24   @ 0x8000
-   movsxxl, #0
-5:
-   .endif
-   @ tailcalls are tricky on v6-m.
-   push{r0, r1, r2}
-   ldr r0, 1f
-   adr r1, 1f
-   addsr0, r1
-   str r0, [sp, #8]
-   @ We know we are not on armv4t, so pop pc is safe.
-   pop {r0, r1, pc}
-   .align  2
-1:
-   .word   __aeabi_ldiv0 - 1b
-7:
-.endm
-
-#ifdef L_aeabi_ldivmod
-
-FUNC_START aeabi_ldivmod
-   test_div_by_zero signed
-
-   push{r0, r1}
-   mov r0, sp
-   push{r0, lr}
-   ldr r0, [sp, #8]
-   bl  SYM(__gnu_ldivmod_helper)
-   ldr r3, [sp, #4]
-   mov lr, r3
-   add sp, sp, #8
-   pop {r2, r3}
-   RET
-   FUNC_END aeabi_ldivmod
-
-#endif /* L_aeabi_ldivmod */
-
-#ifdef L_aeabi_uldivmod
-
-FUNC_START aeabi_uldivmod
-   test_div_by_zero unsigned
-
-   push{r0, r1}
-   mov r0, sp
-   push{r0, lr}
-   ldr r0, [sp, #8]
-   bl  SYM(__udivmoddi4)
-   ldr r3, [sp, #4]
-   mov lr, r3
-   add sp, sp, #8
-   pop {r2, r3}
-   RET
-   FUNC_END aeabi_uldivmod
-   
-#endif /* L_aeabi_uldivmod */
-
 #ifdef L_arm_addsubsf3
 
 FUNC_START aeabi_frsub
diff --git a/libgcc/config/arm/eabi/ldiv.S b/libgcc/config/arm/eabi/ldiv.S
new file mode 100644
index 000..3c8280ef580
--- /dev/null
+++ b/libgcc/config/arm/eabi/ldiv.S
@@ -0,0 +1,107 @@
+/* Miscellaneous BPABI functions.  Thumb-1 implementation, suitable for ARMv4T,
+   ARMv6-M and ARMv8-M Baseline like ISA variants.
+
+   Copyright (C) 2006-2020 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+.macro test_div_by_zero signed
+cmp yyh, #0
+bne 7f
+cmp yyl, #0
+bne 7f
+cmp xxh, #0
+.ifc\signed, unsigned
+bne 2f
+cmp xxl, #0
+2:
+beq 3f
+movsxxh, #0
+mvnsxxh, xxh@ 0x
+movsxxl, xxh
+3:
+.else
+blt 6f
+bgt 4f
+cmp xxl, #0
+beq 5f
+4:  movsxxl, #0
+mvnsxxl, xxl@ 0x
+lsrsxxh, xxl, #1@ 0x7fff
+b   5f
+6:  movsxxh, #0x80
+lslsxxh, xxh, #24   @ 0x8000
+movsxxl, #0
+5:
+.endif
+@ tailcalls are tricky on v6-m.
+push{r0, r1, r2}
+ldr r0, 1f
+adr r1, 1f
+addsr0, r1
+str r0, [sp, #8]
+@ We know we are not on armv4t, so pop 

[PATCH v6 21/34] Import 64-bit division from the CM0 library

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bpabi.c: Deleted unused file.
* config/arm/eabi/ldiv.S (__aeabi_ldivmod, __aeabi_uldivmod):
Replaced wrapper functions with a complete implementation.
* config/arm/t-bpabi (LIB2ADD_ST): Removed bpabi.c.
* config/arm/t-elf (LIB1ASMFUNCS): Added _divdi3 and _udivdi3.
---
 libgcc/config/arm/bpabi.c |  42 ---
 libgcc/config/arm/eabi/ldiv.S | 542 +-
 libgcc/config/arm/t-bpabi |   3 +-
 libgcc/config/arm/t-elf   |   9 +
 4 files changed, 474 insertions(+), 122 deletions(-)
 delete mode 100644 libgcc/config/arm/bpabi.c

diff --git a/libgcc/config/arm/bpabi.c b/libgcc/config/arm/bpabi.c
deleted file mode 100644
index bf6ba757964..000
--- a/libgcc/config/arm/bpabi.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Miscellaneous BPABI functions.
-
-   Copyright (C) 2003-2021 Free Software Foundation, Inc.
-   Contributed by CodeSourcery, LLC.
-
-   This file is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by the
-   Free Software Foundation; either version 3, or (at your option) any
-   later version.
-
-   This file is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   .  */
-
-extern long long __divdi3 (long long, long long);
-extern unsigned long long __udivdi3 (unsigned long long, 
-unsigned long long);
-extern long long __gnu_ldivmod_helper (long long, long long, long long *);
-
-
-long long
-__gnu_ldivmod_helper (long long a, 
- long long b, 
- long long *remainder)
-{
-  long long quotient;
-
-  quotient = __divdi3 (a, b);
-  *remainder = a - b * quotient;
-  return quotient;
-}
-
diff --git a/libgcc/config/arm/eabi/ldiv.S b/libgcc/config/arm/eabi/ldiv.S
index 3c8280ef580..c225e5973b2 100644
--- a/libgcc/config/arm/eabi/ldiv.S
+++ b/libgcc/config/arm/eabi/ldiv.S
@@ -1,8 +1,7 @@
-/* Miscellaneous BPABI functions.  Thumb-1 implementation, suitable for ARMv4T,
-   ARMv6-M and ARMv8-M Baseline like ISA variants.
+/* ldiv.S: Thumb-1 optimized 64-bit integer division
 
-   Copyright (C) 2006-2020 Free Software Foundation, Inc.
-   Contributed by CodeSourcery.
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
 
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@@ -24,84 +23,471 @@
.  */
 
 
-.macro test_div_by_zero signed
-cmp yyh, #0
-bne 7f
-cmp yyl, #0
-bne 7f
-cmp xxh, #0
-.ifc\signed, unsigned
-bne 2f
-cmp xxl, #0
-2:
-beq 3f
-movsxxh, #0
-mvnsxxh, xxh@ 0x
-movsxxl, xxh
-3:
-.else
-blt 6f
-bgt 4f
-cmp xxl, #0
-beq 5f
-4:  movsxxl, #0
-mvnsxxl, xxl@ 0x
-lsrsxxh, xxl, #1@ 0x7fff
-b   5f
-6:  movsxxh, #0x80
-lslsxxh, xxh, #24   @ 0x8000
-movsxxl, #0
-5:
-.endif
-@ tailcalls are tricky on v6-m.
-push{r0, r1, r2}
-ldr r0, 1f
-adr r1, 1f
-addsr0, r1
-str r0, [sp, #8]
-@ We know we are not on armv4t, so pop pc is safe.
-pop {r0, r1, pc}
-.align  2
-1:
-.word   __aeabi_ldiv0 - 1b
-7:
-.endm
-
-#ifdef L_aeabi_ldivmod
-
-FUNC_START aeabi_ldivmod
-test_div_by_zero signed
-
-push{r0, r1}
-mov r0, sp
-push{r0, lr}
-ldr r0, [sp, #8]
-bl  SYM(__gnu_ldivmod_helper)
-ldr r3, [sp, #4]
-mov lr, r3
-add sp, sp, #8
-pop {r2, r3}
+#ifndef __GNUC__
+
+// long long __aeabi_ldiv0(long long)
+// Helper function for division by 0.
+WEAK_START_SECTION aeabi_ldiv0 .text.sorted.libgcc.ldiv.ldiv0
+CFI_START_FUNCTION
+
+  #if defined(TRAP_EXCEPTIONS) && TRAP_EXCEPTIONS
+svc #(SVC_DIVISION_BY_ZERO)
+  #endif
+
 RET
-FUNC_END aea

[PATCH v6 22/34] Import integer multiplication from the CM0 library

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-07 Daniel Engel 

* config/arm/eabi/lmul.S: New file for __muldi3(), __mulsidi3(), and
 __umulsidi3().
* config/arm/lib1funcs.S: #eabi/lmul.S (v6m only).
* config/arm/t-elf: Add the new objects to LIB1ASMFUNCS.
---
 libgcc/config/arm/eabi/lmul.S | 218 ++
 libgcc/config/arm/lib1funcs.S |   1 +
 libgcc/config/arm/t-elf   |  13 +-
 3 files changed, 230 insertions(+), 2 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/lmul.S

diff --git a/libgcc/config/arm/eabi/lmul.S b/libgcc/config/arm/eabi/lmul.S
new file mode 100644
index 000..9fec4364a26
--- /dev/null
+++ b/libgcc/config/arm/eabi/lmul.S
@@ -0,0 +1,218 @@
+/* lmul.S: Thumb-1 optimized 64-bit integer multiplication
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_muldi3
+
+// long long __aeabi_lmul(long long, long long)
+// Returns the least significant 64 bits of a 64 bit multiplication.
+// Expects the two multiplicands in $r1:$r0 and $r3:$r2.
+// Returns the product in $r1:$r0 (does not distinguish signed types).
+// Uses $r4 and $r5 as scratch space.
+// Same parent section as __umulsidi3() to keep tail call branch within range.
+FUNC_START_SECTION muldi3 .text.sorted.libgcc.lmul.muldi3
+
+#ifndef __symbian__
+  FUNC_ALIAS aeabi_lmul muldi3
+#endif
+
+CFI_START_FUNCTION
+
+// $r1:$r0 = 0x
+// $r3:$r2 = 0x
+
+// The following operations that only affect the upper 64 bits
+//  can be safely discarded:
+//    * 
+//    * 
+//    * 
+//    * 
+//    * 
+//    * 
+
+// MAYBE: Test for multiply by ZERO on implementations with a 32-cycle
+//  'muls' instruction, and skip over the operation in that case.
+
+// (0x * 0x), free $r1
+mulsxxh,yyl
+
+// (0x * 0x), free $r3
+mulsyyh,xxl
+addsyyh,xxh
+
+// Put the parameters in the correct form for umulsidi3().
+movsxxh,yyl
+b   LLSYM(__mul_overflow)
+
+CFI_END_FUNCTION
+FUNC_END muldi3
+
+#ifndef __symbian__
+  FUNC_END aeabi_lmul
+#endif
+
+#endif /* L_muldi3 */
+
+
+// The following implementation of __umulsidi3() integrates with __muldi3()
+//  above to allow the fast tail call while still preserving the extra
+//  hi-shifted bits of the result.  However, these extra bits add a few
+//  instructions not otherwise required when using only __umulsidi3().
+// Therefore, this block configures __umulsidi3() for compilation twice.
+// The first version is a minimal standalone implementation, and the second
+//  version adds the hi bits of __muldi3().  The standalone version must
+//  be declared WEAK, so that the combined version can supersede it and
+//  provide both symbols in programs that multiply long doubles.
+// This means '_umulsidi3' should appear before '_muldi3' in LIB1ASMFUNCS.
+#if defined(L_muldi3) || defined(L_umulsidi3)
+
+#ifdef L_umulsidi3
+// unsigned long long __umulsidi3(unsigned int, unsigned int)
+// Returns all 64 bits of a 32 bit multiplication.
+// Expects the two multiplicands in $r0 and $r1.
+// Returns the product in $r1:$r0.
+// Uses $r3, $r4 and $ip as scratch space.
+WEAK_START_SECTION umulsidi3 .text.sorted.libgcc.lmul.umulsidi3
+CFI_START_FUNCTION
+
+#else /* L_muldi3 */
+FUNC_ENTRY umulsidi3
+CFI_START_FUNCTION
+
+// 32x32 multiply with 64 bit result.
+// Expand the multiply into 4 parts, since muls only returns 32 bits.
+// (a16h * b16h / 2^32)
+//   + (a16h * b16l / 2^48) + (a16l * b16h / 2^48)
+//   + (a16l * b16l / 2^64)
+
+// MAYBE: Test for multiply by 0 on implementations with a 32-cycle
+//  'muls' instruction, and skip over the operation in that case.
+
+ 

[PATCH v6 23/34] Refactor Thumb-1 float comparison into a new file

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bpabi-v6m.S (__aeabi_cfcmpeq, __aeabi_cfcmple,
__aeabi_cfrcmple, __aeabi_fcmpeq, __aeabi_fcmple, aeabi_fcmple,
__aeabi_fcmpgt, aeabi_fcmpge): Moved to ...
* config/arm/eabi/fcmp.S: New file.
* config/arm/lib1funcs.S: #include eabi/fcmp.S (v6m only).
---
 libgcc/config/arm/bpabi-v6m.S | 63 -
 libgcc/config/arm/eabi/fcmp.S | 89 +++
 libgcc/config/arm/lib1funcs.S |  1 +
 3 files changed, 90 insertions(+), 63 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/fcmp.S

diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S
index b3dc3bf8f4d..7c874f06218 100644
--- a/libgcc/config/arm/bpabi-v6m.S
+++ b/libgcc/config/arm/bpabi-v6m.S
@@ -49,69 +49,6 @@ FUNC_START aeabi_frsub
 
 #endif /* L_arm_addsubsf3 */
 
-#ifdef L_arm_cmpsf2
-
-FUNC_START aeabi_cfrcmple
-
-   mov ip, r0
-   movsr0, r1
-   mov r1, ip
-   b   6f
-
-FUNC_START aeabi_cfcmpeq
-FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq
-
-   @ The status-returning routines are required to preserve all
-   @ registers except ip, lr, and cpsr.
-6: push{r0, r1, r2, r3, r4, lr}
-   bl  __lesf2
-   @ Set the Z flag correctly, and the C flag unconditionally.
-   cmp r0, #0
-   @ Clear the C flag if the return value was -1, indicating
-   @ that the first operand was smaller than the second.
-   bmi 1f
-   movsr1, #0
-   cmn r0, r1
-1:
-   pop {r0, r1, r2, r3, r4, pc}
-
-   FUNC_END aeabi_cfcmple
-   FUNC_END aeabi_cfcmpeq
-   FUNC_END aeabi_cfrcmple
-
-FUNC_START aeabi_fcmpeq
-
-   push{r4, lr}
-   bl  __eqsf2
-   negsr0, r0
-   addsr0, r0, #1
-   pop {r4, pc}
-
-   FUNC_END aeabi_fcmpeq
-
-.macro COMPARISON cond, helper, mode=sf2
-FUNC_START aeabi_fcmp\cond
-
-   push{r4, lr}
-   bl  __\helper\mode
-   cmp r0, #0
-   b\cond  1f
-   movsr0, #0
-   pop {r4, pc}
-1:
-   movsr0, #1
-   pop {r4, pc}
-
-   FUNC_END aeabi_fcmp\cond
-.endm
-
-COMPARISON lt, le
-COMPARISON le, le
-COMPARISON gt, ge
-COMPARISON ge, ge
-
-#endif /* L_arm_cmpsf2 */
-
 #ifdef L_arm_addsubdf3
 
 FUNC_START aeabi_drsub
diff --git a/libgcc/config/arm/eabi/fcmp.S b/libgcc/config/arm/eabi/fcmp.S
new file mode 100644
index 000..96d627f1fea
--- /dev/null
+++ b/libgcc/config/arm/eabi/fcmp.S
@@ -0,0 +1,89 @@
+/* Miscellaneous BPABI functions.  Thumb-1 implementation, suitable for ARMv4T,
+   ARMv6-M and ARMv8-M Baseline like ISA variants.
+
+   Copyright (C) 2006-2020 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_arm_cmpsf2
+
+FUNC_START aeabi_cfrcmple
+
+   mov ip, r0
+   movsr0, r1
+   mov r1, ip
+   b   6f
+
+FUNC_START aeabi_cfcmpeq
+FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq
+
+   @ The status-returning routines are required to preserve all
+   @ registers except ip, lr, and cpsr.
+6: push{r0, r1, r2, r3, r4, lr}
+   bl  __lesf2
+   @ Set the Z flag correctly, and the C flag unconditionally.
+   cmp r0, #0
+   @ Clear the C flag if the return value was -1, indicating
+   @ that the first operand was smaller than the second.
+   bmi 1f
+   movsr1, #0
+   cmn r0, r1
+1:
+   pop {r0, r1, r2, r3, r4, pc}
+
+   FUNC_END aeabi_cfcmple
+   FUNC_END aeabi_cfcmpeq
+   FUNC_END aeabi_cfrcmple
+
+FUNC_START aeabi_fcmpeq
+
+   push{r4, lr}
+   bl  __eqsf2
+   negsr0, r0
+   addsr0, r0, #1
+   pop {r4, pc}
+
+   FUNC_END aeabi_fcmpeq
+
+.macro COMPARISON cond, helper, mode=sf2
+FUNC_START aeabi_fcmp\cond
+
+   push{r4, lr}
+   bl  __\helper\mode
+   cmp r0, #0
+   b\cond  1f
+   movsr0, #0
+   pop {r4, pc}
+1:
+   movsr0, #1
+  

[PATCH v6 24/34] Import float comparison from the CM0 library

2021-12-27 Thread Daniel Engel
These functions are significantly smaller and faster than the wrapper
functions and soft-float implementation they replace.  Using the first
comparison operator (e.g. '<=') in any program costs about 70 bytes
initially, but every additional operator incrementally adds just 4 bytes.

NOTE: It seems that the __aeabi_cfcmp*() routines formerly in bpabi-v6m.S
were not well tested, as they returned wrong results for the 'C' flag.
The replacement functions are fully tested.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/eabi/fcmp.S (__cmpsf2, __eqsf2, __gesf2,
__aeabi_fcmpne, __aeabi_fcmpun): Added new functions.
(__aeabi_fcmpeq, __aeabi_fcmpne, __aeabi_fcmplt, __aeabi_fcmple,
 __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_cfcmple, __aeabi_cfcmpeq,
 __aeabi_cfrcmple): Replaced with branches to __internal_cmpsf2().
* config/arm/eabi/fplib.h: New file with fcmp-specific constants
and general build configuration macros.
* config/arm/lib1funcs.S: #include eabi/fplib.h (v6m only).
* config/arm/t-elf (LIB1ASMFUNCS): Added _internal_cmpsf2,
_arm_cfcmpeq, _arm_cfcmple, _arm_cfrcmple, _arm_fcmpeq,
_arm_fcmpge, _arm_fcmpgt, _arm_fcmple, _arm_fcmplt, _arm_fcmpne,
_arm_eqsf2, and _arm_gesf2.
---
 libgcc/config/arm/eabi/fcmp.S  | 643 +
 libgcc/config/arm/eabi/fplib.h |  83 +
 libgcc/config/arm/lib1funcs.S  |   1 +
 libgcc/config/arm/t-elf|  18 +
 4 files changed, 681 insertions(+), 64 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/fplib.h

diff --git a/libgcc/config/arm/eabi/fcmp.S b/libgcc/config/arm/eabi/fcmp.S
index 96d627f1fea..cada33f4d35 100644
--- a/libgcc/config/arm/eabi/fcmp.S
+++ b/libgcc/config/arm/eabi/fcmp.S
@@ -1,8 +1,7 @@
-/* Miscellaneous BPABI functions.  Thumb-1 implementation, suitable for ARMv4T,
-   ARMv6-M and ARMv8-M Baseline like ISA variants.
+/* fcmp.S: Thumb-1 optimized 32-bit float comparison
 
-   Copyright (C) 2006-2020 Free Software Foundation, Inc.
-   Contributed by CodeSourcery.
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
 
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@@ -24,66 +23,582 @@
.  */
 
 
+// The various compare functions in this file all expect to tail call 
__cmpsf2()
+//  with flags set for a particular comparison mode.  The __internal_cmpsf2()
+//  symbol  itself is unambiguous, but there is a remote risk that the linker 
+//  will prefer some other symbol in place of __cmpsf2().  Importing an archive
+//  file that also exports __cmpsf2() will throw an error in this case.
+// As a workaround, this block configures __aeabi_f2lz() for compilation twice.
+// The first version configures __internal_cmpsf2() as a WEAK standalone 
symbol,
+//  and the second exports __cmpsf2() and __internal_cmpsf2() normally.
+// A small bonus: programs not using __cmpsf2() itself will be slightly 
smaller.
+// 'L_internal_cmpsf2' should appear before 'L_arm_cmpsf2' in LIB1ASMFUNCS.
+#if defined(L_arm_cmpsf2) || defined(L_internal_cmpsf2)
+
+#define CMPSF2_SECTION .text.sorted.libgcc.fcmp.cmpsf2
+
+// int __cmpsf2(float, float)
+// 
+// Returns the three-way comparison result of $r0 with $r1:
+//  * +1 if ($r0 > $r1), or either argument is NAN
+//  *  0 if ($r0 == $r1)
+//  * -1 if ($r0 < $r1)
+// Uses $r2, $r3, and $ip as scratch space.
+#ifdef L_arm_cmpsf2
+FUNC_START_SECTION cmpsf2 CMPSF2_SECTION
+FUNC_ALIAS lesf2 cmpsf2
+FUNC_ALIAS ltsf2 cmpsf2
+CFI_START_FUNCTION
+
+// Assumption: The 'libgcc' functions should raise exceptions.
+movsr2, #(FCMP_UN_POSITIVE + FCMP_RAISE_EXCEPTIONS + FCMP_3WAY)
+
+// int,int __internal_cmpsf2(float, float, int)
+// Internal function expects a set of control flags in $r2.
+// If ordered, returns a comparison type { 0, 1, 2 } in $r3
+FUNC_ENTRY internal_cmpsf2
+
+#else /* L_internal_cmpsf2 */
+WEAK_START_SECTION internal_cmpsf2 CMPSF2_SECTION
+CFI_START_FUNCTION
+
+#endif 
+
+// When operand signs are considered, the comparison result falls
+//  within one of the following quadrants:
+//
+// $r0  $r1  $r0-$r1* flags  result
+//  ++  >  C=0 GT
+//  ++  =  Z=1 EQ
+//  ++  <  C=1 LT
+//  +-  >  C=1 GT
+//  +-  =  C=1 GT
+//  +-  <  C=1 GT
+//  -+  >  C=0 LT
+//  -+  =  C=0 LT
+//  -+  <  C=0 LT
+//  --  >  C=0 LT
+//  --  =  Z=1 EQ
+//  --  <  C=1 GT
+//
+ 

[PATCH v6 25/34] Refactor Thumb-1 float subtraction into a new file

2021-12-27 Thread Daniel Engel
This will make it easier to isolate changes in subsequent patches.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bpabi-v6m.S (__aeabi_frsub): Moved to ...
* config/arm/eabi/fadd.S: New file.
* config/arm/lib1funcs.S: #include eabi/fadd.S (v6m only).
---
 libgcc/config/arm/bpabi-v6m.S | 16 ---
 libgcc/config/arm/eabi/fadd.S | 38 +++
 libgcc/config/arm/lib1funcs.S |  1 +
 3 files changed, 39 insertions(+), 16 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/fadd.S

diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S
index 7c874f06218..c76c3b0568b 100644
--- a/libgcc/config/arm/bpabi-v6m.S
+++ b/libgcc/config/arm/bpabi-v6m.S
@@ -33,22 +33,6 @@
.eabi_attribute 25, 1
 #endif /* __ARM_EABI__ */
 
-
-#ifdef L_arm_addsubsf3
-
-FUNC_START aeabi_frsub
-
-  push {r4, lr}
-  movs r4, #1
-  lsls r4, #31
-  eors r0, r0, r4
-  bl   __aeabi_fadd
-  pop  {r4, pc}
-
-  FUNC_END aeabi_frsub
-
-#endif /* L_arm_addsubsf3 */
-
 #ifdef L_arm_addsubdf3
 
 FUNC_START aeabi_drsub
diff --git a/libgcc/config/arm/eabi/fadd.S b/libgcc/config/arm/eabi/fadd.S
new file mode 100644
index 000..fffbd91d1bc
--- /dev/null
+++ b/libgcc/config/arm/eabi/fadd.S
@@ -0,0 +1,38 @@
+/* Copyright (C) 2006-2021 Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_arm_addsubsf3
+
+FUNC_START aeabi_frsub
+
+  push {r4, lr}
+  movs r4, #1
+  lsls r4, #31
+  eors r0, r0, r4
+  bl   __aeabi_fadd
+  pop  {r4, pc}
+
+  FUNC_END aeabi_frsub
+
+#endif /* L_arm_addsubsf3 */
+
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 236b7a7763f..31132633f32 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -2012,6 +2012,7 @@ LSYM(Lchange_\register):
 #include "bpabi-v6m.S"
 #include "eabi/fplib.h"
 #include "eabi/fcmp.S"
+#include "eabi/fadd.S"
 #endif /* NOT_ISA_TARGET_32BIT */
 #include "eabi/lcmp.S"
 #endif /* !__symbian__ */
-- 
2.25.1



[PATCH v6 26/34] Import float addition and subtraction from the CM0 library

2021-12-27 Thread Daniel Engel
Since this is the first import of single-precision functions, some common
parsing and formatting routines are also included.  These common rotines
will be referenced by other functions in subsequent commits.
However, even if the size penalty is accounted entirely to __addsf3(),
the total compiled size is still less than half the size of soft-float.

gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/eabi/fadd.S (__addsf3, __subsf3): Added new functions.
* config/arm/eabi/fneg.S (__negsf2): Added new file.
* config/arm/eabi/futil.S (__fp_normalize2, __fp_lalign2, __fp_assemble,
__fp_overflow, __fp_zero, __fp_check_nan): Added new file with shared
helper functions.
* config/arm/lib1funcs.S: #include eabi/fneg.S and eabi/futil.S (v6m 
only).
* config/arm/t-elf (LIB1ASMFUNCS): Added _arm_addsf3, _arm_frsubsf3,
_fp_exceptionf, _fp_checknanf, _fp_assemblef, and _fp_normalizef.
---
 libgcc/config/arm/eabi/fadd.S  | 306 +++-
 libgcc/config/arm/eabi/fneg.S  |  76 ++
 libgcc/config/arm/eabi/fplib.h |   3 -
 libgcc/config/arm/eabi/futil.S | 418 +
 libgcc/config/arm/lib1funcs.S  |   2 +
 libgcc/config/arm/t-elf|   6 +
 6 files changed, 798 insertions(+), 13 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/fneg.S
 create mode 100644 libgcc/config/arm/eabi/futil.S

diff --git a/libgcc/config/arm/eabi/fadd.S b/libgcc/config/arm/eabi/fadd.S
index fffbd91d1bc..77b81d62b3b 100644
--- a/libgcc/config/arm/eabi/fadd.S
+++ b/libgcc/config/arm/eabi/fadd.S
@@ -1,5 +1,7 @@
-/* Copyright (C) 2006-2021 Free Software Foundation, Inc.
-   Contributed by CodeSourcery.
+/* fadd.S: Thumb-1 optimized 32-bit float addition and subtraction
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
 
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@@ -21,18 +23,302 @@
.  */
 
 
+#ifdef L_arm_frsubsf3
+
+// float __aeabi_frsub(float, float)
+// Returns the floating point difference of $r1 - $r0 in $r0.
+// Subsection ordering within fpcore keeps conditional branches within range.
+FUNC_START_SECTION aeabi_frsub .text.sorted.libgcc.fpcore.b.frsub
+CFI_START_FUNCTION
+
+  #if defined(STRICT_NANS) && STRICT_NANS
+// Check if $r0 is NAN before modifying.
+lslsr2, r0, #1
+movsr3, #255
+lslsr3, #24
+
+// Let fadd() find the NAN in the normal course of operation,
+//  moving it to $r0 and checking the quiet/signaling bit.
+cmp r2, r3
+bhi SYM(__aeabi_fadd)
+  #endif
+
+// Flip sign and run through fadd().
+movsr2, #1
+lslsr2, #31
+addsr0, r2
+b   SYM(__aeabi_fadd)
+
+CFI_END_FUNCTION
+FUNC_END aeabi_frsub
+
+#endif /* L_arm_frsubsf3 */
+
+
 #ifdef L_arm_addsubsf3
 
-FUNC_START aeabi_frsub
+// float __aeabi_fsub(float, float)
+// Returns the floating point difference of $r0 - $r1 in $r0.
+// Subsection ordering within fpcore keeps conditional branches within range.
+FUNC_START_SECTION aeabi_fsub .text.sorted.libgcc.fpcore.c.faddsub
+FUNC_ALIAS subsf3 aeabi_fsub
+CFI_START_FUNCTION
 
-  push {r4, lr}
-  movs r4, #1
-  lsls r4, #31
-  eors r0, r0, r4
-  bl   __aeabi_fadd
-  pop  {r4, pc}
+  #if defined(STRICT_NANS) && STRICT_NANS
+// Check if $r1 is NAN before modifying.
+lslsr2, r1, #1
+movsr3, #255
+lslsr3, #24
 
-  FUNC_END aeabi_frsub
+// Let fadd() find the NAN in the normal course of operation,
+//  moving it to $r0 and checking the quiet/signaling bit.
+cmp r2, r3
+bhi SYM(__aeabi_fadd)
+  #endif
+
+// Flip sign and fall into fadd().
+movsr2, #1
+lslsr2, #31
+addsr1, r2
 
 #endif /* L_arm_addsubsf3 */
 
+
+// The execution of __subsf3() flows directly into __addsf3(), such that
+//  instructions must appear consecutively in the same memory section.
+//  However, this construction inhibits the ability to discard __subsf3()
+//  when only using __addsf3().
+// Therefore, this block configures __addsf3() for compilation twice.
+// The first version is a minimal standalone implementation, and the second
+//  version is the continuation of __subsf3().  The standalone version must
+//  be declared WEAK, so that the combined version can supersede it and
+//  provide both symbols when required.
+// '_arm_addsf3' should appear before '_arm_addsubsf3' in LIB1ASMFUNCS.
+#if defined(L_arm_addsf3) || defined(L_arm_addsubsf3)
+
+#ifdef L_arm_addsf3
+// float __aeabi_fadd(float, float)
+// Returns the floating point 

[PATCH v6 27/34] Import float multiplication from the CM0 library

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/eabi/fmul.S (__mulsf3): New file.
* config/arm/lib1funcs.S: #include eabi/fmul.S (v6m only).
* config/arm/t-elf (LIB1ASMFUNCS): Moved _mulsf3 to global scope
(this object was previously blocked on v6m builds).
---
 libgcc/config/arm/eabi/fmul.S | 215 ++
 libgcc/config/arm/lib1funcs.S |   1 +
 libgcc/config/arm/t-elf   |   3 +-
 3 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 libgcc/config/arm/eabi/fmul.S

diff --git a/libgcc/config/arm/eabi/fmul.S b/libgcc/config/arm/eabi/fmul.S
new file mode 100644
index 000..767de988f0b
--- /dev/null
+++ b/libgcc/config/arm/eabi/fmul.S
@@ -0,0 +1,215 @@
+/* fmul.S: Thumb-1 optimized 32-bit float multiplication
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_arm_mulsf3
+
+// float __aeabi_fmul(float, float)
+// Returns $r0 after multiplication by $r1.
+// Subsection ordering within fpcore keeps conditional branches within range.
+FUNC_START_SECTION aeabi_fmul .text.sorted.libgcc.fpcore.m.fmul
+FUNC_ALIAS mulsf3 aeabi_fmul
+CFI_START_FUNCTION
+
+// Standard registers, compatible with exception handling.
+push{ rT, lr }
+.cfi_remember_state
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+// Save the sign of the result.
+movsrT, r1
+eorsrT, r0
+lsrsrT, #31
+lslsrT, #31
+mov ip, rT
+
+// Set up INF for comparison.
+movsrT, #255
+lslsrT, #24
+
+// Check for multiplication by zero.
+lslsr2, r0, #1
+beq LLSYM(__fmul_zero1)
+
+lslsr3, r1, #1
+beq LLSYM(__fmul_zero2)
+
+// Check for INF/NAN.
+cmp r3, rT
+bhs LLSYM(__fmul_special2)
+
+cmp r2, rT
+bhs LLSYM(__fmul_special1)
+
+// Because neither operand is INF/NAN, the result will be finite.
+// It is now safe to modify the original operand registers.
+lslsr0, #9
+
+// Isolate the first exponent.  When normal, add back the implicit '1'.
+// The result is always aligned with the MSB in bit [31].
+// Subnormal mantissas remain effectively multiplied by 2x relative to
+//  normals, but this works because the weight of a subnormal is -126.
+lsrsr2, #24
+beq LLSYM(__fmul_normalize2)
+addsr0, #1
+rorsr0, r0
+
+LLSYM(__fmul_normalize2):
+// IMPORTANT: exp10i() jumps in here!
+// Repeat for the mantissa of the second operand.
+// Short-circuit when the mantissa is 1.0, as the
+//  first mantissa is already prepared in $r0
+lslsr1, #9
+
+// When normal, add back the implicit '1'.
+lsrsr3, #24
+beq LLSYM(__fmul_go)
+addsr1, #1
+rorsr1, r1
+
+LLSYM(__fmul_go):
+// Calculate the final exponent, relative to bit [30].
+addsrT, r2, r3
+subsrT, #127
+
+  #if !defined(__OPTIMIZE_SIZE__) || !__OPTIMIZE_SIZE__
+// Short-circuit on multiplication by powers of 2.
+lslsr3, r0, #1
+beq LLSYM(__fmul_simple1)
+
+lslsr3, r1, #1
+beq LLSYM(__fmul_simple2)
+  #endif
+
+// Save $ip across the call.
+// (Alternatively, could push/pop a separate register,
+//  but the four instructions here are equivally fast)
+//  without imposing on the stack.
+add rT, ip
+
+// 32x32 unsigned multiplication, 64 bit result.
+bl  SYM(__umulsidi3) __PLT__
+
+// Separ

[PATCH v6 28/34] Import float division from the CM0 library

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-08 Daniel Engel 

* config/arm/eabi/fdiv.S (__divsf3, __fp_divloopf): New file.
* config/arm/lib1funcs.S: #include eabi/fdiv.S (v6m only).
* config/arm/t-elf (LIB1ASMFUNCS): Added _divsf3 and _fp_divloopf.
---
 libgcc/config/arm/eabi/fdiv.S | 261 ++
 libgcc/config/arm/lib1funcs.S |   1 +
 libgcc/config/arm/t-elf   |   2 +
 3 files changed, 264 insertions(+)
 create mode 100644 libgcc/config/arm/eabi/fdiv.S

diff --git a/libgcc/config/arm/eabi/fdiv.S b/libgcc/config/arm/eabi/fdiv.S
new file mode 100644
index 000..9571f0afec1
--- /dev/null
+++ b/libgcc/config/arm/eabi/fdiv.S
@@ -0,0 +1,261 @@
+/* fdiv.S: Thumb-1 optimized 32-bit float division
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_arm_divsf3
+
+// float __aeabi_fdiv(float, float)
+// Returns $r0 after division by $r1.
+// Subsection ordering within fpcore keeps conditional branches within range.
+FUNC_START_SECTION aeabi_fdiv .text.sorted.libgcc.fpcore.n.fdiv
+FUNC_ALIAS divsf3 aeabi_fdiv
+CFI_START_FUNCTION
+
+// Standard registers, compatible with exception handling.
+push{ rT, lr }
+.cfi_remember_state
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+// Save for the sign of the result.
+movsr3, r1
+eorsr3, r0
+lsrsrT, r3, #31
+lslsrT, #31
+mov ip, rT
+
+// Set up INF for comparison.
+movsrT, #255
+lslsrT, #24
+
+// Check for divide by 0.  Automatically catches 0/0.
+lslsr2, r1, #1
+beq LLSYM(__fdiv_by_zero)
+
+// Check for INF/INF, or a number divided by itself.
+lslsr3, #1
+beq LLSYM(__fdiv_equal)
+
+// Check the numerator for INF/NAN.
+eorsr3, r2
+cmp r3, rT
+bhs LLSYM(__fdiv_special1)
+
+// Check the denominator for INF/NAN.
+cmp r2, rT
+bhs LLSYM(__fdiv_special2)
+
+// Check the numerator for zero.
+cmp r3, #0
+beq SYM(__fp_zero)
+
+// No action if the numerator is subnormal.
+//  The mantissa will normalize naturally in the division loop.
+lslsr0, #9
+lsrsr1, r3, #24
+beq LLSYM(__fdiv_denominator)
+
+// Restore the numerator's implicit '1'.
+addsr0, #1
+rorsr0, r0
+
+LLSYM(__fdiv_denominator):
+// The denominator must be normalized and left aligned.
+bl  SYM(__fp_normalize2)
+
+// 25 bits of precision will be sufficient.
+movsrT, #64
+
+// Run division.
+bl  SYM(__fp_divloopf)
+b   SYM(__fp_assemble)
+
+LLSYM(__fdiv_equal):
+  #if defined(EXCEPTION_CODES) && EXCEPTION_CODES
+movsr3, #(DIVISION_INF_BY_INF)
+  #endif
+
+// The absolute value of both operands are equal, but not 0.
+// If both operands are INF, create a new NAN.
+cmp r2, rT
+beq SYM(__fp_exception)
+
+  #if defined(TRAP_NANS) && TRAP_NANS
+// If both operands are NAN, return the NAN in $r0.
+bhi SYM(__fp_check_nan)
+  #else
+bhi LLSYM(__fdiv_return)
+  #endif
+
+// Return 1.0f, with appropriate sign.
+movsr0, #127
+lslsr0, #23
+add r0, ip
+
+LLSYM(__fdiv_return):
+pop { rT, pc }
+.cfi_restore_state
+
+LLSYM(__fdiv_special2):
+// The denominator is either INF or NAN, numerator is neither.
+// Also, the denominator is not equal to 0.
+// If the denominator is INF, the result goes to 

[PATCH v6 29/34] Import integer-to-float conversion from the CM0 library

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bpabi-lib.h (__floatdisf, __floatundisf):
Remove obsolete RENAME_LIBRARY directives.
* config/arm/eabi/ffloat.S (__aeabi_i2f, __aeabi_l2f, __aeabi_ui2f,
__aeabi_ul2f): New file.
* config/arm/lib1funcs.S: #include eabi/ffloat.S (v6m only).
* config/arm/t-elf (LIB1ASMFUNCS): Added _arm_floatunsisf,
_arm_floatsisf, and _internal_floatundisf.
Moved _arm_floatundisf to the weak function group
---
 libgcc/config/arm/bpabi-lib.h   |   6 -
 libgcc/config/arm/eabi/ffloat.S | 247 
 libgcc/config/arm/lib1funcs.S   |   1 +
 libgcc/config/arm/t-elf |   5 +-
 4 files changed, 252 insertions(+), 7 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/ffloat.S

diff --git a/libgcc/config/arm/bpabi-lib.h b/libgcc/config/arm/bpabi-lib.h
index 3cb90b4b345..1e651ead4ac 100644
--- a/libgcc/config/arm/bpabi-lib.h
+++ b/libgcc/config/arm/bpabi-lib.h
@@ -56,9 +56,6 @@
 #ifdef L_floatdidf
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatdidf, l2d)
 #endif
-#ifdef L_floatdisf
-#define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatdisf, l2f)
-#endif
 
 /* These renames are needed on ARMv6M.  Other targets get them from
assembly routines.  */
@@ -71,9 +68,6 @@
 #ifdef L_floatundidf
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatundidf, ul2d)
 #endif
-#ifdef L_floatundisf
-#define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatundisf, ul2f)
-#endif
 
 /* For ARM bpabi, we only want to use a "__gnu_" prefix for the fixed-point
helper functions - not everything in libgcc - in the interests of
diff --git a/libgcc/config/arm/eabi/ffloat.S b/libgcc/config/arm/eabi/ffloat.S
new file mode 100644
index 000..9690ab85081
--- /dev/null
+++ b/libgcc/config/arm/eabi/ffloat.S
@@ -0,0 +1,247 @@
+/* ffixed.S: Thumb-1 optimized integer-to-float conversion
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_arm_floatsisf
+
+// float __aeabi_i2f(int)
+// Converts a signed integer in $r0 to float.
+
+// On little-endian cores (including all Cortex-M), __floatsisf() can be
+//  implemented as below in 5 instructions.  However, it can also be
+//  implemented by prefixing a single instruction to __floatdisf().
+// A memory savings of 4 instructions at a cost of only 2 execution cycles
+//  seems reasonable enough.  Plus, the trade-off only happens in programs
+//  that require both __floatsisf() and __floatdisf().  Programs only using
+//  __floatsisf() always get the smallest version.
+// When the combined version will be provided, this standalone version
+//  must be declared WEAK, so that the combined version can supersede it.
+// '_arm_floatsisf' should appear before '_arm_floatdisf' in LIB1ASMFUNCS.
+// Same parent section as __ul2f() to keep tail call branch within range.
+#if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+WEAK_START_SECTION aeabi_i2f .text.sorted.libgcc.fpcore.p.floatsisf
+WEAK_ALIAS floatsisf aeabi_i2f
+CFI_START_FUNCTION
+
+#else /* !__OPTIMIZE_SIZE__ */
+FUNC_START_SECTION aeabi_i2f .text.sorted.libgcc.fpcore.p.floatsisf
+FUNC_ALIAS floatsisf aeabi_i2f
+CFI_START_FUNCTION
+
+#endif /* !__OPTIMIZE_SIZE__ */
+
+// Save the sign.
+asrsr3, r0, #31
+
+// Absolute value of the input.
+eorsr0, r3
+subsr0, r3
+
+// Sign extension to long long unsigned.
+eorsr1, r1
+b   SYM(__internal_floatundisf_noswap)
+
+CFI_END_FUNCTION
+FUNC_END floatsisf
+FUNC_END aeabi_i2f
+
+#endif /* L_arm_floatsisf */
+
+
+#ifdef L_arm_floatdisf
+
+// float __aeabi_l2f(long long)
+// Converts a signed 64-bit integer in $r1:$r0 to a float in $r0.
+// See build comments for __floatsisf() above.
+// Same parent section as __ul2f() to keep tail call branch within range.
+#if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+FUNC_START_SECTION aeabi

[PATCH v6 30/34] Import float-to-integer conversion from the CM0 library

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/bpabi-lib.h (muldi3): Removed duplicate.
(fixunssfsi) Removed obsolete RENAME_LIBRARY directive.
* config/arm/eabi/ffixed.S (__aeabi_f2iz, __aeabi_f2uiz,
__aeabi_f2lz, __aeabi_f2ulz): New file.
* config/arm/lib1funcs.S: #include eabi/ffixed.S (v6m only).
* config/arm/t-elf (LIB1ASMFUNCS): Added _internal_fixsfdi,
_internal_fixsfsi, _arm_fixsfdi, and _arm_fixunssfdi.
---
 libgcc/config/arm/bpabi-lib.h   |   6 -
 libgcc/config/arm/eabi/ffixed.S | 414 
 libgcc/config/arm/lib1funcs.S   |   1 +
 libgcc/config/arm/t-elf |   4 +
 4 files changed, 419 insertions(+), 6 deletions(-)
 create mode 100644 libgcc/config/arm/eabi/ffixed.S

diff --git a/libgcc/config/arm/bpabi-lib.h b/libgcc/config/arm/bpabi-lib.h
index 1e651ead4ac..a1c631640bb 100644
--- a/libgcc/config/arm/bpabi-lib.h
+++ b/libgcc/config/arm/bpabi-lib.h
@@ -32,9 +32,6 @@
 #ifdef L_muldi3
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (muldi3, lmul)
 #endif
-#ifdef L_muldi3
-#define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (muldi3, lmul)
-#endif
 #ifdef L_fixdfdi
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (fixdfdi, d2lz) \
   extern DWtype __fixdfdi (DFtype) __attribute__((pcs("aapcs"))); \
@@ -62,9 +59,6 @@
 #ifdef L_fixunsdfsi
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (fixunsdfsi, d2uiz)
 #endif
-#ifdef L_fixunssfsi
-#define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (fixunssfsi, f2uiz)
-#endif
 #ifdef L_floatundidf
 #define DECLARE_LIBRARY_RENAMES RENAME_LIBRARY (floatundidf, ul2d)
 #endif
diff --git a/libgcc/config/arm/eabi/ffixed.S b/libgcc/config/arm/eabi/ffixed.S
new file mode 100644
index 000..8ced3a701ff
--- /dev/null
+++ b/libgcc/config/arm/eabi/ffixed.S
@@ -0,0 +1,414 @@
+/* ffixed.S: Thumb-1 optimized float-to-integer conversion
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+// The implementation of __aeabi_f2uiz() expects to tail call __internal_f2iz()
+//  with the flags register set for unsigned conversion.  The __internal_f2iz()
+//  symbol itself is unambiguous, but there is a remote risk that the linker
+//  will prefer some other symbol in place of __aeabi_f2iz().  Importing an
+//  archive file that exports __aeabi_f2iz() will throw an error in this case.
+// As a workaround, this block configures __aeabi_f2iz() for compilation twice.
+// The first version configures __internal_f2iz() as a WEAK standalone symbol,
+//  and the second exports __aeabi_f2iz() and __internal_f2iz() normally.
+// A small bonus: programs only using __aeabi_f2uiz() will be slightly smaller.
+// '_internal_fixsfsi' should appear before '_arm_fixsfsi' in LIB1ASMFUNCS.
+#if defined(L_arm_fixsfsi) || \
+   (defined(L_internal_fixsfsi) && \
+  !(defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__))
+
+// Subsection ordering within fpcore keeps conditional branches within range.
+#define F2IZ_SECTION .text.sorted.libgcc.fpcore.r.fixsfsi
+
+// int __aeabi_f2iz(float)
+// Converts a float in $r0 to signed integer, rounding toward 0.
+// Values out of range are forced to either INT_MAX or INT_MIN.
+// NAN becomes zero.
+#ifdef L_arm_fixsfsi
+FUNC_START_SECTION aeabi_f2iz F2IZ_SECTION
+FUNC_ALIAS fixsfsi aeabi_f2iz
+CFI_START_FUNCTION
+#endif
+
+  #if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
+// Flag for unsigned conversion.
+movsr1, #33
+b   SYM(__internal_fixsfdi)
+
+  #else /* !__OPTIMIZE_SIZE__ */
+
+#ifdef L_arm_fixsfsi
+// Flag for signed conversion.
+movsr3, #1
+
+// [unsigned] int internal_f2iz(float, int)
+// Internal function expects a boolean flag in $r1.
+// If the boolean flag is 0, the result is unsigned.
+// If the boolean flag is 1, the result is signed.
+FUNC_ENTRY internal_f2iz
+
+#else /* L_internal_fixsfsi */
+WEAK_START_SECTION internal_f2iz F2IZ_SECTION
+CFI_START_FUNCTION
+

[PATCH v6 31/34] Import float<->double conversion from the CM0 library

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/eabi/fcast.S (__aeabi_d2f, __aeabi_f2d): New file.
* config/arm/lib1funcs.S: #include eabi/fcast.S (v6m only).
* config/arm/t-elf (LIB1ASMFUNCS): Added _arm_d2f and _arm_f2d.
---
 libgcc/config/arm/eabi/fcast.S | 256 +
 libgcc/config/arm/lib1funcs.S  |   1 +
 libgcc/config/arm/t-elf|   2 +
 3 files changed, 259 insertions(+)
 create mode 100644 libgcc/config/arm/eabi/fcast.S

diff --git a/libgcc/config/arm/eabi/fcast.S b/libgcc/config/arm/eabi/fcast.S
new file mode 100644
index 000..b1184ee1d53
--- /dev/null
+++ b/libgcc/config/arm/eabi/fcast.S
@@ -0,0 +1,256 @@
+/* fcast.S: Thumb-1 optimized 32- and 64-bit float conversions
+
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+   Contributed by Daniel Engel, Senva Inc (g...@danielengel.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   .  */
+
+
+#ifdef L_arm_f2d
+
+// double __aeabi_f2d(float)
+// Converts a single-precision float in $r0 to double-precision in $r1:$r0.
+// Rounding, overflow, and underflow are impossible.
+// INF and ZERO are returned unmodified.
+FUNC_START_SECTION aeabi_f2d .text.sorted.libgcc.fpcore.v.f2d
+FUNC_ALIAS extendsfdf2 aeabi_f2d
+CFI_START_FUNCTION
+
+// Save the sign.
+lsrsr1, r0, #31
+lslsr1, #31
+
+// Set up registers for __fp_normalize2().
+push{ rT, lr }
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+// Test for zero.
+lslsr0, #1
+beq LLSYM(__f2d_return)
+
+// Split the exponent and mantissa into separate registers.
+// This is the most efficient way to convert subnormals in the
+//  half-precision form into normals in single-precision.
+// This does add a leading implicit '1' to INF and NAN,
+//  but that will be absorbed when the value is re-assembled.
+movsr2, r0
+bl  SYM(__fp_normalize2) __PLT__
+
+// Set up the exponent bias.  For INF/NAN values, the bias
+//  is 1791 (2047 - 255 - 1), where the last '1' accounts
+//  for the implicit '1' in the mantissa.
+movsr0, #3
+lslsr0, #9
+addsr0, #255
+
+// Test for INF/NAN, promote exponent if necessary
+cmp r2, #255
+beq LLSYM(__f2d_indefinite)
+
+// For normal values, the exponent bias is 895 (1023 - 127 - 1),
+//  which is half of the prepared INF/NAN bias.
+lsrsr0, #1
+
+LLSYM(__f2d_indefinite):
+// Assemble exponent with bias correction.
+addsr2, r0
+lslsr2, #20
+addsr1, r2
+
+// Assemble the high word of the mantissa.
+lsrsr0, r3, #11
+add r1, r0
+
+// Remainder of the mantissa in the low word of the result.
+lslsr0, r3, #21
+
+LLSYM(__f2d_return):
+pop { rT, pc }
+.cfi_restore_state
+
+CFI_END_FUNCTION
+FUNC_END extendsfdf2
+FUNC_END aeabi_f2d
+
+#endif /* L_arm_f2d */
+
+
+#if defined(L_arm_d2f) || defined(L_arm_truncdfsf2)
+
+// HACK: Build two separate implementations:
+//  * __aeabi_d2f() rounds to nearest per traditional IEEE-753 rules.
+//  * __truncdfsf2() rounds towards zero per GCC specification.
+// Presumably, a program will consistently use one ABI or the other,
+//  which means that code size will not be duplicated in practice.
+// Merging two versions with dynamic rounding would be rather hard.
+#ifdef L_arm_truncdfsf2
+  #define D2F_NAME truncdfsf2
+  #define D2F_SECTION .text.sorted.libgcc.fpcore.x.truncdfsf2
+#else
+  #define D2F_NAME aeabi_d2f
+  #define D2F_SECTION .text.sorted.libgcc.fpcore.w.d2f
+#endif
+
+// float __aeabi_d2f(double)
+// Converts a double-precision float in $r1:$r0 to single-precision in $r0.
+// Values out of range become ZERO or 

[PATCH v6 32/34] Import float<->__fp16 conversion from the CM0 library

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-13 Daniel Engel 

* config/arm/eabi/fcast.S (__aeabi_h2f, __aeabi_f2h): Added functions.
* config/arm/fp16 (__gnu_f2h_ieee, __gnu_h2f_ieee, 
__gnu_f2h_alternative,
__gnu_h2f_alternative): Disable build for v6m multilibs.
* config/arm/t-bpabi (LIB1ASMFUNCS): Added _aeabi_f2h_ieee,
_aeabi_h2f_ieee, _aeabi_f2h_alt, and _aeabi_h2f_alt (v6m only).
---
 libgcc/config/arm/eabi/fcast.S | 277 +
 libgcc/config/arm/fp16.c   |   4 +
 libgcc/config/arm/t-bpabi  |   7 +
 3 files changed, 288 insertions(+)

diff --git a/libgcc/config/arm/eabi/fcast.S b/libgcc/config/arm/eabi/fcast.S
index b1184ee1d53..e5a34d69578 100644
--- a/libgcc/config/arm/eabi/fcast.S
+++ b/libgcc/config/arm/eabi/fcast.S
@@ -254,3 +254,280 @@ FUNC_END D2F_NAME
 
 #endif /* L_arm_d2f || L_arm_truncdfsf2 */
 
+
+#if defined(L_aeabi_h2f_ieee) || defined(L_aeabi_h2f_alt)
+
+#ifdef L_aeabi_h2f_ieee
+  #define H2F_NAME aeabi_h2f
+  #define H2F_ALIAS gnu_h2f_ieee
+#else
+  #define H2F_NAME aeabi_h2f_alt
+  #define H2F_ALIAS gnu_h2f_alternative
+#endif
+
+// float __aeabi_h2f(short hf)
+// float __aeabi_h2f_alt(short hf)
+// Converts a half-precision float in $r0 to single-precision.
+// Rounding, overflow, and underflow conditions are impossible.
+// In IEEE mode, INF, ZERO, and NAN are returned unmodified.
+FUNC_START_SECTION H2F_NAME .text.sorted.libgcc.h2f
+FUNC_ALIAS H2F_ALIAS H2F_NAME
+CFI_START_FUNCTION
+
+// Set up registers for __fp_normalize2().
+push{ rT, lr }
+.cfi_remember_state
+.cfi_adjust_cfa_offset 8
+.cfi_rel_offset rT, 0
+.cfi_rel_offset lr, 4
+
+// Save the mantissa and exponent.
+lslsr2, r0, #17
+
+// Isolate the sign.
+lsrsr0, #15
+lslsr0, #31
+
+// Align the exponent at bit[24] for normalization.
+// If zero, return the original sign.
+lsrsr2, #3
+
+  #ifdef __HAVE_FEATURE_IT
+do_it   eq
+RETc(eq)
+  #else
+beq LLSYM(__h2f_return)
+  #endif
+
+// Split the exponent and mantissa into separate registers.
+// This is the most efficient way to convert subnormals in the
+//  half-precision form into normals in single-precision.
+// This does add a leading implicit '1' to INF and NAN,
+//  but that will be absorbed when the value is re-assembled.
+bl  SYM(__fp_normalize2) __PLT__
+
+   #ifdef L_aeabi_h2f_ieee
+// Set up the exponent bias.  For INF/NAN values, the bias is 223,
+//  where the last '1' accounts for the implicit '1' in the mantissa.
+addsr2, #(255 - 31 - 1)
+
+// Test for INF/NAN.
+cmp r2, #254
+
+  #ifdef __HAVE_FEATURE_IT
+do_it   ne
+  #else
+beq LLSYM(__h2f_assemble)
+  #endif
+
+// For normal values, the bias should have been 111.
+// However, this offset must be adjusted per the INF check above.
+ IT(sub,ne) r2, #((255 - 31 - 1) - (127 - 15 - 1))
+
+#else /* L_aeabi_h2f_alt */
+// Set up the exponent bias.  All values are normal.
+addsr2, #(127 - 15 - 1)
+#endif
+
+LLSYM(__h2f_assemble):
+// Combine exponent and sign.
+lslsr2, #23
+addsr0, r2
+
+// Combine mantissa.
+lsrsr3, #8
+add r0, r3
+
+LLSYM(__h2f_return):
+pop { rT, pc }
+.cfi_restore_state
+
+CFI_END_FUNCTION
+FUNC_END H2F_NAME
+FUNC_END H2F_ALIAS
+
+#endif /* L_aeabi_h2f_ieee || L_aeabi_h2f_alt */
+
+
+#if defined(L_aeabi_f2h_ieee) || defined(L_aeabi_f2h_alt)
+
+#ifdef L_aeabi_f2h_ieee
+  #define F2H_NAME aeabi_f2h
+  #define F2H_ALIAS gnu_f2h_ieee
+#else
+  #define F2H_NAME aeabi_f2h_alt
+  #define F2H_ALIAS gnu_f2h_alternative
+#endif
+
+// short __aeabi_f2h(float f)
+// short __aeabi_f2h_alt(float f)
+// Converts a single-precision float in $r0 to half-precision,
+//  rounding to nearest, ties to even.
+// Values out of range are forced to either ZERO or INF.
+// In IEEE mode, the upper 12 bits of a NAN will be preserved.
+FUNC_START_SECTION F2H_NAME .text.sorted.libgcc.f2h
+FUNC_ALIAS F2H_ALIAS F2H_NAME
+CFI_START_FUNCTION
+
+// Set up the sign.
+lsrsr2, r0, #31
+lslsr2, #15
+
+// Save the exponent and mantissa.
+// If ZERO, return the original sign.
+lslsr0, #1
+
+  #ifdef __HAVE_FEATURE_IT
+do_it   ne,t
+addne   r0, r2
+RETc(ne)
+  #else
+beq LLSYM(__f2h_return)
+  #endif
+
+// Isolate the exponent.
+lsrsr1, r0, #24
+
+  #ifdef L_aeabi_f2h_ieee
+// Check for NAN.
+cmp r1, #255
+beq LLSYM(__f2h_indefinite)
+
+// Che

[PATCH v6 33/34] Drop single-precision Thumb-1 soft-float functions

2021-12-27 Thread Daniel Engel
With the complete CM0 library integrated, regression testing showed new
failures with the message "compilation failed to produce executable":

gcc.dg/fixed-point/convert-float-1.c
gcc.dg/fixed-point/convert-float-3.c
gcc.dg/fixed-point/convert-sat.c

Investigating, this appears to be caused by the linker.  I can't find a
comprehensive linker specification to claim this is actually a bug, but it
certainly doesn't match my expectations.  Investigating, I found issues
with the link order of these symbols:

  * __aeabi_fmul()
  * __aeabi_f2d()
  * __aeabi_f2iz()

Specifically, I expect the linker to import the _first_ definition of any
symbol.  This is the basic behavior that allows the soft-float library to
supply missing symbols on architectures without optimized routines.

Comparing the v6-m multilib with the default, I see symbol exports for all
of the affect symbols:

gcc-obj/gcc/libgcc.a:

// assembly routines

_arm_mulsf3.o:
 W __aeabi_fmul
 W __mulsf3

_arm_addsubdf3.o:
0368 T __aeabi_f2d
0368 T __extendsfdf2

_arm_fixsfsi.o:
 T __aeabi_f2iz
 T __fixsfsi

mulsf3.o:


fixsfsi.o:


extendsfdf2.o.o:


gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a:

// assembly routines

_arm_mulsf3.o:
 T __aeabi_fmul
 U __fp_assemble
 U __fp_exception
 U __fp_infinity
 U __fp_zero
 T __mulsf3
 U __umulsidi3

_arm_fixsfsi.o:
 T __aeabi_f2iz
 T __fixsfsi
0002 T __internal_f2iz

_arm_f2d.o:
 T __aeabi_f2d
 T __extendsfdf2
 U __fp_normalize2

// soft-float library

mulsf3.o:
 T __aeabi_fmul

fixsfsi.o:
 T __aeabi_f2iz

extendsfdf2.o:
 T __aeabi_f2d

Given the order of the archive file, I expect the linker to import the affected
functions from the _arm_* archive elements.

For "convert-sat.c", all is well with -march=armv7-m.
...
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_muldf3.o
OK> (/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_mulsf3.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_cmpsf2.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_fixsfsi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_fixunssfsi.o
OK> (/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_addsubdf3.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_cmpdf2.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_fixdfsi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_arm_fixunsdfsi.o
OK> (/home/mirdan/gcc-obj/gcc/libgcc.a)_fixsfdi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_fixdfdi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_fixunssfdi.o
(/home/mirdan/gcc-obj/gcc/libgcc.a)_fixunsdfdi.o
...

However, with -march=armv6s-m, the linker imports these symbols from the soft-
float library.  (NOTE: The CM0 library only implements single-precision float
operations, so imports from muldf3.o, fixdfsi.o, etc are expected.)
...
??> (/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)mulsf3.o
??> (/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)fixsfsi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)muldf3.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)fixdfsi.o
??> (/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)extendsfdf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_clzsi2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_arm_fcmpge.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_arm_fcmple.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixsfdi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixunssfdi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixunssfsi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_arm_cmpdf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixunsdfsi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixdfdi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixunsdfdi.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)eqdf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)gedf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)ledf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)subdf3.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)floatunsidf.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_arm_cmpsf2.o
(/home/mirdan/gcc-obj/gcc/thumb/v6-m/nofp/libgcc.a)_fixsfsi.o
...

It seems that the order in which the linker resolves symbols matters.  In the
affected test cases, the linker begins searching for fixed-point function
symbols first: _subQQ.o, _cmpQQ.o, etc.  T

[PATCH v6 34/34] Add -mpure-code support to the CM0 functions.

2021-12-27 Thread Daniel Engel
gcc/libgcc/ChangeLog:
2021-01-16 Daniel Engel 

Makefile.in (MPURE_CODE): New macro defines __PURE_CODE__.
(gcc_compile): Appended MPURE_CODE.
lib1funcs.S (FUNC_START_SECTION): Set flags for __PURE_CODE__.
clz2.S (__clzsi2): Added -mpure-code compatible instructions.
ctz2.S (__ctzsi2): Same.
popcnt.S (__popcountsi2, __popcountdi2): Same.
---
 libgcc/Makefile.in|  5 -
 libgcc/config/arm/clz2.S  | 25 ++-
 libgcc/config/arm/ctz2.S  | 38 +--
 libgcc/config/arm/lib1funcs.S |  7 ++-
 libgcc/config/arm/popcnt.S| 33 +-
 5 files changed, 98 insertions(+), 10 deletions(-)

diff --git a/libgcc/Makefile.in b/libgcc/Makefile.in
index 32e329f7764..e6b2ce5c6d7 100644
--- a/libgcc/Makefile.in
+++ b/libgcc/Makefile.in
@@ -307,6 +307,9 @@ CRTSTUFF_CFLAGS = -O2 $(GCC_CFLAGS) $(INCLUDES) 
$(MULTILIB_CFLAGS) -g0 \
 # Extra flags to use when compiling crt{begin,end}.o.
 CRTSTUFF_T_CFLAGS =
 
+# Pass the -mpure-code flag into assembly for conditional compilation.
+MPURE_CODE = $(if $(findstring -mpure-code,$(CFLAGS)), -D__PURE_CODE__)
+
 MULTIDIR := $(shell $(CC) $(CFLAGS) -print-multi-directory)
 MULTIOSDIR := $(shell $(CC) $(CFLAGS) -print-multi-os-directory)
 
@@ -316,7 +319,7 @@ inst_slibdir = $(slibdir)$(MULTIOSSUBDIR)
 
 gcc_compile_bare = $(CC) $(INTERNAL_CFLAGS) $(CFLAGS-$(http://www.gnu.org/licenses/>.  */
 
 
+#if defined(L_popcountdi2) || defined(L_popcountsi2)
+
+.macro ldmask reg, temp, value
+#if defined(__PURE_CODE__) && (__PURE_CODE__)
+  #ifdef NOT_ISA_TARGET_32BIT
+movs\reg,   \value
+lsls\temp,  \reg,   #8
+orrs\reg,   \temp
+lsls\temp,  \reg,   #16
+orrs\reg,   \temp
+  #else
+// Assumption: __PURE_CODE__ only support M-profile.
+movw\reg((\value) * 0x101)
+movt\reg((\value) * 0x101)
+  #endif
+#else
+ldr \reg,   =((\value) * 0x1010101)
+#endif
+.endm
+
+#endif
+
+
 #ifdef L_popcountdi2
 
 // int __popcountdi2(int)
@@ -49,7 +72,7 @@ FUNC_START_SECTION popcountdi2 .text.sorted.libgcc.popcountdi2
 
   #else /* !__OPTIMIZE_SIZE__ */
 // Load the one-bit alternating mask.
-ldr r3, =0x
+ldmask  r3, r2, 0x55
 
 // Reduce the second word.
 lsrsr2, r1, #1
@@ -62,7 +85,7 @@ FUNC_START_SECTION popcountdi2 .text.sorted.libgcc.popcountdi2
 subsr0, r2
 
 // Load the two-bit alternating mask.
-ldr r3, =0x
+ldmask  r3, r2, 0x33
 
 // Reduce the second word.
 lsrsr2, r1, #2
@@ -140,7 +163,7 @@ FUNC_ENTRY popcountsi2
   #else /* !__OPTIMIZE_SIZE__ */
 
 // Load the one-bit alternating mask.
-ldr r3, =0x
+ldmask  r3, r2, 0x55
 
 // Reduce the word.
 lsrsr1, r0, #1
@@ -148,7 +171,7 @@ FUNC_ENTRY popcountsi2
 subsr0, r1
 
 // Load the two-bit alternating mask.
-ldr r3, =0x
+ldmask  r3, r2, 0x33
 
 // Reduce the word.
 lsrsr1, r0, #2
@@ -158,7 +181,7 @@ FUNC_ENTRY popcountsi2
 addsr0, r1
 
 // Load the four-bit alternating mask.
-ldr r3, =0x0F0F0F0F
+ldmask  r3, r2, 0x0F
 
 // Reduce the word.
 lsrsr1, r0, #4
-- 
2.25.1



Re: [PATCH] Make integer output faster in libgfortran

2021-12-27 Thread FX via Gcc-patches
Follow-up patch committed, after my use of the one-argument variant of 
static_assert() broke bootstrap on Solaris (sorry Rainer!).
The one-arg form is new since C23, while Solaris  only supports the 
two-arg form (C11).

I have confirmed that other target libraries use the two-arg form, and 
bootstrapped the attached patch on x86_64-pc-linux-gnu.

FX



static_assert.diff
Description: Binary data


[PATCH] PR fortran/102332 - ICE in select_type_set_tmp, at fortran/match.c:6366

2021-12-27 Thread Harald Anlauf via Gcc-patches
Dear all,

there are a couple of NULL pointer dereferences leading to improper
error recovery when trying to handle Gerhard's testcases involving
SELECT TYPE and invalid uses of CLASS variables.

The fixes look pretty obvious to me, but I'm submitting here to
check if there is more that should be done here.

(I was surprised to see that there are several different places
involved by rather simple variations in the basic test case.)

Regtested on x86_64-pc-linux-gnu.  OK for mainline?

Thanks,
Harald

From 4cda248202ea741bea1dd1ca4531aa15f423801b Mon Sep 17 00:00:00 2001
From: Harald Anlauf 
Date: Mon, 27 Dec 2021 23:06:18 +0100
Subject: [PATCH] Fortran: avoid several NULL pointer dereferences during error
 recovery

gcc/fortran/ChangeLog:

	PR fortran/102332
	* expr.c (gfc_get_variable_expr): Avoid NULL pointer dereferences
	during handling of errors with invalid uses of CLASS variables.
	* match.c (select_type_set_tmp): Likewise.
	* primary.c (gfc_match_varspec): Likewise.
	* resolve.c (resolve_variable): Likewise.
	(resolve_select_type): Likewise.

gcc/testsuite/ChangeLog:

	PR fortran/102332
	* gfortran.dg/pr102332.f90: New test.
---
 gcc/fortran/expr.c |  3 +-
 gcc/fortran/match.c|  3 +-
 gcc/fortran/primary.c  |  1 +
 gcc/fortran/resolve.c  |  9 +++-
 gcc/testsuite/gfortran.dg/pr102332.f90 | 69 ++
 5 files changed, 81 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/pr102332.f90

diff --git a/gcc/fortran/expr.c b/gcc/fortran/expr.c
index b874607db1d..c1258e0eb06 100644
--- a/gcc/fortran/expr.c
+++ b/gcc/fortran/expr.c
@@ -5166,7 +5166,8 @@ gfc_get_variable_expr (gfc_symtree *var)

   if (var->n.sym->attr.flavor != FL_PROCEDURE
   && ((var->n.sym->as != NULL && var->n.sym->ts.type != BT_CLASS)
-	   || (var->n.sym->ts.type == BT_CLASS && CLASS_DATA (var->n.sym)
+	   || (var->n.sym->ts.type == BT_CLASS && var->n.sym->ts.u.derived
+	   && CLASS_DATA (var->n.sym)
 	   && CLASS_DATA (var->n.sym)->as)))
 {
   e->rank = var->n.sym->ts.type == BT_CLASS
diff --git a/gcc/fortran/match.c b/gcc/fortran/match.c
index 617fb35c9cd..41faa53b97a 100644
--- a/gcc/fortran/match.c
+++ b/gcc/fortran/match.c
@@ -6363,7 +6363,8 @@ select_type_set_tmp (gfc_typespec *ts)
   sym = tmp->n.sym;
   gfc_add_type (sym, ts, NULL);

-  if (selector->ts.type == BT_CLASS && selector->attr.class_ok)
+  if (selector->ts.type == BT_CLASS && selector->attr.class_ok
+	  && selector->ts.u.derived && CLASS_DATA (selector))
 	{
 	  sym->attr.pointer
 		= CLASS_DATA (selector)->attr.class_pointer;
diff --git a/gcc/fortran/primary.c b/gcc/fortran/primary.c
index d873264a08e..1f63028d179 100644
--- a/gcc/fortran/primary.c
+++ b/gcc/fortran/primary.c
@@ -2151,6 +2151,7 @@ gfc_match_varspec (gfc_expr *primary, int equiv_flag, bool sub_flag,
 	  && !(gfc_matching_procptr_assignment
 	   && sym->attr.flavor == FL_PROCEDURE))
   || (sym->ts.type == BT_CLASS && sym->attr.class_ok
+	  && sym->ts.u.derived && CLASS_DATA (sym)
 	  && (CLASS_DATA (sym)->attr.dimension
 	  || CLASS_DATA (sym)->attr.codimension)))
 {
diff --git a/gcc/fortran/resolve.c b/gcc/fortran/resolve.c
index bff1b35446f..591e8186007 100644
--- a/gcc/fortran/resolve.c
+++ b/gcc/fortran/resolve.c
@@ -5736,6 +5736,8 @@ resolve_variable (gfc_expr *e)
  can't be translated that way.  */
   if (sym->assoc && e->rank == 0 && e->ref && sym->ts.type == BT_CLASS
   && sym->assoc->target && sym->assoc->target->ts.type == BT_CLASS
+  && sym->assoc->target->ts.u.derived
+  && CLASS_DATA (sym->assoc->target)
   && CLASS_DATA (sym->assoc->target)->as)
 {
   gfc_ref *ref = e->ref;
@@ -5799,7 +5801,8 @@ resolve_variable (gfc_expr *e)
   /* Like above, but for class types, where the checking whether an array
  ref is present is more complicated.  Furthermore make sure not to add
  the full array ref to _vptr or _len refs.  */
-  if (sym->assoc && sym->ts.type == BT_CLASS
+  if (sym->assoc && sym->ts.type == BT_CLASS && sym->ts.u.derived
+  && CLASS_DATA (sym)
   && CLASS_DATA (sym)->attr.dimension
   && (e->ts.type != BT_DERIVED || !e->ts.u.derived->attr.vtype))
 {
@@ -9432,6 +9435,7 @@ resolve_select_type (gfc_code *code, gfc_namespace *old_ns)

   /* Check F03:C815.  */
   if ((c->ts.type == BT_DERIVED || c->ts.type == BT_CLASS)
+	  && selector_type
 	  && !selector_type->attr.unlimited_polymorphic
 	  && !gfc_type_is_extensible (c->ts.u.derived))
 	{
@@ -9442,7 +9446,8 @@ resolve_select_type (gfc_code *code, gfc_namespace *old_ns)
 	}

   /* Check F03:C816.  */
-  if (c->ts.type != BT_UNKNOWN && !selector_type->attr.unlimited_polymorphic
+  if (c->ts.type != BT_UNKNOWN
+	  && selector_type && !selector_type->attr.unlimited_polymorphic
 	  && ((c->ts.type != BT_DERIVED && c->ts.type != BT_CLASS)
 	  || !gfc_type_is_extension_of (selector_type, c->ts.u.deri

Re: [PATCH] [i386]Fix tdpbf16ps testcase

2021-12-27 Thread Hongtao Liu via Gcc-patches
On Fri, Dec 24, 2021 at 4:51 PM Haochen Jiang via Gcc-patches
 wrote:
>
> Hi all,
>
> This patch fix the testcase of amxbf16-dpbf16ps-2.c. Previously the type 
> convert has some issue.
>
> Ok for trunk?
Ok.
>
> BRs,
> Haochen
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/amx-check.h (check_float_tile_register):
> New check function for float to prevent precision loss.
> * gcc.target/i386/amxbf16-dpbf16ps-2.c: Correct the type convert
> and byte offset. Use the new check function.
> ---
>  gcc/testsuite/gcc.target/i386/amx-check.h | 23 --
>  .../gcc.target/i386/amxbf16-dpbf16ps-2.c  | 30 ---
>  2 files changed, 41 insertions(+), 12 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h 
> b/gcc/testsuite/gcc.target/i386/amx-check.h
> index 03616ff0b8e..434b0e59703 100644
> --- a/gcc/testsuite/gcc.target/i386/amx-check.h
> +++ b/gcc/testsuite/gcc.target/i386/amx-check.h
> @@ -139,8 +139,27 @@ int check_tile_register (__tile* ref, __tile* target)
>
>for (i = 0; i < rows; i++)
>  for (j = 0; j < colsb; j++)
> -   if (ref->buf[i * colsb + j] != target->buf[i * colsb + j])
> -   return 0;
> +  if (ref->buf[i * colsb + j] != target->buf[i * colsb + j])
> +   return 0;
> +
> +  return 1;
> +}
> +
> +/* Compare float tile register value with __tile variable */
> +int check_float_tile_register (__tile* ref, __tile* target)
> +{
> +  /* Tile register should be stored from tmm to
> + memory and compare with emulation results. */
> +  int rows = target->rows;
> +  int colsb = target->colsb / 4;
> +  int i, j;
> +  uint32_t *ref_buf = (uint32_t *) ref->buf;
> +  uint32_t *target_buf = (uint32_t *) target->buf;
> +
> +  for (i = 0; i < rows; i++)
> +for (j = 0; j < colsb; j++)
> +  if (abs(ref_buf[i * colsb + j] - target_buf[i * colsb + j]) > 1)
> +   return 0;
>
>return 1;
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c 
> b/gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c
> index f7002ca5ea5..b00bc13ec78 100644
> --- a/gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c
> +++ b/gcc/testsuite/gcc.target/i386/amxbf16-dpbf16ps-2.c
> @@ -12,15 +12,25 @@ void test_amx_bf16_dpbf16ps ();
>  /* Transformation functions between bf16/float */
>  static uint16_t make_bf16 (float f)
>  {
> -  uint32_t u = (uint32_t)f;
> -  u = (u >> 16) & 0x;
> -  return (uint16_t)u;
> +  union
> +  {
> +float f;
> +uint32_t u;
> +  } fu;
> +  fu.f = f;
> +  fu.u = (fu.u >> 16) & 0x;
> +  return (uint16_t) fu.u;
>  }
>
>  static float make_f32 (uint16_t bf)
>  {
> -  uint32_t u = (uint32_t)(bf << 16);
> -  return (float)u;
> +  union
> +  {
> +float f;
> +uint32_t u;
> +  } fu;
> +  fu.u = (uint32_t) bf << 16;
> +  return fu.f;
>  }
>
>  /* Init tile buffer with bf16 pairs */
> @@ -54,10 +64,10 @@ void calc_matrix_dpbf16ps (__tile *dst, __tile *src1, 
> __tile *src2)
> for (t = 0; t < 2; t+=2)
>   {
> dst_buf[i * N + k] +=
> - (make_f32(src1_buf[i * 4 * N + 4 * j + t]) *
> - make_f32(src2_buf[j * 4 * K + 4 * k + t])) +
> - (make_f32(src1_buf[i * 4 * N + 4 * j + t + 1]) *
> - make_f32(src2_buf[j * 4 * K + 4 * k + t + 1]));
> + (make_f32(src1_buf[i * 2 * N + 2 * j + t]) *
> + make_f32(src2_buf[j * 2 * K + 2 * k + t])) +
> + (make_f32(src1_buf[i * 2 * N + 2 * j + t + 1]) *
> + make_f32(src2_buf[j * 2 * K + 2 * k + t + 1]));
>   }
>
>  }
> @@ -80,6 +90,6 @@ void test_amx_bf16_dpbf16ps ()
>_tile_dpbf16ps (1, 2, 3);
>_tile_stored (1, dst_ref.buf, _STRIDE);
>
> -  if (!check_tile_register (&dst_ref, &dst))
> +  if (!check_float_tile_register (&dst_ref, &dst))
>  abort();
>  }
> --
> 2.18.1
>


-- 
BR,
Hongtao