[PATCH, AArch64 00/11] LSE atomics out-of-line

2018-09-25 Thread rth7680
From: Richard Henderson 

ARMv8.1 adds an (mandatory) Atomics extension, also known as the
Large System Extension.  Deploying this extension at the OS level
has proved challenging.

The following is the result of a conversation between myself,
Alex Graf of SuSE, and Ramana Radhakrishnan of ARM, at last week's
Linaro Connect in Vancouver.

The current state of the world is that one could distribute two
different copies of a given shared library and place the LSE-enabled
version in /lib64/atomics/ and it will be selected over the /lib64/
version by ld.so when HWCAP_ATOMICS is present.

Alex's main concern with this is that (1) he doesn't want to
distribute two copies of every library, or determine what a
resonable subset would be and (2) this solution does not work
for executables, e.g. mysql.

Ramana's main concern was to avoid the overhead of an indirect jump,
especially in how that would affect the (non-)branch-prediction of
the smallest implementations.

Therefore, I've created small out-of-line helpers that are directly
linked into every library or executable that requires them.  There
will be two direct branches, both of which will be well-predicted.

In the process, I discovered a number of places within the code
where the existing implementation could be improved.  In particular:

 - the LSE patterns didn't use predicates or constraints that
   match the actual instructions, requiring unnecessary splitting.

 - the non-LSE compare-and-swap can use an extending compare to
   avoid requiring the input to have been previously extended.

 - TImode compare-and-swap was missing entirely.  This brings
   aarch64 to parity with x86_64 wrt __sync_val_compare_and_swap.

There is a final patch that enables the new option by default.
I am not necessarily expecting this to be merged upstream, but
for the operating system to decide what the default should be.
It might be that this should be a configure option, so as to
make that OS choice easier, but I've just now thought of that.  ;-)

I'm going to have to rely on Alex and/or Ramana to perform
testing on a system that supports LSE.


r~


Richard Henderson (11):
  aarch64: Simplify LSE cas generation
  aarch64: Improve cas generation
  aarch64: Improve swp generation
  aarch64: Improve atomic-op lse generation
  aarch64: Emit LSE st instructions
  Add visibility to libfunc constructors
  Link static libgcc after shared libgcc for -shared-libgcc
  aarch64: Add out-of-line functions for LSE atomics
  aarch64: Implement -matomic-ool
  aarch64: Implement TImode compare-and-swap
  Enable -matomic-ool by default

 gcc/config/aarch64/aarch64-protos.h   |  20 +-
 gcc/optabs-libfuncs.h |   2 +
 gcc/common/config/aarch64/aarch64-common.c|   6 +-
 gcc/config/aarch64/aarch64.c  | 480 ++
 gcc/gcc.c |   9 +-
 gcc/optabs-libfuncs.c |  26 +-
 .../atomic-comp-swap-release-acquire.c|   2 +-
 .../gcc.target/aarch64/atomic-inst-ldadd.c|  18 +-
 .../gcc.target/aarch64/atomic-inst-ldlogic.c  |  54 +-
 .../gcc.target/aarch64/atomic-op-acq_rel.c|   2 +-
 .../gcc.target/aarch64/atomic-op-acquire.c|   2 +-
 .../gcc.target/aarch64/atomic-op-char.c   |   2 +-
 .../gcc.target/aarch64/atomic-op-consume.c|   2 +-
 .../gcc.target/aarch64/atomic-op-imm.c|   2 +-
 .../gcc.target/aarch64/atomic-op-int.c|   2 +-
 .../gcc.target/aarch64/atomic-op-long.c   |   2 +-
 .../gcc.target/aarch64/atomic-op-relaxed.c|   2 +-
 .../gcc.target/aarch64/atomic-op-release.c|   2 +-
 .../gcc.target/aarch64/atomic-op-seq_cst.c|   2 +-
 .../gcc.target/aarch64/atomic-op-short.c  |   2 +-
 .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |   2 +-
 .../atomic_cmp_exchange_zero_strong_1.c   |   2 +-
 .../gcc.target/aarch64/sync-comp-swap.c   |   2 +-
 .../gcc.target/aarch64/sync-op-acquire.c  |   2 +-
 .../gcc.target/aarch64/sync-op-full.c |   2 +-
 libgcc/config/aarch64/lse.c   | 280 
 gcc/config/aarch64/aarch64.opt|   4 +
 gcc/config/aarch64/atomics.md | 608 ++
 gcc/config/aarch64/iterators.md   |   8 +-
 gcc/config/aarch64/predicates.md  |  12 +
 gcc/doc/invoke.texi   |  14 +-
 libgcc/config.host|   4 +
 libgcc/config/aarch64/t-lse   |  48 ++
 33 files changed, 1050 insertions(+), 577 deletions(-)
 create mode 100644 libgcc/config/aarch64/lse.c
 create mode 100644 libgcc/config/aarch64/t-lse

-- 
2.17.1



[PATCH, AArch64 03/11] aarch64: Improve swp generation

2018-09-25 Thread rth7680
From: Richard Henderson 

Allow zero as an input; fix constraints; avoid unnecessary split.

* config/aarch64/aarch64.c (aarch64_emit_atomic_swap): Remove.
(aarch64_gen_atomic_ldop): Don't call it.
* config/aarch64/atomics.md (atomic_exchange):
Use aarch64_reg_or_zero.
(aarch64_atomic_exchange): Likewise.
(aarch64_atomic_exchange_lse): Remove split; remove & from
operand 0; use aarch64_reg_or_zero for input; merge ...
(@aarch64_atomic_swp): ... this and remove.
---
 gcc/config/aarch64/aarch64.c  | 13 --
 gcc/config/aarch64/atomics.md | 49 +++
 2 files changed, 15 insertions(+), 47 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c0f2d296342..5e9a85be44c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14401,15 +14401,6 @@ aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, 
rtx s2, int shift)
   emit_insn (gen (dst, s2, shift_rtx, s1));
 }
 
-/* Emit an atomic swap.  */
-
-static void
-aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
- rtx mem, rtx model)
-{
-  emit_insn (gen_aarch64_atomic_swp (mode, dst, mem, value, model));
-}
-
 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
location to store the data read from memory.  OUT_RESULT is the location to
store the result of the operation.  MEM is the memory location to read and
@@ -14450,10 +14441,6 @@ aarch64_gen_atomic_ldop (enum rtx_code code, rtx 
out_data, rtx out_result,
  a SET then emit a swap instruction and finish.  */
   switch (code)
 {
-case SET:
-  aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
-  return;
-
 case MINUS:
   /* Negate the value and treat it as a PLUS.  */
   {
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index c00a18675b4..63384f9f99c 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -136,7 +136,7 @@
 (define_expand "atomic_exchange"
  [(match_operand:ALLI 0 "register_operand" "")
   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
-  (match_operand:ALLI 2 "register_operand" "")
+  (match_operand:ALLI 2 "aarch64_reg_or_zero" "")
   (match_operand:SI 3 "const_int_operand" "")]
   ""
   {
@@ -156,10 +156,10 @@
 
 (define_insn_and_split "aarch64_atomic_exchange"
   [(set (match_operand:ALLI 0 "register_operand" "=&r");; 
output
-(match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+(match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
(set (match_dup 1)
 (unspec_volatile:ALLI
-  [(match_operand:ALLI 2 "register_operand" "r")   ;; input
+  [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")   ;; input
(match_operand:SI 3 "const_int_operand" "")];; model
   UNSPECV_ATOMIC_EXCHG))
(clobber (reg:CC CC_REGNUM))
@@ -175,22 +175,25 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_exchange_lse"
-  [(set (match_operand:ALLI 0 "register_operand" "=&r")
+(define_insn "aarch64_atomic_exchange_lse"
+  [(set (match_operand:ALLI 0 "register_operand" "=r")
 (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
(set (match_dup 1)
 (unspec_volatile:ALLI
-  [(match_operand:ALLI 2 "register_operand" "r")
+  [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
(match_operand:SI 3 "const_int_operand" "")]
   UNSPECV_ATOMIC_EXCHG))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
   {
-aarch64_gen_atomic_ldop (SET, operands[0], NULL, operands[1],
-operands[2], operands[3]);
-DONE;
+enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+if (is_mm_relaxed (model))
+  return "swp\t%2, %0, %1";
+else if (is_mm_acquire (model) || is_mm_consume (model))
+  return "swpa\t%2, %0, %1";
+else if (is_mm_release (model))
+  return "swpl\t%2, %0, %1";
+else
+  return "swpal\t%2, %0, %1";
   }
 )
 
@@ -585,28 +588,6 @@
 
 ;; ARMv8.1-A LSE instructions.
 
-;; Atomic swap with memory.
-(define_insn "@aarch64_atomic_swp"
- [(set (match_operand:ALLI 0 "register_operand" "+&r")
-   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
-  (set (match_dup 1)
-   (unspec_volatile:ALLI
-[(match_operand:ALLI 2 "register_operand" "r")
- (match_operand:SI 3 "const_int_operand" "")]
-UNSPECV_ATOMIC_SWP))]
-  "TARGET_LSE && reload_completed"
-  {
-enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-if (is_mm_relaxed (model))
-  return "swp\t%2, %0, %1";
-else if (is_mm_acquire (model) || is_mm_consume (model))
-  return "swpa\t%2, %0, %1";
-else if (is_mm_release (model))
-  return "swpl\t%2, %0, %1";
-else
-  return "swpal\t%2, %0, %1";
-  })
-
 ;; Atomic load-op: Load data, operate, store result, keep data.
 
 

[PATCH, AArch64 05/11] aarch64: Emit LSE st instructions

2018-09-25 Thread rth7680
From: Richard Henderson 

When the result of an operation is not used, we can ignore the
result by storing to XZR.  For two of the memory models, using
XZR with LD has a preferred assembler alias, ST.

* config/aarch64/atomics.md (aarch64_atomic__lse):
Use ST for relaxed and release models; load to XZR otherwise;
remove the now unnecessary scratch register.

* gcc.target/aarch64/atomic-inst-ldadd.c: Expect stadd{,l}.
* gcc.target/aarch64/atomic-inst-ldlogic.c: Similarly.
---
 .../gcc.target/aarch64/atomic-inst-ldadd.c| 18 ---
 .../gcc.target/aarch64/atomic-inst-ldlogic.c  | 54 ---
 gcc/config/aarch64/atomics.md | 15 +++---
 3 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c 
b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
index 4b2282c6861..db2206186b4 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
@@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN)
 TEST (sub_load, SUB_LOAD)
 TEST (sub_load_notreturn, SUB_LOAD_NORETURN)
 
-/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddab\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */
+/* { dg-final { scan-assembler-times "staddb\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlb\t" 8} } */
 
-/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddah\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */
+/* { dg-final { scan-assembler-times "staddh\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlh\t" 8} } */
 
-/* { dg-final { scan-assembler-times "ldadd\t" 32} } */
+/* { dg-final { scan-assembler-times "ldadd\t" 16} } */
 /* { dg-final { scan-assembler-times "ldadda\t" 64} } */
-/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */
+/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */
 /* { dg-final { scan-assembler-times "ldaddal\t" 64} } */
+/* { dg-final { scan-assembler-times "stadd\t" 16} } */
+/* { dg-final { scan-assembler-times "staddl\t" 16} } */
 
 /* { dg-final { scan-assembler-not "ldaxr\t" } } */
 /* { dg-final { scan-assembler-not "stlxr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c 
b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
index 4879d52b9b4..b8a53e0a676 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
@@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN)
 
 /* Load-OR.  */
 
-/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */
+/* { dg-final { scan-assembler-times "stsetb\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldseth\t" 8} } */
+/* { dg-final { scan-assembler-times "ldseth\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetalh\t" 16} } */
+/* { dg-final { scan-assembler-times "stseth\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldset\t" 16} } */
+/* { dg-final { scan-assembler-times "ldset\t" 8} } */
 /* { dg-final { scan-assembler-times "ldseta\t" 32} } */
-/* { dg-final { scan-assembler-times "ldsetl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldsetl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldsetal\t" 32} } */
+/* { dg-final { scan-assembler-times "stset\t" 8} } */
+/* { dg-final { scan-assembler-times "stsetl\t" 8} } */
 
 /* Load-AND.  */
 
-/* { dg-final { scan-assembler-times "ldclrb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclrab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldclrlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclralb\t" 16} } */
+/* { dg-final { scan-assembler-times "stclrb\t" 4} } */
+/* { dg-final { scan-assembler-times "stclrlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldclrh\t" 8} } */
+/

[PATCH, AArch64 02/11] aarch64: Improve cas generation

2018-09-25 Thread rth7680
From: Richard Henderson 

Do not zero-extend the input to the cas for subword operations;
instead, use the appropriate zero-extending compare insns.
Correct the predicates and constraints for immediate expected operand.

* config/aarch64/aarch64.c (aarch64_gen_compare_reg_maybe_ze): New.
(aarch64_split_compare_and_swap): Use it.
(aarch64_expand_compare_and_swap): Likewise.  Remove convert_modes;
test oldval against the proper predicate.
* config/aarch64/atomics.md (@atomic_compare_and_swap):
Use nonmemory_operand for expected.
(cas_short_expected_pred): New.
(@aarch64_compare_and_swap): Use it; use "rn" not "rI" to match.
(@aarch64_compare_and_swap): Use "rn" not "rI" for expected.
* config/aarch64/predicates.md (aarch64_plushi_immediate): New.
(aarch64_plushi_operand): New.
---
 gcc/config/aarch64/aarch64.c | 85 
 gcc/config/aarch64/atomics.md| 21 
 gcc/config/aarch64/predicates.md | 12 +
 3 files changed, 77 insertions(+), 41 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a0ba358c2f1..c0f2d296342 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1613,6 +1613,33 @@ aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
   return cc_reg;
 }
 
+/* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
+
+static rtx
+aarch64_gen_compare_reg_maybe_ze(RTX_CODE code, rtx x, rtx y,
+ machine_mode y_mode)
+{
+  if (y_mode == E_QImode || y_mode == E_HImode)
+{
+  if (CONST_INT_P (y))
+   y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
+  else
+   {
+ rtx t, cc_reg;
+ machine_mode cc_mode;
+
+ t = gen_rtx_ZERO_EXTEND (SImode, y);
+ t = gen_rtx_COMPARE (CC_SWPmode, t, x);
+ cc_mode = CC_SWPmode;
+ cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+ emit_set_insn (cc_reg, t);
+ return cc_reg;
+   }
+}
+
+  return aarch64_gen_compare_reg (code, x, y);
+}
+
 /* Build the SYMBOL_REF for __tls_get_addr.  */
 
 static GTY(()) rtx tls_get_addr_libfunc;
@@ -14138,8 +14165,8 @@ aarch64_emit_unlikely_jump (rtx insn)
 void
 aarch64_expand_compare_and_swap (rtx operands[])
 {
-  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
-  machine_mode mode, cmp_mode;
+  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
+  machine_mode mode, r_mode;
 
   bval = operands[0];
   rval = operands[1];
@@ -14150,56 +14177,50 @@ aarch64_expand_compare_and_swap (rtx operands[])
   mod_s = operands[6];
   mod_f = operands[7];
   mode = GET_MODE (mem);
-  cmp_mode = mode;
 
   /* Normally the succ memory model must be stronger than fail, but in the
  unlikely event of fail being ACQUIRE and succ being RELEASE we need to
  promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
-
   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
   && is_mm_release (memmodel_from_int (INTVAL (mod_s
 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
-  switch (mode)
+  r_mode = mode;
+  if (mode == QImode || mode == HImode)
 {
-case E_QImode:
-case E_HImode:
-  /* For short modes, we're going to perform the comparison in SImode,
-so do the zero-extension now.  */
-  cmp_mode = SImode;
-  rval = gen_reg_rtx (SImode);
-  oldval = convert_modes (SImode, mode, oldval, true);
-  /* Fall through.  */
-
-case E_SImode:
-case E_DImode:
-  /* Force the value into a register if needed.  */
-  if (TARGET_LSE || !aarch64_plus_operand (oldval, mode))
-   oldval = force_reg (cmp_mode, oldval);
-  break;
-
-default:
-  gcc_unreachable ();
+  r_mode = SImode;
+  rval = gen_reg_rtx (r_mode);
 }
 
   if (TARGET_LSE)
 {
+  /* Oldval always requires a register.  We also must not clobber
+ oldval when writing to rval, so that we can compare afterward.  */
+  oldval = force_reg (mode, oldval);
   if (reg_overlap_mentioned_p (rval, oldval))
-rval = gen_reg_rtx (cmp_mode);
+rval = gen_reg_rtx (r_mode);
+
   emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
   newval, mod_s));
-  aarch64_gen_compare_reg (EQ, rval, oldval);
+  cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
 }
   else
-emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
-is_weak, mod_s, mod_f));
+{
+  /* The oldval predicate varies by mode.  Test it and force to reg.  */
+  insn_code code = code_for_aarch64_compare_and_swap (mode);
+  if (!insn_data[code].operand[2].predicate (oldval, mode))
+   oldval = force_reg (mode, oldval);
 
-  if (mode == QImode || mode == HImode)
+  emit_insn (GEN_FCN (code) (rval, m

[PATCH, AArch64 01/11] aarch64: Simplify LSE cas generation

2018-09-25 Thread rth7680
From: Richard Henderson 

The cas insn is a single insn, and if expanded properly need not
be split after reload.  Use the proper inputs for the insn.

* config/aarch64/aarch64.c (aarch64_expand_compare_and_swap):
Force value into a register for TARGET_LSE; avoid register
overlap between rval and oldval, in case the compare is needed;
emit the compare during initial expansion so that it may be
deleted if unused.
(aarch64_gen_atomic_cas): Remove.
* config/aarch64/atomics.md (@aarch64_compare_and_swap_lse):
Remove & from operand 0; use matching constraint for operand 2;
remove is_weak and mod_f operands as unused.  Drop the split
and merge with...
(@aarch64_atomic_cas): ... this pattern's output; remove.
(@aarch64_compare_and_swap_lse): Similarly.
(@aarch64_atomic_cas): Similarly.
---
 gcc/config/aarch64/aarch64-protos.h |   1 -
 gcc/config/aarch64/aarch64.c|  41 +++---
 gcc/config/aarch64/atomics.md   | 117 
 3 files changed, 41 insertions(+), 118 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index caf1d2041f0..3d045cf43be 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -562,7 +562,6 @@ rtx aarch64_load_tp (rtx);
 
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
-void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
 
 bool aarch64_atomic_ldop_supported_p (enum rtx_code);
 void aarch64_gen_atomic_ldop (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 12f7dfe9a75..a0ba358c2f1 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14174,7 +14174,7 @@ aarch64_expand_compare_and_swap (rtx operands[])
 case E_SImode:
 case E_DImode:
   /* Force the value into a register if needed.  */
-  if (!aarch64_plus_operand (oldval, mode))
+  if (TARGET_LSE || !aarch64_plus_operand (oldval, mode))
oldval = force_reg (cmp_mode, oldval);
   break;
 
@@ -14183,16 +14183,20 @@ aarch64_expand_compare_and_swap (rtx operands[])
 }
 
   if (TARGET_LSE)
-emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
-newval, is_weak, mod_s,
-mod_f));
+{
+  if (reg_overlap_mentioned_p (rval, oldval))
+rval = gen_reg_rtx (cmp_mode);
+  emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
+  newval, mod_s));
+  aarch64_gen_compare_reg (EQ, rval, oldval);
+}
   else
 emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
 is_weak, mod_s, mod_f));
 
-
   if (mode == QImode || mode == HImode)
-emit_move_insn (operands[1], gen_lowpart (mode, rval));
+rval = gen_lowpart (mode, rval);
+  emit_move_insn (operands[1], rval);
 
   x = gen_rtx_REG (CCmode, CC_REGNUM);
   x = gen_rtx_EQ (SImode, x, const0_rtx);
@@ -14242,31 +14246,6 @@ aarch64_emit_post_barrier (enum memmodel model)
 }
 }
 
-/* Emit an atomic compare-and-swap operation.  RVAL is the destination register
-   for the data in memory.  EXPECTED is the value expected to be in memory.
-   DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
-   is the memory ordering to use.  */
-
-void
-aarch64_gen_atomic_cas (rtx rval, rtx mem,
-   rtx expected, rtx desired,
-   rtx model)
-{
-  machine_mode mode;
-
-  mode = GET_MODE (mem);
-
-  /* Move the expected value into the CAS destination register.  */
-  emit_insn (gen_rtx_SET (rval, expected));
-
-  /* Emit the CAS.  */
-  emit_insn (gen_aarch64_atomic_cas (mode, rval, mem, desired, model));
-
-  /* Compare the expected value with the value loaded by the CAS, to establish
- whether the swap was made.  */
-  aarch64_gen_compare_reg (EQ, rval, expected);
-}
-
 /* Split a compare and swap pattern.  */
 
 void
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index bba8e9e9c8e..9f00dd3c68e 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -85,56 +85,50 @@
   }
 )
 
-(define_insn_and_split "@aarch64_compare_and_swap_lse"
-  [(set (reg:CC CC_REGNUM) ;; bool out
-(unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
-   (set (match_operand:SI 0 "register_operand" "=&r")  ;; val out
+(define_insn "@aarch64_compare_and_swap_lse"
+  [(set (match_operand:SI 0 "register_operand" "=r")   ;; val out
 (zero_extend:SI
-  (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memory
+ (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memo

[PATCH, AArch64 07/11] Link static libgcc after shared libgcc for -shared-libgcc

2018-09-25 Thread rth7680
From: Richard Henderson 

We are about to introduce symbols to libgcc.a that will
not be present in libgcc_s.so.  Most symbols will be
resolved from the shared library first, and only the new
symbols will be pulled from the static library.

* gcc.c (init_gcc_specs): Include static_name after shared_name.
---
 gcc/gcc.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/gcc/gcc.c b/gcc/gcc.c
index 264204d7b37..4a7ca691122 100644
--- a/gcc/gcc.c
+++ b/gcc/gcc.c
@@ -1676,9 +1676,8 @@ init_gcc_specs (struct obstack *obstack, const char 
*shared_name,
static_name, " " LD_AS_NEEDED_OPTION " ",
shared_name, " " LD_NO_AS_NEEDED_OPTION
"}"
-   "%{shared-libgcc:",
-   shared_name, "%{!shared: ", static_name, "}"
-   "}}"
+   "%{shared-libgcc:", shared_name, " ", static_name, "}"
+   "}"
 #else
   buf = concat ("%{static|static-libgcc:", static_name, " ", eh_name, "}"
"%{!static:%{!static-libgcc:"
@@ -1688,11 +1687,11 @@ init_gcc_specs (struct obstack *obstack, const char 
*shared_name,
"}"
 #ifdef LINK_EH_SPEC
"%{shared:"
-   "%{shared-libgcc:", shared_name, "}"
+   "%{shared-libgcc:", shared_name, " ", static_name, "}"
"%{!shared-libgcc:", static_name, "}"
"}"
 #else
-   "%{shared:", shared_name, "}"
+   "%{shared:", shared_name, " ", static_name, "}"
 #endif
 #endif
"}}", NULL);
-- 
2.17.1



[PATCH, AArch64 06/11] Add visibility to libfunc constructors

2018-09-25 Thread rth7680
From: Richard Henderson 

* optabs-libfuncs.c (build_libfunc_function_visibility):
New, split out from...
(build_libfunc_function): ... here.
(init_one_libfunc_visibility): New, split out from ...
(init_one_libfunc): ... here.
---
 gcc/optabs-libfuncs.h |  2 ++
 gcc/optabs-libfuncs.c | 26 --
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/gcc/optabs-libfuncs.h b/gcc/optabs-libfuncs.h
index 0669ea1fdd7..cf39da36887 100644
--- a/gcc/optabs-libfuncs.h
+++ b/gcc/optabs-libfuncs.h
@@ -63,7 +63,9 @@ void gen_satfract_conv_libfunc (convert_optab, const char *,
 void gen_satfractuns_conv_libfunc (convert_optab, const char *,
   machine_mode, machine_mode);
 
+tree build_libfunc_function_visibility (const char *, symbol_visibility);
 tree build_libfunc_function (const char *);
+rtx init_one_libfunc_visibility (const char *, symbol_visibility);
 rtx init_one_libfunc (const char *);
 rtx set_user_assembler_libfunc (const char *, const char *);
 
diff --git a/gcc/optabs-libfuncs.c b/gcc/optabs-libfuncs.c
index bd0df8baa37..73a28e9ca7a 100644
--- a/gcc/optabs-libfuncs.c
+++ b/gcc/optabs-libfuncs.c
@@ -719,10 +719,10 @@ struct libfunc_decl_hasher : ggc_ptr_hash
 /* A table of previously-created libfuncs, hashed by name.  */
 static GTY (()) hash_table *libfunc_decls;
 
-/* Build a decl for a libfunc named NAME.  */
+/* Build a decl for a libfunc named NAME with visibility VIS.  */
 
 tree
-build_libfunc_function (const char *name)
+build_libfunc_function_visibility (const char *name, symbol_visibility vis)
 {
   /* ??? We don't have any type information; pretend this is "int foo ()".  */
   tree decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
@@ -731,7 +731,7 @@ build_libfunc_function (const char *name)
   DECL_EXTERNAL (decl) = 1;
   TREE_PUBLIC (decl) = 1;
   DECL_ARTIFICIAL (decl) = 1;
-  DECL_VISIBILITY (decl) = VISIBILITY_DEFAULT;
+  DECL_VISIBILITY (decl) = vis;
   DECL_VISIBILITY_SPECIFIED (decl) = 1;
   gcc_assert (DECL_ASSEMBLER_NAME (decl));
 
@@ -742,11 +742,19 @@ build_libfunc_function (const char *name)
   return decl;
 }
 
+/* Build a decl for a libfunc named NAME.  */
+
+tree
+build_libfunc_function (const char *name)
+{
+  return build_libfunc_function_visibility (name, VISIBILITY_DEFAULT);
+}
+
 /* Return a libfunc for NAME, creating one if we don't already have one.
-   The returned rtx is a SYMBOL_REF.  */
+   The decl is given visibility VIS.  The returned rtx is a SYMBOL_REF.  */
 
 rtx
-init_one_libfunc (const char *name)
+init_one_libfunc_visibility (const char *name, symbol_visibility vis)
 {
   tree id, decl;
   hashval_t hash;
@@ -763,12 +771,18 @@ init_one_libfunc (const char *name)
 {
   /* Create a new decl, so that it can be passed to
 targetm.encode_section_info.  */
-  decl = build_libfunc_function (name);
+  decl = build_libfunc_function_visibility (name, vis);
   *slot = decl;
 }
   return XEXP (DECL_RTL (decl), 0);
 }
 
+rtx
+init_one_libfunc (const char *name)
+{
+  return init_one_libfunc_visibility (name, VISIBILITY_DEFAULT);
+}
+
 /* Adjust the assembler name of libfunc NAME to ASMSPEC.  */
 
 rtx
-- 
2.17.1



[PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics

2018-09-25 Thread rth7680
From: Richard Henderson 

This is the libgcc part of the interface -- providing the functions.
Rationale is provided at the top of libgcc/config/aarch64/lse.c.

* config/aarch64/lse.c: New file.
* config/aarch64/t-lse: New file.
* config.host: Add t-lse to all aarch64 tuples.
---
 libgcc/config/aarch64/lse.c | 258 
 libgcc/config.host  |   4 +
 libgcc/config/aarch64/t-lse |  44 ++
 3 files changed, 306 insertions(+)
 create mode 100644 libgcc/config/aarch64/lse.c
 create mode 100644 libgcc/config/aarch64/t-lse

diff --git a/libgcc/config/aarch64/lse.c b/libgcc/config/aarch64/lse.c
new file mode 100644
index 000..20f4bde741f
--- /dev/null
+++ b/libgcc/config/aarch64/lse.c
@@ -0,0 +1,258 @@
+/* Out-of-line LSE atomics for AArch64 architecture.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   Contributed by Linaro Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+.  */
+
+/*
+ * The problem that we are trying to solve is operating system deployment
+ * of ARMv8.1-Atomics, also known as Large System Exensions (LSE).
+ *
+ * There are a number of potential solutions for this problem which have
+ * been proposed and rejected for various reasons.  To recap:
+ *
+ * (1) Multiple builds.  The dynamic linker will examine /lib64/atomics/
+ * if HWCAP_ATOMICS is set, allowing entire libraries to be overwritten.
+ * However, not all Linux distributions are happy with multiple builds,
+ * and anyway it has no effect on main applications.
+ *
+ * (2) IFUNC.  We could put these functions into libgcc_s.so, and have
+ * a single copy of each function for all DSOs.  However, ARM is concerned
+ * that the branch-to-indirect-branch that is implied by using a PLT,
+ * as required by IFUNC, is too much overhead for smaller cpus.
+ *
+ * (3) Statically predicted direct branches.  This is the approach that
+ * is taken here.  These functions are linked into every DSO that uses them.
+ * All of the symbols are hidden, so that the functions are called via a
+ * direct branch.  The choice of LSE vs non-LSE is done via one byte load
+ * followed by a well-predicted direct branch.  The functions are compiled
+ * separately to minimize code size.
+ */
+
+/* Define or declare the symbol gating the LSE implementations.  */
+#ifndef L_have_atomics
+extern
+#endif
+_Bool __aa64_have_atomics __attribute__((visibility("hidden"), nocommon));
+
+/* The branch controlled by this test should be easily predicted, in that
+   it will, after constructors, always branch the same way.  The expectation
+   is that systems that implement ARMv8.1-Atomics are "beefier" than those
+   that omit the extension.  By arranging for the fall-through path to use
+   load-store-exclusive insns, we aid the branch predictor of the
+   smallest cpus.  */
+#define have_atomics  __builtin_expect(__aa64_have_atomics, 0)
+
+#ifdef L_have_atomics
+/* Disable initialization of __aa64_have_atomics during bootstrap.  */
+# ifndef inhibit_libc
+#  include 
+
+static void __attribute__((constructor))
+init_have_atomics(void)
+{
+  unsigned long hwcap = getauxval(AT_HWCAP);
+  __aa64_have_atomics = (hwcap & HWCAP_ATOMICS) != 0;
+}
+# endif /* inhibit_libc */
+#else
+
+/* Tell the assembler to accept LSE instructions.  */
+asm(".arch armv8-a+lse");
+
+/* Turn size and memory model defines into mnemonic fragments.  */
+#if SIZE == 1
+# define S "b"
+# define MASK  ", uxtb"
+#elif SIZE == 2
+# define S "h"
+# define MASK  ", uxth"
+#elif SIZE == 4 || SIZE == 8
+# define S ""
+# define MASK  ""
+#else
+# error
+#endif
+
+#if SIZE < 8
+# define T  unsigned int
+# define W  "w"
+#else
+# define T  unsigned long long
+# define W  ""
+#endif
+
+#if MODEL == 1
+# define SUFF  _relax
+# define A ""
+# define L ""
+#elif MODEL == 2
+# define SUFF  _acq
+# define A "a"
+# define L ""
+#elif MODEL == 3
+# define SUFF  _rel
+# define A ""
+# define L "l"
+#elif MODEL == 4
+# define SUFF  _acq_rel
+# define A "a"
+# define L "l"
+#else
+# error
+#endif
+
+#define NAME2(B, S, X)  __aa64_ ## B ## S ## X
+#define N

[PATCH, AArch64 11/11] Enable -matomic-ool by default

2018-09-25 Thread rth7680
From: Richard Henderson 

Do Not Merge Upstream.
This is for agraf and his testing within SLES.
---
 gcc/common/config/aarch64/aarch64-common.c | 6 --
 gcc/config/aarch64/aarch64.c   | 6 --
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/common/config/aarch64/aarch64-common.c 
b/gcc/common/config/aarch64/aarch64-common.c
index 292fb818705..3bd1312a3f8 100644
--- a/gcc/common/config/aarch64/aarch64-common.c
+++ b/gcc/common/config/aarch64/aarch64-common.c
@@ -31,9 +31,11 @@
 #include "flags.h"
 #include "diagnostic.h"
 
-#ifdef  TARGET_BIG_ENDIAN_DEFAULT
 #undef  TARGET_DEFAULT_TARGET_FLAGS
-#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
+#ifdef  TARGET_BIG_ENDIAN_DEFAULT
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END | MASK_ATOMIC_OOL)
+#else
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_ATOMIC_OOL)
 #endif
 
 #undef  TARGET_HANDLE_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index eca47784730..d2aa9bad5a4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -17611,9 +17611,11 @@ aarch64_run_selftests (void)
 #undef TARGET_C_MODE_FOR_SUFFIX
 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
 
-#ifdef TARGET_BIG_ENDIAN_DEFAULT
 #undef  TARGET_DEFAULT_TARGET_FLAGS
-#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
+#ifdef  TARGET_BIG_ENDIAN_DEFAULT
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END | MASK_ATOMIC_OOL)
+#else
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_ATOMIC_OOL)
 #endif
 
 #undef TARGET_CLASS_MAX_NREGS
-- 
2.17.1



[PATCH, AArch64 04/11] aarch64: Improve atomic-op lse generation

2018-09-25 Thread rth7680
From: Richard Henderson 

Fix constraints; avoid unnecessary split.  Drop the use of the atomic_op
iterator in favor of the ATOMIC_LDOP iterator; this is simplier and more
logical for ldclr aka bic.

* config/aarch64/aarch64.c (aarch64_emit_bic): Remove.
(aarch64_atomic_ldop_supported_p): Remove.
(aarch64_gen_atomic_ldop): Remove.
* config/aarch64/atomic.md (atomic_):
Fully expand LSE operations here.
(atomic_fetch_): Likewise.
(atomic__fetch): Likewise.
(aarch64_atomic__lse): Drop atomic_op iterator
and use ATOMIC_LDOP instead; use register_operand for the input;
drop the split and emit insns directly.
(aarch64_atomic_fetch__lse): Likewise.
(aarch64_atomic__fetch_lse): Remove.
(@aarch64_atomic_load): Remove.
---
 gcc/config/aarch64/aarch64-protos.h |   2 -
 gcc/config/aarch64/aarch64.c| 176 -
 gcc/config/aarch64/atomics.md   | 197 +++-
 gcc/config/aarch64/iterators.md |   5 +-
 4 files changed, 108 insertions(+), 272 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 3d045cf43be..1d2f8487d1a 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -563,8 +563,6 @@ rtx aarch64_load_tp (rtx);
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
 
-bool aarch64_atomic_ldop_supported_p (enum rtx_code);
-void aarch64_gen_atomic_ldop (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
 void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
 
 bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5e9a85be44c..1e00fdc801c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14224,32 +14224,6 @@ aarch64_expand_compare_and_swap (rtx operands[])
   emit_insn (gen_rtx_SET (bval, x));
 }
 
-/* Test whether the target supports using a atomic load-operate instruction.
-   CODE is the operation and AFTER is TRUE if the data in memory after the
-   operation should be returned and FALSE if the data before the operation
-   should be returned.  Returns FALSE if the operation isn't supported by the
-   architecture.  */
-
-bool
-aarch64_atomic_ldop_supported_p (enum rtx_code code)
-{
-  if (!TARGET_LSE)
-return false;
-
-  switch (code)
-{
-case SET:
-case AND:
-case IOR:
-case XOR:
-case MINUS:
-case PLUS:
-  return true;
-default:
-  return false;
-}
-}
-
 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
sequence implementing an atomic operation.  */
 
@@ -14382,156 +14356,6 @@ aarch64_split_compare_and_swap (rtx operands[])
 aarch64_emit_post_barrier (model);
 }
 
-/* Emit a BIC instruction.  */
-
-static void
-aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
-{
-  rtx shift_rtx = GEN_INT (shift);
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
-{
-case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
-case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
-default:
-  gcc_unreachable ();
-}
-
-  emit_insn (gen (dst, s2, shift_rtx, s1));
-}
-
-/* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
-   location to store the data read from memory.  OUT_RESULT is the location to
-   store the result of the operation.  MEM is the memory location to read and
-   modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
-   operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
-   be NULL.  */
-
-void
-aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
-rtx mem, rtx value, rtx model_rtx)
-{
-  machine_mode mode = GET_MODE (mem);
-  machine_mode wmode = (mode == DImode ? DImode : SImode);
-  const bool short_mode = (mode < SImode);
-  int ldop_code;
-  rtx src;
-  rtx x;
-
-  if (out_data)
-out_data = gen_lowpart (mode, out_data);
-
-  if (out_result)
-out_result = gen_lowpart (mode, out_result);
-
-  /* Make sure the value is in a register, putting it into a destination
- register if it needs to be manipulated.  */
-  if (!register_operand (value, mode)
-  || code == AND || code == MINUS)
-{
-  src = out_result ? out_result : out_data;
-  emit_move_insn (src, gen_lowpart (mode, value));
-}
-  else
-src = value;
-  gcc_assert (register_operand (src, mode));
-
-  /* Preprocess the data for the operation as necessary.  If the operation is
- a SET then emit a swap instruction and finish.  */
-  switch (code)
-{
-case MINUS:
-  /* Negate the value and treat it as a PLUS.  */
-  {
-   rtx neg_src;
-
-   /* Resize the value if necessary.  */
-   if (short_mode)
- src = gen_lowpart (wmode

[PATCH, AArch64 09/11] aarch64: Implement -matomic-ool

2018-09-25 Thread rth7680
From: Richard Henderson 

* config/aarch64/aarch64.opt (-matomic-ool): New.
* config/aarch64/aarch64.c (aarch64_atomic_ool_func): New.
(aarch64_ool_cas_names, aarch64_ool_swp_names): New.
(aarch64_ool_ldadd_names, aarch64_ool_ldset_names): New.
(aarch64_ool_ldclr_names, aarch64_ool_ldeor_names): New.
(aarch64_ool_stadd_names, aarch64_ool_stset_names): New.
(aarch64_ool_stclr_names, aarch64_ool_steor_names): New.
(aarch64_expand_compare_and_swap): Honor TARGET_ATOMIC_OOL.
* config/aarch64/atomics.md (atomic_exchange): Likewise.
(atomic_): Likewise.
(atomic_fetch_): Likewise.
(atomic__fetch): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h   | 17 
 gcc/config/aarch64/aarch64.c  | 95 +++
 .../atomic-comp-swap-release-acquire.c|  2 +-
 .../gcc.target/aarch64/atomic-op-acq_rel.c|  2 +-
 .../gcc.target/aarch64/atomic-op-acquire.c|  2 +-
 .../gcc.target/aarch64/atomic-op-char.c   |  2 +-
 .../gcc.target/aarch64/atomic-op-consume.c|  2 +-
 .../gcc.target/aarch64/atomic-op-imm.c|  2 +-
 .../gcc.target/aarch64/atomic-op-int.c|  2 +-
 .../gcc.target/aarch64/atomic-op-long.c   |  2 +-
 .../gcc.target/aarch64/atomic-op-relaxed.c|  2 +-
 .../gcc.target/aarch64/atomic-op-release.c|  2 +-
 .../gcc.target/aarch64/atomic-op-seq_cst.c|  2 +-
 .../gcc.target/aarch64/atomic-op-short.c  |  2 +-
 .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |  2 +-
 .../atomic_cmp_exchange_zero_strong_1.c   |  2 +-
 .../gcc.target/aarch64/sync-comp-swap.c   |  2 +-
 .../gcc.target/aarch64/sync-op-acquire.c  |  2 +-
 .../gcc.target/aarch64/sync-op-full.c |  2 +-
 gcc/config/aarch64/aarch64.opt|  4 +
 gcc/config/aarch64/atomics.md | 94 --
 gcc/doc/invoke.texi   | 14 ++-
 22 files changed, 232 insertions(+), 26 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 1d2f8487d1a..c7b96b12bbe 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -624,4 +624,21 @@ poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
 bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
 
+struct atomic_ool_names
+{
+const char *str[4][4];
+};
+
+rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+   const atomic_ool_names *names);
+extern const atomic_ool_names aarch64_ool_swp_names;
+extern const atomic_ool_names aarch64_ool_stadd_names;
+extern const atomic_ool_names aarch64_ool_stset_names;
+extern const atomic_ool_names aarch64_ool_stclr_names;
+extern const atomic_ool_names aarch64_ool_steor_names;
+extern const atomic_ool_names aarch64_ool_ldadd_names;
+extern const atomic_ool_names aarch64_ool_ldset_names;
+extern const atomic_ool_names aarch64_ool_ldclr_names;
+extern const atomic_ool_names aarch64_ool_ldeor_names;
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1e00fdc801c..78b30d68884 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14160,6 +14160,90 @@ aarch64_emit_unlikely_jump (rtx insn)
   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
 }
 
+/* We store the names of the various atomic helpers in a 4x4 array.
+   Return the libcall function given MODE, MODEL and NAMES.  */
+
+rtx
+aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+   const atomic_ool_names *names)
+{
+  memmodel model = memmodel_base (INTVAL (model_rtx));
+  int mode_idx, model_idx;
+
+  switch (mode)
+{
+case E_QImode:
+  mode_idx = 0;
+  break;
+case E_HImode:
+  mode_idx = 1;
+  break;
+case E_SImode:
+  mode_idx = 2;
+  break;
+case E_DImode:
+  mode_idx = 3;
+  break;
+default:
+  gcc_unreachable ();
+}
+
+  switch (model)
+{
+case MEMMODEL_RELAXED:
+  model_idx = 0;
+  break;
+case MEMMODEL_CONSUME:
+case MEMMODEL_ACQUIRE:
+  model_idx = 1;
+  break;
+case MEMMODEL_RELEASE:
+  model_idx = 2;
+  break;
+case MEMMODEL_ACQ_REL:
+case MEMMODEL_SEQ_CST:
+  model_idx = 3;
+  break;
+default:
+  gcc_unreachable ();
+}
+
+  return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
+ VISIBILITY_HIDDEN);
+}
+
+#define DEF0(B, N) \
+  { "__aa64_" #B #N "_relax", \
+"__aa64_" #B #N "_acq", \
+"__aa64_" #B #N "_rel", \
+"__aa64_" #B #N "_acq_rel" }
+
+#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8)
+
+static const atomic_ool_names aarch64_ool_cas_names = { { DEF4(cas) } };
+const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
+const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
+const atom

[PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap

2018-09-25 Thread rth7680
From: Richard Henderson 

This pattern will only be used with the __sync functions, because
we do not yet have a bare TImode atomic load.

* config/aarch64/aarch64.c (aarch64_gen_compare_reg): Add support   
for NE comparison of TImode values.
(aarch64_print_operand): Extend %R to handle general registers.
(aarch64_emit_load_exclusive): Add support for TImode.
(aarch64_emit_store_exclusive): Likewise.
(aarch64_atomic_ool_func): Likewise.
(aarch64_ool_cas_names): Likewise.
* config/aarch64/atomics.md (@atomic_compare_and_swap):
Change iterator from ALLI to ALLI_TI.
(@atomic_compare_and_swap): New.
(@atomic_compare_and_swap_lse): New.
(aarch64_load_exclusive_pair): New.
(aarch64_store_exclusive_pair): New.
* config/aarch64/iterators.md (JUST_TI): New.

* config/aarch64/lse.c (cas): Add support for SIZE == 16.
* config/aarch64/t-lse (S0, O0): Split out cas.
(LSE_OBJS): Include $(O0).
---
 gcc/config/aarch64/aarch64-protos.h |  2 +-
 gcc/config/aarch64/aarch64.c| 72 ++-
 libgcc/config/aarch64/lse.c | 48 ++-
 gcc/config/aarch64/atomics.md   | 91 +++--
 gcc/config/aarch64/iterators.md |  3 +
 libgcc/config/aarch64/t-lse | 10 +++-
 6 files changed, 189 insertions(+), 37 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index c7b96b12bbe..f735c4e5ad8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -626,7 +626,7 @@ bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
 
 struct atomic_ool_names
 {
-const char *str[4][4];
+const char *str[5][4];
 };
 
 rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 78b30d68884..eca47784730 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1606,10 +1606,33 @@ emit_set_insn (rtx x, rtx y)
 rtx
 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 {
-  machine_mode mode = SELECT_CC_MODE (code, x, y);
-  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+  machine_mode cmp_mode = GET_MODE (x);
+  machine_mode cc_mode;
+  rtx cc_reg;
 
-  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
+  if (cmp_mode == E_TImode)
+{
+  gcc_assert (code == NE);
+
+  cc_mode = E_CCmode;
+  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+
+  rtx x_lo = operand_subword (x, 0, 0, TImode);
+  rtx y_lo = operand_subword (y, 0, 0, TImode);
+  emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
+
+  rtx x_hi = operand_subword (x, 1, 0, TImode);
+  rtx y_hi = operand_subword (y, 1, 0, TImode);
+  emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
+gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
+GEN_INT (AARCH64_EQ)));
+}
+  else
+{
+  cc_mode = SELECT_CC_MODE (code, x, y);
+  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+  emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
+}
   return cc_reg;
 }
 
@@ -6689,7 +6712,7 @@ sizetochar (int size)
  'S/T/U/V':Print a FP/SIMD register name for a register 
list.
The register printed is the FP/SIMD register name
of X + 0/1/2/3 for S/T/U/V.
- 'R':  Print a scalar FP/SIMD register name + 1.
+ 'R':  Print a scalar Integer/FP/SIMD register name + 1.
  'X':  Print bottom 16 bits of integer constant in hex.
  'w/x':Print a general register name or the zero register
(32-bit or 64-bit).
@@ -6881,12 +6904,13 @@ aarch64_print_operand (FILE *f, rtx x, int code)
   break;
 
 case 'R':
-  if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
-   {
- output_operand_lossage ("incompatible floating point / vector 
register operand for '%%%c'", code);
- return;
-   }
-  asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+  if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
+   asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+  else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
+   asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
+  else
+   output_operand_lossage ("incompatible register operand for '%%%c'",
+   code);
   break;
 
 case 'X':
@@ -14139,16 +14163,26 @@ static void
 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
 rtx mem, rtx model_rtx)
 {
-  emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
+  if (mode == E_TImode)
+emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
+   gen_highpart (DImode, rval),
+