There are targets, which only offer 32-bit atomic operations (for
example 32-bit RISC-V).  For these targets, split the 64-bit atomic
bitwise-or operation into two parts.

For this test case

int a(int i);
int b(int i);

int f(int i)
{
  if (i) {
    return a(i);
  } else {
    return b(i);
  }
}

with options

-O2 -fprofile-update=atomic -fcondition-coverage

the code generation to 64-bit vs. 32-bit RISC-V looks like:

        addi    a5,a5,%lo(.LANCHOR0)
        beq     a0,zero,.L2
        li      a4,1
-       amoor.d zero,a4,0(a5)
-       addi    a5,a5,8
-       amoor.d zero,zero,0(a5)
+       amoor.w zero,a4,0(a5)
+       addi    a4,a5,4
+       amoor.w zero,zero,0(a4)
+       addi    a4,a5,8
+       amoor.w zero,zero,0(a4)
+       addi    a5,a5,12
+       amoor.w zero,zero,0(a5)
        tail    a
 .L2:
-       amoor.d zero,zero,0(a5)
+       amoor.w zero,zero,0(a5)
+       addi    a4,a5,4
+       amoor.w zero,zero,0(a4)
        li      a4,1
-       addi    a5,a5,8
-       amoor.d zero,a4,0(a5)
+       addi    a3,a5,8
+       amoor.w zero,a4,0(a3)
+       addi    a5,a5,12
+       amoor.w zero,zero,0(a5)
        tail    b

Not related to this patch, even with -O2 the compiler generates
no-operations like

amoor.d zero,zero,0(a5)

and

amoor.w zero,zero,0(a5)

Would this be possible to filter out in instrument_decisions()?

gcc/ChangeLog:

        * tree-profile.cc (split_update_decision_counter): New.
        (instrument_decisions): Use counter_update to determine which
        atomic operations are available.  Use
        split_update_decision_counter() if 64-bit atomic operations can
        be split up into two 32-bit atomic operations.
---
 gcc/tree-profile.cc | 73 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 67 insertions(+), 6 deletions(-)

diff --git a/gcc/tree-profile.cc b/gcc/tree-profile.cc
index fed218eb60b..0a29ea42f00 100644
--- a/gcc/tree-profile.cc
+++ b/gcc/tree-profile.cc
@@ -1006,6 +1006,57 @@ resolve_counters (vec<counters>& cands)
 
 }
 
+/* At edge E, update the decision counter referenced by REF with the
+   COUNTER.  Generate two separate 32-bit atomic bitwise-or operations
+   specified by ATOMIC_IOR_32 in the RELAXED memory order.  */
+static void
+split_update_decision_counter (edge e, tree ref, tree counter, tree
+                              atomic_ior_32, tree relaxed)
+{
+    gimple_stmt_iterator gsi = gsi_last (PENDING_STMT (e));
+    ref = unshare_expr (ref);
+
+    /* Get the low and high address of the referenced counter */
+    tree addr_low = build_addr (ref);
+    tree addr_high = make_temp_ssa_name (TREE_TYPE (addr_low), NULL,
+                                        "PROF_decision");
+    tree four = build_int_cst (size_type_node, 4);
+    gassign *assign1 = gimple_build_assign (addr_high, POINTER_PLUS_EXPR,
+                                           addr_low, four);
+    gsi_insert_after (&gsi, assign1, GSI_NEW_STMT);
+    if (WORDS_BIG_ENDIAN)
+       std::swap (addr_low, addr_high);
+
+    /* Get the low 32-bit of the counter */
+    tree counter_low_32 = make_temp_ssa_name (uint32_type_node, NULL,
+                                             "PROF_decision");
+    gassign *assign2 = gimple_build_assign (counter_low_32, NOP_EXPR, counter);
+    gsi_insert_after (&gsi, assign2, GSI_NEW_STMT);
+
+    /* Get the high 32-bit of the counter */
+    tree shift_32 = build_int_cst (integer_type_node, 32);
+    tree counter_high_64 = make_temp_ssa_name (gcov_type_node, NULL,
+                                              "PROF_decision");
+    gassign *assign3 = gimple_build_assign (counter_high_64, LSHIFT_EXPR,
+                                           counter, shift_32);
+    gsi_insert_after (&gsi, assign3, GSI_NEW_STMT);
+    tree counter_high_32 = make_temp_ssa_name (uint32_type_node, NULL,
+                                              "PROF_decision");
+    gassign *assign4 = gimple_build_assign (counter_high_32, NOP_EXPR,
+                                           counter_high_64);
+    gsi_insert_after (&gsi, assign4, GSI_NEW_STMT);
+
+    /* Atomically bitwise-or the low 32-bit counter parts */
+    gcall *call1 = gimple_build_call (atomic_ior_32, 3, addr_low,
+                                     counter_low_32, relaxed);
+    gsi_insert_after (&gsi, call1, GSI_NEW_STMT);
+
+    /* Atomically bitwise-or the high 32-bit counter parts */
+    gcall *call2 = gimple_build_call (atomic_ior_32, 3, addr_high,
+                                     counter_high_32, relaxed);
+    gsi_insert_after (&gsi, call2, GSI_NEW_STMT);
+}
+
 /* Add instrumentation to a decision subgraph.  EXPR should be the
    (topologically sorted) block of nodes returned by cov_blocks, MAPS the
    bitmaps returned by cov_maps, and MASKS the block of bitsets returned by
@@ -1108,11 +1159,16 @@ instrument_decisions (array_slice<basic_block> expr, 
size_t condno,
     gcc_assert (xi == bitmap_count_bits (core));
 
     const tree relaxed = build_int_cst (integer_type_node, MEMMODEL_RELAXED);
-    const bool atomic = flag_profile_update == PROFILE_UPDATE_ATOMIC;
-    const tree atomic_ior = builtin_decl_explicit
-       (TYPE_PRECISION (gcov_type_node) > 32
-        ? BUILT_IN_ATOMIC_FETCH_OR_8
-        : BUILT_IN_ATOMIC_FETCH_OR_4);
+    const bool use_atomic_builtin =
+       counter_update == COUNTER_UPDATE_ATOMIC_BUILTIN;
+    const bool use_atomic_split =
+       counter_update == COUNTER_UPDATE_ATOMIC_SPLIT ||
+       counter_update == COUNTER_UPDATE_ATOMIC_PARTIAL;
+    const tree atomic_ior_32 =
+       builtin_decl_explicit (BUILT_IN_ATOMIC_FETCH_OR_4);
+    const tree atomic_ior = TYPE_PRECISION (gcov_type_node) > 32 ?
+       builtin_decl_explicit (BUILT_IN_ATOMIC_FETCH_OR_8) :
+       atomic_ior_32;
 
     /* Flush to the gcov accumulators.  */
     for (const basic_block b : expr)
@@ -1149,7 +1205,7 @@ instrument_decisions (array_slice<basic_block> expr, 
size_t condno,
            {
                tree ref = tree_coverage_counter_ref (GCOV_COUNTER_CONDS,
                                                      2*condno + k);
-               if (atomic)
+               if (use_atomic_builtin)
                {
                    ref = unshare_expr (ref);
                    gcall *flush = gimple_build_call (atomic_ior, 3,
@@ -1157,6 +1213,11 @@ instrument_decisions (array_slice<basic_block> expr, 
size_t condno,
                                                      next[k], relaxed);
                    gsi_insert_on_edge (e, flush);
                }
+               else if (use_atomic_split)
+               {
+                   split_update_decision_counter (e, ref, next[k],
+                                                  atomic_ior_32, relaxed);
+               }
                else
                {
                    tree get = emit_assign (e, ref);
-- 
2.43.0

Reply via email to