There are targets, which only offer 32-bit atomic operations (for example 32-bit RISC-V). For these targets, split the 64-bit atomic bitwise-or operation into two parts.
For this test case int a(int i); int b(int i); int f(int i) { if (i) { return a(i); } else { return b(i); } } with options -O2 -fprofile-update=atomic -fcondition-coverage the code generation to 64-bit vs. 32-bit RISC-V looks like: addi a5,a5,%lo(.LANCHOR0) beq a0,zero,.L2 li a4,1 - amoor.d zero,a4,0(a5) - addi a5,a5,8 - amoor.d zero,zero,0(a5) + amoor.w zero,a4,0(a5) + addi a4,a5,4 + amoor.w zero,zero,0(a4) + addi a4,a5,8 + amoor.w zero,zero,0(a4) + addi a5,a5,12 + amoor.w zero,zero,0(a5) tail a .L2: - amoor.d zero,zero,0(a5) + amoor.w zero,zero,0(a5) + addi a4,a5,4 + amoor.w zero,zero,0(a4) li a4,1 - addi a5,a5,8 - amoor.d zero,a4,0(a5) + addi a3,a5,8 + amoor.w zero,a4,0(a3) + addi a5,a5,12 + amoor.w zero,zero,0(a5) tail b Not related to this patch, even with -O2 the compiler generates no-operations like amoor.d zero,zero,0(a5) and amoor.w zero,zero,0(a5) Would this be possible to filter out in instrument_decisions()? gcc/ChangeLog: * tree-profile.cc (split_update_decision_counter): New. (instrument_decisions): Use counter_update to determine which atomic operations are available. Use split_update_decision_counter() if 64-bit atomic operations can be split up into two 32-bit atomic operations. --- gcc/tree-profile.cc | 73 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/gcc/tree-profile.cc b/gcc/tree-profile.cc index fed218eb60b..0a29ea42f00 100644 --- a/gcc/tree-profile.cc +++ b/gcc/tree-profile.cc @@ -1006,6 +1006,57 @@ resolve_counters (vec<counters>& cands) } +/* At edge E, update the decision counter referenced by REF with the + COUNTER. Generate two separate 32-bit atomic bitwise-or operations + specified by ATOMIC_IOR_32 in the RELAXED memory order. */ +static void +split_update_decision_counter (edge e, tree ref, tree counter, tree + atomic_ior_32, tree relaxed) +{ + gimple_stmt_iterator gsi = gsi_last (PENDING_STMT (e)); + ref = unshare_expr (ref); + + /* Get the low and high address of the referenced counter */ + tree addr_low = build_addr (ref); + tree addr_high = make_temp_ssa_name (TREE_TYPE (addr_low), NULL, + "PROF_decision"); + tree four = build_int_cst (size_type_node, 4); + gassign *assign1 = gimple_build_assign (addr_high, POINTER_PLUS_EXPR, + addr_low, four); + gsi_insert_after (&gsi, assign1, GSI_NEW_STMT); + if (WORDS_BIG_ENDIAN) + std::swap (addr_low, addr_high); + + /* Get the low 32-bit of the counter */ + tree counter_low_32 = make_temp_ssa_name (uint32_type_node, NULL, + "PROF_decision"); + gassign *assign2 = gimple_build_assign (counter_low_32, NOP_EXPR, counter); + gsi_insert_after (&gsi, assign2, GSI_NEW_STMT); + + /* Get the high 32-bit of the counter */ + tree shift_32 = build_int_cst (integer_type_node, 32); + tree counter_high_64 = make_temp_ssa_name (gcov_type_node, NULL, + "PROF_decision"); + gassign *assign3 = gimple_build_assign (counter_high_64, LSHIFT_EXPR, + counter, shift_32); + gsi_insert_after (&gsi, assign3, GSI_NEW_STMT); + tree counter_high_32 = make_temp_ssa_name (uint32_type_node, NULL, + "PROF_decision"); + gassign *assign4 = gimple_build_assign (counter_high_32, NOP_EXPR, + counter_high_64); + gsi_insert_after (&gsi, assign4, GSI_NEW_STMT); + + /* Atomically bitwise-or the low 32-bit counter parts */ + gcall *call1 = gimple_build_call (atomic_ior_32, 3, addr_low, + counter_low_32, relaxed); + gsi_insert_after (&gsi, call1, GSI_NEW_STMT); + + /* Atomically bitwise-or the high 32-bit counter parts */ + gcall *call2 = gimple_build_call (atomic_ior_32, 3, addr_high, + counter_high_32, relaxed); + gsi_insert_after (&gsi, call2, GSI_NEW_STMT); +} + /* Add instrumentation to a decision subgraph. EXPR should be the (topologically sorted) block of nodes returned by cov_blocks, MAPS the bitmaps returned by cov_maps, and MASKS the block of bitsets returned by @@ -1108,11 +1159,16 @@ instrument_decisions (array_slice<basic_block> expr, size_t condno, gcc_assert (xi == bitmap_count_bits (core)); const tree relaxed = build_int_cst (integer_type_node, MEMMODEL_RELAXED); - const bool atomic = flag_profile_update == PROFILE_UPDATE_ATOMIC; - const tree atomic_ior = builtin_decl_explicit - (TYPE_PRECISION (gcov_type_node) > 32 - ? BUILT_IN_ATOMIC_FETCH_OR_8 - : BUILT_IN_ATOMIC_FETCH_OR_4); + const bool use_atomic_builtin = + counter_update == COUNTER_UPDATE_ATOMIC_BUILTIN; + const bool use_atomic_split = + counter_update == COUNTER_UPDATE_ATOMIC_SPLIT || + counter_update == COUNTER_UPDATE_ATOMIC_PARTIAL; + const tree atomic_ior_32 = + builtin_decl_explicit (BUILT_IN_ATOMIC_FETCH_OR_4); + const tree atomic_ior = TYPE_PRECISION (gcov_type_node) > 32 ? + builtin_decl_explicit (BUILT_IN_ATOMIC_FETCH_OR_8) : + atomic_ior_32; /* Flush to the gcov accumulators. */ for (const basic_block b : expr) @@ -1149,7 +1205,7 @@ instrument_decisions (array_slice<basic_block> expr, size_t condno, { tree ref = tree_coverage_counter_ref (GCOV_COUNTER_CONDS, 2*condno + k); - if (atomic) + if (use_atomic_builtin) { ref = unshare_expr (ref); gcall *flush = gimple_build_call (atomic_ior, 3, @@ -1157,6 +1213,11 @@ instrument_decisions (array_slice<basic_block> expr, size_t condno, next[k], relaxed); gsi_insert_on_edge (e, flush); } + else if (use_atomic_split) + { + split_update_decision_counter (e, ref, next[k], + atomic_ior_32, relaxed); + } else { tree get = emit_assign (e, ref); -- 2.43.0