https://gcc.gnu.org/g:d5fb79718b22207253d62c92dcf8b1c16e1ea370
commit r16-8490-gd5fb79718b22207253d62c92dcf8b1c16e1ea370 Author: Avinash Jayakar <[email protected]> Date: Mon Mar 23 23:29:02 2026 -0500 rs6000: Add new builtin __builtin_ppc_atomic_cas_local This patch adds a new powerpc specific atomic builtin which is similar to the generic __atomic_compare_exchange builtin. bool __builtin_ppc_atomic_cas_local (type *ptr, type *expected, type *desired, bool weak, int success_memorder, int failure_memorder) It behaves like __atomic_compare_exchange(), but it uses an EH value of 1 in the larx (load-and-reserve) instruction. The new builtin helps optimize lock contention on PowerPC by keeping the lock cacheline in the local processor longer, reducing performance penalties from cache coherence protocol traffic. 2026-04-07 Avinash Jayakar <[email protected]> Surya Kumari Jangala <[email protected]> gcc/ChangeLog: * config/rs6000/rs6000-builtin.cc (rs6000_expand_builtin): Add logic to handle __builtin_ppc_atomic_cas_local. * config/rs6000/rs6000-builtins.def: New builtins for __builtin_ppc_atomic_cas_local with types. * config/rs6000/rs6000-c.cc (altivec_build_resolved_builtin): Handle builtins with up to 6 arguments. * config/rs6000/rs6000-overload.def: Overload builtin for signed/unsiged char, short, int, long, __int128. * config/rs6000/rs6000-protos.h (rs6000_expand_atomic_compare_and_swap): Add additional parameter 'local' to the prototype. * config/rs6000/rs6000.cc (emit_load_locked): Add new parameter. Pass new parameter to generate load-locked instruction. (rs6000_expand_atomic_compare_and_swap): Add new parameter. Call emit_load_locked() with additional parameter value of EH bit. (rs6000_expand_atomic_exchange): Pass EH value 0 to emit_load_locked(). (rs6000_expand_atomic_op): Likewise. * config/rs6000/sync.md (load_locked<mode>): Add new operand in RTL template. Specify EH bit in the larx instruction. (load_locked<QHI:mode>_si): Likewise. (load_lockedpti): Likewise. (load_lockedti): Add new operand in RTL template. Pass EH bit to gen_load_lockedpti(). (atomic_compare_and_swap<mode>): Pass new parameter 'false' to rs6000_expand_atomic_compare_and_swap. (atomic_compare_and_swap_local<mode>): New define_expand. gcc/testsuite/ChangeLog: * gcc.target/powerpc/acmp-tst.c: New test. Diff: --- gcc/config/rs6000/rs6000-builtin.cc | 102 ++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000-builtins.def | 11 +++ gcc/config/rs6000/rs6000-c.cc | 10 ++- gcc/config/rs6000/rs6000-overload.def | 22 ++++++ gcc/config/rs6000/rs6000-protos.h | 2 +- gcc/config/rs6000/rs6000.cc | 17 ++--- gcc/config/rs6000/sync.md | 37 +++++++--- gcc/testsuite/gcc.target/powerpc/acmp-tst.c | 32 +++++++++ 8 files changed, 214 insertions(+), 19 deletions(-) diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc index 45c88fe063b1..bbf60de3b1b6 100644 --- a/gcc/config/rs6000/rs6000-builtin.cc +++ b/gcc/config/rs6000/rs6000-builtin.cc @@ -3284,6 +3284,108 @@ rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */, return expand_call (exp, target, ignore); } + if (fcode == RS6000_BIF_PPC_ATOMIC_CAS_QI + || fcode == RS6000_BIF_PPC_ATOMIC_CAS_HI + || fcode == RS6000_BIF_PPC_ATOMIC_CAS_SI + || fcode == RS6000_BIF_PPC_ATOMIC_CAS_DI + || fcode == RS6000_BIF_PPC_ATOMIC_CAS_TI) + { + machine_mode mode; // Get mode based on BIF ID (QImode, SImode, etc.) + + switch (fcode) + { + case RS6000_BIF_PPC_ATOMIC_CAS_QI: + mode = QImode; + icode = CODE_FOR_atomic_compare_and_swap_localqi; + break; + case RS6000_BIF_PPC_ATOMIC_CAS_HI: + mode = HImode; + icode = CODE_FOR_atomic_compare_and_swap_localhi; + break; + case RS6000_BIF_PPC_ATOMIC_CAS_SI: + mode = SImode; + icode = CODE_FOR_atomic_compare_and_swap_localsi; + break; + case RS6000_BIF_PPC_ATOMIC_CAS_DI: + mode = DImode; + icode = CODE_FOR_atomic_compare_and_swap_localdi; + break; + case RS6000_BIF_PPC_ATOMIC_CAS_TI: + mode = TImode; + icode = CODE_FOR_atomic_compare_and_swap_localti; + break; + default: + gcc_unreachable (); + } + + // Arg 0: ptr (pointer to data) + rtx ptr = expand_normal (CALL_EXPR_ARG (exp, 0)); + rtx mem = gen_rtx_MEM (mode, force_reg (Pmode, ptr)); + + // Arg 1: expected (pointer to value) -> WE MUST DEREFERENCE THIS + rtx exp_ptr = expand_normal (CALL_EXPR_ARG (exp, 1)); + rtx expected_val = gen_reg_rtx (mode); + emit_move_insn (expected_val, gen_rtx_MEM (mode, + force_reg (Pmode, exp_ptr))); + + // Arg 2: desired (value), dereference this as well + rtx desired_ptr = expand_normal (CALL_EXPR_ARG (exp, 2)); + rtx desired_val = gen_reg_rtx (mode); + emit_move_insn (desired_val, + gen_rtx_MEM (mode, force_reg (Pmode, desired_ptr))); + + // Args 3, 4, 5: weak, succ, fail (constants) + rtx weak = expand_normal (CALL_EXPR_ARG (exp, 3)); + rtx succ = expand_normal (CALL_EXPR_ARG (exp, 4)); + rtx fail = expand_normal (CALL_EXPR_ARG (exp, 5)); + + // 0: Boolean return (Output) + struct expand_operand ops[8]; + create_output_operand (&ops[0], target, SImode); + + // 1: Old value return (Output) + rtx old_val = gen_reg_rtx (mode); + create_output_operand (&ops[1], old_val, mode); + + // 2: The Memory (Fixed/Input - it's a MEM rtx) + // We use create_fixed_operand because it's a specific MEM location + create_fixed_operand (&ops[2], mem); + + // 3: Expected Value (Input) + create_input_operand (&ops[3], expected_val, mode); + + // 4: Desired Value (Input) + create_input_operand (&ops[4], desired_val, mode); + + // 5, 6, 7: Weak, Success, Failure (Immediate/Constants) + create_input_operand (&ops[5], weak, SImode); + create_input_operand (&ops[6], succ, SImode); + create_input_operand (&ops[7], fail, SImode); + + // Now call expand_insn with the ops array + if (!maybe_expand_insn (icode, 8, ops)) + error ("invalid arguments to builtin"); + + // Create a label for the end of the function. + rtx done_label = gen_label_rtx (); + + /* Standard Semantics: Update 'expected' ONLY on failure. + If target (the boolean result) is NOT 0, the CAS succeeded. + In the case of success, we jump straight to the end. */ + + // If target != 0 (Success), skip the store. + emit_cmp_and_jump_insns (target, const0_rtx, NE, NULL_RTX, + SImode, 1, done_label); + + // FAILURE PATH: This code runs only if target == 0. + rtx expected_mem = gen_rtx_MEM (mode, force_reg (Pmode, exp_ptr)); + emit_move_insn (expected_mem, old_val); + + emit_label (done_label); + + return target; + } + if (bif_is_nosoft (*bifaddr) && rs6000_isa_flags & OPTION_MASK_SOFT_FLOAT) { diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index 7e5a4fb96e72..577c9d6c8f07 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -246,6 +246,17 @@ const double __builtin_unpack_longdouble (long double, const int<1>); UNPACK_TF unpacktf {ibmld} +; Builtins for ppc specific atomic compare exchange + bool __builtin_ppc_atomic_cas_local_qi (char *, char *, char *, const int, const int, const int); + PPC_ATOMIC_CAS_QI nothing {} + bool __builtin_ppc_atomic_cas_local_hi (short *, short *, short *, const int, const int, const int); + PPC_ATOMIC_CAS_HI nothing {} + bool __builtin_ppc_atomic_cas_local_si (int *, int *, int *, const int, const int, const int); + PPC_ATOMIC_CAS_SI nothing {} + bool __builtin_ppc_atomic_cas_local_di (long *, long *, long *, const int, const int, const int); + PPC_ATOMIC_CAS_DI nothing {} + bool __builtin_ppc_atomic_cas_local_ti (vsq *, vsq *, vsq *, const int, const int, const int); + PPC_ATOMIC_CAS_TI nothing {} ; Builtins that have been around just about forever, but not quite. [power5] diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc index 8c221b71b8d3..b230d9a73450 100644 --- a/gcc/config/rs6000/rs6000-c.cc +++ b/gcc/config/rs6000/rs6000-c.cc @@ -900,7 +900,7 @@ altivec_build_resolved_builtin (tree *args, int n, tree fntype, tree ret_type, /* If the number of arguments to an overloaded function increases, we must expand this switch. */ - gcc_assert (MAX_OVLD_ARGS <= 4); + gcc_assert (MAX_OVLD_ARGS <= 6); tree call; switch (n) @@ -920,6 +920,14 @@ altivec_build_resolved_builtin (tree *args, int n, tree fntype, tree ret_type, case 4: call = build_call_expr (fndecl, 4, args[0], args[1], args[2], args[3]); break; + case 5: + call = build_call_expr (fndecl, 5, args[0], args[1], args[2], args[3], + args[4]); + break; + case 6: + call = build_call_expr (fndecl, 6, args[0], args[1], args[2], args[3], + args[4], args[5]); + break; default: gcc_unreachable (); } diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def index 5238c81b2144..8f2fa9784755 100644 --- a/gcc/config/rs6000/rs6000-overload.def +++ b/gcc/config/rs6000/rs6000-overload.def @@ -79,6 +79,28 @@ ; a semicolon are also treated as blank lines. +[PPC_ATOMIC_CAS, SKIP, __builtin_ppc_atomic_cas_local] + bool __builtin_ppc_atomic_cas_local (signed char *, signed char *, signed char *, const int, const int, const int); + PPC_ATOMIC_CAS_QI PPC_ATOMIC_CAS_SQI + bool __builtin_ppc_atomic_cas_local (unsigned char *, unsigned char *, unsigned char *, const int, const int, const int); + PPC_ATOMIC_CAS_QI PPC_ATOMIC_CAS_UQI + bool __builtin_ppc_atomic_cas_local (signed short *, signed short *, signed short *, const int, const int, const int); + PPC_ATOMIC_CAS_HI PPC_ATOMIC_CAS_SHI + bool __builtin_ppc_atomic_cas_local (unsigned short *, unsigned short *, unsigned short *, const int, const int, const int); + PPC_ATOMIC_CAS_HI PPC_ATOMIC_CAS_UHI + bool __builtin_ppc_atomic_cas_local (signed int *, signed int *, signed int *, const int, const int, const int); + PPC_ATOMIC_CAS_SI PPC_ATOMIC_CAS_SSI + bool __builtin_ppc_atomic_cas_local (unsigned int *, unsigned int *, unsigned int *, const int, const int, const int); + PPC_ATOMIC_CAS_SI PPC_ATOMIC_CAS_USI + bool __builtin_ppc_atomic_cas_local (signed long *, signed long *, signed long *, const int, const int, const int); + PPC_ATOMIC_CAS_DI PPC_ATOMIC_CAS_SDI + bool __builtin_ppc_atomic_cas_local (unsigned long *, unsigned long *, unsigned long *, const int, const int, const int); + PPC_ATOMIC_CAS_DI PPC_ATOMIC_CAS_UDI + bool __builtin_ppc_atomic_cas_local (vsq *, vsq *, vsq *, const int, const int, const int); + PPC_ATOMIC_CAS_TI PPC_ATOMIC_CAS_STI + bool __builtin_ppc_atomic_cas_local (vuq *, vuq *, vuq *, const int, const int, const int); + PPC_ATOMIC_CAS_TI PPC_ATOMIC_CAS_UTI + [BCDADD, __builtin_bcdadd, __builtin_vec_bcdadd] vsq __builtin_vec_bcdadd (vsq, vsq, const int); BCDADD_V1TI diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 09424ebaf970..5efca2d58348 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -127,7 +127,7 @@ extern bool rs6000_emit_set_const (rtx, rtx); extern bool rs6000_emit_cmove (rtx, rtx, rtx, rtx); extern bool rs6000_emit_int_cmove (rtx, rtx, rtx, rtx); extern void rs6000_emit_minmax (rtx, enum rtx_code, rtx, rtx); -extern void rs6000_expand_atomic_compare_and_swap (rtx op[]); +extern void rs6000_expand_atomic_compare_and_swap (rtx op[], bool local); extern rtx swap_endian_selector_for_mode (machine_mode mode); extern void rs6000_expand_atomic_exchange (rtx op[]); diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 3838059a7e25..42a4d7bb6224 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -16746,12 +16746,13 @@ emit_unlikely_jump (rtx cond, rtx label) /* A subroutine of the atomic operation splitters. Emit a load-locked instruction in MODE. For QI/HImode, possibly use a pattern than includes - the zero_extend operation. */ + the zero_extend operation. LOCAL indicates the EH bit value for the + load-locked instruction. */ static void -emit_load_locked (machine_mode mode, rtx reg, rtx mem) +emit_load_locked (machine_mode mode, rtx reg, rtx mem, rtx local) { - rtx (*fn) (rtx, rtx) = NULL; + rtx (*fn) (rtx, rtx, rtx) = NULL; switch (mode) { @@ -16778,7 +16779,7 @@ emit_load_locked (machine_mode mode, rtx reg, rtx mem) default: gcc_unreachable (); } - emit_insn (fn (reg, mem)); + emit_insn (fn (reg, mem, local)); } /* A subroutine of the atomic operation splitters. Emit a store-conditional @@ -16948,7 +16949,7 @@ rs6000_finish_atomic_subword (rtx narrow, rtx wide, rtx shift) /* Expand an atomic compare and swap operation. */ void -rs6000_expand_atomic_compare_and_swap (rtx operands[]) +rs6000_expand_atomic_compare_and_swap (rtx operands[], bool local) { rtx boolval, retval, mem, oldval, newval, cond; rtx label1, label2, x, mask, shift; @@ -17011,7 +17012,7 @@ rs6000_expand_atomic_compare_and_swap (rtx operands[]) } label2 = gen_rtx_LABEL_REF (VOIDmode, gen_label_rtx ()); - emit_load_locked (mode, retval, mem); + emit_load_locked (mode, retval, mem, local ? const1_rtx : const0_rtx); x = retval; if (mask) @@ -17109,7 +17110,7 @@ rs6000_expand_atomic_exchange (rtx operands[]) label = gen_rtx_LABEL_REF (VOIDmode, gen_label_rtx ()); emit_label (XEXP (label, 0)); - emit_load_locked (mode, retval, mem); + emit_load_locked (mode, retval, mem, const0_rtx); x = val; if (mask) @@ -17214,7 +17215,7 @@ rs6000_expand_atomic_op (enum rtx_code code, rtx mem, rtx val, if (before == NULL_RTX) before = gen_reg_rtx (mode); - emit_load_locked (mode, before, mem); + emit_load_locked (mode, before, mem, const0_rtx); if (code == NOT) { diff --git a/gcc/config/rs6000/sync.md b/gcc/config/rs6000/sync.md index a4e8344ef114..7087daf7e4c0 100644 --- a/gcc/config/rs6000/sync.md +++ b/gcc/config/rs6000/sync.md @@ -278,17 +278,19 @@ (define_insn "load_locked<mode>" [(set (match_operand:ATOMIC 0 "int_reg_operand" "=r") (unspec_volatile:ATOMIC - [(match_operand:ATOMIC 1 "memory_operand" "Z")] UNSPECV_LL))] + [(match_operand:ATOMIC 1 "memory_operand" "Z") + (match_operand:QI 2 "u1bit_cint_operand" "n")] UNSPECV_LL))] "" - "<larx> %0,%y1" + "<larx> %0,%y1,%2" [(set_attr "type" "load_l")]) (define_insn "load_locked<QHI:mode>_si" [(set (match_operand:SI 0 "int_reg_operand" "=r") (unspec_volatile:SI - [(match_operand:QHI 1 "memory_operand" "Z")] UNSPECV_LL))] + [(match_operand:QHI 1 "memory_operand" "Z") + (match_operand:QI 2 "u1bit_cint_operand" "n")] UNSPECV_LL))] "TARGET_SYNC_HI_QI" - "<QHI:larx> %0,%y1" + "<QHI:larx> %0,%y1,%2" [(set_attr "type" "load_l")]) ;; Use PTImode to get even/odd register pairs. @@ -302,7 +304,8 @@ (define_expand "load_lockedti" [(use (match_operand:TI 0 "quad_int_reg_operand")) - (use (match_operand:TI 1 "memory_operand"))] + (use (match_operand:TI 1 "memory_operand")) + (use (match_operand:QI 2 "u1bit_cint_operand"))] "TARGET_SYNC_TI" { rtx op0 = operands[0]; @@ -316,7 +319,7 @@ operands[1] = op1 = change_address (op1, TImode, new_addr); } - emit_insn (gen_load_lockedpti (pti, op1)); + emit_insn (gen_load_lockedpti (pti, op1, operands[2])); if (WORDS_BIG_ENDIAN) emit_move_insn (op0, gen_lowpart (TImode, pti)); else @@ -330,11 +333,12 @@ (define_insn "load_lockedpti" [(set (match_operand:PTI 0 "quad_int_reg_operand" "=&r") (unspec_volatile:PTI - [(match_operand:TI 1 "indexed_or_indirect_operand" "Z")] UNSPECV_LL))] + [(match_operand:TI 1 "indexed_or_indirect_operand" "Z") + (match_operand:QI 2 "u1bit_cint_operand" "n")] UNSPECV_LL))] "TARGET_SYNC_TI && !reg_mentioned_p (operands[0], operands[1]) && quad_int_reg_operand (operands[0], PTImode)" - "lqarx %0,%y1" + "lqarx %0,%y1,%2" [(set_attr "type" "load_l") (set_attr "size" "128")]) @@ -411,7 +415,22 @@ (match_operand:SI 7 "const_int_operand")] ;; model fail "" { - rs6000_expand_atomic_compare_and_swap (operands); + rs6000_expand_atomic_compare_and_swap (operands, false); + DONE; +}) + +(define_expand "atomic_compare_and_swap_local<mode>" + [(match_operand:SI 0 "int_reg_operand") ;; bool out + (match_operand:AINT 1 "int_reg_operand") ;; val out + (match_operand:AINT 2 "memory_operand") ;; memory + (match_operand:AINT 3 "reg_or_short_operand") ;; expected + (match_operand:AINT 4 "int_reg_operand") ;; desired + (match_operand:SI 5 "const_int_operand") ;; is_weak + (match_operand:SI 6 "const_int_operand") ;; model succ + (match_operand:SI 7 "const_int_operand")] ;; model fail + "" +{ + rs6000_expand_atomic_compare_and_swap (operands, true); DONE; }) diff --git a/gcc/testsuite/gcc.target/powerpc/acmp-tst.c b/gcc/testsuite/gcc.target/powerpc/acmp-tst.c new file mode 100644 index 000000000000..6ebd2ebbc286 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/acmp-tst.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#define TESTS \ + X(signed char, qi) \ + X(unsigned char, uqi) \ + X(short, hi) \ + X(signed short, shi) \ + X(unsigned short, uhi) \ + X(int, si) \ + X(signed int, ssi) \ + X(unsigned int, usi) \ + X(long, di) \ + X(signed long, sdi) \ + X(unsigned long, udi) \ + X(vector signed __int128, sti) \ + X(vector unsigned __int128, uti) + +#define X(T, name) \ +bool word_exchange_##name (T *ptr, T *expected, T * desired) \ +{ \ + return __builtin_ppc_atomic_cas_local (ptr, expected, desired, 0, \ + __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE); \ +} + +TESTS + +/* { dg-final { scan-assembler-times {\mlbarx +[0-9]+,[0-9]+,[0-9]+,1} 2 } } */ +/* { dg-final { scan-assembler-times {\mlharx +[0-9]+,[0-9]+,[0-9]+,1} 3 } } */ +/* { dg-final { scan-assembler-times {\mlwarx +[0-9]+,[0-9]+,[0-9]+,1} 3 } } */ +/* { dg-final { scan-assembler-times {\mldarx +[0-9]+,[0-9]+,[0-9]+,1} 3 } } */ +/* { dg-final { scan-assembler-times {\mlqarx +[0-9]+,[0-9]+,[0-9]+,1} 2 } } */
