This is my fixed patch that correctly swaps the words for atomic quad memory accesses in little endian. It would be nice in the future to eliminate some of the extra moves for things like compare and add.
I have tested this with a bootstrap (using --with-cpu=power8) on a little endian power8 system, and I verified the example program works. I'm running the rest of the test suite right now. Assuming the test suite doesn't show anything wrong, is it ok to commit? [gcc] 2014-01-23 Michael Meissner <meiss...@linux.vnet.ibm.com> PR target/59909 * doc/invoke.texi (RS/6000 and PowerPC Options): Document -mquad-memory-atomic. Update -mquad-memory documentation to say it is only used for non-atomic loads/stores. * config/rs6000/predicates.md (quad_int_reg_operand): Allow either -mquad-memory or -mquad-memory-atomic switches. * config/rs6000/rs6000-cpus.def (ISA_2_7_MASKS_SERVER): Add -mquad-memory-atomic to ISA 2.07 support. * config/rs6000/rs6000.opt (-mquad-memory-atomic): Add new switch to separate support of normal quad word memory operations (ldq, stq) from the atomic quad word memory operations. * config/rs6000/rs6000.c (rs6000_option_override_internal): Add support to separate non-atomic quad word operations from atomic quad word operations. Disable non-atomic quad word operations in little endian mode so that we don't have to swap words after the load and before the store. (quad_load_store_p): Add comment about atomic quad word support. (rs6000_opt_masks): Add -mquad-memory-atomic to the list of options printed with -mdebug=reg. * config/rs6000/rs6000.h (TARGET_SYNC_TI): Use -mquad-memory-atomic as the test for whether we have quad word atomic instructions. (TARGET_SYNC_HI_QI): If either -mquad-memory-atomic, -mquad-memory, or -mp8-vector are used, allow byte/half-word atomic operations. * config/rs6000/sync.md (load_lockedti): Insure that the address is a proper indexed or indirect address for the lqarx instruction. On little endian systems, swap the hi/lo registers after the lqarx instruction. (load_lockedpti): Use indexed_or_indirect_operand predicate to insure the address is valid for the lqarx instruction. (store_conditionalti): Insure that the address is a proper indexed or indirect address for the stqcrx. instruction. On little endian systems, swap the hi/lo registers before doing the stqcrx. instruction. (store_conditionalpti): Use indexed_or_indirect_operand predicate to insure the address is valid for the stqcrx. instruction. * gcc/config/rs6000/rs6000.md (UNSPEC_TI_PTI_SWAP): New UNSPEC for dealing with quad word atomic instructions in little endian mode. * gcc/config/rs6000/rs6000-c.c (rs6000_target_modify_macros): Define __QUAD_MEMORY__ and __QUAD_MEMORY_ATOMIC__ based on what type of quad memory support is available. [gcc/testsuite] 2014-01-23 Michael Meissner <meiss...@linux.vnet.ibm.com> PR target/59909 * gcc.target/powerpc/quad-atomic.c: New file to test power8 quad word atomic functions at runtime. -- Michael Meissner, IBM IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/doc/invoke.texi =================================================================== --- gcc/doc/invoke.texi (revision 206895) +++ gcc/doc/invoke.texi (working copy) @@ -919,6 +919,7 @@ See RS/6000 and PowerPC Options. -mpower8-fusion -mno-mpower8-fusion -mpower8-vector -mno-power8-vector @gol -mcrypto -mno-crypto -mdirect-move -mno-direct-move @gol -mquad-memory -mno-quad-memory @gol +-mquad-memory-atomic -mno-quad-memory-atomic @gol -mcompat-align-parm -mno-compat-align-parm} @emph{RX Options} @@ -18853,7 +18854,8 @@ following options: -mpopcntb -mpopcntd -mpowerpc64 @gol -mpowerpc-gpopt -mpowerpc-gfxopt -msingle-float -mdouble-float @gol -msimple-fpu -mstring -mmulhw -mdlmzb -mmfpgpr -mvsx @gol --mcrypto -mdirect-move -mpower8-fusion -mpower8-vector -mquad-memory} +-mcrypto -mdirect-move -mpower8-fusion -mpower8-vector @gol +-mquad-memory -mquad-memory-atomic} The particular options set for any particular CPU varies between compiler versions, depending on what setting seems to produce optimal @@ -19040,10 +19042,18 @@ the vector instructions. @itemx -mno-quad-memory @opindex mquad-memory @opindex mno-quad-memory -Generate code that uses (does not use) the quad word memory +Generate code that uses (does not use) the non-atomic quad word memory instructions. The @option{-mquad-memory} option requires use of 64-bit mode. +@item -mquad-memory-atomic +@itemx -mno-quad-memory-atomic +@opindex mquad-memory-atomic +@opindex mno-quad-memory-atomic +Generate code that uses (does not use) the atomic quad word memory +instructions. The @option{-mquad-memory-atomic} option requires use of +64-bit mode. + @item -mfloat-gprs=@var{yes/single/double/no} @itemx -mfloat-gprs @opindex mfloat-gprs Index: gcc/config/rs6000/predicates.md =================================================================== --- gcc/config/rs6000/predicates.md (revision 206895) +++ gcc/config/rs6000/predicates.md (working copy) @@ -270,7 +270,7 @@ (define_predicate "quad_int_reg_operand" { HOST_WIDE_INT r; - if (!TARGET_QUAD_MEMORY) + if (!TARGET_QUAD_MEMORY && !TARGET_QUAD_MEMORY_ATOMIC) return 0; if (GET_CODE (op) == SUBREG) @@ -624,6 +624,7 @@ (define_predicate "offsettable_mem_opera (match_test "offsettable_nonstrict_memref_p (op)"))) ;; Return 1 if the operand is suitable for load/store quad memory. +;; This predicate only checks for non-atomic loads/stores. (define_predicate "quad_memory_operand" (match_code "mem") { Index: gcc/config/rs6000/rs6000-cpus.def =================================================================== --- gcc/config/rs6000/rs6000-cpus.def (revision 206895) +++ gcc/config/rs6000/rs6000-cpus.def (working copy) @@ -53,7 +53,8 @@ | OPTION_MASK_CRYPTO \ | OPTION_MASK_DIRECT_MOVE \ | OPTION_MASK_HTM \ - | OPTION_MASK_QUAD_MEMORY) + | OPTION_MASK_QUAD_MEMORY \ + | OPTION_MASK_QUAD_MEMORY_ATOMIC) #define POWERPC_7400_MASK (OPTION_MASK_PPC_GFXOPT | OPTION_MASK_ALTIVEC) Index: gcc/config/rs6000/rs6000.opt =================================================================== --- gcc/config/rs6000/rs6000.opt (revision 206895) +++ gcc/config/rs6000/rs6000.opt (working copy) @@ -571,7 +571,11 @@ Use ISA 2.07 transactional memory (HTM) mquad-memory Target Report Mask(QUAD_MEMORY) Var(rs6000_isa_flags) -Generate the quad word memory instructions (lq/stq/lqarx/stqcx). +Generate the quad word memory instructions (lq/stq). + +mquad-memory-atomic +Target Report Mask(QUAD_MEMORY_ATOMIC) Var(rs6000_isa_flags) +Generate the quad word memory atomic instructions (lqarx/stqcx). mcompat-align-parm Target Report Var(rs6000_compat_align_parm) Init(0) Save Index: gcc/config/rs6000/rs6000-c.c =================================================================== --- gcc/config/rs6000/rs6000-c.c (revision 206895) +++ gcc/config/rs6000/rs6000-c.c (working copy) @@ -339,6 +339,10 @@ rs6000_target_modify_macros (bool define rs6000_define_or_undefine_macro (define_p, "__HTM__"); if ((flags & OPTION_MASK_P8_VECTOR) != 0) rs6000_define_or_undefine_macro (define_p, "__POWER8_VECTOR__"); + if ((flags & OPTION_MASK_QUAD_MEMORY) != 0) + rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY__"); + if ((flags & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0) + rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY_ATOMIC__"); if ((flags & OPTION_MASK_CRYPTO) != 0) rs6000_define_or_undefine_macro (define_p, "__CRYPTO__"); Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 206895) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -3356,14 +3356,37 @@ rs6000_option_override_internal (bool gl /* The quad memory instructions only works in 64-bit mode. In 32-bit mode, silently turn off quad memory mode. */ - if (TARGET_QUAD_MEMORY && !TARGET_POWERPC64) + if ((TARGET_QUAD_MEMORY || TARGET_QUAD_MEMORY_ATOMIC) && !TARGET_POWERPC64) { if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0) warning (0, N_("-mquad-memory requires 64-bit mode")); + if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0) + warning (0, N_("-mquad-memory-atomic requires 64-bit mode")); + + rs6000_isa_flags &= ~(OPTION_MASK_QUAD_MEMORY + | OPTION_MASK_QUAD_MEMORY_ATOMIC); + } + + /* Non-atomic quad memory load/store are disabled for little endian, since + the words are reversed, but atomic operations can still be done by + swapping the words. */ + if (TARGET_QUAD_MEMORY && !WORDS_BIG_ENDIAN) + { + if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0) + warning (0, N_("-mquad-memory is not available in little endian mode")); + rs6000_isa_flags &= ~OPTION_MASK_QUAD_MEMORY; } + /* Assume if the user asked for normal quad memory instructions, they want + the atomic versions as well, unless they explicity told us not to use quad + word atomic instructions. */ + if (TARGET_QUAD_MEMORY + && !TARGET_QUAD_MEMORY_ATOMIC + && ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) == 0)) + rs6000_isa_flags |= OPTION_MASK_QUAD_MEMORY_ATOMIC; + /* Enable power8 fusion if we are tuning for power8, even if we aren't generating power8 instructions. */ if (!(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION)) @@ -5939,7 +5962,8 @@ direct_move_p (rtx op0, rtx op1) return false; } -/* Return true if this is a load or store quad operation. */ +/* Return true if this is a load or store quad operation. This function does + not handle the atomic quad memory instructions. */ bool quad_load_store_p (rtx op0, rtx op1) @@ -30753,6 +30777,7 @@ static struct rs6000_opt_mask const rs60 { "powerpc-gfxopt", OPTION_MASK_PPC_GFXOPT, false, true }, { "powerpc-gpopt", OPTION_MASK_PPC_GPOPT, false, true }, { "quad-memory", OPTION_MASK_QUAD_MEMORY, false, true }, + { "quad-memory-atomic", OPTION_MASK_QUAD_MEMORY_ATOMIC, false, true }, { "recip-precision", OPTION_MASK_RECIP_PRECISION, false, true }, { "string", OPTION_MASK_STRING, false, true }, { "update", OPTION_MASK_NO_UPDATE, true , true }, Index: gcc/config/rs6000/rs6000.h =================================================================== --- gcc/config/rs6000/rs6000.h (revision 206895) +++ gcc/config/rs6000/rs6000.h (working copy) @@ -533,8 +533,11 @@ extern int rs6000_vector_align[]; /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present in power7, so conditionalize them on p8 features. TImode syncs need quad memory support. */ -#define TARGET_SYNC_HI_QI (TARGET_QUAD_MEMORY || TARGET_DIRECT_MOVE) -#define TARGET_SYNC_TI TARGET_QUAD_MEMORY +#define TARGET_SYNC_HI_QI (TARGET_QUAD_MEMORY \ + || TARGET_QUAD_MEMORY_ATOMIC \ + || TARGET_DIRECT_MOVE) + +#define TARGET_SYNC_TI TARGET_QUAD_MEMORY_ATOMIC /* Power7 has both 32-bit load and store integer for the FPRs, so we don't need to allocate the SDmode stack slot to get the value into the proper location Index: gcc/config/rs6000/sync.md =================================================================== --- gcc/config/rs6000/sync.md (revision 206895) +++ gcc/config/rs6000/sync.md (working copy) @@ -204,25 +204,46 @@ (define_insn "load_locked<QHI:mode>_si" "<QHI:larx> %0,%y1" [(set_attr "type" "load_l")]) -;; Use PTImode to get even/odd register pairs +;; Use PTImode to get even/odd register pairs. +;; Use a temporary register to force getting an even register for the +;; lqarx/stqcrx. instructions. Normal optimizations will eliminate this extra +;; copy on big endian systems. + +;; On little endian systems where non-atomic quad word load/store instructions +;; are not used, the address can be register+offset, so make sure the address +;; is indexed or indirect before register allocation. + (define_expand "load_lockedti" [(use (match_operand:TI 0 "quad_int_reg_operand" "")) (use (match_operand:TI 1 "memory_operand" ""))] "TARGET_SYNC_TI" { - /* Use a temporary register to force getting an even register for the - lqarx/stqcrx. instructions. Normal optimizations will eliminate this - extra copy. */ + rtx op0 = operands[0]; + rtx op1 = operands[1]; rtx pti = gen_reg_rtx (PTImode); - emit_insn (gen_load_lockedpti (pti, operands[1])); - emit_move_insn (operands[0], gen_lowpart (TImode, pti)); + + if (!indexed_or_indirect_operand (op1, TImode)) + { + rtx old_addr = XEXP (op1, 0); + rtx new_addr = force_reg (Pmode, old_addr); + operands[1] = op1 = change_address (op1, TImode, new_addr); + } + + emit_insn (gen_load_lockedpti (pti, op1)); + if (WORDS_BIG_ENDIAN) + emit_move_insn (op0, gen_lowpart (TImode, pti)); + else + { + emit_move_insn (gen_lowpart (DImode, op0), gen_highpart (DImode, pti)); + emit_move_insn (gen_highpart (DImode, op0), gen_lowpart (DImode, pti)); + } DONE; }) (define_insn "load_lockedpti" [(set (match_operand:PTI 0 "quad_int_reg_operand" "=&r") (unspec_volatile:PTI - [(match_operand:TI 1 "memory_operand" "Z")] UNSPECV_LL))] + [(match_operand:TI 1 "indexed_or_indirect_operand" "Z")] UNSPECV_LL))] "TARGET_SYNC_TI && !reg_mentioned_p (operands[0], operands[1]) && quad_int_reg_operand (operands[0], PTImode)" @@ -238,6 +259,14 @@ (define_insn "store_conditional<mode>" "<stcx> %2,%y1" [(set_attr "type" "store_c")]) +;; Use a temporary register to force getting an even register for the +;; lqarx/stqcrx. instructions. Normal optimizations will eliminate this extra +;; copy on big endian systems. + +;; On little endian systems where non-atomic quad word load/store instructions +;; are not used, the address can be register+offset, so make sure the address +;; is indexed or indirect before register allocation. + (define_expand "store_conditionalti" [(use (match_operand:CC 0 "cc_reg_operand" "")) (use (match_operand:TI 1 "memory_operand" "")) @@ -247,21 +276,36 @@ (define_expand "store_conditionalti" rtx op0 = operands[0]; rtx op1 = operands[1]; rtx op2 = operands[2]; - rtx pti_op1 = change_address (op1, PTImode, XEXP (op1, 0)); - rtx pti_op2 = gen_reg_rtx (PTImode); + rtx addr = XEXP (op1, 0); + rtx pti_mem; + rtx pti_reg; + + if (!indexed_or_indirect_operand (op1, TImode)) + { + rtx new_addr = force_reg (Pmode, addr); + operands[1] = op1 = change_address (op1, TImode, new_addr); + addr = new_addr; + } + + pti_mem = change_address (op1, PTImode, addr); + pti_reg = gen_reg_rtx (PTImode); + + if (WORDS_BIG_ENDIAN) + emit_move_insn (pti_reg, gen_lowpart (PTImode, op2)); + else + { + emit_move_insn (gen_lowpart (DImode, pti_reg), gen_highpart (DImode, op2)); + emit_move_insn (gen_highpart (DImode, pti_reg), gen_lowpart (DImode, op2)); + } - /* Use a temporary register to force getting an even register for the - lqarx/stqcrx. instructions. Normal optimizations will eliminate this - extra copy. */ - emit_move_insn (pti_op2, gen_lowpart (PTImode, op2)); - emit_insn (gen_store_conditionalpti (op0, pti_op1, pti_op2)); + emit_insn (gen_store_conditionalpti (op0, pti_mem, pti_reg)); DONE; }) (define_insn "store_conditionalpti" [(set (match_operand:CC 0 "cc_reg_operand" "=x") (unspec_volatile:CC [(const_int 0)] UNSPECV_SC)) - (set (match_operand:PTI 1 "memory_operand" "=Z") + (set (match_operand:PTI 1 "indexed_or_indirect_operand" "=Z") (match_operand:PTI 2 "quad_int_reg_operand" "r"))] "TARGET_SYNC_TI && quad_int_reg_operand (operands[2], PTImode)" "stqcx. %2,%y1" Index: gcc/config/rs6000/rs6000.md =================================================================== --- gcc/config/rs6000/rs6000.md (revision 206895) +++ gcc/config/rs6000/rs6000.md (working copy) @@ -125,6 +125,7 @@ (define_c_enum "unspec" UNSPEC_P8V_MTVSRD UNSPEC_P8V_XXPERMDI UNSPEC_P8V_RELOAD_FROM_VSX + UNSPEC_TI_PTI_SWAP ]) ;; Index: gcc/testsuite/gcc.target/powerpc/quad-atomic.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/quad-atomic.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/quad-atomic.c (revision 0) @@ -0,0 +1,67 @@ +/* { dg-do run { target { powerpc*-*-linux* && lp64 } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */ +/* { dg-require-effective-target p8vector_hw } */ +/* { dg-options "-mcpu=power8 -O2" } */ + +/* Test whether we get the right bits for quad word atomic instructions. */ +#include <stdlib.h> + +static __int128_t quad_fetch_and (__int128_t *, __int128_t value) __attribute__((__noinline__)); +static __int128_t quad_fetch_or (__int128_t *, __int128_t value) __attribute__((__noinline__)); +static __int128_t quad_fetch_add (__int128_t *, __int128_t value) __attribute__((__noinline__)); + +static __int128_t +quad_fetch_and (__int128_t *ptr, __int128_t value) +{ + return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE); +} + +static __int128_t +quad_fetch_or (__int128_t *ptr, __int128_t value) +{ + return __atomic_fetch_or (ptr, value, __ATOMIC_ACQUIRE); +} + +static __int128_t +quad_fetch_add (__int128_t *ptr, __int128_t value) +{ + return __atomic_fetch_add (ptr, value, __ATOMIC_ACQUIRE); +} + +int +main (void) +{ + __int128_t result; + __int128_t value; + __int128_t and_input = ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL)); + __int128_t and_value = ((((__int128_t) 0xfffffffffffffff0ULL) << 64) | ((__int128_t) 0xfffffffffffffff0ULL)); + __int128_t and_exp = ((((__int128_t) 0x1234567890abcde0ULL) << 64) | ((__int128_t) 0xfedcba0987654320ULL)); + + __int128_t or_input = ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL)); + __int128_t or_value = ((((__int128_t) 0x0000000000000010ULL) << 64) | ((__int128_t) 0x000000000000000eULL)); + __int128_t or_exp = ((((__int128_t) 0x1234567890abcdffULL) << 64) | ((__int128_t) 0xfedcba098765432fULL)); + + __int128_t add_input = ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL)); + __int128_t add_value = ((((__int128_t) 0x0000000001000000ULL) << 64) | ((__int128_t) 0x0000001000000000ULL)); + __int128_t add_exp = ((((__int128_t) 0x1234567891abcdefULL) << 64) | ((__int128_t) 0xfedcba1987654321ULL)); + + + value = and_input; + result = quad_fetch_and (&value, and_value); + if (result != and_input || value != and_exp) + abort (); + + value = or_input; + result = quad_fetch_or (&value, or_value); + if (result != or_input || value != or_exp) + abort (); + + value = add_input; + result = quad_fetch_add (&value, add_value); + if (result != add_input || value != add_exp) + abort (); + + return 0; +} +
Index: gcc/doc/invoke.texi =================================================================== --- gcc/doc/invoke.texi (revision 206932) +++ gcc/doc/invoke.texi (working copy) @@ -858,7 +858,8 @@ See RS/6000 and PowerPC Options. -msave-toc-indirect -mno-save-toc-indirect @gol -mpower8-fusion -mno-mpower8-fusion -mpower8-vector -mno-power8-vector @gol -mcrypto -mno-crypto -mdirect-move -mno-direct-move @gol --mquad-memory -mno-quad-memory} +-mquad-memory -mno-quad-memory @gol +-mquad-memory-atomic -mno-quad-memory-atomic} @emph{RX Options} @gccoptlist{-m64bit-doubles -m32bit-doubles -fpu -nofpu@gol @@ -17397,10 +17398,18 @@ the vector instructions. @itemx -mno-quad-memory @opindex mquad-memory @opindex mno-quad-memory -Generate code that uses (does not use) the quad word memory +Generate code that uses (does not use) the non-atomic quad word memory instructions. The @option{-mquad-memory} option requires use of 64-bit mode. +@item -mquad-memory-atomic +@itemx -mno-quad-memory-atomic +@opindex mquad-memory-atomic +@opindex mno-quad-memory-atomic +Generate code that uses (does not use) the atomic quad word memory +instructions. The @option{-mquad-memory-atomic} option requires use of +64-bit mode. + @item -mfloat-gprs=@var{yes/single/double/no} @itemx -mfloat-gprs @opindex mfloat-gprs Index: gcc/config/rs6000/predicates.md =================================================================== --- gcc/config/rs6000/predicates.md (revision 206932) +++ gcc/config/rs6000/predicates.md (working copy) @@ -270,7 +270,7 @@ (define_predicate "quad_int_reg_operand" { HOST_WIDE_INT r; - if (!TARGET_QUAD_MEMORY) + if (!TARGET_QUAD_MEMORY && !TARGET_QUAD_MEMORY_ATOMIC) return 0; if (GET_CODE (op) == SUBREG) @@ -633,6 +633,7 @@ (define_predicate "offsettable_mem_opera (match_test "offsettable_nonstrict_memref_p (op)"))) ;; Return 1 if the operand is suitable for load/store quad memory. +;; This predicate only checks for non-atomic loads/stores. (define_predicate "quad_memory_operand" (match_code "mem") { Index: gcc/config/rs6000/rs6000-cpus.def =================================================================== --- gcc/config/rs6000/rs6000-cpus.def (revision 206932) +++ gcc/config/rs6000/rs6000-cpus.def (working copy) @@ -53,7 +53,8 @@ | OPTION_MASK_CRYPTO \ | OPTION_MASK_DIRECT_MOVE \ | OPTION_MASK_HTM \ - | OPTION_MASK_QUAD_MEMORY) + | OPTION_MASK_QUAD_MEMORY \ + | OPTION_MASK_QUAD_MEMORY_ATOMIC) #define POWERPC_7400_MASK (OPTION_MASK_PPC_GFXOPT | OPTION_MASK_ALTIVEC) Index: gcc/config/rs6000/rs6000-c.c =================================================================== --- gcc/config/rs6000/rs6000-c.c (revision 206932) +++ gcc/config/rs6000/rs6000-c.c (working copy) @@ -337,6 +337,10 @@ rs6000_target_modify_macros (bool define rs6000_define_or_undefine_macro (define_p, "__HTM__"); if ((flags & OPTION_MASK_P8_VECTOR) != 0) rs6000_define_or_undefine_macro (define_p, "__POWER8_VECTOR__"); + if ((flags & OPTION_MASK_QUAD_MEMORY) != 0) + rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY__"); + if ((flags & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0) + rs6000_define_or_undefine_macro (define_p, "__QUAD_MEMORY_ATOMIC__"); if ((flags & OPTION_MASK_CRYPTO) != 0) rs6000_define_or_undefine_macro (define_p, "__CRYPTO__"); Index: gcc/config/rs6000/rs6000.opt =================================================================== --- gcc/config/rs6000/rs6000.opt (revision 206932) +++ gcc/config/rs6000/rs6000.opt (working copy) @@ -556,7 +556,11 @@ Use ISA 2.07 transactional memory (HTM) mquad-memory Target Report Mask(QUAD_MEMORY) Var(rs6000_isa_flags) -Generate the quad word memory instructions (lq/stq/lqarx/stqcx). +Generate the quad word memory instructions (lq/stq). + +mquad-memory-atomic +Target Report Mask(QUAD_MEMORY_ATOMIC) Var(rs6000_isa_flags) +Generate the quad word memory atomic instructions (lqarx/stqcx). mcompat-align-parm Target Report Var(rs6000_compat_align_parm) Init(1) Save Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 206932) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -3317,14 +3317,37 @@ rs6000_option_override_internal (bool gl /* The quad memory instructions only works in 64-bit mode. In 32-bit mode, silently turn off quad memory mode. */ - if (TARGET_QUAD_MEMORY && !TARGET_POWERPC64) + if ((TARGET_QUAD_MEMORY || TARGET_QUAD_MEMORY_ATOMIC) && !TARGET_POWERPC64) { if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0) warning (0, N_("-mquad-memory requires 64-bit mode")); + if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) != 0) + warning (0, N_("-mquad-memory-atomic requires 64-bit mode")); + + rs6000_isa_flags &= ~(OPTION_MASK_QUAD_MEMORY + | OPTION_MASK_QUAD_MEMORY_ATOMIC); + } + + /* Non-atomic quad memory load/store are disabled for little endian, since + the words are reversed, but atomic operations can still be done by + swapping the words. */ + if (TARGET_QUAD_MEMORY && !WORDS_BIG_ENDIAN) + { + if ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY) != 0) + warning (0, N_("-mquad-memory is not available in little endian mode")); + rs6000_isa_flags &= ~OPTION_MASK_QUAD_MEMORY; } + /* Assume if the user asked for normal quad memory instructions, they want + the atomic versions as well, unless they explicity told us not to use quad + word atomic instructions. */ + if (TARGET_QUAD_MEMORY + && !TARGET_QUAD_MEMORY_ATOMIC + && ((rs6000_isa_flags_explicit & OPTION_MASK_QUAD_MEMORY_ATOMIC) == 0)) + rs6000_isa_flags |= OPTION_MASK_QUAD_MEMORY_ATOMIC; + /* Enable power8 fusion if we are tuning for power8, even if we aren't generating power8 instructions. */ if (!(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION)) @@ -5875,7 +5898,8 @@ direct_move_p (rtx op0, rtx op1) return false; } -/* Return true if this is a load or store quad operation. */ +/* Return true if this is a load or store quad operation. This function does + not handle the atomic quad memory instructions. */ bool quad_load_store_p (rtx op0, rtx op1) @@ -30675,6 +30699,7 @@ static struct rs6000_opt_mask const rs60 { "powerpc-gfxopt", OPTION_MASK_PPC_GFXOPT, false, true }, { "powerpc-gpopt", OPTION_MASK_PPC_GPOPT, false, true }, { "quad-memory", OPTION_MASK_QUAD_MEMORY, false, true }, + { "quad-memory-atomic", OPTION_MASK_QUAD_MEMORY_ATOMIC, false, true }, { "recip-precision", OPTION_MASK_RECIP_PRECISION, false, true }, { "string", OPTION_MASK_STRING, false, true }, { "update", OPTION_MASK_NO_UPDATE, true , true }, Index: gcc/config/rs6000/rs6000.h =================================================================== --- gcc/config/rs6000/rs6000.h (revision 206932) +++ gcc/config/rs6000/rs6000.h (working copy) @@ -524,8 +524,11 @@ extern int rs6000_vector_align[]; /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present in power7, so conditionalize them on p8 features. TImode syncs need quad memory support. */ -#define TARGET_SYNC_HI_QI (TARGET_QUAD_MEMORY || TARGET_DIRECT_MOVE) -#define TARGET_SYNC_TI TARGET_QUAD_MEMORY +#define TARGET_SYNC_HI_QI (TARGET_QUAD_MEMORY \ + || TARGET_QUAD_MEMORY_ATOMIC \ + || TARGET_DIRECT_MOVE) + +#define TARGET_SYNC_TI TARGET_QUAD_MEMORY_ATOMIC /* Power7 has both 32-bit load and store integer for the FPRs, so we don't need to allocate the SDmode stack slot to get the value into the proper location Index: gcc/config/rs6000/sync.md =================================================================== --- gcc/config/rs6000/sync.md (revision 206932) +++ gcc/config/rs6000/sync.md (working copy) @@ -1,5 +1,5 @@ ;; Machine description for PowerPC synchronization instructions. -;; Copyright (C) 2005-2013 Free Software Foundation, Inc. +;; Copyright (C) 2005-2014 Free Software Foundation, Inc. ;; Contributed by Geoffrey Keating. ;; This file is part of GCC. @@ -204,25 +204,46 @@ (define_insn "load_locked<QHI:mode>_si" "<QHI:larx> %0,%y1" [(set_attr "type" "load_l")]) -;; Use PTImode to get even/odd register pairs +;; Use PTImode to get even/odd register pairs. +;; Use a temporary register to force getting an even register for the +;; lqarx/stqcrx. instructions. Normal optimizations will eliminate this extra +;; copy on big endian systems. + +;; On little endian systems where non-atomic quad word load/store instructions +;; are not used, the address can be register+offset, so make sure the address +;; is indexed or indirect before register allocation. + (define_expand "load_lockedti" [(use (match_operand:TI 0 "quad_int_reg_operand" "")) (use (match_operand:TI 1 "memory_operand" ""))] "TARGET_SYNC_TI" { - /* Use a temporary register to force getting an even register for the - lqarx/stqcrx. instructions. Normal optimizations will eliminate this - extra copy. */ + rtx op0 = operands[0]; + rtx op1 = operands[1]; rtx pti = gen_reg_rtx (PTImode); - emit_insn (gen_load_lockedpti (pti, operands[1])); - emit_move_insn (operands[0], gen_lowpart (TImode, pti)); + + if (!indexed_or_indirect_operand (op1, TImode)) + { + rtx old_addr = XEXP (op1, 0); + rtx new_addr = force_reg (Pmode, old_addr); + operands[1] = op1 = change_address (op1, TImode, new_addr); + } + + emit_insn (gen_load_lockedpti (pti, op1)); + if (WORDS_BIG_ENDIAN) + emit_move_insn (op0, gen_lowpart (TImode, pti)); + else + { + emit_move_insn (gen_lowpart (DImode, op0), gen_highpart (DImode, pti)); + emit_move_insn (gen_highpart (DImode, op0), gen_lowpart (DImode, pti)); + } DONE; }) (define_insn "load_lockedpti" [(set (match_operand:PTI 0 "quad_int_reg_operand" "=&r") (unspec_volatile:PTI - [(match_operand:TI 1 "memory_operand" "Z")] UNSPECV_LL))] + [(match_operand:TI 1 "indexed_or_indirect_operand" "Z")] UNSPECV_LL))] "TARGET_SYNC_TI && !reg_mentioned_p (operands[0], operands[1]) && quad_int_reg_operand (operands[0], PTImode)" @@ -238,6 +259,14 @@ (define_insn "store_conditional<mode>" "<stcx> %2,%y1" [(set_attr "type" "store_c")]) +;; Use a temporary register to force getting an even register for the +;; lqarx/stqcrx. instructions. Normal optimizations will eliminate this extra +;; copy on big endian systems. + +;; On little endian systems where non-atomic quad word load/store instructions +;; are not used, the address can be register+offset, so make sure the address +;; is indexed or indirect before register allocation. + (define_expand "store_conditionalti" [(use (match_operand:CC 0 "cc_reg_operand" "")) (use (match_operand:TI 1 "memory_operand" "")) @@ -247,21 +276,36 @@ (define_expand "store_conditionalti" rtx op0 = operands[0]; rtx op1 = operands[1]; rtx op2 = operands[2]; - rtx pti_op1 = change_address (op1, PTImode, XEXP (op1, 0)); - rtx pti_op2 = gen_reg_rtx (PTImode); + rtx addr = XEXP (op1, 0); + rtx pti_mem; + rtx pti_reg; + + if (!indexed_or_indirect_operand (op1, TImode)) + { + rtx new_addr = force_reg (Pmode, addr); + operands[1] = op1 = change_address (op1, TImode, new_addr); + addr = new_addr; + } + + pti_mem = change_address (op1, PTImode, addr); + pti_reg = gen_reg_rtx (PTImode); + + if (WORDS_BIG_ENDIAN) + emit_move_insn (pti_reg, gen_lowpart (PTImode, op2)); + else + { + emit_move_insn (gen_lowpart (DImode, pti_reg), gen_highpart (DImode, op2)); + emit_move_insn (gen_highpart (DImode, pti_reg), gen_lowpart (DImode, op2)); + } - /* Use a temporary register to force getting an even register for the - lqarx/stqcrx. instructions. Normal optimizations will eliminate this - extra copy. */ - emit_move_insn (pti_op2, gen_lowpart (PTImode, op2)); - emit_insn (gen_store_conditionalpti (op0, pti_op1, pti_op2)); + emit_insn (gen_store_conditionalpti (op0, pti_mem, pti_reg)); DONE; }) (define_insn "store_conditionalpti" [(set (match_operand:CC 0 "cc_reg_operand" "=x") (unspec_volatile:CC [(const_int 0)] UNSPECV_SC)) - (set (match_operand:PTI 1 "memory_operand" "=Z") + (set (match_operand:PTI 1 "indexed_or_indirect_operand" "=Z") (match_operand:PTI 2 "quad_int_reg_operand" "r"))] "TARGET_SYNC_TI && quad_int_reg_operand (operands[2], PTImode)" "stqcx. %2,%y1" Index: gcc/testsuite/gcc.target/powerpc/quad-atomic.c =================================================================== --- gcc/testsuite/gcc.target/powerpc/quad-atomic.c (revision 0) +++ gcc/testsuite/gcc.target/powerpc/quad-atomic.c (revision 0) @@ -0,0 +1,67 @@ +/* { dg-do run { target { powerpc*-*-linux* && lp64 } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */ +/* { dg-require-effective-target p8vector_hw } */ +/* { dg-options "-mcpu=power8 -O2" } */ + +/* Test whether we get the right bits for quad word atomic instructions. */ +#include <stdlib.h> + +static __int128_t quad_fetch_and (__int128_t *, __int128_t value) __attribute__((__noinline__)); +static __int128_t quad_fetch_or (__int128_t *, __int128_t value) __attribute__((__noinline__)); +static __int128_t quad_fetch_add (__int128_t *, __int128_t value) __attribute__((__noinline__)); + +static __int128_t +quad_fetch_and (__int128_t *ptr, __int128_t value) +{ + return __atomic_fetch_and (ptr, value, __ATOMIC_ACQUIRE); +} + +static __int128_t +quad_fetch_or (__int128_t *ptr, __int128_t value) +{ + return __atomic_fetch_or (ptr, value, __ATOMIC_ACQUIRE); +} + +static __int128_t +quad_fetch_add (__int128_t *ptr, __int128_t value) +{ + return __atomic_fetch_add (ptr, value, __ATOMIC_ACQUIRE); +} + +int +main (void) +{ + __int128_t result; + __int128_t value; + __int128_t and_input = ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL)); + __int128_t and_value = ((((__int128_t) 0xfffffffffffffff0ULL) << 64) | ((__int128_t) 0xfffffffffffffff0ULL)); + __int128_t and_exp = ((((__int128_t) 0x1234567890abcde0ULL) << 64) | ((__int128_t) 0xfedcba0987654320ULL)); + + __int128_t or_input = ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL)); + __int128_t or_value = ((((__int128_t) 0x0000000000000010ULL) << 64) | ((__int128_t) 0x000000000000000eULL)); + __int128_t or_exp = ((((__int128_t) 0x1234567890abcdffULL) << 64) | ((__int128_t) 0xfedcba098765432fULL)); + + __int128_t add_input = ((((__int128_t) 0x1234567890abcdefULL) << 64) | ((__int128_t) 0xfedcba0987654321ULL)); + __int128_t add_value = ((((__int128_t) 0x0000000001000000ULL) << 64) | ((__int128_t) 0x0000001000000000ULL)); + __int128_t add_exp = ((((__int128_t) 0x1234567891abcdefULL) << 64) | ((__int128_t) 0xfedcba1987654321ULL)); + + + value = and_input; + result = quad_fetch_and (&value, and_value); + if (result != and_input || value != and_exp) + abort (); + + value = or_input; + result = quad_fetch_or (&value, or_value); + if (result != or_input || value != or_exp) + abort (); + + value = add_input; + result = quad_fetch_add (&value, add_value); + if (result != add_input || value != add_exp) + abort (); + + return 0; +} +