In a CAS operation, even if expected != *memory we still need to do an atomic load of *memory into output. But I made a mistake in the initial implementation, causing the output to contain junk in this situation.
Like a normal atomic load, the atomic load embedded in the CAS semantic is required to work on read-only page. Thus we cannot rely on sc.q to ensure the atomicity of the load. Use LSX to perform the load instead, and also use LSX to compare the 16B values to keep the ll-sc loop body short. gcc/ChangeLog: * config/loongarch/sync.md (atomic_compare_and_swapti_scq): Require LSX. Change the operands for the output, the memory, and the expected value to LSX vector modes. Add a FCCmode output to indicate if CAS has written the desired value into memory. Use LSX to atomically load both words of the 16B value in memory. (atomic_compare_and_swapti): Pun the modes to satisify the new atomic_compare_and_swapti_scq implementation. Read the bool return value from the FCC instead of performing a comparision. --- gcc/config/loongarch/sync.md | 104 +++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 41 deletions(-) diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md index 2ee400e2381..66a431ddbf0 100644 --- a/gcc/config/loongarch/sync.md +++ b/gcc/config/loongarch/sync.md @@ -682,21 +682,25 @@ (define_expand "atomic_compare_and_swap<mode>" }) (define_insn "atomic_compare_and_swapti_scq" - [(set (match_operand:TI 0 "register_operand" "=&r") - (match_operand:TI 1 "memory_operand" "+ZB")) + [(set (match_operand:V2DI 0 "register_operand" "=&f") + (match_operand:V2DI 1 "memory_operand" "+m")) (set (match_dup 1) - (unspec_volatile:TI [(match_operand:TI 2 "reg_or_0_operand" "rJ") - (match_operand:TI 3 "reg_or_0_operand" "rJ") - (match_operand:SI 4 "const_int_operand")] ;; mod_f - UNSPEC_COMPARE_AND_SWAP)) - (clobber (match_scratch:DI 5 "=&r"))] - "TARGET_64BIT && ISA_HAS_SCQ" + (unspec_volatile:V2DI + [(match_operand:V2DI 2 "reg_or_0_operand" "fJ") + (match_operand:TI 3 "reg_or_0_operand" "rJ") + (match_operand:SI 4 "const_int_operand")] + UNSPEC_COMPARE_AND_SWAP)) + (set (match_operand:FCC 5 "register_operand" "=z") + (ne:FCC (match_dup 1) (match_dup 3))) + (clobber (match_scratch:V2DI 6 "=&f")) + (clobber (match_scratch:DI 7 "=&r"))] + "TARGET_64BIT && ISA_HAS_SCQ && ISA_HAS_LSX" { output_asm_insn ("1:", operands); - output_asm_insn ("ll.d\t%0,%1", operands); - /* Compare the low word */ - output_asm_insn ("bne\t%0,%z2,2f", operands); + /* The loaded value in %7 will just be discarded, this instruction is + only intended to raise the LL bit. Should we use %. instead?? */ + output_asm_insn ("ll.d\t%7,%1", operands); /* Don't reorder the load of high word before ll.d. As the TImode must be aligned in the memory, the high and low words must be in @@ -704,23 +708,31 @@ (define_insn "atomic_compare_and_swapti_scq" if (!ISA_HAS_LD_SEQ_SA) output_asm_insn ("dbar\t0x700", operands); - /* Now load the high word. As the high and low words are in the same - cacheline, in case another core has clobbered the high word before the - sc.q instruction is executed, the LL bit for the low word will be - cleared. Thus a normal load is sufficient. */ - output_asm_insn ("ld.d\t%t0,%b1,8", operands); - - /* Compare the high word. */ - output_asm_insn ("bne\t%t0,%t2,2f", operands); + /* Load the word pair altogether. We cannot just load the high word + alone: doing so will need to rely on sc.q to ensure no other threads + have updated the memory between two loads, but issuing an sc.q when + original != expected will cause a page fault with a "valid" (well, at + least the standard implies it's valid) use of + atomic_compare_and_exchange on a const _Atomic object. See + https://gcc.gnu.org/PR80878#c1. */ + output_asm_insn ("vld\t%w0,%1", operands); + + /* Compare the word pair. */ + if (const_0_operand (operands[2], V1TImode)) + operands[7] = operands[0]; + else + output_asm_insn ("vxor.v\t%w6,%w0,%w2", operands); + output_asm_insn ("vseteqz.v\t%5,%w6", operands); + output_asm_insn ("bceqz\t%5,2f", operands); /* Copy the low word of the new value as it'll be clobbered by sc.q. */ - output_asm_insn ("move\t%5,%z3", operands); + output_asm_insn ("move\t%7,%z3", operands); /* Store both words if LL bit is still set. */ - output_asm_insn ("sc.q\t%5,%t3,%1", operands); + output_asm_insn ("sc.q\t%7,%t3,%1", operands); /* Check if sc.q has done the store. */ - output_asm_insn ("beqz\t%5,1b", operands); + output_asm_insn ("beqz\t%7,1b", operands); /* Jump over the mod_f barrier if sc.q has succeeded. */ output_asm_insn ("%T4b\t3f", operands); @@ -732,7 +744,7 @@ (define_insn "atomic_compare_and_swapti_scq" output_asm_insn ("3:", operands); return ""; } - [(set_attr "length" "40")]) + [(set_attr "length" "44")]) (define_expand "atomic_compare_and_swapti" [(match_operand:SI 0 "register_operand" "") ;; bool output @@ -743,30 +755,40 @@ (define_expand "atomic_compare_and_swapti" (match_operand:SI 5 "const_int_operand" "") ;; is_weak (match_operand:SI 6 "const_int_operand" "") ;; mod_s (match_operand:SI 7 "const_int_operand" "")] ;; mod_f - "TARGET_64BIT && ISA_HAS_SCQ" + "TARGET_64BIT && ISA_HAS_SCQ && ISA_HAS_LSX" { - emit_insn (gen_atomic_compare_and_swapti_scq (operands[1], operands[2], - operands[3], operands[4], - operands[7])); - - rtx t[2]; + rtx fcc = gen_reg_rtx (FCCmode); + rtx gpr = gen_reg_rtx (DImode); + rtx vr = gen_reg_rtx (V2DImode); + rtx mem = gen_rtx_MEM (V2DImode, XEXP (operands[2], 0)); - for (int i = 0; i < 2; i++) + if (const_0_operand (operands[3], TImode)) + operands[3] = CONST0_RTX (V2DImode); + else { - rtx compare = loongarch_subword (operands[1], i); - rtx expect = loongarch_subword (operands[3], i); + rtvec v = rtvec_alloc (2); + for (int i = 0; i < 2; i++) + RTVEC_ELT (v, i) = loongarch_subword (operands[3], i); - t[i] = gen_reg_rtx (DImode); - - if (expect != const0_rtx) - emit_insn (gen_xordi3 (t[i], compare, expect)); - else - emit_move_insn (t[i], compare); + operands[3] = gen_reg_rtx (V2DImode); + emit_insn (gen_vec_initv2didi (operands[3], + gen_rtx_PARALLEL (V2DImode, v))); } - emit_insn (gen_iordi3 (t[0], t[0], t[1])); - emit_insn (gen_rtx_SET (operands[0], - gen_rtx_EQ (SImode, t[0], const0_rtx))); + emit_insn (gen_atomic_compare_and_swapti_scq (vr, mem, operands[3], + operands[4], operands[7], + fcc)); + + for (int i = 0; i < 2; i++) + emit_insn ( + gen_lsx_vpickve2gr_d (loongarch_subword (operands[1], i), vr, + GEN_INT (i))); + + emit_insn (gen_fcc_to_di (gpr, fcc)); + gpr = gen_lowpart (SImode, gpr); + SUBREG_PROMOTED_VAR_P (gpr) = 1; + SUBREG_PROMOTED_SET (gpr, SRP_SIGNED); + emit_move_insn (operands[0], gpr); DONE; }) -- 2.50.1