https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119607
--- Comment #14 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
To me this looks like just not thread safe code in glib2.
The important part of the function is just trying to atomically increment the
closure->ref_count bitfield.
In *.optimized dump this is
<bb 8> [local count: 756815742]:
old_int_61 = MEM[(union ClosureInt *)closure_16(D)].vint;
tmp.vint = old_int_61;
_62 = tmp.closure.ref_count;
_63 = (short unsigned int) _62;
_64 = _63 + 1;
_65 = (<unnamed-unsigned:15>) _64;
tmp.closure.ref_count = _65;
new_int_68 = tmp.vint;
new_int.12_69 = (unsigned int) new_int_68;
_70 = &MEM[(union ClosureInt *)closure_16(D)].vint;
_71 = (unsigned int) old_int_61;
_72 = .ATOMIC_COMPARE_EXCHANGE (_70, _71, new_int.12_69, 4, 5, 5);
_73 = IMAGPART_EXPR <_72>;
tmp ={v} {CLOBBER(eos)};
if (_73 == 0)
goto <bb 8>; [0.04%]
else
goto <bb 9>; [99.96%]
Normal thread-safe code would do __atomic_load to read from &closure->vint, it
can be with relaxed model but the important thing is to make sure the compiler
doesn't try to read it multiple times.
That loop with r15-7894 is
.L336:
movl (%esi), %ebp
movl %ebp, %edx
movl %ebp, 32(%esp)
andw $32767, %dx
incl %edx
movl %edx, %eax
andw $32767, %ax
movw %ax, 20(%esp)
movl %ebp, %eax
andw $-32768, %ax
orw 20(%esp), %ax
movw %ax, 32(%esp)
movl %ebp, %eax
movl 32(%esp), %ebp
lock cmpxchgl %ebp, (%esi)
jne .L336
where %esi is closure, so it reads it from memory just once, saves a copy of it
in 32(%esp), awkwardly increments the 15-bit ref_count (I don't see the point
in masking before the increment, it could just do that after increment) in
there and preserves the last bit. In any case, 32(%esp) here is the tmp.vint
32-bit memory and %ebp holds the original value of closure->vint.
That same loop with r15-7895 is
.L336:
movl (%esi), %eax
movl %eax, 16(%esp)
movzwl (%esi), %eax
andw $32767, %ax
leal 1(%eax), %edx
movl %edx, %eax
andw $32767, %ax
movl %eax, %ecx
movzwl (%esi), %eax
andw $-32768, %ax
orl %ecx, %eax
movw %ax, 16(%esp)
movl (%esi), %eax
movl 16(%esp), %ecx
lock cmpxchgl %ecx, (%esi)
jne .L336
The RA makes completely different decisions here. But the important thing is
that
instead of reading from closure->vint once, it actually reads from there 4
times. It isn't volatile and doesn't use __atomic_load, so IMHO why not.
Except when multiple threads attempt to increase it, one has to be lucky that
all the 4 reads read the same value. If not, you get what you observe.
Before RA this is (I think in both cases)
(insn 68 67 69 9 (set (reg/v:SI 127 [ old_int ])
(mem/j:SI (reg/v/f:SI 144 [ closure ]) [7 MEM[(union ClosureInt
*)closure_16(D)].vint+0 S4 A32])) "../glib-2.84.0/gobject/gclosure.c":564:124
discrim 3 96 {*movsi_internal}
(nil))
(insn 69 68 72 9 (set (mem/j/c:SI (plus:SI (reg/f:SI 19 frame)
(const_int -16 [0xfffffffffffffff0])) [7 MEM[(union
*)_14].vint+0 S4 A32])
(reg/v:SI 127 [ old_int ])) "../glib-2.84.0/gobject/gclosure.c":564:114
discrim 3 96 {*movsi_internal}
(nil))
(insn 72 69 73 9 (parallel [
(set (reg:HI 161 [ _62 ])
(and:HI (subreg:HI (reg/v:SI 127 [ old_int ]) 0)
(const_int 32767 [0x7fff])))
(clobber (reg:CC 17 flags))
]) "../glib-2.84.0/gobject/gclosure.c":564:181 discrim 3 720 {*andhi_1}
(expr_list:REG_EQUAL (and:HI (mem/c:HI (plus:SI (reg/f:SI 19 frame)
(const_int -16 [0xfffffffffffffff0])) [7 MEM[(union
*)_14]+0 S2 A32])
(const_int 32767 [0x7fff]))
(nil)))
(insn 73 72 75 9 (parallel [
(set (reg:HI 130 [ _64 ])
(plus:HI (reg:HI 161 [ _62 ])
(const_int 1 [0x1])))
(clobber (reg:CC 17 flags))
]) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 298 {*addhi_1}
(expr_list:REG_DEAD (reg:HI 161 [ _62 ])
(expr_list:REG_UNUSED (reg:CC 17 flags)
(nil))))
(insn 75 73 77 9 (parallel [
(set (reg:HI 164)
(and:HI (reg:HI 130 [ _64 ])
(const_int 32767 [0x7fff])))
(clobber (reg:CC 17 flags))
]) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 720 {*andhi_1}
(expr_list:REG_UNUSED (reg:CC 17 flags)
(nil)))
(insn 77 75 78 9 (parallel [
(set (reg:HI 166)
(and:HI (subreg:HI (reg/v:SI 127 [ old_int ]) 0)
(const_int -32768 [0xffffffffffff8000])))
(clobber (reg:CC 17 flags))
]) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 720 {*andhi_1}
(nil))
(insn 78 77 79 9 (parallel [
(set (reg:HI 167)
(ior:HI (reg:HI 166)
(reg:HI 164)))
(clobber (reg:CC 17 flags))
]) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 824 {*iorhi_1}
(expr_list:REG_DEAD (reg:HI 166)
(expr_list:REG_DEAD (reg:HI 164)
(expr_list:REG_UNUSED (reg:CC 17 flags)
(nil)))))
(insn 79 78 80 9 (set (mem/j/c:HI (plus:SI (reg/f:SI 19 frame)
(const_int -16 [0xfffffffffffffff0])) [7 MEM[(union
*)_14].closure.ref_count+0 S2 A32])
(reg:HI 167)) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 97
{*movhi_internal}
(expr_list:REG_DEAD (reg:HI 167)
(nil)))
(insn 80 79 81 9 (set (reg:SI 170 [ MEM[(union *)_14].vint ])
(mem/j/c:SI (plus:SI (reg/f:SI 19 frame)
(const_int -16 [0xfffffffffffffff0])) [7 MEM[(union
*)_14].vint+0 S4 A32])) "../glib-2.84.0/gobject/gclosure.c":564:472 discrim 3
96 {*movsi_internal}
(nil))
(insn 81 80 82 9 (parallel [
(set (reg:SI 168)
(unspec_volatile:SI [
(mem/v:SI (reg/v/f:SI 144 [ closure ]) [-1 S4 A32])
(reg/v:SI 127 [ old_int ])
(reg:SI 170 [ MEM[(union *)_14].vint ])
(const_int 5 [0x5])
] UNSPECV_CMPXCHG))
(set (mem/v:SI (reg/v/f:SI 144 [ closure ]) [-1 S4 A32])
(unspec_volatile:SI [
(const_int 0 [0])
] UNSPECV_CMPXCHG))
(set (reg:CCZ 17 flags)
(unspec_volatile:CCZ [
(const_int 0 [0])
] UNSPECV_CMPXCHG))
]) "../glib-2.84.0/gobject/gclosure.c":564:472 discrim 3 11031
{atomic_compare_and_swapsi_1}
(expr_list:REG_DEAD (reg:SI 170 [ MEM[(union *)_14].vint ])
(expr_list:REG_DEAD (reg/v:SI 127 [ old_int ])
(expr_list:REG_UNUSED (reg:SI 168)
(nil)))))
or so, so it really depends on the RA if it gives pseudo SI 127 a hard register
or not and if not, I think RA rightfully just rereads it from memory.
Note, the loop also doesn't use the value from lock cmpxchgl if it needs to
loop, normal code does __atomic_load before the loop, then perform operations
on it and for next iteration uses what __atomic_compare_exchange gives on
failure.
So, I suppose it should be something like
#define ATOMIC_CHANGE_FIELD(_closure, _field, _OP, _value, _must_set,
_SET_OLD, _SET_NEW) \
G_STMT_START {
\
ClosureInt *cunion = (ClosureInt*) _closure;
\
gint new_int, old_int, success;
\
+ old_int = g_atomic_int_get (&cunion->vint);
\
do
\
{
\
ClosureInt tmp;
\
- tmp.vint = old_int = cunion->vint;
\
+ tmp.vint = old_int;
\
_SET_OLD tmp.closure._field;
\
tmp.closure._field _OP _value;
\
_SET_NEW tmp.closure._field;
\
new_int = tmp.vint;
\
- success = g_atomic_int_compare_and_exchange (&cunion->vint, old_int,
new_int); \
+ success = g_atomic_int_compare_and_exchange_full (&cunion->vint,
old_int, new_int,\
+ &old_int);
\
}
\
while (!success && _must_set);
\
} G_STMT_END
Completely untested.