[Bug rtl-optimization/119607] [15 regression] glib miscompiled since r15-7895-gb191e8bdecf881 with -O3 -m32 -march=x86-64 -mtune=znver2 -fno-semantic-interposition

jakub at gcc dot gnu.org via Gcc-bugs Thu, 03 Apr 2025 10:20:49 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119607


--- Comment #14 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
To me this looks like just not thread safe code in glib2.
The important part of the function is just trying to atomically increment the
closure->ref_count bitfield.
In *.optimized dump this is
  <bb 8> [local count: 756815742]:
  old_int_61 = MEM[(union ClosureInt *)closure_16(D)].vint;
  tmp.vint = old_int_61;
  _62 = tmp.closure.ref_count;
  _63 = (short unsigned int) _62;
  _64 = _63 + 1;
  _65 = (<unnamed-unsigned:15>) _64;
  tmp.closure.ref_count = _65;
  new_int_68 = tmp.vint;
  new_int.12_69 = (unsigned int) new_int_68;
  _70 = &MEM[(union ClosureInt *)closure_16(D)].vint;
  _71 = (unsigned int) old_int_61;
  _72 = .ATOMIC_COMPARE_EXCHANGE (_70, _71, new_int.12_69, 4, 5, 5);
  _73 = IMAGPART_EXPR <_72>;
  tmp ={v} {CLOBBER(eos)};
  if (_73 == 0)
    goto <bb 8>; [0.04%]
  else
    goto <bb 9>; [99.96%]
Normal thread-safe code would do __atomic_load to read from &closure->vint, it
can be with relaxed model but the important thing is to make sure the compiler
doesn't try to read it multiple times.
That loop with r15-7894 is
.L336:
        movl    (%esi), %ebp
        movl    %ebp, %edx
        movl    %ebp, 32(%esp)
        andw    $32767, %dx
        incl    %edx
        movl    %edx, %eax
        andw    $32767, %ax
        movw    %ax, 20(%esp)
        movl    %ebp, %eax
        andw    $-32768, %ax
        orw     20(%esp), %ax
        movw    %ax, 32(%esp)
        movl    %ebp, %eax
        movl    32(%esp), %ebp
        lock cmpxchgl   %ebp, (%esi)
        jne     .L336
where %esi is closure, so it reads it from memory just once, saves a copy of it
in 32(%esp), awkwardly increments the 15-bit ref_count (I don't see the point
in masking before the increment, it could just do that after increment) in
there and preserves the  last bit.  In any case, 32(%esp) here is the tmp.vint
32-bit memory and %ebp holds the original value of closure->vint.
That same loop with r15-7895 is
.L336:
        movl    (%esi), %eax
        movl    %eax, 16(%esp)
        movzwl  (%esi), %eax
        andw    $32767, %ax
        leal    1(%eax), %edx
        movl    %edx, %eax
        andw    $32767, %ax
        movl    %eax, %ecx
        movzwl  (%esi), %eax
        andw    $-32768, %ax
        orl     %ecx, %eax
        movw    %ax, 16(%esp)
        movl    (%esi), %eax
        movl    16(%esp), %ecx
        lock cmpxchgl   %ecx, (%esi)
        jne     .L336
The RA makes completely different decisions here.  But the important thing is
that
instead of reading from closure->vint once, it actually reads from there 4
times.  It isn't volatile and doesn't use __atomic_load, so IMHO why not. 
Except when multiple threads attempt to increase it, one has to be lucky that
all the 4 reads read the same value.  If not, you get what you observe.

Before RA this is (I think in both cases)
(insn 68 67 69 9 (set (reg/v:SI 127 [ old_int ])
        (mem/j:SI (reg/v/f:SI 144 [ closure ]) [7 MEM[(union ClosureInt
*)closure_16(D)].vint+0 S4 A32])) "../glib-2.84.0/gobject/gclosure.c":564:124
discrim 3 96 {*movsi_internal}
     (nil))
(insn 69 68 72 9 (set (mem/j/c:SI (plus:SI (reg/f:SI 19 frame)
                (const_int -16 [0xfffffffffffffff0])) [7 MEM[(union 
*)_14].vint+0 S4 A32])
        (reg/v:SI 127 [ old_int ])) "../glib-2.84.0/gobject/gclosure.c":564:114
discrim 3 96 {*movsi_internal}
     (nil))
(insn 72 69 73 9 (parallel [
            (set (reg:HI 161 [ _62 ])
                (and:HI (subreg:HI (reg/v:SI 127 [ old_int ]) 0)
                    (const_int 32767 [0x7fff])))
            (clobber (reg:CC 17 flags))
        ]) "../glib-2.84.0/gobject/gclosure.c":564:181 discrim 3 720 {*andhi_1}
     (expr_list:REG_EQUAL (and:HI (mem/c:HI (plus:SI (reg/f:SI 19 frame)
                    (const_int -16 [0xfffffffffffffff0])) [7 MEM[(union 
*)_14]+0 S2 A32])
            (const_int 32767 [0x7fff]))
        (nil)))
(insn 73 72 75 9 (parallel [
            (set (reg:HI 130 [ _64 ])
                (plus:HI (reg:HI 161 [ _62 ])
                    (const_int 1 [0x1])))
            (clobber (reg:CC 17 flags))
        ]) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 298 {*addhi_1}
     (expr_list:REG_DEAD (reg:HI 161 [ _62 ])
        (expr_list:REG_UNUSED (reg:CC 17 flags)
            (nil))))
(insn 75 73 77 9 (parallel [
            (set (reg:HI 164)
                (and:HI (reg:HI 130 [ _64 ])
                    (const_int 32767 [0x7fff])))
            (clobber (reg:CC 17 flags))
        ]) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 720 {*andhi_1}
     (expr_list:REG_UNUSED (reg:CC 17 flags)
        (nil)))
(insn 77 75 78 9 (parallel [
            (set (reg:HI 166)
                (and:HI (subreg:HI (reg/v:SI 127 [ old_int ]) 0)
                    (const_int -32768 [0xffffffffffff8000])))
            (clobber (reg:CC 17 flags))
        ]) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 720 {*andhi_1}
     (nil))
(insn 78 77 79 9 (parallel [
            (set (reg:HI 167)
                (ior:HI (reg:HI 166)
                    (reg:HI 164)))
            (clobber (reg:CC 17 flags))
        ]) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 824 {*iorhi_1}
     (expr_list:REG_DEAD (reg:HI 166)
        (expr_list:REG_DEAD (reg:HI 164)
            (expr_list:REG_UNUSED (reg:CC 17 flags)
                (nil)))))
(insn 79 78 80 9 (set (mem/j/c:HI (plus:SI (reg/f:SI 19 frame)
                (const_int -16 [0xfffffffffffffff0])) [7 MEM[(union 
*)_14].closure.ref_count+0 S2 A32])
        (reg:HI 167)) "../glib-2.84.0/gobject/gclosure.c":564:192 discrim 3 97
{*movhi_internal}
     (expr_list:REG_DEAD (reg:HI 167)
        (nil)))
(insn 80 79 81 9 (set (reg:SI 170 [ MEM[(union  *)_14].vint ])
        (mem/j/c:SI (plus:SI (reg/f:SI 19 frame)
                (const_int -16 [0xfffffffffffffff0])) [7 MEM[(union 
*)_14].vint+0 S4 A32])) "../glib-2.84.0/gobject/gclosure.c":564:472 discrim 3
96 {*movsi_internal}
     (nil))
(insn 81 80 82 9 (parallel [
            (set (reg:SI 168)
                (unspec_volatile:SI [
                        (mem/v:SI (reg/v/f:SI 144 [ closure ]) [-1  S4 A32])
                        (reg/v:SI 127 [ old_int ])
                        (reg:SI 170 [ MEM[(union  *)_14].vint ])
                        (const_int 5 [0x5])
                    ] UNSPECV_CMPXCHG))
            (set (mem/v:SI (reg/v/f:SI 144 [ closure ]) [-1  S4 A32])
                (unspec_volatile:SI [
                        (const_int 0 [0])
                    ] UNSPECV_CMPXCHG))
            (set (reg:CCZ 17 flags)
                (unspec_volatile:CCZ [
                        (const_int 0 [0])
                    ] UNSPECV_CMPXCHG))
        ]) "../glib-2.84.0/gobject/gclosure.c":564:472 discrim 3 11031
{atomic_compare_and_swapsi_1}
     (expr_list:REG_DEAD (reg:SI 170 [ MEM[(union  *)_14].vint ])
        (expr_list:REG_DEAD (reg/v:SI 127 [ old_int ])
            (expr_list:REG_UNUSED (reg:SI 168)
                (nil)))))
or so, so it really depends on the RA if it gives pseudo SI 127 a hard register
or not and if not, I think RA rightfully just rereads it from memory.

Note, the loop also doesn't use the value from lock cmpxchgl if it needs to
loop, normal code does __atomic_load before the loop, then perform operations
on it and for next iteration uses what __atomic_compare_exchange gives on
failure.
So, I suppose it should be something like
 #define ATOMIC_CHANGE_FIELD(_closure, _field, _OP, _value, _must_set,
_SET_OLD, _SET_NEW)      \
 G_STMT_START {                                                                
         \
   ClosureInt *cunion = (ClosureInt*) _closure;                                
                \
   gint new_int, old_int, success;                                             
                \
+  old_int = g_atomic_int_get (&cunion->vint);                                 
        \
   do                                                                          
        \
     {                                                                         
        \
       ClosureInt tmp;                                                         
        \
-      tmp.vint = old_int = cunion->vint;                                      
        \
+      tmp.vint = old_int;                                                     
        \
       _SET_OLD tmp.closure._field;                                            
         \
       tmp.closure._field _OP _value;                                          
        \
       _SET_NEW tmp.closure._field;                                            
         \
       new_int = tmp.vint;                                                     
        \
-      success = g_atomic_int_compare_and_exchange (&cunion->vint, old_int,
new_int);    \
+      success = g_atomic_int_compare_and_exchange_full (&cunion->vint,
old_int, new_int,\
+                                                       &old_int);             
        \
     }                                                                         
        \
   while (!success && _must_set);                                              
         \
 } G_STMT_END
Completely untested.

[Bug rtl-optimization/119607] [15 regression] glib miscompiled since r15-7895-gb191e8bdecf881 with -O3 -m32 -march=x86-64 -mtune=znver2 -fno-semantic-interposition

Reply via email to