Hi! To again state this in public:
On 2024-03-22T15:54:49+0000, Andrew Stubbs <a...@baylibre.com> wrote: > The RDNA devices have different cache architectures to the CDNA devices, and > the differences go deeper than just the assembler mnemonics, so we > probably need to generate different code to maintain coherency across > the whole device. > > I believe this patch is correct according to the documentation in the LLVM > AMDGPU user guide (the ISA manual is less instructive), but I hadn't observed > any real problems before (or after). > > Committed to mainline. Thanks! This commit does repair a lot of the GCN offloading damage noted in <https://inbox.sourceware.org/878r44l00i....@euler.schwinge.ddns.net> "libgomp GCN gfx1030/gfx1100 offloading status" and thereabouts, that is, this recovers to PASS a lot of twinkling libgomp/OpenMP/GCN execution test cases, and their even more annyoing random timeouts. (The commit doesn't affect GCN target testing.) I still have a number of stabilization hacks applied to my sources -- but I've, for example, not seen any 'HSA_STATUS_ERROR_OUT_OF_RESOURCES' or random timeouts in my current GCN offloading test results. Grüße Thomas > gcc/ChangeLog: > > * config/gcn/gcn.md (*memory_barrier): Split into RDNA and !RDNA. > (atomic_load<mode>): Adjust RDNA cache settings. > (atomic_store<mode>): Likewise. > (atomic_exchange<mode>): Likewise. > --- > gcc/config/gcn/gcn.md | 86 +++++++++++++++++++++++++++---------------- > 1 file changed, 55 insertions(+), 31 deletions(-) > > diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md > index 3b51453aaca..574c2f87e8c 100644 > --- a/gcc/config/gcn/gcn.md > +++ b/gcc/config/gcn/gcn.md > @@ -1960,11 +1960,19 @@ > (define_insn "*memory_barrier" > [(set (match_operand:BLK 0) > (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))] > - "" > - "{buffer_wbinvl1_vol|buffer_gl0_inv}" > + "!TARGET_RDNA2_PLUS" > + "buffer_wbinvl1_vol" > [(set_attr "type" "mubuf") > (set_attr "length" "4")]) > > +(define_insn "*memory_barrier" > + [(set (match_operand:BLK 0) > + (unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))] > + "TARGET_RDNA2_PLUS" > + "buffer_gl1_inv\;buffer_gl0_inv" > + [(set_attr "type" "mult") > + (set_attr "length" "8")]) > + > ; FIXME: These patterns have been disabled as they do not seem to work > ; reliably - they can cause hangs or incorrect results. > ; TODO: flush caches according to memory model > @@ -2094,9 +2102,13 @@ > case 0: > return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)"; > case 1: > - return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0"; > + return (TARGET_RDNA2 /* Not GFX11. */ > + ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0" > + : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0"); > case 2: > - return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)"; > + return (TARGET_RDNA2 /* Not GFX11. */ > + ? "global_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\tvmcnt(0)" > + : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)"); > } > break; > case MEMMODEL_CONSUME: > @@ -2108,15 +2120,21 @@ > return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;" > "s_dcache_wb_vol"; > case 1: > - return (TARGET_RDNA2_PLUS > + return (TARGET_RDNA2 > + ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0\;" > + "buffer_gl1_inv\;buffer_gl0_inv" > + : TARGET_RDNA3 > ? "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;" > - "buffer_gl0_inv" > + "buffer_gl1_inv\;buffer_gl0_inv" > : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;" > "buffer_wbinvl1_vol"); > case 2: > - return (TARGET_RDNA2_PLUS > + return (TARGET_RDNA2 > + ? "global_load%o0\t%0, %A1%O1 glc > dlc\;s_waitcnt\tvmcnt(0)\;" > + "buffer_gl1_inv\;buffer_gl0_inv" > + : TARGET_RDNA3 > ? "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;" > - "buffer_gl0_inv" > + "buffer_gl1_inv\;buffer_gl0_inv" > : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;" > "buffer_wbinvl1_vol"); > } > @@ -2130,15 +2148,21 @@ > return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;" > "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol"; > case 1: > - return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc\;" > - "s_waitcnt\t0\;buffer_gl0_inv" > + return (TARGET_RDNA2 > + ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 > glc dlc\;" > + "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv" > + : TARGET_RDNA3 > + ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 > glc\;" > + "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv" > : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;" > "s_waitcnt\t0\;buffer_wbinvl1_vol"); > case 2: > - return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc\;" > - "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv" > + return (TARGET_RDNA2 > + ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, > %A1%O1 glc dlc\;" > + "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv" > + : TARGET_RDNA3 > + ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, > %A1%O1 glc\;" > + "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv" > : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;" > "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"); > } > @@ -2147,7 +2171,7 @@ > gcc_unreachable (); > } > [(set_attr "type" "smem,flat,flat") > - (set_attr "length" "20") > + (set_attr "length" "28") > (set_attr "gcn_version" "gcn5,*,gcn5") > (set_attr "rdna" "no,*,*")]) > > @@ -2180,11 +2204,11 @@ > return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc"; > case 1: > return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc" > + ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, > %1%O0 glc" > : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc"); > case 2: > return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc" > + ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, > %1%O0 glc" > : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc"); > } > break; > @@ -2198,14 +2222,14 @@ > "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol"; > case 1: > return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc\;" > - "s_waitcnt\t0\;buffer_gl0_inv" > + ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, > %1%O0 glc\;" > + "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv" > : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;" > "s_waitcnt\t0\;buffer_wbinvl1_vol"); > case 2: > return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc\;" > - "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv" > + ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, > %1%O0 glc\;" > + "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv" > : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;" > "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"); > } > @@ -2214,7 +2238,7 @@ > gcc_unreachable (); > } > [(set_attr "type" "smem,flat,flat") > - (set_attr "length" "20") > + (set_attr "length" "28") > (set_attr "gcn_version" "gcn5,*,gcn5") > (set_attr "rdna" "no,*,*")]) > > @@ -2253,13 +2277,13 @@ > case 1: > return (TARGET_RDNA2_PLUS > ? "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;" > - "buffer_gl0_inv" > + "buffer_gl1_inv\;buffer_gl0_inv" > : "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;" > "buffer_wbinvl1_vol"); > case 2: > return (TARGET_RDNA2_PLUS > ? "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;" > - "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv" > + "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv" > : "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;" > "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"); > } > @@ -2273,13 +2297,13 @@ > "s_waitcnt\tlgkmcnt(0)"; > case 1: > return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;" > + ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, > %1, %2 glc\;" > "s_waitcnt\t0" > : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 > glc\;" > "s_waitcnt\t0"); > case 2: > return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;" > + ? "buffer_gl1_inv\;buffer_gl0_inv\;" > "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;" > "s_waitcnt\tvmcnt(0)" > : "buffer_wbinvl1_vol\;" > @@ -2297,15 +2321,15 @@ > "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol"; > case 1: > return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;" > - "s_waitcnt\t0\;buffer_gl0_inv" > + ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, > %1, %2 glc\;" > + "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv" > : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 > glc\;" > "s_waitcnt\t0\;buffer_wbinvl1_vol"); > case 2: > return (TARGET_RDNA2_PLUS > - ? "buffer_gl0_inv\;" > + ? "buffer_gl1_inv\;buffer_gl0_inv\;" > "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;" > - "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv" > + "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv" > : "buffer_wbinvl1_vol\;" > "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;" > "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"); > @@ -2315,7 +2339,7 @@ > gcc_unreachable (); > } > [(set_attr "type" "smem,flat,flat") > - (set_attr "length" "20") > + (set_attr "length" "28") > (set_attr "gcn_version" "gcn5,*,gcn5") > (set_attr "rdna" "no,*,*")]) > > -- > 2.41.0