On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu <crazy...@gmail.com> wrote: > > > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > ldtilecfg and sttilecfg take a 512-byte memory block. With > > > _tile_loadconfig implemented as > > > > > > extern __inline void > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > _tile_loadconfig (const void *__config) > > > { > > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); > > > } > > > > > > GCC sees: > > > > > > (parallel [ > > > (asm_operands/v ("ldtilecfg %X0") ("") 0 > > > [(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars) > > > (const_int -64 [0xffffffffffffffc0])) [1 > > > MEM[(const void * *)&tile_data]+0 S8 A128])] > > > [(asm_input:DI ("m"))] > > > (clobber (reg:CC 17 flags))]) > > > > > > and the memory operand size is 1 byte. As the result, the rest of 511 > > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics > > > with a pointer to BLKmode to honor the 512-byte memory block. > > > > > > gcc/ChangeLog: > > > > > > PR target/114098 > > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use > > > __builtin_ia32_ldtilecfg. > > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg. > > > * config/i386/i386-builtin.def (BDESC): Add > > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg. > > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle > > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG. > > > * config/i386/i386.md (ldtilecfg): New pattern. > > > (sttilecfg): Likewise. > > > > > > gcc/testsuite/ChangeLog: > > > > > > PR target/114098 > > > * gcc.target/i386/amxtile-4.c: New test. > > > --- > > > gcc/config/i386/amxtileintrin.h | 4 +- > > > gcc/config/i386/i386-builtin.def | 4 ++ > > > gcc/config/i386/i386-expand.cc | 19 ++++++++ > > > gcc/config/i386/i386.md | 24 ++++++++++ > > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++++++++++++++++++++++ > > > 5 files changed, 104 insertions(+), 2 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c > > > > > > diff --git a/gcc/config/i386/amxtileintrin.h > > > b/gcc/config/i386/amxtileintrin.h > > > index d1a26e0fea5..5081b326498 100644 > > > --- a/gcc/config/i386/amxtileintrin.h > > > +++ b/gcc/config/i386/amxtileintrin.h > > > @@ -39,14 +39,14 @@ extern __inline void > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > _tile_loadconfig (const void *__config) > > > { > > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void > > > **)__config))); > > > + __builtin_ia32_ldtilecfg (__config); > > > } > > > > > > extern __inline void > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > _tile_storeconfig (void *__config) > > > { > > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); > > > + __builtin_ia32_sttilecfg (__config); > > > } > > > > > > extern __inline void > > > diff --git a/gcc/config/i386/i386-builtin.def > > > b/gcc/config/i386/i386-builtin.def > > > index 729355230b8..88dd7f8857f 100644 > > > --- a/gcc/config/i386/i386-builtin.def > > > +++ b/gcc/config/i386/i386-builtin.def > > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | > > > OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b > > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, > > > CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, > > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, > > > CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, > > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > > > > +/* LDFILECFG and STFILECFG. */ > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > > CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, > > > UNKNOWN, (int) VOID_FTYPE_PCVOID) > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > > CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, > > > UNKNOWN, (int) VOID_FTYPE_PVOID) > > CODE_FOR_sttilecfg. > > It is unused. I changed both to CODE_FOR_nothing. > > > > + > > > /* SSE */ > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, > > > "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) > > > VOID_FTYPE_PFLOAT_V4SF) > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, > > > "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) > > > VOID_FTYPE_PFLOAT_V4SF) > > > diff --git a/gcc/config/i386/i386-expand.cc > > > b/gcc/config/i386/i386-expand.cc > > > index a4d3369f01b..17993eb837f 100644 > > > --- a/gcc/config/i386/i386-expand.cc > > > +++ b/gcc/config/i386/i386-expand.cc > > > @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx > > > subtarget, > > > emit_insn (pat); > > > return 0; > > > > > > + case IX86_BUILTIN_LDTILECFG: > > > + case IX86_BUILTIN_STTILECFG: > > > + arg0 = CALL_EXPR_ARG (exp, 0); > > > + op0 = expand_normal (arg0); > > > + > > > + if (!address_operand (op0, VOIDmode)) > > > + { > > > + op0 = convert_memory_address (Pmode, op0); > > > + op0 = copy_addr_to_reg (op0); > > > + } > > > + op0 = gen_rtx_MEM (BLKmode, op0); > > maybe we can just use XImode, and adjust the patterns with XI. > > Changed. > > > > + if (fcode == IX86_BUILTIN_LDTILECFG) > > > + icode = CODE_FOR_ldtilecfg; > > > + else > > > + icode = CODE_FOR_sttilecfg; > > > + pat = GEN_FCN (icode) (op0); > > > + emit_insn (pat); > > > + return 0; > > > + > > > case IX86_BUILTIN_LLWPCB: > > > arg0 = CALL_EXPR_ARG (exp, 0); > > > op0 = expand_normal (arg0); > > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > > > index 6a26d966a0e..0ede6adac2f 100644 > > > --- a/gcc/config/i386/i386.md > > > +++ b/gcc/config/i386/i386.md > > > @@ -353,6 +353,10 @@ (define_c_enum "unspecv" [ > > > ;; For USER_MSR support > > > UNSPECV_URDMSR > > > UNSPECV_UWRMSR > > > + > > > + ;; For AMX-TILE > > > + UNSPECV_LDTILECFG > > > + UNSPECV_STTILECFG > > > ]) > > > > > > ;; Constants to represent rounding modes in the ROUND instruction > > > @@ -28152,6 +28156,26 @@ (define_insn "uwrmsr" > > > [(set_attr "prefix" "vex") > > > (set_attr "type" "other")]) > > > > > > + > > > +(define_insn "ldtilecfg" > > > + [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "jm")] > > ldtilecfg/sttilecfg can be extended with gpr32, so just use m instead of jm. > > > + UNSPECV_LDTILECFG)] > > > + "TARGET_AMX_TILE" > > > + "ldtilecfg\t%0" > > > > > + [(set_attr "type" "other") > > > + (set_attr "addr" "gpr16") > > Remove this. > > Done. > > > > + (set_attr "prefix" "vex") > > Possible better with maybe_evex. > > Done. > > > > + (set_attr "memory" "load")]) > > > + > > > +(define_insn "sttilecfg" > > > + [(set (match_operand:BLK 0 "memory_operand" "=jm") > > > + (unspec_volatile:BLK [(const_int 0)] UNSPECV_STTILECFG))] > > > + "TARGET_AMX_TILE" > > > + "sttilecfg\t%0" > > > + [(set_attr "type" "other") > > > + (set_attr "addr" "gpr16") > > > + (set_attr "prefix" "vex") > > > + (set_attr "memory" "store")]) > > > (include "mmx.md") > > > (include "sse.md") > > > (include "sync.md") > > > diff --git a/gcc/testsuite/gcc.target/i386/amxtile-4.c > > > b/gcc/testsuite/gcc.target/i386/amxtile-4.c > > > new file mode 100644 > > > index 00000000000..1255af2594e > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/amxtile-4.c > > > @@ -0,0 +1,55 @@ > > > +/* PR target/114098 */ > > > +/* { dg-do compile { target { ! ia32 } } } */ > > > +/* { dg-options "-O2 -mamx-tile" } */ > > > + > > > +#include <stdint.h> > > > +#include <x86intrin.h> > > > + > > > +#define MAX_ROWS 16 > > > +#define MAX_COLS 64 > > > +#define MAX 1024 > > > +#define STRIDE 64 > > > + > > > +typedef struct __tile_config > > > +{ > > > + uint8_t palette_id; > > > + uint8_t start_row; > > > + uint8_t reserved_0[14]; > > > + uint16_t colsb[16]; > > > + uint8_t rows[16]; > > > +} __tilecfg; > > > + > > > + > > > +extern void bar (__tilecfg *tileinfo); > > > + > > > +/* Initialize tile config */ > > > +static void > > > +init_tile_config (__tilecfg *tileinfo) > > > +{ > > > + int i; > > > + tileinfo->palette_id = 1; > > > + tileinfo->start_row = 0; > > > + > > > + for (i = 0; i < 1; ++i) > > > + { > > > + tileinfo->colsb[i] = MAX_ROWS; > > > + tileinfo->rows[i] = MAX_ROWS; > > > + } > > > + > > > + for (i = 1; i < 4; ++i) > > > + { > > > + tileinfo->colsb[i] = MAX_COLS; > > > + tileinfo->rows[i] = MAX_ROWS; > > > + } > > > + > > > + _tile_loadconfig (tileinfo); > > > +} > > > + > > > +void > > > +enable_amx (void) > > > +{ > > > + __tilecfg tile_data = {0}; > > > + init_tile_config (&tile_data); > > > +} > > > + > > > +/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */ > > > -- > > > 2.43.2 > > > > > > > I am testing this patch now. Ok if it passes the regression test. > > Thanks. > > -- > H.J.
-- BR, Hongtao