On Mon, Apr 21, 2025 at 6:34 PM Jan Hubicka <hubi...@ucw.cz> wrote: ... > We originally put CLEAR_RATIO < MOVE_RATIO based on observation that > mov $0, mem > is longer in encoding than > mov mem, mem > and there was a plan to implement optimization to avoid long immediates > in moves, but it did not materialize (yet). With SSE this problem > disappears since SSE stores does not have immediates anyway.
Here is a patch to implement it with UNSPEC_STORE_BY_PIECES. How does it look? -- H.J.
From c021053a4fea121a3c4a593b2907701c42a626bc Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.to...@gmail.com> Date: Mon, 21 Apr 2025 21:12:35 +0800 Subject: [PATCH] RFC: Add TARGET_STORE_BY_PIECES_ICODE Add a target hook to control the instruction to move the memory used by the store by_pieces infrastructure so that a target can choose a specific instruction for shorter encoding. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> --- gcc/config/i386/i386.cc | 27 +++++++++++++++++++++++++++ gcc/config/i386/i386.md | 24 ++++++++++++++++++++++++ gcc/config/i386/x86-tune.def | 6 ++++++ gcc/doc/tm.texi | 5 +++++ gcc/doc/tm.texi.in | 2 ++ gcc/expr.cc | 2 +- gcc/target.def | 7 +++++++ gcc/targhooks.cc | 9 +++++++++ gcc/targhooks.h | 1 + 9 files changed, 82 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 28603c2943e..8d289ad1a53 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -26549,6 +26549,31 @@ ix86_redzone_clobber () return NULL_RTX; } +/* Implement TARGET_STORE_BY_PIECES_ICODE. */ + +static insn_code +ix86_store_by_pieces_icode (machine_mode mode) +{ + if (STORE_MAX_PIECES == UNITS_PER_WORD + && ix86_tune_features [X86_TUNE_USE_REGISTER_STORE_BY_PIECES]) + switch (mode) + { + case SImode: + /* Allow 32-bit immediate in 64-bit mode since it will be + used at most twice. */ + if (TARGET_64BIT) + break; + return CODE_FOR_store_by_pieces_movsi; + case DImode: + if (TARGET_64BIT) + return CODE_FOR_store_by_pieces_movdi; + default: + break; + } + + return default_store_by_pieces_icode (mode); +} + /* Target-specific selftests. */ #if CHECKING_P @@ -26994,6 +27019,8 @@ static const scoped_attribute_specs *const ix86_attribute_table[] = #undef TARGET_OVERLAP_OP_BY_PIECES_P #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true +#undef TARGET_STORE_BY_PIECES_ICODE +#define TARGET_STORE_BY_PIECES_ICODE ix86_store_by_pieces_icode #undef TARGET_FLAGS_REGNUM #define TARGET_FLAGS_REGNUM FLAGS_REG diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index d6b2f2959b2..77761ab5fc3 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -118,6 +118,7 @@ (define_c_enum "unspec" [ UNSPEC_POPFL UNSPEC_OPTCOMX UNSPEC_SETCC_SI_SLP + UNSPEC_STORE_BY_PIECES ;; For SSE/MMX support: UNSPEC_FIX_NOTRUNC @@ -2417,6 +2418,29 @@ (define_expand "mov<mode>" "" "ix86_expand_move (<MODE>mode, operands); DONE;") +;; SI/DI mode register stores used by store by_pieces for shorter +;; encoding. +(define_expand "store_by_pieces_mov<mode>" + [(set (match_operand:SWI48x 0 "memory_operand") + (match_operand:SWI48x 1 "general_operand"))] + "" +{ + operands[1] = force_reg (<MODE>mode, operands[1]); + emit_insn (gen_store_by_pieces_mov<mode>_1 (operands[0], + operands[1])); + DONE; +}) + +(define_insn "store_by_pieces_mov<mode>_1" + [(set (match_operand:SWI48x 0 "memory_operand" "=m") + (unspec:SWI48x + [(match_operand:SWI48x 1 "register_operand" "r")] + UNSPEC_STORE_BY_PIECES))] + "" + "mov\t{%1, %0|%0, %1}" + [(set_attr "type" "imov") + (set_attr "mode" "<MODE>")]) + (define_insn "*mov<mode>_xor" [(set (match_operand:SWI48 0 "register_operand" "=r") (match_operand:SWI48 1 "const0_operand")) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index c3635c71d06..00a1638ad61 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -636,6 +636,12 @@ DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces", DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues", m_ZNVER4 | m_ZNVER5) +/* X86_TUNE_USE_REGISTER_STORE_BY_PIECES: Generate store_by_pieces with + register store. */ +DEF_TUNE (X86_TUNE_USE_REGISTER_STORE_BY_PIECES, + "use_register_store_by_pieces", + 0) + /*****************************************************************************/ /*****************************************************************************/ /* Historical relics: tuning flags that helps a specific old CPU designs */ diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index a96700c0d38..9753ebcf9c2 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -7186,6 +7186,11 @@ particular mode from being used for block comparisons by returning a negative number from this hook. @end deftypefn +@deftypefn {Target Hook} insn_code TARGET_STORE_BY_PIECES_ICODE (machine_mode @var{mode}) +This target hook returns insn_code to move the @var{mode} memory used +by the store @code{by_pieces} infrastructure. +@end deftypefn + @defmac MOVE_MAX_PIECES A C expression used by @code{move_by_pieces} to determine the largest unit a load or store used to copy memory is. Defaults to @code{MOVE_MAX}. diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index eccc4d88493..e8cd831ad32 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4639,6 +4639,8 @@ If you don't define this, a reasonable default is used. @hook TARGET_COMPARE_BY_PIECES_BRANCH_RATIO +@hook TARGET_STORE_BY_PIECES_ICODE + @defmac MOVE_MAX_PIECES A C expression used by @code{move_by_pieces} to determine the largest unit a load or store used to copy memory is. Defaults to @code{MOVE_MAX}. diff --git a/gcc/expr.cc b/gcc/expr.cc index 3815c565e2d..caed7c4dcf7 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -1714,7 +1714,7 @@ class store_by_pieces_d : public op_by_pieces_d bool store_by_pieces_d::prepare_mode (machine_mode mode, unsigned int align) { - insn_code icode = optab_handler (mov_optab, mode); + insn_code icode = targetm.store_by_pieces_icode (mode); m_gen_fun = GEN_FCN (icode); return icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode); } diff --git a/gcc/target.def b/gcc/target.def index 6c7cdc8126b..63157dbcde2 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -3909,6 +3909,13 @@ negative number from this hook.", int, (machine_mode mode), default_compare_by_pieces_branch_ratio) +DEFHOOK +(store_by_pieces_icode, + "This target hook returns insn_code to move the @var{mode} memory used\n\ +by the store @code{by_pieces} infrastructure.", + insn_code, (machine_mode mode), + default_store_by_pieces_icode) + DEFHOOK (slow_unaligned_access, "This hook returns true if memory accesses described by the\n\ diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc index c79458e374e..0233f331f53 100644 --- a/gcc/targhooks.cc +++ b/gcc/targhooks.cc @@ -2196,6 +2196,15 @@ default_compare_by_pieces_branch_ratio (machine_mode) return 1; } +/* This target hook returns insn_code to move the MODE memory used by the + store by_pieces infrastructure. */ + +insn_code +default_store_by_pieces_icode (machine_mode mode) +{ + return optab_handler (mov_optab, mode); +} + /* Write PATCH_AREA_SIZE NOPs into the asm outfile FILE around a function entry. If RECORD_P is true and the target supports named sections, the location of the NOPs will be recorded in a special object section diff --git a/gcc/targhooks.h b/gcc/targhooks.h index f16b58798c2..fb38fc9aec4 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -249,6 +249,7 @@ extern bool default_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT, enum by_pieces_operation, bool); extern int default_compare_by_pieces_branch_ratio (machine_mode); +extern insn_code default_store_by_pieces_icode (machine_mode); extern void default_print_patchable_function_entry (FILE *, unsigned HOST_WIDE_INT, -- 2.49.0