"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with > vec_duplicate_optab to duplicate QI value to TI/OI/XI value. > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard > scratch register to avoid stack realignment when expanding memset. > > PR middle-end/90773 > * builtins.c (gen_memset_value_from_prev): New function. > (gen_memset_broadcast): Likewise. > (builtin_memset_read_str): Use gen_memset_value_from_prev > and gen_memset_broadcast. > (builtin_memset_gen_str): Likewise. > * target.def (gen_memset_scratch_rtx): New hook. > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. > * doc/tm.texi: Regenerated. > --- > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++-------- > gcc/doc/tm.texi | 5 ++ > gcc/doc/tm.texi.in | 2 + > gcc/target.def | 7 +++ > 4 files changed, 116 insertions(+), 21 deletions(-) > > diff --git a/gcc/builtins.c b/gcc/builtins.c > index 39ab139b7e1..c1758ae2efc 100644 > --- a/gcc/builtins.c > +++ b/gcc/builtins.c > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target) > return NULL_RTX; > } > > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > - bytes from constant string DATA + OFFSET and return it as target > - constant. If PREV isn't nullptr, it has the RTL info from the > +/* Return the RTL of a register in MODE generated from PREV in the > previous iteration. */ > > -rtx > -builtin_memset_read_str (void *data, void *prevp, > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > - scalar_int_mode mode) > +static rtx > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) > { > + rtx target = nullptr; > by_pieces_prev *prev = (by_pieces_prev *) prevp; > if (prev != nullptr && prev->data != nullptr) > { > /* Use the previous data in the same mode. */ > if (prev->mode == mode) > return prev->data; > + > + rtx prev_rtx = prev->data; > + machine_mode prev_mode = prev->mode; > + unsigned int word_size = GET_MODE_SIZE (word_mode); > + if (word_size < GET_MODE_SIZE (prev->mode) > + && word_size > GET_MODE_SIZE (mode)) > + { > + /* First generate subreg of word mode if the previous mode is > + wider than word mode and word mode is wider than MODE. */ > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > + prev_mode, 0); > + prev_mode = word_mode; > + } > + if (prev_rtx != nullptr) > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > } > + return target; > +} > + > +/* Return the RTL of a register in MODE broadcasted from DATA. */ > + > +static rtx > +gen_memset_broadcast (rtx data, scalar_int_mode mode) > +{ > + /* Skip if regno_reg_rtx isn't initialized. */ > + if (!regno_reg_rtx) > + return nullptr; > + > + rtx target = nullptr; > + > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); > + machine_mode vector_mode; > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) > + gcc_unreachable ();
Sorry, I realise it's a bit late to be raising this objection now, but I don't think it's a good idea to use scalar integer modes as a proxy for vector modes. In principle there's no reason why a target has to define an integer mode for every vector mode. If we want the mode to be a vector then I think the by-pieces infrastructure should be extended to support vectors directly, rather than assuming that each piece can be represented as a scalar_int_mode. Thanks, Richard > + > + enum insn_code icode = optab_handler (vec_duplicate_optab, > + vector_mode); > + if (icode != CODE_FOR_nothing) > + { > + rtx reg = targetm.gen_memset_scratch_rtx (vector_mode); > + if (CONST_INT_P (data)) > + { > + /* Use the move expander with CONST_VECTOR. */ > + rtx const_vec = gen_const_vec_duplicate (vector_mode, data); > + emit_move_insn (reg, const_vec); > + } > + else > + { > + > + class expand_operand ops[2]; > + create_output_operand (&ops[0], reg, vector_mode); > + create_input_operand (&ops[1], data, QImode); > + expand_insn (icode, 2, ops); > + if (!rtx_equal_p (reg, ops[0].value)) > + emit_move_insn (reg, ops[0].value); > + } > + target = lowpart_subreg (mode, reg, vector_mode); > + } > + > + return target; > +} > + > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > + bytes from constant string DATA + OFFSET and return it as target > + constant. If PREV isn't nullptr, it has the RTL info from the > + previous iteration. */ > > +rtx > +builtin_memset_read_str (void *data, void *prev, > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > + scalar_int_mode mode) > +{ > + rtx target; > const char *c = (const char *) data; > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > + char *p; > + > + /* Don't use the previous value if size is 1. */ > + if (GET_MODE_SIZE (mode) != 1) > + { > + target = gen_memset_value_from_prev (prev, mode); > + if (target != nullptr) > + return target; > + > + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode)); > + memset (p, *c, GET_MODE_SIZE (QImode)); > + rtx src = c_readstr (p, QImode); > + target = gen_memset_broadcast (src, mode); > + if (target != nullptr) > + return target; > + } > + > + p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > > memset (p, *c, GET_MODE_SIZE (mode)); > > @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp, > nullptr, it has the RTL info from the previous iteration. */ > > static rtx > -builtin_memset_gen_str (void *data, void *prevp, > +builtin_memset_gen_str (void *data, void *prev, > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > scalar_int_mode mode) > { > @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp, > size_t size; > char *p; > > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > - if (prev != nullptr && prev->data != nullptr) > - { > - /* Use the previous data in the same mode. */ > - if (prev->mode == mode) > - return prev->data; > - > - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); > - if (target != nullptr) > - return target; > - } > - > size = GET_MODE_SIZE (mode); > if (size == 1) > return (rtx) data; > > + target = gen_memset_value_from_prev (prev, mode); > + if (target != nullptr) > + return target; > + > + target = gen_memset_broadcast ((rtx) data, mode); > + if (target != nullptr) > + return target; > + > p = XALLOCAVEC (char, size); > memset (p, 1, size); > coeff = c_readstr (p, mode); > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi > index 2a41ae5fba1..8849711fcf8 100644 > --- a/gcc/doc/tm.texi > +++ b/gcc/doc/tm.texi > @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional > comparison within a sequence > @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares. > @end deftypefn > > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode > @var{mode}) > +This hook should return an rtx for scratch register in @var{mode} to > +be used by memset broadcast. The default is @code{gen_reg_rtx}. > +@end deftypefn > + > @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned > @var{nunroll}, class loop *@var{loop}) > This target hook returns a new value for the number of times @var{loop} > should be unrolled. The parameter @var{nunroll} is the number of times > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in > index f881cdabe9e..a6bbf4f2667 100644 > --- a/gcc/doc/tm.texi.in > +++ b/gcc/doc/tm.texi.in > @@ -7958,6 +7958,8 @@ lists. > > @hook TARGET_GEN_CCMP_NEXT > > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX > + > @hook TARGET_LOOP_UNROLL_ADJUST > > @defmac POWI_MAX_MULTS > diff --git a/gcc/target.def b/gcc/target.def > index c009671c583..5fb287db3bd 100644 > --- a/gcc/target.def > +++ b/gcc/target.def > @@ -2726,6 +2726,13 @@ DEFHOOK > rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree > op0, tree op1, int bit_code), > NULL) > > +DEFHOOK > +(gen_memset_scratch_rtx, > + "This hook should return an rtx for scratch register in @var{mode} to\n\ > +be used by memset broadcast. The default is @code{gen_reg_rtx}.", > + rtx, (machine_mode mode), > + gen_reg_rtx) > + > /* Return a new value for loop unroll size. */ > DEFHOOK > (loop_unroll_adjust,