On Mon, Apr 21, 2025 at 4:30 PM H.J. Lu <[email protected]> wrote:
>
> On Mon, Apr 21, 2025 at 11:29 AM Hongtao Liu <[email protected]> wrote:
> >
> > On Sat, Apr 19, 2025 at 1:25 PM H.J. Lu <[email protected]> wrote:
> > >
> > > On Sun, Dec 1, 2024 at 7:50 AM H.J. Lu <[email protected]> wrote:
> > > >
> > > > For all different modes of all 0s/1s vectors, we can use the single
> > > > widest
> > > > all 0s/1s vector register for all 0s/1s vector uses in the whole
> > > > function.
> > > > Add a pass to generate a single widest all 0s/1s vector set instruction
> > > > at
> > > > entry of the nearest common dominator for basic blocks with all 0s/1s
> > > > vector uses. On Linux/x86-64, in cc1plus, this patch reduces the number
> > > > of vector xor instructions from 4803 to 4714 and pcmpeq instructions
> > > > from
> > > > 144 to 142.
> > > >
> > > > This change causes a regression:
> > > >
> > > > FAIL: gcc.dg/rtl/x86_64/vector_eq.c
> > > >
> > > > without the fix for
> > > >
> > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117863
> > > >
> > > > NB: PR target/92080 and PR target/117839 aren't same. PR target/117839
> > > > is for vectors of all 0s and all 1s with different sizes and different
> > > > components. PR target/92080 is for broadcast of the same component to
> > > > different vector sizes. This patch covers only all 0s and all 1s cases
> > > > of PR target/92080.
> > > >
> > > > gcc/
> > > >
> > > > PR target/92080
> > > > PR target/117839
> > > > * config/i386/i386-features.cc (ix86_rrvl_gate): New.
> > > > (ix86_place_single_vector_set): Likewise.
> > > > (ix86_get_vector_load_mode): Likewise.
> > > > (remove_redundant_vector_load): Likewise.
> > > > (pass_data_remove_redundant_vector_load): Likewise.
> > > > (pass_remove_redundant_vector_load): Likewise.
> > > > (make_pass_remove_redundant_vector_load): Likewise.
> > > > * config/i386/i386-passes.def: Add
> > > > pass_remove_redundant_vector_load after
> > > > pass_remove_partial_avx_dependency.
> > > > * config/i386/i386-protos.h
> > > > (make_pass_remove_redundant_vector_load): New.
> > > >
> > > > gcc/testsuite/
> > > >
> > > > PR target/92080
> > > > PR target/117839
> > > > * gcc.target/i386/pr117839-1a.c: New test.
> > > > * gcc.target/i386/pr117839-1b.c: Likewise.
> > > > * gcc.target/i386/pr117839-2.c: Likewise.
> > > > * gcc.target/i386/pr92080-1.c: Likewise.
> > > > * gcc.target/i386/pr92080-2.c: Likewise.
> > > > * gcc.target/i386/pr92080-3.c: Likewise.
> > > >
> > > > Signed-off-by: H.J. Lu <[email protected]>
> > > > ---
> > > > gcc/config/i386/i386-features.cc | 308 ++++++++++++++++++++
> > > > gcc/config/i386/i386-passes.def | 1 +
> > > > gcc/config/i386/i386-protos.h | 2 +
> > > > gcc/testsuite/gcc.target/i386/pr117839-1a.c | 35 +++
> > > > gcc/testsuite/gcc.target/i386/pr117839-1b.c | 5 +
> > > > gcc/testsuite/gcc.target/i386/pr117839-2.c | 40 +++
> > > > gcc/testsuite/gcc.target/i386/pr92080-1.c | 54 ++++
> > > > gcc/testsuite/gcc.target/i386/pr92080-2.c | 59 ++++
> > > > gcc/testsuite/gcc.target/i386/pr92080-3.c | 48 +++
> > > > 9 files changed, 552 insertions(+)
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c
> > > >
> > > > diff --git a/gcc/config/i386/i386-features.cc
> > > > b/gcc/config/i386/i386-features.cc
> > > > index 003b003e09c..7d8d260750d 100644
> > > > --- a/gcc/config/i386/i386-features.cc
> > > > +++ b/gcc/config/i386/i386-features.cc
> > > > @@ -3288,6 +3288,314 @@ make_pass_remove_partial_avx_dependency
> > > > (gcc::context *ctxt)
> > > > return new pass_remove_partial_avx_dependency (ctxt);
> > > > }
> > > >
> > > > +static bool
> > > > +ix86_rrvl_gate ()
> > > > +{
> > > > + return (TARGET_SSE2
> > > > + && optimize
> > > > + && optimize_function_for_speed_p (cfun));
> > > > +}
> > > > +
> > > > +/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
> > > > + for basic block map BBS, which is in the fake loop that contains the
> > > > + whole function, so that there is only a single vector set in the
> > > > + whole function. */
> > > > +
> > > > +static void
> > > > +ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
> > > > +{
> > > > + basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
> > > > bbs);
> > > > + while (bb->loop_father->latch
> > > > + != EXIT_BLOCK_PTR_FOR_FN (cfun))
> > > > + bb = get_immediate_dominator (CDI_DOMINATORS,
> > > > + bb->loop_father->header);
> > > > +
> > > > + rtx set = gen_rtx_SET (dest, src);
> > > > +
> > > > + rtx_insn *insn = BB_HEAD (bb);
> > > > + while (insn && !NONDEBUG_INSN_P (insn))
> > > > + {
> > > > + if (insn == BB_END (bb))
> > > > + {
> > > > + insn = NULL;
> > > > + break;
> > > > + }
> > > > + insn = NEXT_INSN (insn);
> > > > + }
> > > > +
> > > > + rtx_insn *set_insn;
> > > > + if (insn == BB_HEAD (bb))
> > > > + set_insn = emit_insn_before (set, insn);
> > > > + else
> > > > + set_insn = emit_insn_after (set,
> > > > + insn ? PREV_INSN (insn) : BB_END (bb));
> > > > + df_insn_rescan (set_insn);
> > > > +}
> > > > +
> > > > +/* Return a machine mode suitable for vector SIZE. */
> > > > +
> > > > +static machine_mode
> > > > +ix86_get_vector_load_mode (unsigned int size)
> > > > +{
> > > > + machine_mode mode;
> > > > + if (size == 64)
> > > > + mode = V64QImode;
> > > > + else if (size == 32)
> > > > + mode = V32QImode;
> > > > + else
> > > > + mode = V16QImode;
> > > > + return mode;
> > > > +}
> > > > +
> > > > +/* At entry of the nearest common dominator for basic blocks with
> > > > vector
> > > > + CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
> > > > + vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
> > > > + uses.
> > > > +
> > > > + NB: We want to generate only a single widest vector set to cover the
> > > > + whole function. The LCM algorithm isn't appropriate here since it
> > > > + may place a vector set inside the loop. */
> > > > +
> > > > +static unsigned int
> > > > +remove_redundant_vector_load (void)
> > > > +{
> > > > + timevar_push (TV_MACH_DEP);
> > > > +
> > > > + bitmap_obstack_initialize (NULL);
> > > > + bitmap zero_bbs = BITMAP_ALLOC (NULL);
> > > > + bitmap m1_bbs = BITMAP_ALLOC (NULL);
> > > > + bitmap vector_insns = BITMAP_ALLOC (NULL);
> > Use auto_bitmap?
>
> Will fix it.
>
> > > > +
> > > > + basic_block bb;
> > > > + rtx_insn *insn;
> > > > + rtx set;
> > > > + unsigned HOST_WIDE_INT zero_count = 0;
> > > > + unsigned HOST_WIDE_INT m1_count = 0;
> > > > + unsigned int zero_size = 0;
> > > > + unsigned int m1_size = 0;
> > > > +
> > > > + df_set_flags (DF_DEFER_INSN_RESCAN);
> > > > +
> > > > + FOR_EACH_BB_FN (bb, cfun)
> > > > + {
> > > > + FOR_BB_INSNS (bb, insn)
> > > > + {
> > > > + if (!NONDEBUG_INSN_P (insn))
> > > > + continue;
> > > > +
> > > > + set = single_set (insn);
> > > > + if (!set)
> > > > + continue;
> > > > +
> > > > + rtx dest = SET_DEST (set);
> > > > + machine_mode mode = GET_MODE (dest);
> > > > + /* Skip non-vector instruction. */
> > > > + if (!VECTOR_MODE_P (mode))
> > > > + continue;
> > > > +
> > > > + rtx src = SET_SRC (set);
> > > > + if (!REG_P (dest)
> > > > + || (src != CONST0_RTX (mode)
> > > > + && !(GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > > > + && src == CONSTM1_RTX (mode))))
> > > > + {
> > > > + /* Record non-CONST0_RTX/CONSTM1_RTX vector instruction.
> > > > */
> > vector_insns only records a single_set, but not all vector_insns which
> > could use constm1/zero.
>
> Will fix it.
>
> > > > + bitmap_set_bit (vector_insns, INSN_UID (insn));
> > > > + continue;
> > > > + }
> > > > +
> > > > + if (src == CONST0_RTX (mode))
> > > > + {
> > > > + /* Record the maximum vector size. */
> > > > + if (zero_size < GET_MODE_SIZE (mode))
> > > > + zero_size = GET_MODE_SIZE (mode);
> > > > +
> > > > + /* Record the basic block with CONST0_RTX. */
> > > > + bitmap_set_bit (zero_bbs, bb->index);
> > > > + zero_count++;
> > > > + }
> > > > + else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > > > + && src == CONSTM1_RTX (mode))
> > > > + {
> > > > + /* Record the maximum vector size. */
> > > > + if (m1_size < GET_MODE_SIZE (mode))
> > > > + m1_size = GET_MODE_SIZE (mode);
> > > > +
> > > > + /* Record the basic block with CONSTM1_RTX. */
> > > > + bitmap_set_bit (m1_bbs, bb->index);
> > > > + m1_count++;
> > > > + }
> > > > + }
> > > > + }
> > > > +
> > > > + if (zero_count > 1 || m1_count > 1)
> > > > + {
> > > > + machine_mode zero_mode, m1_mode;
> > > > + rtx vector_const0, vector_constm1;
> > > > + if (zero_count > 1)
> > > > + {
> > > > + zero_mode = ix86_get_vector_load_mode (zero_size);
> > > > + vector_const0 = gen_reg_rtx (zero_mode);
> > > > + }
> > > > + else
> > > > + {
> > > > + zero_mode = VOIDmode;
> > > > + vector_const0 = nullptr;
> > > > + }
> > > > + if (m1_count > 1)
> > > > + {
> > > > + m1_mode = ix86_get_vector_load_mode (m1_size);
> > > > + vector_constm1 = gen_reg_rtx (m1_mode);
> > > > + }
> > > > + else
> > > > + {
> > > > + m1_mode = VOIDmode;
> > > > + vector_constm1 = nullptr;
> > > > + }
> > > > +
> > > > + bool zero_replaced = false;
> > > > + bool m1_replaced = false;
> > > > +
> > > > + bitmap_iterator bi;
> > > > + unsigned id;
> > > > + EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
> > Could we just record those zero/m1 insn, and replace the src of those
> > insn with subreg (...), I think LRA can eliminate those redundant
> > moves?
>
> This is what my patch does:
But it iterates through vector_insns, using a def-ref chain to find
those insns. I think we can just record those single_set with src as
const_m1/zero, and replace src for them.
>
> /* Check the single definition of CONST0_RTX and integer
> CONSTM1_RTX. */
> rtx src = SET_SRC (set);
> rtx replace;
> if (vector_const0 && src == CONST0_RTX (mode))
> {
> /* Replace REG with VECTOR_CONST0. */
> if (SUBREG_P (reg) || mode == zero_mode)
> replace = vector_const0;
> else
> replace = gen_rtx_SUBREG (mode, vector_const0, 0);
> *DF_REF_REAL_LOC (ref) = replace;
> replaced = true;
> zero_replaced = true;
> }
>
> It changed the source to a subreg directly.
>
> > Also we also need to change ix86_modes_tieable_p to make sure those
> > inserted subreg can be handled by LRA and other passes?
>
> ix86_modes_tieable_p is OK:
>
> /* If MODE2 is only appropriate for an SSE register, then tie with
> any other mode acceptable to SSE registers. */
> if (GET_MODE_SIZE (mode2) == 64
> && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> return (GET_MODE_SIZE (mode1) == 64
> && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> if (GET_MODE_SIZE (mode2) == 32
> && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> return (GET_MODE_SIZE (mode1) == 32
> && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> if (GET_MODE_SIZE (mode2) == 16
> && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> return (GET_MODE_SIZE (mode1) == 16
> && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
>
It's ok only size of mode1 is equal to size of mode2.
But in the testcase, there are different size vectors(32-bytes, 16-bytes).
So it would be better as, for mode2 >= 16 bytes, it can only be put
into SSE_REGS(except for TImode, but TImode still can be tied to
<=16bytes mode1 which can be put into SSE_REGS) , if mode1 can also be
put into SSE_REGS, then mode2 tie with mode1.
/* If MODE2 is only appropriate for an SSE register, then tie with
any other mode acceptable to SSE registers. */
- if (GET_MODE_SIZE (mode2) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 16
+ if (GET_MODE_SIZE (mode2) >= 16
+ && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 16
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+ return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
> > > > + {
> > > > + /* Replace CONST0_RTX and integer CONSTM1_RTX with the single
> > > > + CONST0_RTX and integer CONSTM1_RTX register. */
> > > > + df_ref ref, def;
> > > > + insn = DF_INSN_UID_GET (id)->insn;
> > > > + bool replaced = false;
> > > > +
> > > > + for (ref = DF_INSN_UID_USES (id);
> > > > + ref;
> > > > + ref = DF_REF_NEXT_LOC (ref))
> > > > + {
> > > > + if (DF_REF_TYPE (ref) != DF_REF_REG_USE)
> > > > + continue;
> > > > +
> > > > + /* Skip non-vector register. */
> > > > + rtx reg = DF_REF_REG (ref);
> > > > + if (!VECTOR_MODE_P (GET_MODE (reg)))
> > > > + continue;
> > > > +
> > > > + /* Check the single definition. */
> > > > + def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
> > > > + if (!def || DF_REF_NEXT_REG (def) != nullptr)
> > > > + continue;
> > > > +
> > > > + /* Get the single definition. */
> > > > + rtx_insn *def_insn = DF_REF_INSN (def);
> > > > + set = single_set (def_insn);
> > > > + if (!set)
> > > > + continue;
> > > > +
> > > > + /* Check the single definition of vector constant zero.
> > > > */
> > > > + rtx src = SET_SRC (set);
> > > > + rtx replace;
> > > > + if (vector_const0 && src == CONST0_RTX (GET_MODE (src)))
> > > > + {
> > > > + /* Replace REG with VECTOR_CONST0. */
> > > > + if (SUBREG_P (reg) || GET_MODE (reg) == zero_mode)
> > > > + replace = vector_const0;
> > > > + else
> > > > + replace = gen_rtx_SUBREG (GET_MODE (reg),
> > > > + vector_const0, 0);
> > > > + *DF_REF_REAL_LOC (ref) = replace;
> > > > + replaced = true;
> > > > + zero_replaced = true;
> > > > + }
> > > > + else if (vector_constm1
> > > > + && src == CONSTM1_RTX (GET_MODE (src)))
> > > > + {
> > > > + /* Replace REG with VECTOR_CONSTM1. */
> > > > + if (SUBREG_P (reg) || GET_MODE (reg) == m1_mode)
> > > > + replace = vector_constm1;
> > > > + else
> > > > + replace = gen_rtx_SUBREG (GET_MODE (reg),
> > > > + vector_constm1, 0);
> > > > + *DF_REF_REAL_LOC (ref) = replace;
> > > > + replaced = true;
> > > > + m1_replaced = true;
> > > > + }
> > > > + }
> > > > +
> > > > + if (replaced)
> > > > + df_insn_rescan (insn);
> > > > + }
> > > > +
> > > > + /* (Re-)discover loops so that bb->loop_father can be used in the
> > > > + analysis below. */
> > > > + calculate_dominance_info (CDI_DOMINATORS);
> > > > + loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> > > > +
> > > > + if (zero_replaced)
> > > > + ix86_place_single_vector_set (vector_const0,
> > > > + CONST0_RTX (zero_mode),
> > > > + zero_bbs);
> > > > +
> > > > + if (m1_replaced)
> > > > + ix86_place_single_vector_set (vector_constm1,
> > > > + CONSTM1_RTX (m1_mode),
> > > > + m1_bbs);
> > > > +
> > > > + loop_optimizer_finalize ();
> > > > +
> > > > + df_process_deferred_rescans ();
> > > > + }
> > > > +
> > > > + df_clear_flags (DF_DEFER_INSN_RESCAN);
> > > > +
> > > > + bitmap_obstack_release (NULL);
> > > > + BITMAP_FREE (zero_bbs);
> > > > + BITMAP_FREE (m1_bbs);
> > > > + BITMAP_FREE (vector_insns);
> > > > +
> > > > + timevar_pop (TV_MACH_DEP);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +namespace {
> > > > +
> > > > +const pass_data pass_data_remove_redundant_vector_load =
> > > > +{
> > > > + RTL_PASS, /* type */
> > > > + "rrvl", /* name */
> > > > + OPTGROUP_NONE, /* optinfo_flags */
> > > > + TV_MACH_DEP, /* tv_id */
> > > > + 0, /* properties_required */
> > > > + 0, /* properties_provided */
> > > > + 0, /* properties_destroyed */
> > > > + 0, /* todo_flags_start */
> > > > + 0, /* todo_flags_finish */
> > > > +};
> > > > +
> > > > +class pass_remove_redundant_vector_load : public rtl_opt_pass
> > > > +{
> > > > +public:
> > > > + pass_remove_redundant_vector_load (gcc::context *ctxt)
> > > > + : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
> > > > + {}
> > > > +
> > > > + /* opt_pass methods: */
> > > > + bool gate (function *) final override
> > > > + {
> > > > + return ix86_rrvl_gate ();
> > > > + }
> > > > +
> > > > + unsigned int execute (function *) final override
> > > > + {
> > > > + return remove_redundant_vector_load ();
> > > > + }
> > > > +}; // class pass_remove_redundant_vector_load
> > > > +
> > > > +} // anon namespace
> > > > +
> > > > +rtl_opt_pass *
> > > > +make_pass_remove_redundant_vector_load (gcc::context *ctxt)
> > > > +{
> > > > + return new pass_remove_redundant_vector_load (ctxt);
> > > > +}
> > > > +
> > > > /* Convert legacy instructions that clobbers EFLAGS to APX_NF
> > > > instructions when there are no flag set between a flag
> > > > producer and user. */
> > > > diff --git a/gcc/config/i386/i386-passes.def
> > > > b/gcc/config/i386/i386-passes.def
> > > > index a9d350dcfca..df424cdb9c7 100644
> > > > --- a/gcc/config/i386/i386-passes.def
> > > > +++ b/gcc/config/i386/i386-passes.def
> > > > @@ -35,5 +35,6 @@ along with GCC; see the file COPYING3. If not see
> > > > PR116174. */
> > > > INSERT_PASS_BEFORE (pass_shorten_branches, 1,
> > > > pass_align_tight_loops);
> > > >
> > > > + INSERT_PASS_AFTER (pass_late_combine, 1,
> > > > pass_remove_redundant_vector_load);
> > > > INSERT_PASS_AFTER (pass_late_combine, 1,
> > > > pass_remove_partial_avx_dependency);
> > > > INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
> > > > diff --git a/gcc/config/i386/i386-protos.h
> > > > b/gcc/config/i386/i386-protos.h
> > > > index 78e72c50c6d..4c3a8bd326c 100644
> > > > --- a/gcc/config/i386/i386-protos.h
> > > > +++ b/gcc/config/i386/i386-protos.h
> > > > @@ -426,6 +426,8 @@ extern rtl_opt_pass
> > > > *make_pass_insert_endbr_and_patchable_area
> > > > (gcc::context *);
> > > > extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
> > > > (gcc::context *);
> > > > +extern rtl_opt_pass *make_pass_remove_redundant_vector_load
> > > > + (gcc::context *);
> > > > extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
> > > > extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
> > > >
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1a.c
> > > > b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
> > > > new file mode 100644
> > > > index 00000000000..4501cfbcad4
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
> > > > @@ -0,0 +1,35 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
> > > > +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t
> > > > \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
> > > > +
> > > > +#include <stddef.h>
> > > > +
> > > > +void
> > > > +clear_memory (void *mem1, size_t nclears1, void *mem2, size_t nclears2)
> > > > +{
> > > > + size_t *d1 = (size_t *) mem1;
> > > > +
> > > > + *(d1 + 0) = 0;
> > > > + *(d1 + 1) = 0;
> > > > + *(d1 + 2) = 0;
> > > > + if (nclears1 > 3)
> > > > + {
> > > > + *(d1 + nclears1 - 4) = 0;
> > > > + *(d1 + nclears1 - 4 + 1) = 0;
> > > > + *(d1 + nclears1 - 4 + 2) = 0;
> > > > + *(d1 + nclears1 - 4 + 3) = 0;
> > > > + }
> > > > +
> > > > + double *d2 = (double *) mem2;
> > > > +
> > > > + *(d2 + 0) = 0;
> > > > + *(d2 + 1) = 0;
> > > > + *(d2 + 2) = 0;
> > > > + if (nclears2 > 3)
> > > > + {
> > > > + *(d2 + nclears2 - 4) = 0;
> > > > + *(d2 + nclears2 - 4 + 1) = 0;
> > > > + *(d2 + nclears2 - 4 + 2) = 0;
> > > > + *(d2 + nclears2 - 4 + 3) = 0;
> > > > + }
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1b.c
> > > > b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
> > > > new file mode 100644
> > > > index 00000000000..e71b991a207
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
> > > > @@ -0,0 +1,5 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -march=x86-64-v3" } */
> > > > +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t
> > > > \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
> > > > +
> > > > +#include "pr117839-1a.c"
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr117839-2.c
> > > > b/gcc/testsuite/gcc.target/i386/pr117839-2.c
> > > > new file mode 100644
> > > > index 00000000000..c76744cf98b
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr117839-2.c
> > > > @@ -0,0 +1,40 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -march=x86-64-v3" } */
> > > > +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t
> > > > \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
> > > > +
> > > > +#include <stddef.h>
> > > > +
> > > > +float
> > > > +clear_memory (void *mem, size_t clearsize)
> > > > +{
> > > > + size_t *d = (size_t *) mem;
> > > > + size_t nclears = clearsize / sizeof (size_t);
> > > > +
> > > > + *(d + 0) = 0;
> > > > + *(d + 1) = 0;
> > > > + *(d + 2) = 0;
> > > > + if (nclears > 9)
> > > > + {
> > > > + *(d + 5) = 0;
> > > > + *(d + 5 + 1) = 0;
> > > > + *(d + 5 + 2) = 0;
> > > > + *(d + 5 + 3) = 0;
> > > > + *(d + nclears - 8) = 0;
> > > > + *(d + nclears - 8 + 1) = 0;
> > > > + *(d + nclears - 8 + 2) = 0;
> > > > + *(d + nclears - 8 + 3) = 0;
> > > > + }
> > > > + else
> > > > + {
> > > > + *(d + 1) = 0;
> > > > + *(d + 2) = 0;
> > > > + *(d + 3) = 0;
> > > > + *(d + 4) = 0;
> > > > + *(d + nclears - 4) = 0;
> > > > + *(d + nclears - 4 + 1) = 0;
> > > > + *(d + nclears - 4 + 2) = 0;
> > > > + *(d + nclears - 4 + 3) = 0;
> > > > + }
> > > > +
> > > > + return nclears;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-1.c
> > > > b/gcc/testsuite/gcc.target/i386/pr92080-1.c
> > > > new file mode 100644
> > > > index 00000000000..7059b4514eb
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-1.c
> > > > @@ -0,0 +1,54 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -march=x86-64-v3" } */
> > > > +/* { dg-final { scan-assembler-times "vpxor" 2 } } */
> > > > +/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */
> > > > +
> > > > +typedef int v4si __attribute__((vector_size(16)));
> > > > +typedef int v8si __attribute__((vector_size(32)));
> > > > +typedef short v8hi __attribute__((vector_size(16)));
> > > > +typedef short v16hi __attribute__((vector_size(32)));
> > > > +typedef char v16qi __attribute__((vector_size(16)));
> > > > +typedef char v32qi __attribute__((vector_size(32)));
> > > > +
> > > > +v16qi b1;
> > > > +v8hi h1;
> > > > +v4si s1;
> > > > +v32qi b2;
> > > > +v16hi h2;
> > > > +v8si s2;
> > > > +
> > > > +void
> > > > +foo ()
> > > > +{
> > > > + s1 = __extension__(v4si){0, 0, 0, 0};
> > > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> > > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0};
> > > > + h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0};
> > > > +}
> > > > +
> > > > +void
> > > > +foo1 ()
> > > > +{
> > > > + s1 = __extension__(v4si){-1, -1, -1, -1};
> > > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> > > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
> > > > -1, -1, -1, -1, -1, -1};
> > > > +}
> > > > +
> > > > +
> > > > +void
> > > > +foo2 ()
> > > > +{
> > > > + s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
> > > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> > > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0};
> > > > + b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0,
> > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
> > > > +}
> > > > +
> > > > +void
> > > > +foo3 ()
> > > > +{
> > > > + s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
> > > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> > > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
> > > > -1, -1, -1, -1, -1, -1};
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-2.c
> > > > b/gcc/testsuite/gcc.target/i386/pr92080-2.c
> > > > new file mode 100644
> > > > index 00000000000..d160d90de53
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-2.c
> > > > @@ -0,0 +1,59 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -march=x86-64-v3" } */
> > > > +/* { dg-final { scan-assembler-times "vpxor" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */
> > > > +
> > > > +typedef int v4si __attribute__((vector_size(16)));
> > > > +typedef int v8si __attribute__((vector_size(32)));
> > > > +typedef short v8hi __attribute__((vector_size(16)));
> > > > +typedef short v16hi __attribute__((vector_size(32)));
> > > > +typedef char v16qi __attribute__((vector_size(16)));
> > > > +typedef char v32qi __attribute__((vector_size(32)));
> > > > +
> > > > +v16qi b1;
> > > > +v8hi h1;
> > > > +v4si s1;
> > > > +v32qi b2;
> > > > +v16hi h2;
> > > > +v8si s2;
> > > > +
> > > > +void
> > > > +foo (int i, int j)
> > > > +{
> > > > + switch (i)
> > > > + {
> > > > + case 1:
> > > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> > > > + s1 = __extension__(v4si){0, 0, 0, 0};
> > > > + s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
> > > > + break;
> > > > + case 2:
> > > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> > > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1,
> > > > -1, -1, -1, -1, -1, -1, -1};
> > > > + break;
> > > > + case 3:
> > > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> > > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0, 0};
> > > > + break;
> > > > + default:
> > > > + break;
> > > > + }
> > > > +
> > > > + switch (i)
> > > > + {
> > > > + case 1:
> > > > + s1 = __extension__(v4si){-1, -1, -1, -1};
> > > > + b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0, 0,
> > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
> > > > + h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0, 0};
> > > > + break;
> > > > + case 2:
> > > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0, 0};
> > > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> > > > + break;
> > > > + case 3:
> > > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1,
> > > > -1, -1, -1, -1, -1, -1, -1};
> > > > + s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
> > > > + break;
> > > > + }
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-3.c
> > > > b/gcc/testsuite/gcc.target/i386/pr92080-3.c
> > > > new file mode 100644
> > > > index 00000000000..2174def4e6d
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-3.c
> > > > @@ -0,0 +1,48 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -march=x86-64" } */
> > > > +/* { dg-final { scan-assembler-times "pxor" 1 } } */
> > > > +/* { dg-final { scan-assembler-times "pcmpeq" 1 } } */
> > > > +
> > > > +typedef int v4si __attribute__((vector_size(16)));
> > > > +typedef short v8hi __attribute__((vector_size(16)));
> > > > +typedef char v16qi __attribute__((vector_size(16)));
> > > > +
> > > > +v16qi b1;
> > > > +v8hi h1;
> > > > +v4si s1;
> > > > +
> > > > +void
> > > > +foo (int i, int j)
> > > > +{
> > > > + switch (i)
> > > > + {
> > > > + case 1:
> > > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> > > > + s1 = __extension__(v4si){0, 0, 0, 0};
> > > > + break;
> > > > + case 2:
> > > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> > > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1,
> > > > -1, -1, -1, -1, -1, -1, -1};
> > > > + break;
> > > > + case 3:
> > > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> > > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0, 0};
> > > > + break;
> > > > + default:
> > > > + break;
> > > > + }
> > > > +
> > > > + switch (i)
> > > > + {
> > > > + case 1:
> > > > + s1 = __extension__(v4si){-1, -1, -1, -1};
> > > > + break;
> > > > + case 2:
> > > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> > > > 0, 0, 0};
> > > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> > > > + break;
> > > > + case 3:
> > > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1,
> > > > -1, -1, -1, -1, -1, -1, -1};
> > > > + break;
> > > > + }
> > > > +}
> > > > --
> > > > 2.47.1
> > > >
> > >
> > > OK for master?
> > >
> > > --
> > > H.J.
> >
> >
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> H.J.
--
BR,
Hongtao