On Mon, Apr 28, 2025 at 4:26 PM H.J. Lu <[email protected]> wrote:
>
> > > This is what my patch does:
> > But it iterates through vector_insns, using a def-ref chain to find
> > those insns. I think we can just record those single_set with src as
> > const_m1/zero, and replace src for them.
>
> Will fix it.
Fixed in the v2 patch.
> > >
> > > /* Check the single definition of CONST0_RTX and integer
> > > CONSTM1_RTX. */
> > > rtx src = SET_SRC (set);
> > > rtx replace;
> > > if (vector_const0 && src == CONST0_RTX (mode))
> > > {
> > > /* Replace REG with VECTOR_CONST0. */
> > > if (SUBREG_P (reg) || mode == zero_mode)
> > > replace = vector_const0;
> > > else
> > > replace = gen_rtx_SUBREG (mode, vector_const0, 0);
> > > *DF_REF_REAL_LOC (ref) = replace;
> > > replaced = true;
> > > zero_replaced = true;
> > > }
> > >
> > > It changed the source to a subreg directly.
> > >
> > > > Also we also need to change ix86_modes_tieable_p to make sure those
> > > > inserted subreg can be handled by LRA and other passes?
> > >
> > > ix86_modes_tieable_p is OK:
> > >
> > > /* If MODE2 is only appropriate for an SSE register, then tie with
> > > any other mode acceptable to SSE registers. */
> > > if (GET_MODE_SIZE (mode2) == 64
> > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > return (GET_MODE_SIZE (mode1) == 64
> > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > if (GET_MODE_SIZE (mode2) == 32
> > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > return (GET_MODE_SIZE (mode1) == 32
> > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > if (GET_MODE_SIZE (mode2) == 16
> > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > return (GET_MODE_SIZE (mode1) == 16
> > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > >
> > It's ok only size of mode1 is equal to size of mode2.
> > But in the testcase, there are different size vectors(32-bytes, 16-bytes).
> >
> > So it would be better as, for mode2 >= 16 bytes, it can only be put
> > into SSE_REGS(except for TImode, but TImode still can be tied to
> > <=16bytes mode1 which can be put into SSE_REGS) , if mode1 can also be
> > put into SSE_REGS, then mode2 tie with mode1.
> >
> > /* If MODE2 is only appropriate for an SSE register, then tie with
> > any other mode acceptable to SSE registers. */
> > - if (GET_MODE_SIZE (mode2) == 64
> > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > - return (GET_MODE_SIZE (mode1) == 64
> > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > - if (GET_MODE_SIZE (mode2) == 32
> > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > - return (GET_MODE_SIZE (mode1) == 32
> > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > - if (GET_MODE_SIZE (mode2) == 16
> > + if (GET_MODE_SIZE (mode2) >= 16
> > + && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)
> > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > - return (GET_MODE_SIZE (mode1) == 16
> > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > + return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
> >
>
> This caused:
>
> FAIL: gcc.target/i386/pr111267.c scan-assembler-not movd
> FAIL: gcc.target/i386/pr111267.c scan-assembler-not movq
> FAIL: gcc.target/i386/pr82580.c scan-assembler-not \\mmovzb
>
> since GCC thinks it is cheap to get QI/HI/SI/DI from TI in XMM.
> I am testing:
>
> /* If MODE2 is only appropriate for an SSE register, then tie with
> any other mode acceptable to SSE registers, excluding
> (subreg:QI (reg:TI 99) 0))
> (subreg:HI (reg:TI 99) 0))
> (subreg:SI (reg:TI 99) 0))
> (subreg:DI (reg:TI 99) 0))
> to avoid unnecessary move from SSE register to integer register.
> */
> if (GET_MODE_SIZE (mode2) >= 16
> && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
> || (!INTEGRAL_MODE_P (mode1)
> && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
> && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
>
Tested. There is no regression.
Here is the v2 patch. Ok for master?
Thanks.
--
H.J.
From 603161d6bc543f68b4eb0991f5d634a9ce748d99 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <[email protected]>
Date: Fri, 29 Nov 2024 18:22:14 +0800
Subject: [PATCH v2] x86: Add a pass to remove redundant all 0s/1s vector load
For all different modes of all 0s/1s vectors, we can use the single widest
all 0s/1s vector register for all 0s/1s vector uses in the whole function.
Add a pass to generate a single widest all 0s/1s vector set instruction at
entry of the nearest common dominator for basic blocks with all 0s/1s
vector uses. On Linux/x86-64, in cc1plus, this patch reduces the number
of vector xor instructions from 4803 to 4714 and pcmpeq instructions from
144 to 142.
NB: PR target/92080 and PR target/117839 aren't same. PR target/117839
is for vectors of all 0s and all 1s with different sizes and different
components. PR target/92080 is for broadcast of the same component to
different vector sizes. This patch covers only all 0s and all 1s cases
of PR target/92080.
gcc/
PR target/92080
PR target/117839
* config/i386/i386-features.cc (ix86_place_single_vector_set):
New.
(ix86_get_vector_load_mode): Likewise.
(replace_vector_const): Likewise.
(remove_redundant_vector_load): Likewise.
(pass_data_remove_redundant_vector_load): Likewise.
(pass_remove_redundant_vector_load): Likewise.
(make_pass_remove_redundant_vector_load): Likewise.
* config/i386/i386-passes.def: Add
pass_remove_redundant_vector_load after
pass_remove_partial_avx_dependency.
* config/i386/i386-protos.h
(make_pass_remove_redundant_vector_load): New.
* config/i386/i386.cc (ix86_modes_tieable_p): Return true for
narrower non-integer modes in SSE registers.
gcc/testsuite/
PR target/92080
PR target/117839
* gcc.target/i386/pr117839-1a.c: New test.
* gcc.target/i386/pr117839-1b.c: Likewise.
* gcc.target/i386/pr117839-2.c: Likewise.
* gcc.target/i386/pr92080-1.c: Likewise.
* gcc.target/i386/pr92080-2.c: Likewise.
* gcc.target/i386/pr92080-3.c: Likewise.
Signed-off-by: H.J. Lu <[email protected]>
---
gcc/config/i386/i386-features.cc | 270 ++++++++++++++++++++
gcc/config/i386/i386-passes.def | 1 +
gcc/config/i386/i386-protos.h | 2 +
gcc/config/i386/i386.cc | 24 +-
gcc/testsuite/gcc.target/i386/pr117839-1a.c | 35 +++
gcc/testsuite/gcc.target/i386/pr117839-1b.c | 5 +
gcc/testsuite/gcc.target/i386/pr117839-2.c | 40 +++
gcc/testsuite/gcc.target/i386/pr92080-1.c | 72 ++++++
gcc/testsuite/gcc.target/i386/pr92080-2.c | 59 +++++
gcc/testsuite/gcc.target/i386/pr92080-3.c | 48 ++++
10 files changed, 544 insertions(+), 12 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c35ac24fd8a..a59dd71c2e3 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3288,6 +3288,276 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
return new pass_remove_partial_avx_dependency (ctxt);
}
+/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
+ for basic block map BBS, which is in the fake loop that contains the
+ whole function, so that there is only a single vector set in the
+ whole function. */
+
+static void
+ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
+{
+ basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
+ while (bb->loop_father->latch
+ != EXIT_BLOCK_PTR_FOR_FN (cfun))
+ bb = get_immediate_dominator (CDI_DOMINATORS,
+ bb->loop_father->header);
+
+ rtx set = gen_rtx_SET (dest, src);
+
+ rtx_insn *insn = BB_HEAD (bb);
+ while (insn && !NONDEBUG_INSN_P (insn))
+ {
+ if (insn == BB_END (bb))
+ {
+ insn = NULL;
+ break;
+ }
+ insn = NEXT_INSN (insn);
+ }
+
+ rtx_insn *set_insn;
+ if (insn == BB_HEAD (bb))
+ set_insn = emit_insn_before (set, insn);
+ else
+ set_insn = emit_insn_after (set,
+ insn ? PREV_INSN (insn) : BB_END (bb));
+ df_insn_rescan (set_insn);
+}
+
+/* Return a machine mode suitable for vector SIZE. */
+
+static machine_mode
+ix86_get_vector_load_mode (unsigned int size)
+{
+ machine_mode mode;
+ if (size == 64)
+ mode = V64QImode;
+ else if (size == 32)
+ mode = V32QImode;
+ else
+ mode = V16QImode;
+ return mode;
+}
+
+/* Replace the source operand of instructions in VECTOR_INSNS with
+ VECTOR_CONST in VECTOR_MODE. */
+
+static void
+replace_vector_const (machine_mode vector_mode, rtx vector_const,
+ auto_bitmap &vector_insns)
+{
+ bitmap_iterator bi;
+ unsigned int id;
+
+ EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
+ {
+ rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
+
+ /* Get the single SET instruction. */
+ rtx set = single_set (insn);
+ rtx dest = SET_SRC (set);
+ machine_mode mode = GET_MODE (dest);
+
+ rtx replace;
+ /* Replace the source operand with VECTOR_CONST. */
+ if (SUBREG_P (dest) || mode == vector_mode)
+ replace = vector_const;
+ else
+ replace = gen_rtx_SUBREG (mode, vector_const, 0);
+
+ /* NB: Don't run recog_memoized here since vector SUBREG may not
+ be valid. Let LRA handle vector SUBREG. */
+ SET_SRC (set) = replace;
+ /* Drop possible dead definitions. */
+ PATTERN (insn) = set;
+ df_insn_rescan (insn);
+ }
+}
+
+/* At entry of the nearest common dominator for basic blocks with vector
+ CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
+ vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
+ uses.
+
+ NB: We want to generate only a single widest vector set to cover the
+ whole function. The LCM algorithm isn't appropriate here since it
+ may place a vector set inside the loop. */
+
+static unsigned int
+remove_redundant_vector_load (void)
+{
+ timevar_push (TV_MACH_DEP);
+
+ auto_bitmap zero_bbs;
+ auto_bitmap m1_bbs;
+ auto_bitmap zero_insns;
+ auto_bitmap m1_insns;
+
+ basic_block bb;
+ rtx_insn *insn;
+ unsigned HOST_WIDE_INT zero_count = 0;
+ unsigned HOST_WIDE_INT m1_count = 0;
+ unsigned int zero_size = 0;
+ unsigned int m1_size = 0;
+
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+
+ rtx set = single_set (insn);
+ if (!set)
+ continue;
+
+ /* Record single set vector instruction with CONST0_RTX and
+ CONSTM1_RTX source. Record basic blocks with CONST0_RTX and
+ CONSTM1_RTX. Count CONST0_RTX and CONSTM1_RTX. Record the
+ maximum size of CONST0_RTX and CONSTM1_RTX. */
+
+ rtx dest = SET_DEST (set);
+ machine_mode mode = GET_MODE (dest);
+ /* Skip non-vector instruction. */
+ if (!VECTOR_MODE_P (mode))
+ continue;
+
+ rtx src = SET_SRC (set);
+ /* Skip non-vector load instruction. */
+ if (!REG_P (dest) && !SUBREG_P (dest))
+ continue;
+
+ if (src == CONST0_RTX (mode))
+ {
+ /* Record vector instruction with CONST0_RTX. */
+ bitmap_set_bit (zero_insns, INSN_UID (insn));
+
+ /* Record the maximum vector size. */
+ if (zero_size < GET_MODE_SIZE (mode))
+ zero_size = GET_MODE_SIZE (mode);
+
+ /* Record the basic block with CONST0_RTX. */
+ bitmap_set_bit (zero_bbs, bb->index);
+ zero_count++;
+ }
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && src == CONSTM1_RTX (mode))
+ {
+ /* Record vector instruction with CONSTM1_RTX. */
+ bitmap_set_bit (m1_insns, INSN_UID (insn));
+
+ /* Record the maximum vector size. */
+ if (m1_size < GET_MODE_SIZE (mode))
+ m1_size = GET_MODE_SIZE (mode);
+
+ /* Record the basic block with CONSTM1_RTX. */
+ bitmap_set_bit (m1_bbs, bb->index);
+ m1_count++;
+ }
+ }
+ }
+
+ if (zero_count > 1 || m1_count > 1)
+ {
+ machine_mode zero_mode, m1_mode;
+ rtx vector_const0, vector_constm1;
+
+ if (zero_count > 1)
+ {
+ zero_mode = ix86_get_vector_load_mode (zero_size);
+ vector_const0 = gen_reg_rtx (zero_mode);
+ replace_vector_const (zero_mode, vector_const0, zero_insns);
+ }
+ else
+ {
+ zero_mode = VOIDmode;
+ vector_const0 = nullptr;
+ }
+
+ if (m1_count > 1)
+ {
+ m1_mode = ix86_get_vector_load_mode (m1_size);
+ vector_constm1 = gen_reg_rtx (m1_mode);
+ replace_vector_const (m1_mode, vector_constm1, m1_insns);
+ }
+ else
+ {
+ m1_mode = VOIDmode;
+ vector_constm1 = nullptr;
+ }
+
+ /* (Re-)discover loops so that bb->loop_father can be used in the
+ analysis below. */
+ calculate_dominance_info (CDI_DOMINATORS);
+ loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+ if (vector_const0)
+ ix86_place_single_vector_set (vector_const0,
+ CONST0_RTX (zero_mode),
+ zero_bbs);
+
+ if (vector_constm1)
+ ix86_place_single_vector_set (vector_constm1,
+ CONSTM1_RTX (m1_mode),
+ m1_bbs);
+
+ loop_optimizer_finalize ();
+
+ df_process_deferred_rescans ();
+ }
+
+ df_clear_flags (DF_DEFER_INSN_RESCAN);
+
+ timevar_pop (TV_MACH_DEP);
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_remove_redundant_vector_load =
+{
+ RTL_PASS, /* type */
+ "rrvl", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_MACH_DEP, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_remove_redundant_vector_load : public rtl_opt_pass
+{
+public:
+ pass_remove_redundant_vector_load (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ bool gate (function *fun) final override
+ {
+ return (TARGET_SSE2
+ && optimize
+ && optimize_function_for_speed_p (fun));
+ }
+
+ unsigned int execute (function *) final override
+ {
+ return remove_redundant_vector_load ();
+ }
+}; // class pass_remove_redundant_vector_load
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_remove_redundant_vector_load (gcc::context *ctxt)
+{
+ return new pass_remove_redundant_vector_load (ctxt);
+}
+
/* Convert legacy instructions that clobbers EFLAGS to APX_NF
instructions when there are no flag set between a flag
producer and user. */
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 39f8bc65ddc..06f0288b067 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -35,5 +35,6 @@ along with GCC; see the file COPYING3. If not see
PR116174. */
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
+ INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load);
INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index bea3fd4b2e2..c59b5a67e3a 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -427,6 +427,8 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
(gcc::context *);
extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
(gcc::context *);
+extern rtl_opt_pass *make_pass_remove_redundant_vector_load
+ (gcc::context *);
extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3171d6e0ad4..97705d78ec6 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21340,19 +21340,19 @@ ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
return mode1 == SFmode;
/* If MODE2 is only appropriate for an SSE register, then tie with
- any other mode acceptable to SSE registers. */
- if (GET_MODE_SIZE (mode2) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 16
+ any other mode acceptable to SSE registers, excluding
+ (subreg:QI (reg:TI 99) 0))
+ (subreg:HI (reg:TI 99) 0))
+ (subreg:SI (reg:TI 99) 0))
+ (subreg:DI (reg:TI 99) 0))
+ to avoid unnecessary move from SSE register to integer register.
+ */
+ if (GET_MODE_SIZE (mode2) >= 16
+ && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
+ || (!INTEGRAL_MODE_P (mode1)
+ && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 16
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+ return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
/* If MODE2 is appropriate for an MMX register, then tie
with any other mode acceptable to MMX registers. */
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1a.c b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
new file mode 100644
index 00000000000..4501cfbcad4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include <stddef.h>
+
+void
+clear_memory (void *mem1, size_t nclears1, void *mem2, size_t nclears2)
+{
+ size_t *d1 = (size_t *) mem1;
+
+ *(d1 + 0) = 0;
+ *(d1 + 1) = 0;
+ *(d1 + 2) = 0;
+ if (nclears1 > 3)
+ {
+ *(d1 + nclears1 - 4) = 0;
+ *(d1 + nclears1 - 4 + 1) = 0;
+ *(d1 + nclears1 - 4 + 2) = 0;
+ *(d1 + nclears1 - 4 + 3) = 0;
+ }
+
+ double *d2 = (double *) mem2;
+
+ *(d2 + 0) = 0;
+ *(d2 + 1) = 0;
+ *(d2 + 2) = 0;
+ if (nclears2 > 3)
+ {
+ *(d2 + nclears2 - 4) = 0;
+ *(d2 + nclears2 - 4 + 1) = 0;
+ *(d2 + nclears2 - 4 + 2) = 0;
+ *(d2 + nclears2 - 4 + 3) = 0;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1b.c b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
new file mode 100644
index 00000000000..e71b991a207
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include "pr117839-1a.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-2.c b/gcc/testsuite/gcc.target/i386/pr117839-2.c
new file mode 100644
index 00000000000..c76744cf98b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-2.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include <stddef.h>
+
+float
+clear_memory (void *mem, size_t clearsize)
+{
+ size_t *d = (size_t *) mem;
+ size_t nclears = clearsize / sizeof (size_t);
+
+ *(d + 0) = 0;
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ if (nclears > 9)
+ {
+ *(d + 5) = 0;
+ *(d + 5 + 1) = 0;
+ *(d + 5 + 2) = 0;
+ *(d + 5 + 3) = 0;
+ *(d + nclears - 8) = 0;
+ *(d + nclears - 8 + 1) = 0;
+ *(d + nclears - 8 + 2) = 0;
+ *(d + nclears - 8 + 3) = 0;
+ }
+ else
+ {
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ *(d + 3) = 0;
+ *(d + 4) = 0;
+ *(d + nclears - 4) = 0;
+ *(d + nclears - 4 + 1) = 0;
+ *(d + nclears - 4 + 2) = 0;
+ *(d + nclears - 4 + 3) = 0;
+ }
+
+ return nclears;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-1.c b/gcc/testsuite/gcc.target/i386/pr92080-1.c
new file mode 100644
index 00000000000..82d1ffd4e1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-1.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 2 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */
+
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef double v2df __attribute__((vector_size(16)));
+typedef double v4df __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v2di l1;
+v4sf f1;
+v2df d1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+v4di l2;
+v8sf f2;
+v4df d2;
+
+void
+foo ()
+{
+ d1 = __extension__(v2df){0, 0};
+ f1 = __extension__(v4sf){0, 0, 0};
+ l1 = __extension__(v2di){0, 0};
+ s1 = __extension__(v4si){0, 0, 0, 0};
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo1 ()
+{
+ s1 = __extension__(v4si){-1, -1, -1, -1};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+}
+
+
+void
+foo2 ()
+{
+ d2 = __extension__(v4df){0, 0, 0, 0};
+ f2 = __extension__(v8sf){0, 0, 0, 0, 0, 0, 0, 0};
+ l2 = __extension__(v4di){0, 0, 0, 0};
+ s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo3 ()
+{
+ s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-2.c b/gcc/testsuite/gcc.target/i386/pr92080-2.c
new file mode 100644
index 00000000000..d160d90de53
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-2.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 1 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+
+void
+foo (int i, int j)
+{
+ switch (i)
+ {
+ case 1:
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ s1 = __extension__(v4si){0, 0, 0, 0};
+ s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+ break;
+ case 2:
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ case 3:
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ break;
+ default:
+ break;
+ }
+
+ switch (i)
+ {
+ case 1:
+ s1 = __extension__(v4si){-1, -1, -1, -1};
+ b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ break;
+ case 2:
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ case 3:
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-3.c b/gcc/testsuite/gcc.target/i386/pr92080-3.c
new file mode 100644
index 00000000000..2174def4e6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-3.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64" } */
+/* { dg-final { scan-assembler-times "pxor" 1 } } */
+/* { dg-final { scan-assembler-times "pcmpeq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+
+void
+foo (int i, int j)
+{
+ switch (i)
+ {
+ case 1:
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ s1 = __extension__(v4si){0, 0, 0, 0};
+ break;
+ case 2:
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ case 3:
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ break;
+ default:
+ break;
+ }
+
+ switch (i)
+ {
+ case 1:
+ s1 = __extension__(v4si){-1, -1, -1, -1};
+ break;
+ case 2:
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ case 3:
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ }
+}
--
2.49.0