On Sun, Dec 1, 2024 at 7:50 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > For all different modes of all 0s/1s vectors, we can use the single widest > all 0s/1s vector register for all 0s/1s vector uses in the whole function. > Add a pass to generate a single widest all 0s/1s vector set instruction at > entry of the nearest common dominator for basic blocks with all 0s/1s > vector uses. On Linux/x86-64, in cc1plus, this patch reduces the number > of vector xor instructions from 4803 to 4714 and pcmpeq instructions from > 144 to 142. > > This change causes a regression: > > FAIL: gcc.dg/rtl/x86_64/vector_eq.c > > without the fix for > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117863 > > NB: PR target/92080 and PR target/117839 aren't same. PR target/117839 > is for vectors of all 0s and all 1s with different sizes and different > components. PR target/92080 is for broadcast of the same component to > different vector sizes. This patch covers only all 0s and all 1s cases > of PR target/92080. > > gcc/ > > PR target/92080 > PR target/117839 > * config/i386/i386-features.cc (ix86_rrvl_gate): New. > (ix86_place_single_vector_set): Likewise. > (ix86_get_vector_load_mode): Likewise. > (remove_redundant_vector_load): Likewise. > (pass_data_remove_redundant_vector_load): Likewise. > (pass_remove_redundant_vector_load): Likewise. > (make_pass_remove_redundant_vector_load): Likewise. > * config/i386/i386-passes.def: Add > pass_remove_redundant_vector_load after > pass_remove_partial_avx_dependency. > * config/i386/i386-protos.h > (make_pass_remove_redundant_vector_load): New. > > gcc/testsuite/ > > PR target/92080 > PR target/117839 > * gcc.target/i386/pr117839-1a.c: New test. > * gcc.target/i386/pr117839-1b.c: Likewise. > * gcc.target/i386/pr117839-2.c: Likewise. > * gcc.target/i386/pr92080-1.c: Likewise. > * gcc.target/i386/pr92080-2.c: Likewise. > * gcc.target/i386/pr92080-3.c: Likewise. > > Signed-off-by: H.J. Lu <hjl.to...@gmail.com> > --- > gcc/config/i386/i386-features.cc | 308 ++++++++++++++++++++ > gcc/config/i386/i386-passes.def | 1 + > gcc/config/i386/i386-protos.h | 2 + > gcc/testsuite/gcc.target/i386/pr117839-1a.c | 35 +++ > gcc/testsuite/gcc.target/i386/pr117839-1b.c | 5 + > gcc/testsuite/gcc.target/i386/pr117839-2.c | 40 +++ > gcc/testsuite/gcc.target/i386/pr92080-1.c | 54 ++++ > gcc/testsuite/gcc.target/i386/pr92080-2.c | 59 ++++ > gcc/testsuite/gcc.target/i386/pr92080-3.c | 48 +++ > 9 files changed, 552 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c > > diff --git a/gcc/config/i386/i386-features.cc > b/gcc/config/i386/i386-features.cc > index 003b003e09c..7d8d260750d 100644 > --- a/gcc/config/i386/i386-features.cc > +++ b/gcc/config/i386/i386-features.cc > @@ -3288,6 +3288,314 @@ make_pass_remove_partial_avx_dependency (gcc::context > *ctxt) > return new pass_remove_partial_avx_dependency (ctxt); > } > > +static bool > +ix86_rrvl_gate () > +{ > + return (TARGET_SSE2 > + && optimize > + && optimize_function_for_speed_p (cfun)); > +} > + > +/* Generate a vector set, DEST = SRC, at entry of the nearest dominator > + for basic block map BBS, which is in the fake loop that contains the > + whole function, so that there is only a single vector set in the > + whole function. */ > + > +static void > +ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs) > +{ > + basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); > + while (bb->loop_father->latch > + != EXIT_BLOCK_PTR_FOR_FN (cfun)) > + bb = get_immediate_dominator (CDI_DOMINATORS, > + bb->loop_father->header); > + > + rtx set = gen_rtx_SET (dest, src); > + > + rtx_insn *insn = BB_HEAD (bb); > + while (insn && !NONDEBUG_INSN_P (insn)) > + { > + if (insn == BB_END (bb)) > + { > + insn = NULL; > + break; > + } > + insn = NEXT_INSN (insn); > + } > + > + rtx_insn *set_insn; > + if (insn == BB_HEAD (bb)) > + set_insn = emit_insn_before (set, insn); > + else > + set_insn = emit_insn_after (set, > + insn ? PREV_INSN (insn) : BB_END (bb)); > + df_insn_rescan (set_insn); > +} > + > +/* Return a machine mode suitable for vector SIZE. */ > + > +static machine_mode > +ix86_get_vector_load_mode (unsigned int size) > +{ > + machine_mode mode; > + if (size == 64) > + mode = V64QImode; > + else if (size == 32) > + mode = V32QImode; > + else > + mode = V16QImode; > + return mode; > +} > + > +/* At entry of the nearest common dominator for basic blocks with vector > + CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest > + vector set instruction for all CONST0_RTX and integer CONSTM1_RTX > + uses. > + > + NB: We want to generate only a single widest vector set to cover the > + whole function. The LCM algorithm isn't appropriate here since it > + may place a vector set inside the loop. */ > + > +static unsigned int > +remove_redundant_vector_load (void) > +{ > + timevar_push (TV_MACH_DEP); > + > + bitmap_obstack_initialize (NULL); > + bitmap zero_bbs = BITMAP_ALLOC (NULL); > + bitmap m1_bbs = BITMAP_ALLOC (NULL); > + bitmap vector_insns = BITMAP_ALLOC (NULL); > + > + basic_block bb; > + rtx_insn *insn; > + rtx set; > + unsigned HOST_WIDE_INT zero_count = 0; > + unsigned HOST_WIDE_INT m1_count = 0; > + unsigned int zero_size = 0; > + unsigned int m1_size = 0; > + > + df_set_flags (DF_DEFER_INSN_RESCAN); > + > + FOR_EACH_BB_FN (bb, cfun) > + { > + FOR_BB_INSNS (bb, insn) > + { > + if (!NONDEBUG_INSN_P (insn)) > + continue; > + > + set = single_set (insn); > + if (!set) > + continue; > + > + rtx dest = SET_DEST (set); > + machine_mode mode = GET_MODE (dest); > + /* Skip non-vector instruction. */ > + if (!VECTOR_MODE_P (mode)) > + continue; > + > + rtx src = SET_SRC (set); > + if (!REG_P (dest) > + || (src != CONST0_RTX (mode) > + && !(GET_MODE_CLASS (mode) == MODE_VECTOR_INT > + && src == CONSTM1_RTX (mode)))) > + { > + /* Record non-CONST0_RTX/CONSTM1_RTX vector instruction. */ > + bitmap_set_bit (vector_insns, INSN_UID (insn)); > + continue; > + } > + > + if (src == CONST0_RTX (mode)) > + { > + /* Record the maximum vector size. */ > + if (zero_size < GET_MODE_SIZE (mode)) > + zero_size = GET_MODE_SIZE (mode); > + > + /* Record the basic block with CONST0_RTX. */ > + bitmap_set_bit (zero_bbs, bb->index); > + zero_count++; > + } > + else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT > + && src == CONSTM1_RTX (mode)) > + { > + /* Record the maximum vector size. */ > + if (m1_size < GET_MODE_SIZE (mode)) > + m1_size = GET_MODE_SIZE (mode); > + > + /* Record the basic block with CONSTM1_RTX. */ > + bitmap_set_bit (m1_bbs, bb->index); > + m1_count++; > + } > + } > + } > + > + if (zero_count > 1 || m1_count > 1) > + { > + machine_mode zero_mode, m1_mode; > + rtx vector_const0, vector_constm1; > + if (zero_count > 1) > + { > + zero_mode = ix86_get_vector_load_mode (zero_size); > + vector_const0 = gen_reg_rtx (zero_mode); > + } > + else > + { > + zero_mode = VOIDmode; > + vector_const0 = nullptr; > + } > + if (m1_count > 1) > + { > + m1_mode = ix86_get_vector_load_mode (m1_size); > + vector_constm1 = gen_reg_rtx (m1_mode); > + } > + else > + { > + m1_mode = VOIDmode; > + vector_constm1 = nullptr; > + } > + > + bool zero_replaced = false; > + bool m1_replaced = false; > + > + bitmap_iterator bi; > + unsigned id; > + EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi) > + { > + /* Replace CONST0_RTX and integer CONSTM1_RTX with the single > + CONST0_RTX and integer CONSTM1_RTX register. */ > + df_ref ref, def; > + insn = DF_INSN_UID_GET (id)->insn; > + bool replaced = false; > + > + for (ref = DF_INSN_UID_USES (id); > + ref; > + ref = DF_REF_NEXT_LOC (ref)) > + { > + if (DF_REF_TYPE (ref) != DF_REF_REG_USE) > + continue; > + > + /* Skip non-vector register. */ > + rtx reg = DF_REF_REG (ref); > + if (!VECTOR_MODE_P (GET_MODE (reg))) > + continue; > + > + /* Check the single definition. */ > + def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref)); > + if (!def || DF_REF_NEXT_REG (def) != nullptr) > + continue; > + > + /* Get the single definition. */ > + rtx_insn *def_insn = DF_REF_INSN (def); > + set = single_set (def_insn); > + if (!set) > + continue; > + > + /* Check the single definition of vector constant zero. */ > + rtx src = SET_SRC (set); > + rtx replace; > + if (vector_const0 && src == CONST0_RTX (GET_MODE (src))) > + { > + /* Replace REG with VECTOR_CONST0. */ > + if (SUBREG_P (reg) || GET_MODE (reg) == zero_mode) > + replace = vector_const0; > + else > + replace = gen_rtx_SUBREG (GET_MODE (reg), > + vector_const0, 0); > + *DF_REF_REAL_LOC (ref) = replace; > + replaced = true; > + zero_replaced = true; > + } > + else if (vector_constm1 > + && src == CONSTM1_RTX (GET_MODE (src))) > + { > + /* Replace REG with VECTOR_CONSTM1. */ > + if (SUBREG_P (reg) || GET_MODE (reg) == m1_mode) > + replace = vector_constm1; > + else > + replace = gen_rtx_SUBREG (GET_MODE (reg), > + vector_constm1, 0); > + *DF_REF_REAL_LOC (ref) = replace; > + replaced = true; > + m1_replaced = true; > + } > + } > + > + if (replaced) > + df_insn_rescan (insn); > + } > + > + /* (Re-)discover loops so that bb->loop_father can be used in the > + analysis below. */ > + calculate_dominance_info (CDI_DOMINATORS); > + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); > + > + if (zero_replaced) > + ix86_place_single_vector_set (vector_const0, > + CONST0_RTX (zero_mode), > + zero_bbs); > + > + if (m1_replaced) > + ix86_place_single_vector_set (vector_constm1, > + CONSTM1_RTX (m1_mode), > + m1_bbs); > + > + loop_optimizer_finalize (); > + > + df_process_deferred_rescans (); > + } > + > + df_clear_flags (DF_DEFER_INSN_RESCAN); > + > + bitmap_obstack_release (NULL); > + BITMAP_FREE (zero_bbs); > + BITMAP_FREE (m1_bbs); > + BITMAP_FREE (vector_insns); > + > + timevar_pop (TV_MACH_DEP); > + return 0; > +} > + > +namespace { > + > +const pass_data pass_data_remove_redundant_vector_load = > +{ > + RTL_PASS, /* type */ > + "rrvl", /* name */ > + OPTGROUP_NONE, /* optinfo_flags */ > + TV_MACH_DEP, /* tv_id */ > + 0, /* properties_required */ > + 0, /* properties_provided */ > + 0, /* properties_destroyed */ > + 0, /* todo_flags_start */ > + 0, /* todo_flags_finish */ > +}; > + > +class pass_remove_redundant_vector_load : public rtl_opt_pass > +{ > +public: > + pass_remove_redundant_vector_load (gcc::context *ctxt) > + : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt) > + {} > + > + /* opt_pass methods: */ > + bool gate (function *) final override > + { > + return ix86_rrvl_gate (); > + } > + > + unsigned int execute (function *) final override > + { > + return remove_redundant_vector_load (); > + } > +}; // class pass_remove_redundant_vector_load > + > +} // anon namespace > + > +rtl_opt_pass * > +make_pass_remove_redundant_vector_load (gcc::context *ctxt) > +{ > + return new pass_remove_redundant_vector_load (ctxt); > +} > + > /* Convert legacy instructions that clobbers EFLAGS to APX_NF > instructions when there are no flag set between a flag > producer and user. */ > diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def > index a9d350dcfca..df424cdb9c7 100644 > --- a/gcc/config/i386/i386-passes.def > +++ b/gcc/config/i386/i386-passes.def > @@ -35,5 +35,6 @@ along with GCC; see the file COPYING3. If not see > PR116174. */ > INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); > > + INSERT_PASS_AFTER (pass_late_combine, 1, > pass_remove_redundant_vector_load); > INSERT_PASS_AFTER (pass_late_combine, 1, > pass_remove_partial_avx_dependency); > INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert); > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 78e72c50c6d..4c3a8bd326c 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -426,6 +426,8 @@ extern rtl_opt_pass > *make_pass_insert_endbr_and_patchable_area > (gcc::context *); > extern rtl_opt_pass *make_pass_remove_partial_avx_dependency > (gcc::context *); > +extern rtl_opt_pass *make_pass_remove_redundant_vector_load > + (gcc::context *); > extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *); > extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); > > diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1a.c > b/gcc/testsuite/gcc.target/i386/pr117839-1a.c > new file mode 100644 > index 00000000000..4501cfbcad4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr117839-1a.c > @@ -0,0 +1,35 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ > +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t > \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */ > + > +#include <stddef.h> > + > +void > +clear_memory (void *mem1, size_t nclears1, void *mem2, size_t nclears2) > +{ > + size_t *d1 = (size_t *) mem1; > + > + *(d1 + 0) = 0; > + *(d1 + 1) = 0; > + *(d1 + 2) = 0; > + if (nclears1 > 3) > + { > + *(d1 + nclears1 - 4) = 0; > + *(d1 + nclears1 - 4 + 1) = 0; > + *(d1 + nclears1 - 4 + 2) = 0; > + *(d1 + nclears1 - 4 + 3) = 0; > + } > + > + double *d2 = (double *) mem2; > + > + *(d2 + 0) = 0; > + *(d2 + 1) = 0; > + *(d2 + 2) = 0; > + if (nclears2 > 3) > + { > + *(d2 + nclears2 - 4) = 0; > + *(d2 + nclears2 - 4 + 1) = 0; > + *(d2 + nclears2 - 4 + 2) = 0; > + *(d2 + nclears2 - 4 + 3) = 0; > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1b.c > b/gcc/testsuite/gcc.target/i386/pr117839-1b.c > new file mode 100644 > index 00000000000..e71b991a207 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr117839-1b.c > @@ -0,0 +1,5 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=x86-64-v3" } */ > +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t > \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */ > + > +#include "pr117839-1a.c" > diff --git a/gcc/testsuite/gcc.target/i386/pr117839-2.c > b/gcc/testsuite/gcc.target/i386/pr117839-2.c > new file mode 100644 > index 00000000000..c76744cf98b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr117839-2.c > @@ -0,0 +1,40 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=x86-64-v3" } */ > +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t > \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */ > + > +#include <stddef.h> > + > +float > +clear_memory (void *mem, size_t clearsize) > +{ > + size_t *d = (size_t *) mem; > + size_t nclears = clearsize / sizeof (size_t); > + > + *(d + 0) = 0; > + *(d + 1) = 0; > + *(d + 2) = 0; > + if (nclears > 9) > + { > + *(d + 5) = 0; > + *(d + 5 + 1) = 0; > + *(d + 5 + 2) = 0; > + *(d + 5 + 3) = 0; > + *(d + nclears - 8) = 0; > + *(d + nclears - 8 + 1) = 0; > + *(d + nclears - 8 + 2) = 0; > + *(d + nclears - 8 + 3) = 0; > + } > + else > + { > + *(d + 1) = 0; > + *(d + 2) = 0; > + *(d + 3) = 0; > + *(d + 4) = 0; > + *(d + nclears - 4) = 0; > + *(d + nclears - 4 + 1) = 0; > + *(d + nclears - 4 + 2) = 0; > + *(d + nclears - 4 + 3) = 0; > + } > + > + return nclears; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-1.c > b/gcc/testsuite/gcc.target/i386/pr92080-1.c > new file mode 100644 > index 00000000000..7059b4514eb > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr92080-1.c > @@ -0,0 +1,54 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=x86-64-v3" } */ > +/* { dg-final { scan-assembler-times "vpxor" 2 } } */ > +/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */ > + > +typedef int v4si __attribute__((vector_size(16))); > +typedef int v8si __attribute__((vector_size(32))); > +typedef short v8hi __attribute__((vector_size(16))); > +typedef short v16hi __attribute__((vector_size(32))); > +typedef char v16qi __attribute__((vector_size(16))); > +typedef char v32qi __attribute__((vector_size(32))); > + > +v16qi b1; > +v8hi h1; > +v4si s1; > +v32qi b2; > +v16hi h2; > +v8si s2; > + > +void > +foo () > +{ > + s1 = __extension__(v4si){0, 0, 0, 0}; > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; > + h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; > +} > + > +void > +foo1 () > +{ > + s1 = __extension__(v4si){-1, -1, -1, -1}; > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > -1, -1, -1, -1}; > +} > + > + > +void > +foo2 () > +{ > + s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0}; > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; > + b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; > +} > + > +void > +foo3 () > +{ > + s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1}; > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > -1, -1, -1, -1}; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-2.c > b/gcc/testsuite/gcc.target/i386/pr92080-2.c > new file mode 100644 > index 00000000000..d160d90de53 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr92080-2.c > @@ -0,0 +1,59 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=x86-64-v3" } */ > +/* { dg-final { scan-assembler-times "vpxor" 1 } } */ > +/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */ > + > +typedef int v4si __attribute__((vector_size(16))); > +typedef int v8si __attribute__((vector_size(32))); > +typedef short v8hi __attribute__((vector_size(16))); > +typedef short v16hi __attribute__((vector_size(32))); > +typedef char v16qi __attribute__((vector_size(16))); > +typedef char v32qi __attribute__((vector_size(32))); > + > +v16qi b1; > +v8hi h1; > +v4si s1; > +v32qi b2; > +v16hi h2; > +v8si s2; > + > +void > +foo (int i, int j) > +{ > + switch (i) > + { > + case 1: > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > + s1 = __extension__(v4si){0, 0, 0, 0}; > + s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0}; > + break; > + case 2: > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > -1, -1, -1, -1, -1}; > + break; > + case 3: > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0}; > + break; > + default: > + break; > + } > + > + switch (i) > + { > + case 1: > + s1 = __extension__(v4si){-1, -1, -1, -1}; > + b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; > + h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0}; > + break; > + case 2: > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0}; > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > + break; > + case 3: > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > -1, -1, -1, -1, -1}; > + s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1}; > + break; > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-3.c > b/gcc/testsuite/gcc.target/i386/pr92080-3.c > new file mode 100644 > index 00000000000..2174def4e6d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr92080-3.c > @@ -0,0 +1,48 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=x86-64" } */ > +/* { dg-final { scan-assembler-times "pxor" 1 } } */ > +/* { dg-final { scan-assembler-times "pcmpeq" 1 } } */ > + > +typedef int v4si __attribute__((vector_size(16))); > +typedef short v8hi __attribute__((vector_size(16))); > +typedef char v16qi __attribute__((vector_size(16))); > + > +v16qi b1; > +v8hi h1; > +v4si s1; > + > +void > +foo (int i, int j) > +{ > + switch (i) > + { > + case 1: > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > + s1 = __extension__(v4si){0, 0, 0, 0}; > + break; > + case 2: > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > -1, -1, -1, -1, -1}; > + break; > + case 3: > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0}; > + break; > + default: > + break; > + } > + > + switch (i) > + { > + case 1: > + s1 = __extension__(v4si){-1, -1, -1, -1}; > + break; > + case 2: > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0}; > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > + break; > + case 3: > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > -1, -1, -1, -1, -1}; > + break; > + } > +} > -- > 2.47.1 >
OK for master? -- H.J.