On Sat, Apr 19, 2025 at 1:25 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > On Sun, Dec 1, 2024 at 7:50 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > For all different modes of all 0s/1s vectors, we can use the single widest > > all 0s/1s vector register for all 0s/1s vector uses in the whole function. > > Add a pass to generate a single widest all 0s/1s vector set instruction at > > entry of the nearest common dominator for basic blocks with all 0s/1s > > vector uses. On Linux/x86-64, in cc1plus, this patch reduces the number > > of vector xor instructions from 4803 to 4714 and pcmpeq instructions from > > 144 to 142. > > > > This change causes a regression: > > > > FAIL: gcc.dg/rtl/x86_64/vector_eq.c > > > > without the fix for > > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117863 > > > > NB: PR target/92080 and PR target/117839 aren't same. PR target/117839 > > is for vectors of all 0s and all 1s with different sizes and different > > components. PR target/92080 is for broadcast of the same component to > > different vector sizes. This patch covers only all 0s and all 1s cases > > of PR target/92080. > > > > gcc/ > > > > PR target/92080 > > PR target/117839 > > * config/i386/i386-features.cc (ix86_rrvl_gate): New. > > (ix86_place_single_vector_set): Likewise. > > (ix86_get_vector_load_mode): Likewise. > > (remove_redundant_vector_load): Likewise. > > (pass_data_remove_redundant_vector_load): Likewise. > > (pass_remove_redundant_vector_load): Likewise. > > (make_pass_remove_redundant_vector_load): Likewise. > > * config/i386/i386-passes.def: Add > > pass_remove_redundant_vector_load after > > pass_remove_partial_avx_dependency. > > * config/i386/i386-protos.h > > (make_pass_remove_redundant_vector_load): New. > > > > gcc/testsuite/ > > > > PR target/92080 > > PR target/117839 > > * gcc.target/i386/pr117839-1a.c: New test. > > * gcc.target/i386/pr117839-1b.c: Likewise. > > * gcc.target/i386/pr117839-2.c: Likewise. > > * gcc.target/i386/pr92080-1.c: Likewise. > > * gcc.target/i386/pr92080-2.c: Likewise. > > * gcc.target/i386/pr92080-3.c: Likewise. > > > > Signed-off-by: H.J. Lu <hjl.to...@gmail.com> > > --- > > gcc/config/i386/i386-features.cc | 308 ++++++++++++++++++++ > > gcc/config/i386/i386-passes.def | 1 + > > gcc/config/i386/i386-protos.h | 2 + > > gcc/testsuite/gcc.target/i386/pr117839-1a.c | 35 +++ > > gcc/testsuite/gcc.target/i386/pr117839-1b.c | 5 + > > gcc/testsuite/gcc.target/i386/pr117839-2.c | 40 +++ > > gcc/testsuite/gcc.target/i386/pr92080-1.c | 54 ++++ > > gcc/testsuite/gcc.target/i386/pr92080-2.c | 59 ++++ > > gcc/testsuite/gcc.target/i386/pr92080-3.c | 48 +++ > > 9 files changed, 552 insertions(+) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c > > > > diff --git a/gcc/config/i386/i386-features.cc > > b/gcc/config/i386/i386-features.cc > > index 003b003e09c..7d8d260750d 100644 > > --- a/gcc/config/i386/i386-features.cc > > +++ b/gcc/config/i386/i386-features.cc > > @@ -3288,6 +3288,314 @@ make_pass_remove_partial_avx_dependency > > (gcc::context *ctxt) > > return new pass_remove_partial_avx_dependency (ctxt); > > } > > > > +static bool > > +ix86_rrvl_gate () > > +{ > > + return (TARGET_SSE2 > > + && optimize > > + && optimize_function_for_speed_p (cfun)); > > +} > > + > > +/* Generate a vector set, DEST = SRC, at entry of the nearest dominator > > + for basic block map BBS, which is in the fake loop that contains the > > + whole function, so that there is only a single vector set in the > > + whole function. */ > > + > > +static void > > +ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs) > > +{ > > + basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); > > + while (bb->loop_father->latch > > + != EXIT_BLOCK_PTR_FOR_FN (cfun)) > > + bb = get_immediate_dominator (CDI_DOMINATORS, > > + bb->loop_father->header); > > + > > + rtx set = gen_rtx_SET (dest, src); > > + > > + rtx_insn *insn = BB_HEAD (bb); > > + while (insn && !NONDEBUG_INSN_P (insn)) > > + { > > + if (insn == BB_END (bb)) > > + { > > + insn = NULL; > > + break; > > + } > > + insn = NEXT_INSN (insn); > > + } > > + > > + rtx_insn *set_insn; > > + if (insn == BB_HEAD (bb)) > > + set_insn = emit_insn_before (set, insn); > > + else > > + set_insn = emit_insn_after (set, > > + insn ? PREV_INSN (insn) : BB_END (bb)); > > + df_insn_rescan (set_insn); > > +} > > + > > +/* Return a machine mode suitable for vector SIZE. */ > > + > > +static machine_mode > > +ix86_get_vector_load_mode (unsigned int size) > > +{ > > + machine_mode mode; > > + if (size == 64) > > + mode = V64QImode; > > + else if (size == 32) > > + mode = V32QImode; > > + else > > + mode = V16QImode; > > + return mode; > > +} > > + > > +/* At entry of the nearest common dominator for basic blocks with vector > > + CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest > > + vector set instruction for all CONST0_RTX and integer CONSTM1_RTX > > + uses. > > + > > + NB: We want to generate only a single widest vector set to cover the > > + whole function. The LCM algorithm isn't appropriate here since it > > + may place a vector set inside the loop. */ > > + > > +static unsigned int > > +remove_redundant_vector_load (void) > > +{ > > + timevar_push (TV_MACH_DEP); > > + > > + bitmap_obstack_initialize (NULL); > > + bitmap zero_bbs = BITMAP_ALLOC (NULL); > > + bitmap m1_bbs = BITMAP_ALLOC (NULL); > > + bitmap vector_insns = BITMAP_ALLOC (NULL); Use auto_bitmap? > > + > > + basic_block bb; > > + rtx_insn *insn; > > + rtx set; > > + unsigned HOST_WIDE_INT zero_count = 0; > > + unsigned HOST_WIDE_INT m1_count = 0; > > + unsigned int zero_size = 0; > > + unsigned int m1_size = 0; > > + > > + df_set_flags (DF_DEFER_INSN_RESCAN); > > + > > + FOR_EACH_BB_FN (bb, cfun) > > + { > > + FOR_BB_INSNS (bb, insn) > > + { > > + if (!NONDEBUG_INSN_P (insn)) > > + continue; > > + > > + set = single_set (insn); > > + if (!set) > > + continue; > > + > > + rtx dest = SET_DEST (set); > > + machine_mode mode = GET_MODE (dest); > > + /* Skip non-vector instruction. */ > > + if (!VECTOR_MODE_P (mode)) > > + continue; > > + > > + rtx src = SET_SRC (set); > > + if (!REG_P (dest) > > + || (src != CONST0_RTX (mode) > > + && !(GET_MODE_CLASS (mode) == MODE_VECTOR_INT > > + && src == CONSTM1_RTX (mode)))) > > + { > > + /* Record non-CONST0_RTX/CONSTM1_RTX vector instruction. */ vector_insns only records a single_set, but not all vector_insns which could use constm1/zero. > > + bitmap_set_bit (vector_insns, INSN_UID (insn)); > > + continue; > > + } > > + > > + if (src == CONST0_RTX (mode)) > > + { > > + /* Record the maximum vector size. */ > > + if (zero_size < GET_MODE_SIZE (mode)) > > + zero_size = GET_MODE_SIZE (mode); > > + > > + /* Record the basic block with CONST0_RTX. */ > > + bitmap_set_bit (zero_bbs, bb->index); > > + zero_count++; > > + } > > + else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT > > + && src == CONSTM1_RTX (mode)) > > + { > > + /* Record the maximum vector size. */ > > + if (m1_size < GET_MODE_SIZE (mode)) > > + m1_size = GET_MODE_SIZE (mode); > > + > > + /* Record the basic block with CONSTM1_RTX. */ > > + bitmap_set_bit (m1_bbs, bb->index); > > + m1_count++; > > + } > > + } > > + } > > + > > + if (zero_count > 1 || m1_count > 1) > > + { > > + machine_mode zero_mode, m1_mode; > > + rtx vector_const0, vector_constm1; > > + if (zero_count > 1) > > + { > > + zero_mode = ix86_get_vector_load_mode (zero_size); > > + vector_const0 = gen_reg_rtx (zero_mode); > > + } > > + else > > + { > > + zero_mode = VOIDmode; > > + vector_const0 = nullptr; > > + } > > + if (m1_count > 1) > > + { > > + m1_mode = ix86_get_vector_load_mode (m1_size); > > + vector_constm1 = gen_reg_rtx (m1_mode); > > + } > > + else > > + { > > + m1_mode = VOIDmode; > > + vector_constm1 = nullptr; > > + } > > + > > + bool zero_replaced = false; > > + bool m1_replaced = false; > > + > > + bitmap_iterator bi; > > + unsigned id; > > + EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi) Could we just record those zero/m1 insn, and replace the src of those insn with subreg (...), I think LRA can eliminate those redundant moves? Also we also need to change ix86_modes_tieable_p to make sure those inserted subreg can be handled by LRA and other passes? > > + { > > + /* Replace CONST0_RTX and integer CONSTM1_RTX with the single > > + CONST0_RTX and integer CONSTM1_RTX register. */ > > + df_ref ref, def; > > + insn = DF_INSN_UID_GET (id)->insn; > > + bool replaced = false; > > + > > + for (ref = DF_INSN_UID_USES (id); > > + ref; > > + ref = DF_REF_NEXT_LOC (ref)) > > + { > > + if (DF_REF_TYPE (ref) != DF_REF_REG_USE) > > + continue; > > + > > + /* Skip non-vector register. */ > > + rtx reg = DF_REF_REG (ref); > > + if (!VECTOR_MODE_P (GET_MODE (reg))) > > + continue; > > + > > + /* Check the single definition. */ > > + def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref)); > > + if (!def || DF_REF_NEXT_REG (def) != nullptr) > > + continue; > > + > > + /* Get the single definition. */ > > + rtx_insn *def_insn = DF_REF_INSN (def); > > + set = single_set (def_insn); > > + if (!set) > > + continue; > > + > > + /* Check the single definition of vector constant zero. */ > > + rtx src = SET_SRC (set); > > + rtx replace; > > + if (vector_const0 && src == CONST0_RTX (GET_MODE (src))) > > + { > > + /* Replace REG with VECTOR_CONST0. */ > > + if (SUBREG_P (reg) || GET_MODE (reg) == zero_mode) > > + replace = vector_const0; > > + else > > + replace = gen_rtx_SUBREG (GET_MODE (reg), > > + vector_const0, 0); > > + *DF_REF_REAL_LOC (ref) = replace; > > + replaced = true; > > + zero_replaced = true; > > + } > > + else if (vector_constm1 > > + && src == CONSTM1_RTX (GET_MODE (src))) > > + { > > + /* Replace REG with VECTOR_CONSTM1. */ > > + if (SUBREG_P (reg) || GET_MODE (reg) == m1_mode) > > + replace = vector_constm1; > > + else > > + replace = gen_rtx_SUBREG (GET_MODE (reg), > > + vector_constm1, 0); > > + *DF_REF_REAL_LOC (ref) = replace; > > + replaced = true; > > + m1_replaced = true; > > + } > > + } > > + > > + if (replaced) > > + df_insn_rescan (insn); > > + } > > + > > + /* (Re-)discover loops so that bb->loop_father can be used in the > > + analysis below. */ > > + calculate_dominance_info (CDI_DOMINATORS); > > + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); > > + > > + if (zero_replaced) > > + ix86_place_single_vector_set (vector_const0, > > + CONST0_RTX (zero_mode), > > + zero_bbs); > > + > > + if (m1_replaced) > > + ix86_place_single_vector_set (vector_constm1, > > + CONSTM1_RTX (m1_mode), > > + m1_bbs); > > + > > + loop_optimizer_finalize (); > > + > > + df_process_deferred_rescans (); > > + } > > + > > + df_clear_flags (DF_DEFER_INSN_RESCAN); > > + > > + bitmap_obstack_release (NULL); > > + BITMAP_FREE (zero_bbs); > > + BITMAP_FREE (m1_bbs); > > + BITMAP_FREE (vector_insns); > > + > > + timevar_pop (TV_MACH_DEP); > > + return 0; > > +} > > + > > +namespace { > > + > > +const pass_data pass_data_remove_redundant_vector_load = > > +{ > > + RTL_PASS, /* type */ > > + "rrvl", /* name */ > > + OPTGROUP_NONE, /* optinfo_flags */ > > + TV_MACH_DEP, /* tv_id */ > > + 0, /* properties_required */ > > + 0, /* properties_provided */ > > + 0, /* properties_destroyed */ > > + 0, /* todo_flags_start */ > > + 0, /* todo_flags_finish */ > > +}; > > + > > +class pass_remove_redundant_vector_load : public rtl_opt_pass > > +{ > > +public: > > + pass_remove_redundant_vector_load (gcc::context *ctxt) > > + : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt) > > + {} > > + > > + /* opt_pass methods: */ > > + bool gate (function *) final override > > + { > > + return ix86_rrvl_gate (); > > + } > > + > > + unsigned int execute (function *) final override > > + { > > + return remove_redundant_vector_load (); > > + } > > +}; // class pass_remove_redundant_vector_load > > + > > +} // anon namespace > > + > > +rtl_opt_pass * > > +make_pass_remove_redundant_vector_load (gcc::context *ctxt) > > +{ > > + return new pass_remove_redundant_vector_load (ctxt); > > +} > > + > > /* Convert legacy instructions that clobbers EFLAGS to APX_NF > > instructions when there are no flag set between a flag > > producer and user. */ > > diff --git a/gcc/config/i386/i386-passes.def > > b/gcc/config/i386/i386-passes.def > > index a9d350dcfca..df424cdb9c7 100644 > > --- a/gcc/config/i386/i386-passes.def > > +++ b/gcc/config/i386/i386-passes.def > > @@ -35,5 +35,6 @@ along with GCC; see the file COPYING3. If not see > > PR116174. */ > > INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); > > > > + INSERT_PASS_AFTER (pass_late_combine, 1, > > pass_remove_redundant_vector_load); > > INSERT_PASS_AFTER (pass_late_combine, 1, > > pass_remove_partial_avx_dependency); > > INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert); > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > > index 78e72c50c6d..4c3a8bd326c 100644 > > --- a/gcc/config/i386/i386-protos.h > > +++ b/gcc/config/i386/i386-protos.h > > @@ -426,6 +426,8 @@ extern rtl_opt_pass > > *make_pass_insert_endbr_and_patchable_area > > (gcc::context *); > > extern rtl_opt_pass *make_pass_remove_partial_avx_dependency > > (gcc::context *); > > +extern rtl_opt_pass *make_pass_remove_redundant_vector_load > > + (gcc::context *); > > extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *); > > extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1a.c > > b/gcc/testsuite/gcc.target/i386/pr117839-1a.c > > new file mode 100644 > > index 00000000000..4501cfbcad4 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr117839-1a.c > > @@ -0,0 +1,35 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ > > +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t > > \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */ > > + > > +#include <stddef.h> > > + > > +void > > +clear_memory (void *mem1, size_t nclears1, void *mem2, size_t nclears2) > > +{ > > + size_t *d1 = (size_t *) mem1; > > + > > + *(d1 + 0) = 0; > > + *(d1 + 1) = 0; > > + *(d1 + 2) = 0; > > + if (nclears1 > 3) > > + { > > + *(d1 + nclears1 - 4) = 0; > > + *(d1 + nclears1 - 4 + 1) = 0; > > + *(d1 + nclears1 - 4 + 2) = 0; > > + *(d1 + nclears1 - 4 + 3) = 0; > > + } > > + > > + double *d2 = (double *) mem2; > > + > > + *(d2 + 0) = 0; > > + *(d2 + 1) = 0; > > + *(d2 + 2) = 0; > > + if (nclears2 > 3) > > + { > > + *(d2 + nclears2 - 4) = 0; > > + *(d2 + nclears2 - 4 + 1) = 0; > > + *(d2 + nclears2 - 4 + 2) = 0; > > + *(d2 + nclears2 - 4 + 3) = 0; > > + } > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1b.c > > b/gcc/testsuite/gcc.target/i386/pr117839-1b.c > > new file mode 100644 > > index 00000000000..e71b991a207 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr117839-1b.c > > @@ -0,0 +1,5 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=x86-64-v3" } */ > > +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t > > \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */ > > + > > +#include "pr117839-1a.c" > > diff --git a/gcc/testsuite/gcc.target/i386/pr117839-2.c > > b/gcc/testsuite/gcc.target/i386/pr117839-2.c > > new file mode 100644 > > index 00000000000..c76744cf98b > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr117839-2.c > > @@ -0,0 +1,40 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=x86-64-v3" } */ > > +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t > > \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */ > > + > > +#include <stddef.h> > > + > > +float > > +clear_memory (void *mem, size_t clearsize) > > +{ > > + size_t *d = (size_t *) mem; > > + size_t nclears = clearsize / sizeof (size_t); > > + > > + *(d + 0) = 0; > > + *(d + 1) = 0; > > + *(d + 2) = 0; > > + if (nclears > 9) > > + { > > + *(d + 5) = 0; > > + *(d + 5 + 1) = 0; > > + *(d + 5 + 2) = 0; > > + *(d + 5 + 3) = 0; > > + *(d + nclears - 8) = 0; > > + *(d + nclears - 8 + 1) = 0; > > + *(d + nclears - 8 + 2) = 0; > > + *(d + nclears - 8 + 3) = 0; > > + } > > + else > > + { > > + *(d + 1) = 0; > > + *(d + 2) = 0; > > + *(d + 3) = 0; > > + *(d + 4) = 0; > > + *(d + nclears - 4) = 0; > > + *(d + nclears - 4 + 1) = 0; > > + *(d + nclears - 4 + 2) = 0; > > + *(d + nclears - 4 + 3) = 0; > > + } > > + > > + return nclears; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-1.c > > b/gcc/testsuite/gcc.target/i386/pr92080-1.c > > new file mode 100644 > > index 00000000000..7059b4514eb > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-1.c > > @@ -0,0 +1,54 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=x86-64-v3" } */ > > +/* { dg-final { scan-assembler-times "vpxor" 2 } } */ > > +/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */ > > + > > +typedef int v4si __attribute__((vector_size(16))); > > +typedef int v8si __attribute__((vector_size(32))); > > +typedef short v8hi __attribute__((vector_size(16))); > > +typedef short v16hi __attribute__((vector_size(32))); > > +typedef char v16qi __attribute__((vector_size(16))); > > +typedef char v32qi __attribute__((vector_size(32))); > > + > > +v16qi b1; > > +v8hi h1; > > +v4si s1; > > +v32qi b2; > > +v16hi h2; > > +v8si s2; > > + > > +void > > +foo () > > +{ > > + s1 = __extension__(v4si){0, 0, 0, 0}; > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > 0}; > > + h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > 0}; > > +} > > + > > +void > > +foo1 () > > +{ > > + s1 = __extension__(v4si){-1, -1, -1, -1}; > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > > -1, -1, -1, -1, -1}; > > +} > > + > > + > > +void > > +foo2 () > > +{ > > + s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0}; > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > 0}; > > + b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; > > +} > > + > > +void > > +foo3 () > > +{ > > + s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1}; > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > > -1, -1, -1, -1, -1}; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-2.c > > b/gcc/testsuite/gcc.target/i386/pr92080-2.c > > new file mode 100644 > > index 00000000000..d160d90de53 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-2.c > > @@ -0,0 +1,59 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=x86-64-v3" } */ > > +/* { dg-final { scan-assembler-times "vpxor" 1 } } */ > > +/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */ > > + > > +typedef int v4si __attribute__((vector_size(16))); > > +typedef int v8si __attribute__((vector_size(32))); > > +typedef short v8hi __attribute__((vector_size(16))); > > +typedef short v16hi __attribute__((vector_size(32))); > > +typedef char v16qi __attribute__((vector_size(16))); > > +typedef char v32qi __attribute__((vector_size(32))); > > + > > +v16qi b1; > > +v8hi h1; > > +v4si s1; > > +v32qi b2; > > +v16hi h2; > > +v8si s2; > > + > > +void > > +foo (int i, int j) > > +{ > > + switch (i) > > + { > > + case 1: > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > > + s1 = __extension__(v4si){0, 0, 0, 0}; > > + s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0}; > > + break; > > + case 2: > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > > -1, -1, -1, -1, -1, -1}; > > + break; > > + case 3: > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > 0, 0}; > > + break; > > + default: > > + break; > > + } > > + > > + switch (i) > > + { > > + case 1: > > + s1 = __extension__(v4si){-1, -1, -1, -1}; > > + b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; > > + h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > 0, 0}; > > + break; > > + case 2: > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > 0, 0}; > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > > + break; > > + case 3: > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > > -1, -1, -1, -1, -1, -1}; > > + s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1}; > > + break; > > + } > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-3.c > > b/gcc/testsuite/gcc.target/i386/pr92080-3.c > > new file mode 100644 > > index 00000000000..2174def4e6d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-3.c > > @@ -0,0 +1,48 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=x86-64" } */ > > +/* { dg-final { scan-assembler-times "pxor" 1 } } */ > > +/* { dg-final { scan-assembler-times "pcmpeq" 1 } } */ > > + > > +typedef int v4si __attribute__((vector_size(16))); > > +typedef short v8hi __attribute__((vector_size(16))); > > +typedef char v16qi __attribute__((vector_size(16))); > > + > > +v16qi b1; > > +v8hi h1; > > +v4si s1; > > + > > +void > > +foo (int i, int j) > > +{ > > + switch (i) > > + { > > + case 1: > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > > + s1 = __extension__(v4si){0, 0, 0, 0}; > > + break; > > + case 2: > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > > -1, -1, -1, -1, -1, -1}; > > + break; > > + case 3: > > + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > 0, 0}; > > + break; > > + default: > > + break; > > + } > > + > > + switch (i) > > + { > > + case 1: > > + s1 = __extension__(v4si){-1, -1, -1, -1}; > > + break; > > + case 2: > > + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > 0, 0}; > > + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; > > + break; > > + case 3: > > + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, > > -1, -1, -1, -1, -1, -1}; > > + break; > > + } > > +} > > -- > > 2.47.1 > > > > OK for master? > > -- > H.J.
-- BR, Hongtao