For all different modes of all 0s/1s vectors, we can use the single widest all 0s/1s vector register for all 0s/1s vector uses in the whole function. Add a pass to generate a single widest all 0s/1s vector set instruction at entry of the nearest common dominator for basic blocks with all 0s/1s vector uses. On Linux/x86-64, in cc1plus, this patch reduces the number of vector xor instructions from 4803 to 4714 and pcmpeq instructions from 144 to 142.
This change causes a regression: FAIL: gcc.dg/rtl/x86_64/vector_eq.c without the fix for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117863 NB: PR target/92080 and PR target/117839 aren't same. PR target/117839 is for vectors of all 0s and all 1s with different sizes and different components. PR target/92080 is for broadcast of the same component to different vector sizes. This patch covers only all 0s and all 1s cases of PR target/92080. gcc/ PR target/92080 PR target/117839 * config/i386/i386-features.cc (ix86_rrvl_gate): New. (ix86_place_single_vector_set): Likewise. (ix86_get_vector_load_mode): Likewise. (remove_redundant_vector_load): Likewise. (pass_data_remove_redundant_vector_load): Likewise. (pass_remove_redundant_vector_load): Likewise. (make_pass_remove_redundant_vector_load): Likewise. * config/i386/i386-passes.def: Add pass_remove_redundant_vector_load after pass_remove_partial_avx_dependency. * config/i386/i386-protos.h (make_pass_remove_redundant_vector_load): New. gcc/testsuite/ PR target/92080 PR target/117839 * gcc.target/i386/pr117839-1a.c: New test. * gcc.target/i386/pr117839-1b.c: Likewise. * gcc.target/i386/pr117839-2.c: Likewise. * gcc.target/i386/pr92080-1.c: Likewise. * gcc.target/i386/pr92080-2.c: Likewise. * gcc.target/i386/pr92080-3.c: Likewise. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> --- gcc/config/i386/i386-features.cc | 308 ++++++++++++++++++++ gcc/config/i386/i386-passes.def | 1 + gcc/config/i386/i386-protos.h | 2 + gcc/testsuite/gcc.target/i386/pr117839-1a.c | 35 +++ gcc/testsuite/gcc.target/i386/pr117839-1b.c | 5 + gcc/testsuite/gcc.target/i386/pr117839-2.c | 40 +++ gcc/testsuite/gcc.target/i386/pr92080-1.c | 54 ++++ gcc/testsuite/gcc.target/i386/pr92080-2.c | 59 ++++ gcc/testsuite/gcc.target/i386/pr92080-3.c | 48 +++ 9 files changed, 552 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 003b003e09c..7d8d260750d 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3288,6 +3288,314 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) return new pass_remove_partial_avx_dependency (ctxt); } +static bool +ix86_rrvl_gate () +{ + return (TARGET_SSE2 + && optimize + && optimize_function_for_speed_p (cfun)); +} + +/* Generate a vector set, DEST = SRC, at entry of the nearest dominator + for basic block map BBS, which is in the fake loop that contains the + whole function, so that there is only a single vector set in the + whole function. */ + +static void +ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs) +{ + basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + + rtx set = gen_rtx_SET (dest, src); + + rtx_insn *insn = BB_HEAD (bb); + while (insn && !NONDEBUG_INSN_P (insn)) + { + if (insn == BB_END (bb)) + { + insn = NULL; + break; + } + insn = NEXT_INSN (insn); + } + + rtx_insn *set_insn; + if (insn == BB_HEAD (bb)) + set_insn = emit_insn_before (set, insn); + else + set_insn = emit_insn_after (set, + insn ? PREV_INSN (insn) : BB_END (bb)); + df_insn_rescan (set_insn); +} + +/* Return a machine mode suitable for vector SIZE. */ + +static machine_mode +ix86_get_vector_load_mode (unsigned int size) +{ + machine_mode mode; + if (size == 64) + mode = V64QImode; + else if (size == 32) + mode = V32QImode; + else + mode = V16QImode; + return mode; +} + +/* At entry of the nearest common dominator for basic blocks with vector + CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest + vector set instruction for all CONST0_RTX and integer CONSTM1_RTX + uses. + + NB: We want to generate only a single widest vector set to cover the + whole function. The LCM algorithm isn't appropriate here since it + may place a vector set inside the loop. */ + +static unsigned int +remove_redundant_vector_load (void) +{ + timevar_push (TV_MACH_DEP); + + bitmap_obstack_initialize (NULL); + bitmap zero_bbs = BITMAP_ALLOC (NULL); + bitmap m1_bbs = BITMAP_ALLOC (NULL); + bitmap vector_insns = BITMAP_ALLOC (NULL); + + basic_block bb; + rtx_insn *insn; + rtx set; + unsigned HOST_WIDE_INT zero_count = 0; + unsigned HOST_WIDE_INT m1_count = 0; + unsigned int zero_size = 0; + unsigned int m1_size = 0; + + df_set_flags (DF_DEFER_INSN_RESCAN); + + FOR_EACH_BB_FN (bb, cfun) + { + FOR_BB_INSNS (bb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + + set = single_set (insn); + if (!set) + continue; + + rtx dest = SET_DEST (set); + machine_mode mode = GET_MODE (dest); + /* Skip non-vector instruction. */ + if (!VECTOR_MODE_P (mode)) + continue; + + rtx src = SET_SRC (set); + if (!REG_P (dest) + || (src != CONST0_RTX (mode) + && !(GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && src == CONSTM1_RTX (mode)))) + { + /* Record non-CONST0_RTX/CONSTM1_RTX vector instruction. */ + bitmap_set_bit (vector_insns, INSN_UID (insn)); + continue; + } + + if (src == CONST0_RTX (mode)) + { + /* Record the maximum vector size. */ + if (zero_size < GET_MODE_SIZE (mode)) + zero_size = GET_MODE_SIZE (mode); + + /* Record the basic block with CONST0_RTX. */ + bitmap_set_bit (zero_bbs, bb->index); + zero_count++; + } + else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && src == CONSTM1_RTX (mode)) + { + /* Record the maximum vector size. */ + if (m1_size < GET_MODE_SIZE (mode)) + m1_size = GET_MODE_SIZE (mode); + + /* Record the basic block with CONSTM1_RTX. */ + bitmap_set_bit (m1_bbs, bb->index); + m1_count++; + } + } + } + + if (zero_count > 1 || m1_count > 1) + { + machine_mode zero_mode, m1_mode; + rtx vector_const0, vector_constm1; + if (zero_count > 1) + { + zero_mode = ix86_get_vector_load_mode (zero_size); + vector_const0 = gen_reg_rtx (zero_mode); + } + else + { + zero_mode = VOIDmode; + vector_const0 = nullptr; + } + if (m1_count > 1) + { + m1_mode = ix86_get_vector_load_mode (m1_size); + vector_constm1 = gen_reg_rtx (m1_mode); + } + else + { + m1_mode = VOIDmode; + vector_constm1 = nullptr; + } + + bool zero_replaced = false; + bool m1_replaced = false; + + bitmap_iterator bi; + unsigned id; + EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi) + { + /* Replace CONST0_RTX and integer CONSTM1_RTX with the single + CONST0_RTX and integer CONSTM1_RTX register. */ + df_ref ref, def; + insn = DF_INSN_UID_GET (id)->insn; + bool replaced = false; + + for (ref = DF_INSN_UID_USES (id); + ref; + ref = DF_REF_NEXT_LOC (ref)) + { + if (DF_REF_TYPE (ref) != DF_REF_REG_USE) + continue; + + /* Skip non-vector register. */ + rtx reg = DF_REF_REG (ref); + if (!VECTOR_MODE_P (GET_MODE (reg))) + continue; + + /* Check the single definition. */ + def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref)); + if (!def || DF_REF_NEXT_REG (def) != nullptr) + continue; + + /* Get the single definition. */ + rtx_insn *def_insn = DF_REF_INSN (def); + set = single_set (def_insn); + if (!set) + continue; + + /* Check the single definition of vector constant zero. */ + rtx src = SET_SRC (set); + rtx replace; + if (vector_const0 && src == CONST0_RTX (GET_MODE (src))) + { + /* Replace REG with VECTOR_CONST0. */ + if (SUBREG_P (reg) || GET_MODE (reg) == zero_mode) + replace = vector_const0; + else + replace = gen_rtx_SUBREG (GET_MODE (reg), + vector_const0, 0); + *DF_REF_REAL_LOC (ref) = replace; + replaced = true; + zero_replaced = true; + } + else if (vector_constm1 + && src == CONSTM1_RTX (GET_MODE (src))) + { + /* Replace REG with VECTOR_CONSTM1. */ + if (SUBREG_P (reg) || GET_MODE (reg) == m1_mode) + replace = vector_constm1; + else + replace = gen_rtx_SUBREG (GET_MODE (reg), + vector_constm1, 0); + *DF_REF_REAL_LOC (ref) = replace; + replaced = true; + m1_replaced = true; + } + } + + if (replaced) + df_insn_rescan (insn); + } + + /* (Re-)discover loops so that bb->loop_father can be used in the + analysis below. */ + calculate_dominance_info (CDI_DOMINATORS); + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); + + if (zero_replaced) + ix86_place_single_vector_set (vector_const0, + CONST0_RTX (zero_mode), + zero_bbs); + + if (m1_replaced) + ix86_place_single_vector_set (vector_constm1, + CONSTM1_RTX (m1_mode), + m1_bbs); + + loop_optimizer_finalize (); + + df_process_deferred_rescans (); + } + + df_clear_flags (DF_DEFER_INSN_RESCAN); + + bitmap_obstack_release (NULL); + BITMAP_FREE (zero_bbs); + BITMAP_FREE (m1_bbs); + BITMAP_FREE (vector_insns); + + timevar_pop (TV_MACH_DEP); + return 0; +} + +namespace { + +const pass_data pass_data_remove_redundant_vector_load = +{ + RTL_PASS, /* type */ + "rrvl", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_remove_redundant_vector_load : public rtl_opt_pass +{ +public: + pass_remove_redundant_vector_load (gcc::context *ctxt) + : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt) + {} + + /* opt_pass methods: */ + bool gate (function *) final override + { + return ix86_rrvl_gate (); + } + + unsigned int execute (function *) final override + { + return remove_redundant_vector_load (); + } +}; // class pass_remove_redundant_vector_load + +} // anon namespace + +rtl_opt_pass * +make_pass_remove_redundant_vector_load (gcc::context *ctxt) +{ + return new pass_remove_redundant_vector_load (ctxt); +} + /* Convert legacy instructions that clobbers EFLAGS to APX_NF instructions when there are no flag set between a flag producer and user. */ diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index a9d350dcfca..df424cdb9c7 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -35,5 +35,6 @@ along with GCC; see the file COPYING3. If not see PR116174. */ INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); + INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load); INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency); INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 78e72c50c6d..4c3a8bd326c 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -426,6 +426,8 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area (gcc::context *); extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); +extern rtl_opt_pass *make_pass_remove_redundant_vector_load + (gcc::context *); extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *); extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1a.c b/gcc/testsuite/gcc.target/i386/pr117839-1a.c new file mode 100644 index 00000000000..4501cfbcad4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr117839-1a.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */ + +#include <stddef.h> + +void +clear_memory (void *mem1, size_t nclears1, void *mem2, size_t nclears2) +{ + size_t *d1 = (size_t *) mem1; + + *(d1 + 0) = 0; + *(d1 + 1) = 0; + *(d1 + 2) = 0; + if (nclears1 > 3) + { + *(d1 + nclears1 - 4) = 0; + *(d1 + nclears1 - 4 + 1) = 0; + *(d1 + nclears1 - 4 + 2) = 0; + *(d1 + nclears1 - 4 + 3) = 0; + } + + double *d2 = (double *) mem2; + + *(d2 + 0) = 0; + *(d2 + 1) = 0; + *(d2 + 2) = 0; + if (nclears2 > 3) + { + *(d2 + nclears2 - 4) = 0; + *(d2 + nclears2 - 4 + 1) = 0; + *(d2 + nclears2 - 4 + 2) = 0; + *(d2 + nclears2 - 4 + 3) = 0; + } +} diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1b.c b/gcc/testsuite/gcc.target/i386/pr117839-1b.c new file mode 100644 index 00000000000..e71b991a207 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr117839-1b.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3" } */ +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */ + +#include "pr117839-1a.c" diff --git a/gcc/testsuite/gcc.target/i386/pr117839-2.c b/gcc/testsuite/gcc.target/i386/pr117839-2.c new file mode 100644 index 00000000000..c76744cf98b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr117839-2.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3" } */ +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */ + +#include <stddef.h> + +float +clear_memory (void *mem, size_t clearsize) +{ + size_t *d = (size_t *) mem; + size_t nclears = clearsize / sizeof (size_t); + + *(d + 0) = 0; + *(d + 1) = 0; + *(d + 2) = 0; + if (nclears > 9) + { + *(d + 5) = 0; + *(d + 5 + 1) = 0; + *(d + 5 + 2) = 0; + *(d + 5 + 3) = 0; + *(d + nclears - 8) = 0; + *(d + nclears - 8 + 1) = 0; + *(d + nclears - 8 + 2) = 0; + *(d + nclears - 8 + 3) = 0; + } + else + { + *(d + 1) = 0; + *(d + 2) = 0; + *(d + 3) = 0; + *(d + 4) = 0; + *(d + nclears - 4) = 0; + *(d + nclears - 4 + 1) = 0; + *(d + nclears - 4 + 2) = 0; + *(d + nclears - 4 + 3) = 0; + } + + return nclears; +} diff --git a/gcc/testsuite/gcc.target/i386/pr92080-1.c b/gcc/testsuite/gcc.target/i386/pr92080-1.c new file mode 100644 index 00000000000..7059b4514eb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92080-1.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3" } */ +/* { dg-final { scan-assembler-times "vpxor" 2 } } */ +/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */ + +typedef int v4si __attribute__((vector_size(16))); +typedef int v8si __attribute__((vector_size(32))); +typedef short v8hi __attribute__((vector_size(16))); +typedef short v16hi __attribute__((vector_size(32))); +typedef char v16qi __attribute__((vector_size(16))); +typedef char v32qi __attribute__((vector_size(32))); + +v16qi b1; +v8hi h1; +v4si s1; +v32qi b2; +v16hi h2; +v8si s2; + +void +foo () +{ + s1 = __extension__(v4si){0, 0, 0, 0}; + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +} + +void +foo1 () +{ + s1 = __extension__(v4si){-1, -1, -1, -1}; + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; +} + + +void +foo2 () +{ + s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0}; + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +} + +void +foo3 () +{ + s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1}; + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; +} diff --git a/gcc/testsuite/gcc.target/i386/pr92080-2.c b/gcc/testsuite/gcc.target/i386/pr92080-2.c new file mode 100644 index 00000000000..d160d90de53 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92080-2.c @@ -0,0 +1,59 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3" } */ +/* { dg-final { scan-assembler-times "vpxor" 1 } } */ +/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */ + +typedef int v4si __attribute__((vector_size(16))); +typedef int v8si __attribute__((vector_size(32))); +typedef short v8hi __attribute__((vector_size(16))); +typedef short v16hi __attribute__((vector_size(32))); +typedef char v16qi __attribute__((vector_size(16))); +typedef char v32qi __attribute__((vector_size(32))); + +v16qi b1; +v8hi h1; +v4si s1; +v32qi b2; +v16hi h2; +v8si s2; + +void +foo (int i, int j) +{ + switch (i) + { + case 1: + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; + s1 = __extension__(v4si){0, 0, 0, 0}; + s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0}; + break; + case 2: + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + break; + case 3: + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + break; + default: + break; + } + + switch (i) + { + case 1: + s1 = __extension__(v4si){-1, -1, -1, -1}; + b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + break; + case 2: + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; + break; + case 3: + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1}; + break; + } +} diff --git a/gcc/testsuite/gcc.target/i386/pr92080-3.c b/gcc/testsuite/gcc.target/i386/pr92080-3.c new file mode 100644 index 00000000000..2174def4e6d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92080-3.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64" } */ +/* { dg-final { scan-assembler-times "pxor" 1 } } */ +/* { dg-final { scan-assembler-times "pcmpeq" 1 } } */ + +typedef int v4si __attribute__((vector_size(16))); +typedef short v8hi __attribute__((vector_size(16))); +typedef char v16qi __attribute__((vector_size(16))); + +v16qi b1; +v8hi h1; +v4si s1; + +void +foo (int i, int j) +{ + switch (i) + { + case 1: + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; + s1 = __extension__(v4si){0, 0, 0, 0}; + break; + case 2: + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + break; + case 3: + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + break; + default: + break; + } + + switch (i) + { + case 1: + s1 = __extension__(v4si){-1, -1, -1, -1}; + break; + case 2: + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; + break; + case 3: + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + break; + } +} -- 2.47.1