On Sun, Dec 1, 2024 at 7:50 AM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> For all different modes of all 0s/1s vectors, we can use the single widest
> all 0s/1s vector register for all 0s/1s vector uses in the whole function.
> Add a pass to generate a single widest all 0s/1s vector set instruction at
> entry of the nearest common dominator for basic blocks with all 0s/1s
> vector uses.  On Linux/x86-64, in cc1plus, this patch reduces the number
> of vector xor instructions from 4803 to 4714 and pcmpeq instructions from
> 144 to 142.
>
> This change causes a regression:
>
> FAIL: gcc.dg/rtl/x86_64/vector_eq.c
>
> without the fix for
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117863
>
> NB: PR target/92080 and PR target/117839 aren't same.  PR target/117839
> is for vectors of all 0s and all 1s with different sizes and different
> components.  PR target/92080 is for broadcast of the same component to
> different vector sizes.  This patch covers only all 0s and all 1s cases
> of PR target/92080.
>
> gcc/
>
>         PR target/92080
>         PR target/117839
>         * config/i386/i386-features.cc (ix86_rrvl_gate): New.
>         (ix86_place_single_vector_set): Likewise.
>         (ix86_get_vector_load_mode): Likewise.
>         (remove_redundant_vector_load): Likewise.
>         (pass_data_remove_redundant_vector_load): Likewise.
>         (pass_remove_redundant_vector_load): Likewise.
>         (make_pass_remove_redundant_vector_load): Likewise.
>         * config/i386/i386-passes.def: Add
>         pass_remove_redundant_vector_load after
>         pass_remove_partial_avx_dependency.
>         * config/i386/i386-protos.h
>         (make_pass_remove_redundant_vector_load): New.
>
> gcc/testsuite/
>
>         PR target/92080
>         PR target/117839
>         * gcc.target/i386/pr117839-1a.c: New test.
>         * gcc.target/i386/pr117839-1b.c: Likewise.
>         * gcc.target/i386/pr117839-2.c: Likewise.
>         * gcc.target/i386/pr92080-1.c: Likewise.
>         * gcc.target/i386/pr92080-2.c: Likewise.
>         * gcc.target/i386/pr92080-3.c: Likewise.
>
> Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
> ---
>  gcc/config/i386/i386-features.cc            | 308 ++++++++++++++++++++
>  gcc/config/i386/i386-passes.def             |   1 +
>  gcc/config/i386/i386-protos.h               |   2 +
>  gcc/testsuite/gcc.target/i386/pr117839-1a.c |  35 +++
>  gcc/testsuite/gcc.target/i386/pr117839-1b.c |   5 +
>  gcc/testsuite/gcc.target/i386/pr117839-2.c  |  40 +++
>  gcc/testsuite/gcc.target/i386/pr92080-1.c   |  54 ++++
>  gcc/testsuite/gcc.target/i386/pr92080-2.c   |  59 ++++
>  gcc/testsuite/gcc.target/i386/pr92080-3.c   |  48 +++
>  9 files changed, 552 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c
>
> diff --git a/gcc/config/i386/i386-features.cc 
> b/gcc/config/i386/i386-features.cc
> index 003b003e09c..7d8d260750d 100644
> --- a/gcc/config/i386/i386-features.cc
> +++ b/gcc/config/i386/i386-features.cc
> @@ -3288,6 +3288,314 @@ make_pass_remove_partial_avx_dependency (gcc::context 
> *ctxt)
>    return new pass_remove_partial_avx_dependency (ctxt);
>  }
>
> +static bool
> +ix86_rrvl_gate ()
> +{
> +  return (TARGET_SSE2
> +         && optimize
> +         && optimize_function_for_speed_p (cfun));
> +}
> +
> +/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
> +   for basic block map BBS, which is in the fake loop that contains the
> +   whole function, so that there is only a single vector set in the
> +   whole function.   */
> +
> +static void
> +ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
> +{
> +  basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
> +  while (bb->loop_father->latch
> +        != EXIT_BLOCK_PTR_FOR_FN (cfun))
> +    bb = get_immediate_dominator (CDI_DOMINATORS,
> +                                 bb->loop_father->header);
> +
> +  rtx set = gen_rtx_SET (dest, src);
> +
> +  rtx_insn *insn = BB_HEAD (bb);
> +  while (insn && !NONDEBUG_INSN_P (insn))
> +    {
> +      if (insn == BB_END (bb))
> +       {
> +         insn = NULL;
> +         break;
> +       }
> +      insn = NEXT_INSN (insn);
> +    }
> +
> +  rtx_insn *set_insn;
> +  if (insn == BB_HEAD (bb))
> +    set_insn = emit_insn_before (set, insn);
> +  else
> +    set_insn = emit_insn_after (set,
> +                               insn ? PREV_INSN (insn) : BB_END (bb));
> +  df_insn_rescan (set_insn);
> +}
> +
> +/* Return a machine mode suitable for vector SIZE.  */
> +
> +static machine_mode
> +ix86_get_vector_load_mode (unsigned int size)
> +{
> +  machine_mode mode;
> +  if (size == 64)
> +    mode = V64QImode;
> +  else if (size == 32)
> +    mode = V32QImode;
> +  else
> +    mode = V16QImode;
> +  return mode;
> +}
> +
> +/* At entry of the nearest common dominator for basic blocks with vector
> +   CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
> +   vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
> +   uses.
> +
> +   NB: We want to generate only a single widest vector set to cover the
> +   whole function.  The LCM algorithm isn't appropriate here since it
> +   may place a vector set inside the loop.  */
> +
> +static unsigned int
> +remove_redundant_vector_load (void)
> +{
> +  timevar_push (TV_MACH_DEP);
> +
> +  bitmap_obstack_initialize (NULL);
> +  bitmap zero_bbs = BITMAP_ALLOC (NULL);
> +  bitmap m1_bbs = BITMAP_ALLOC (NULL);
> +  bitmap vector_insns = BITMAP_ALLOC (NULL);
> +
> +  basic_block bb;
> +  rtx_insn *insn;
> +  rtx set;
> +  unsigned HOST_WIDE_INT zero_count = 0;
> +  unsigned HOST_WIDE_INT m1_count = 0;
> +  unsigned int zero_size = 0;
> +  unsigned int m1_size = 0;
> +
> +  df_set_flags (DF_DEFER_INSN_RESCAN);
> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +    {
> +      FOR_BB_INSNS (bb, insn)
> +       {
> +         if (!NONDEBUG_INSN_P (insn))
> +           continue;
> +
> +         set = single_set (insn);
> +         if (!set)
> +           continue;
> +
> +         rtx dest = SET_DEST (set);
> +         machine_mode mode = GET_MODE (dest);
> +         /* Skip non-vector instruction.  */
> +         if (!VECTOR_MODE_P (mode))
> +           continue;
> +
> +         rtx src = SET_SRC (set);
> +         if (!REG_P (dest)
> +             || (src != CONST0_RTX (mode)
> +                 && !(GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> +                      && src == CONSTM1_RTX (mode))))
> +           {
> +             /* Record non-CONST0_RTX/CONSTM1_RTX vector instruction.  */
> +             bitmap_set_bit (vector_insns, INSN_UID (insn));
> +             continue;
> +           }
> +
> +         if (src == CONST0_RTX (mode))
> +           {
> +             /* Record the maximum vector size.  */
> +             if (zero_size < GET_MODE_SIZE (mode))
> +               zero_size = GET_MODE_SIZE (mode);
> +
> +             /* Record the basic block with CONST0_RTX.  */
> +             bitmap_set_bit (zero_bbs, bb->index);
> +             zero_count++;
> +           }
> +         else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> +                  && src == CONSTM1_RTX (mode))
> +           {
> +             /* Record the maximum vector size.  */
> +             if (m1_size < GET_MODE_SIZE (mode))
> +               m1_size = GET_MODE_SIZE (mode);
> +
> +             /* Record the basic block with CONSTM1_RTX.  */
> +             bitmap_set_bit (m1_bbs, bb->index);
> +             m1_count++;
> +           }
> +       }
> +    }
> +
> +  if (zero_count > 1 || m1_count > 1)
> +    {
> +      machine_mode zero_mode, m1_mode;
> +      rtx vector_const0, vector_constm1;
> +      if (zero_count > 1)
> +       {
> +         zero_mode = ix86_get_vector_load_mode (zero_size);
> +         vector_const0 = gen_reg_rtx (zero_mode);
> +       }
> +      else
> +       {
> +         zero_mode = VOIDmode;
> +         vector_const0 = nullptr;
> +       }
> +      if (m1_count > 1)
> +       {
> +         m1_mode = ix86_get_vector_load_mode (m1_size);
> +         vector_constm1 = gen_reg_rtx (m1_mode);
> +       }
> +      else
> +       {
> +         m1_mode = VOIDmode;
> +         vector_constm1 = nullptr;
> +       }
> +
> +      bool zero_replaced = false;
> +      bool m1_replaced = false;
> +
> +      bitmap_iterator bi;
> +      unsigned id;
> +      EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
> +       {
> +         /* Replace CONST0_RTX and integer CONSTM1_RTX with the single
> +            CONST0_RTX and integer CONSTM1_RTX register.  */
> +         df_ref ref, def;
> +         insn = DF_INSN_UID_GET (id)->insn;
> +         bool replaced = false;
> +
> +         for (ref = DF_INSN_UID_USES (id);
> +              ref;
> +              ref = DF_REF_NEXT_LOC (ref))
> +           {
> +             if (DF_REF_TYPE (ref) != DF_REF_REG_USE)
> +               continue;
> +
> +             /* Skip non-vector register.  */
> +             rtx reg = DF_REF_REG (ref);
> +             if (!VECTOR_MODE_P (GET_MODE (reg)))
> +               continue;
> +
> +             /* Check the single definition.  */
> +             def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
> +             if (!def || DF_REF_NEXT_REG (def) != nullptr)
> +               continue;
> +
> +             /* Get the single definition.  */
> +             rtx_insn *def_insn = DF_REF_INSN (def);
> +             set = single_set (def_insn);
> +             if (!set)
> +               continue;
> +
> +             /* Check the single definition of vector constant zero.  */
> +             rtx src = SET_SRC (set);
> +             rtx replace;
> +             if (vector_const0 && src == CONST0_RTX (GET_MODE (src)))
> +               {
> +                 /* Replace REG with VECTOR_CONST0.  */
> +                 if (SUBREG_P (reg) || GET_MODE (reg) == zero_mode)
> +                   replace = vector_const0;
> +                 else
> +                   replace = gen_rtx_SUBREG (GET_MODE (reg),
> +                                             vector_const0, 0);
> +                 *DF_REF_REAL_LOC (ref) = replace;
> +                 replaced = true;
> +                 zero_replaced = true;
> +               }
> +             else if (vector_constm1
> +                      && src == CONSTM1_RTX (GET_MODE (src)))
> +               {
> +                 /* Replace REG with VECTOR_CONSTM1.  */
> +                 if (SUBREG_P (reg) || GET_MODE (reg) == m1_mode)
> +                   replace = vector_constm1;
> +                 else
> +                   replace = gen_rtx_SUBREG (GET_MODE (reg),
> +                                             vector_constm1, 0);
> +                 *DF_REF_REAL_LOC (ref) = replace;
> +                 replaced = true;
> +                 m1_replaced = true;
> +               }
> +           }
> +
> +         if (replaced)
> +           df_insn_rescan (insn);
> +       }
> +
> +      /* (Re-)discover loops so that bb->loop_father can be used in the
> +        analysis below.  */
> +      calculate_dominance_info (CDI_DOMINATORS);
> +      loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> +
> +      if (zero_replaced)
> +       ix86_place_single_vector_set (vector_const0,
> +                                     CONST0_RTX (zero_mode),
> +                                     zero_bbs);
> +
> +      if (m1_replaced)
> +       ix86_place_single_vector_set (vector_constm1,
> +                                     CONSTM1_RTX (m1_mode),
> +                                     m1_bbs);
> +
> +      loop_optimizer_finalize ();
> +
> +      df_process_deferred_rescans ();
> +    }
> +
> +  df_clear_flags (DF_DEFER_INSN_RESCAN);
> +
> +  bitmap_obstack_release (NULL);
> +  BITMAP_FREE (zero_bbs);
> +  BITMAP_FREE (m1_bbs);
> +  BITMAP_FREE (vector_insns);
> +
> +  timevar_pop (TV_MACH_DEP);
> +  return 0;
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_remove_redundant_vector_load =
> +{
> +  RTL_PASS, /* type */
> +  "rrvl", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_MACH_DEP, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  0, /* todo_flags_finish */
> +};
> +
> +class pass_remove_redundant_vector_load : public rtl_opt_pass
> +{
> +public:
> +  pass_remove_redundant_vector_load (gcc::context *ctxt)
> +    : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
> +  {}
> +
> +  /* opt_pass methods: */
> +  bool gate (function *) final override
> +    {
> +      return ix86_rrvl_gate ();
> +    }
> +
> +  unsigned int execute (function *) final override
> +    {
> +      return remove_redundant_vector_load ();
> +    }
> +}; // class pass_remove_redundant_vector_load
> +
> +} // anon namespace
> +
> +rtl_opt_pass *
> +make_pass_remove_redundant_vector_load (gcc::context *ctxt)
> +{
> +  return new pass_remove_redundant_vector_load (ctxt);
> +}
> +
>  /* Convert legacy instructions that clobbers EFLAGS to APX_NF
>     instructions when there are no flag set between a flag
>     producer and user.  */
> diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
> index a9d350dcfca..df424cdb9c7 100644
> --- a/gcc/config/i386/i386-passes.def
> +++ b/gcc/config/i386/i386-passes.def
> @@ -35,5 +35,6 @@ along with GCC; see the file COPYING3.  If not see
>       PR116174.  */
>    INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
>
> +  INSERT_PASS_AFTER (pass_late_combine, 1, 
> pass_remove_redundant_vector_load);
>    INSERT_PASS_AFTER (pass_late_combine, 1, 
> pass_remove_partial_avx_dependency);
>    INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 78e72c50c6d..4c3a8bd326c 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -426,6 +426,8 @@ extern rtl_opt_pass 
> *make_pass_insert_endbr_and_patchable_area
>    (gcc::context *);
>  extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
>    (gcc::context *);
> +extern rtl_opt_pass *make_pass_remove_redundant_vector_load
> +  (gcc::context *);
>  extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
>  extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1a.c 
> b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
> new file mode 100644
> index 00000000000..4501cfbcad4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
> @@ -0,0 +1,35 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
> +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t 
> \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
> +
> +#include <stddef.h>
> +
> +void
> +clear_memory (void *mem1, size_t nclears1, void *mem2, size_t nclears2)
> +{
> +  size_t *d1 = (size_t *) mem1;
> +
> +  *(d1 + 0) = 0;
> +  *(d1 + 1) = 0;
> +  *(d1 + 2) = 0;
> +  if (nclears1 > 3)
> +    {
> +      *(d1 + nclears1 - 4) = 0;
> +      *(d1 + nclears1 - 4 + 1) = 0;
> +      *(d1 + nclears1 - 4 + 2) = 0;
> +      *(d1 + nclears1 - 4 + 3) = 0;
> +    }
> +
> +  double *d2 = (double *) mem2;
> +
> +  *(d2 + 0) = 0;
> +  *(d2 + 1) = 0;
> +  *(d2 + 2) = 0;
> +  if (nclears2 > 3)
> +    {
> +      *(d2 + nclears2 - 4) = 0;
> +      *(d2 + nclears2 - 4 + 1) = 0;
> +      *(d2 + nclears2 - 4 + 2) = 0;
> +      *(d2 + nclears2 - 4 + 3) = 0;
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1b.c 
> b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
> new file mode 100644
> index 00000000000..e71b991a207
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
> @@ -0,0 +1,5 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=x86-64-v3" } */
> +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t 
> \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
> +
> +#include "pr117839-1a.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr117839-2.c 
> b/gcc/testsuite/gcc.target/i386/pr117839-2.c
> new file mode 100644
> index 00000000000..c76744cf98b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr117839-2.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=x86-64-v3" } */
> +/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t 
> \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
> +
> +#include <stddef.h>
> +
> +float
> +clear_memory (void *mem, size_t clearsize)
> +{
> +  size_t *d = (size_t *) mem;
> +  size_t nclears = clearsize / sizeof (size_t);
> +
> +  *(d + 0) = 0;
> +  *(d + 1) = 0;
> +  *(d + 2) = 0;
> +  if (nclears > 9)
> +    {
> +      *(d + 5) = 0;
> +      *(d + 5 + 1) = 0;
> +      *(d + 5 + 2) = 0;
> +      *(d + 5 + 3) = 0;
> +      *(d + nclears - 8) = 0;
> +      *(d + nclears - 8 + 1) = 0;
> +      *(d + nclears - 8 + 2) = 0;
> +      *(d + nclears - 8 + 3) = 0;
> +    }
> +  else
> +    {
> +      *(d + 1) = 0;
> +      *(d + 2) = 0;
> +      *(d + 3) = 0;
> +      *(d + 4) = 0;
> +      *(d + nclears - 4) = 0;
> +      *(d + nclears - 4 + 1) = 0;
> +      *(d + nclears - 4 + 2) = 0;
> +      *(d + nclears - 4 + 3) = 0;
> +    }
> +
> +  return nclears;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr92080-1.c 
> b/gcc/testsuite/gcc.target/i386/pr92080-1.c
> new file mode 100644
> index 00000000000..7059b4514eb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr92080-1.c
> @@ -0,0 +1,54 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=x86-64-v3" } */
> +/* { dg-final { scan-assembler-times "vpxor" 2 } } */
> +/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */
> +
> +typedef int v4si __attribute__((vector_size(16)));
> +typedef int v8si __attribute__((vector_size(32)));
> +typedef short v8hi __attribute__((vector_size(16)));
> +typedef short v16hi __attribute__((vector_size(32)));
> +typedef char v16qi __attribute__((vector_size(16)));
> +typedef char v32qi __attribute__((vector_size(32)));
> +
> +v16qi b1;
> +v8hi h1;
> +v4si s1;
> +v32qi b2;
> +v16hi h2;
> +v8si s2;
> +
> +void
> +foo ()
> +{
> +  s1 = __extension__(v4si){0, 0, 0, 0};
> +  h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> +  b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
> +  h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
> +}
> +
> +void
> +foo1 ()
> +{
> +  s1 = __extension__(v4si){-1, -1, -1, -1};
> +  h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> +  b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
> -1, -1, -1, -1};
> +}
> +
> +
> +void
> +foo2 ()
> +{
> +  s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
> +  h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> +  b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
> +  b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
> +}
> +
> +void
> +foo3 ()
> +{
> +  s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
> +  h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> +  b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
> -1, -1, -1, -1};
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr92080-2.c 
> b/gcc/testsuite/gcc.target/i386/pr92080-2.c
> new file mode 100644
> index 00000000000..d160d90de53
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr92080-2.c
> @@ -0,0 +1,59 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=x86-64-v3" } */
> +/* { dg-final { scan-assembler-times "vpxor" 1 } } */
> +/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */
> +
> +typedef int v4si __attribute__((vector_size(16)));
> +typedef int v8si __attribute__((vector_size(32)));
> +typedef short v8hi __attribute__((vector_size(16)));
> +typedef short v16hi __attribute__((vector_size(32)));
> +typedef char v16qi __attribute__((vector_size(16)));
> +typedef char v32qi __attribute__((vector_size(32)));
> +
> +v16qi b1;
> +v8hi h1;
> +v4si s1;
> +v32qi b2;
> +v16hi h2;
> +v8si s2;
> +
> +void
> +foo (int i, int j)
> +{
> +  switch (i)
> +    {
> +    case 1:
> +      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> +      s1 = __extension__(v4si){0, 0, 0, 0};
> +      s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
> +      break;
> +    case 2:
> +      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> +      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
> -1, -1, -1, -1, -1};
> +      break;
> +    case 3:
> +      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> +      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0};
> +      break;
> +    default:
> +      break;
> +    }
> +
> +  switch (i)
> +    {
> +    case 1:
> +      s1 = __extension__(v4si){-1, -1, -1, -1};
> +      b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0,
> +      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
> +      h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0};
> +      break;
> +    case 2:
> +      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0};
> +      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> +      break;
> +    case 3:
> +      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
> -1, -1, -1, -1, -1};
> +      s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
> +      break;
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr92080-3.c 
> b/gcc/testsuite/gcc.target/i386/pr92080-3.c
> new file mode 100644
> index 00000000000..2174def4e6d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr92080-3.c
> @@ -0,0 +1,48 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=x86-64" } */
> +/* { dg-final { scan-assembler-times "pxor" 1 } } */
> +/* { dg-final { scan-assembler-times "pcmpeq" 1 } } */
> +
> +typedef int v4si __attribute__((vector_size(16)));
> +typedef short v8hi __attribute__((vector_size(16)));
> +typedef char v16qi __attribute__((vector_size(16)));
> +
> +v16qi b1;
> +v8hi h1;
> +v4si s1;
> +
> +void
> +foo (int i, int j)
> +{
> +  switch (i)
> +    {
> +    case 1:
> +      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> +      s1 = __extension__(v4si){0, 0, 0, 0};
> +      break;
> +    case 2:
> +      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> +      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
> -1, -1, -1, -1, -1};
> +      break;
> +    case 3:
> +      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
> +      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0};
> +      break;
> +    default:
> +      break;
> +    }
> +
> +  switch (i)
> +    {
> +    case 1:
> +      s1 = __extension__(v4si){-1, -1, -1, -1};
> +      break;
> +    case 2:
> +      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0};
> +      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
> +      break;
> +    case 3:
> +      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
> -1, -1, -1, -1, -1};
> +      break;
> +    }
> +}
> --
> 2.47.1
>

OK for master?

-- 
H.J.

Reply via email to