[PATCH v3] x86: Add a pass to remove redundant all 0s/1s vector load

H.J. Lu Tue, 29 Apr 2025 00:00:52 -0700

On Tue, Apr 29, 2025 at 11:27 AM H.J. Lu <[email protected]> wrote:
>
> On Tue, Apr 29, 2025 at 10:08 AM Hongtao Liu <[email protected]> wrote:
> >
> > On Mon, Apr 28, 2025 at 5:07 PM H.J. Lu <[email protected]> wrote:
> > >
> > > On Mon, Apr 28, 2025 at 4:26 PM H.J. Lu <[email protected]> wrote:
> > > >
> > >
> > > > > > This is what my patch does:
> > > > > But it iterates through vector_insns, using a def-ref chain to find
> > > > > those insns. I think we can just record those single_set with src as
> > > > > const_m1/zero, and replace src for them.
> > > >
> > > > Will fix it.
> > >
> > > Fixed in the v2 patch.
> > >
> > > > > >
> > > > > >              /* Check the single definition of CONST0_RTX and 
> > > > > > integer
> > > > > >                  CONSTM1_RTX.  */
> > > > > >               rtx src = SET_SRC (set);
> > > > > >               rtx replace;
> > > > > >               if (vector_const0 && src == CONST0_RTX (mode))
> > > > > >                 {
> > > > > >                   /* Replace REG with VECTOR_CONST0.  */
> > > > > >                   if (SUBREG_P (reg) || mode == zero_mode)
> > > > > >                     replace = vector_const0;
> > > > > >                   else
> > > > > >                     replace = gen_rtx_SUBREG (mode, vector_const0, 
> > > > > > 0);
> > > > > >                   *DF_REF_REAL_LOC (ref) = replace;
> > > > > >                   replaced = true;
> > > > > >                   zero_replaced = true;
> > > > > >                 }
> > > > > >
> > > > > > It changed the source to a subreg directly.
> > > > > >
> > > > > > > Also we also need to change ix86_modes_tieable_p to make sure 
> > > > > > > those
> > > > > > > inserted subreg can be handled by LRA and other passes?
> > > > > >
> > > > > > ix86_modes_tieable_p is OK:
> > > > > >
> > > > > >  /* If MODE2 is only appropriate for an SSE register, then tie with
> > > > > >      any other mode acceptable to SSE registers.  */
> > > > > >   if (GET_MODE_SIZE (mode2) == 64
> > > > > >       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > >     return (GET_MODE_SIZE (mode1) == 64
> > > > > >             && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > >   if (GET_MODE_SIZE (mode2) == 32
> > > > > >       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > >     return (GET_MODE_SIZE (mode1) == 32
> > > > > >             && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > >   if (GET_MODE_SIZE (mode2) == 16
> > > > > >       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > >     return (GET_MODE_SIZE (mode1) == 16
> > > > > >             && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > >
> > > > > It's ok only size of mode1 is equal to size of mode2.
> > > > > But in the testcase, there are different size vectors(32-bytes, 
> > > > > 16-bytes).
> > > > >
> > > > > So it would be better as, for mode2 >= 16 bytes, it can only be put
> > > > > into SSE_REGS(except for TImode, but TImode still can be tied to
> > > > > <=16bytes mode1 which can be put into SSE_REGS) , if mode1 can also be
> > > > > put into SSE_REGS, then mode2 tie with mode1.
> > > > >
> > > > >    /* If MODE2 is only appropriate for an SSE register, then tie with
> > > > >       any other mode acceptable to SSE registers.  */
> > > > > -  if (GET_MODE_SIZE (mode2) == 64
> > > > > -      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > -    return (GET_MODE_SIZE (mode1) == 64
> > > > > -           && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > -  if (GET_MODE_SIZE (mode2) == 32
> > > > > -      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > -    return (GET_MODE_SIZE (mode1) == 32
> > > > > -           && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > -  if (GET_MODE_SIZE (mode2) == 16
> > > > > +  if (GET_MODE_SIZE (mode2) >= 16
> > > > > +      && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)
> > > > >        && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > -    return (GET_MODE_SIZE (mode1) == 16
> > > > > -           && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > +    return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
> > > > >
> > > >
> > > > This caused:
> > > >
> > > > FAIL: gcc.target/i386/pr111267.c scan-assembler-not movd
> > > > FAIL: gcc.target/i386/pr111267.c scan-assembler-not movq
> > > > FAIL: gcc.target/i386/pr82580.c scan-assembler-not \\mmovzb
> > > >
> > > > since GCC thinks it is cheap to get  QI/HI/SI/DI from TI in XMM.
> > > > I am testing:
> > > >
> > > >  /* If MODE2 is only appropriate for an SSE register, then tie with
> > > >      any other mode acceptable to SSE registers, excluding
> > > >         (subreg:QI (reg:TI 99) 0))
> > > >         (subreg:HI (reg:TI 99) 0))
> > > >         (subreg:SI (reg:TI 99) 0))
> > > >         (subreg:DI (reg:TI 99) 0))
> > > >      to avoid unnecessary move from SSE register to integer register.
> > > >    */
> > > >   if (GET_MODE_SIZE (mode2) >= 16
> > > >       && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
> > > >           || (!INTEGRAL_MODE_P (mode1)
> > > >               && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
> > > >       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > It looks like only scalar_int_mode_p should be exclude, not vector_mode?
> > so, how about
> >    if (GET_MODE_SIZE (mode2) >= 16
> >        && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
> >            || ((VECTOR_MODE_P (mode1) || SCALAR_FLOAT_MODE_P (mode1))
> >                && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
> >        && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> >
> > Only allow vector mode or scalar floating point in mode1 to be tied to
> > mode2 in SSE_REGS?
>
> Will fix.


Fixed in v3.

> >
> > >+      /* NB: Don't run recog_memoized here since vector SUBREG may not
> > >+ be valid.  Let LRA handle vector SUBREG.  */
> > It's tricky, we can insert a move pattern with dest same size as
> > SET_SRC (set), but with same component mode as constm1/constm0, so it
> > should be something like
> > mov v8qi, const0(v32qimode)
> > mov v2sf, subreg:v2sf (v8qi)
>
> IIt is exactly what my patch generates:
>
> (insn 31 2 5 2 (set (reg:V32QI 110)
>         (const_vector:V32QI [
>                 (const_int 0 [0]) repeated x32
>             ])) -1
>      (nil))
> (insn 5 31 6 2 (set (reg:V2DF 98)
>         (subreg:V2DF (reg:V32QI 110) 0)) "z2.c":35:6 2421 {movv2df_internal}
>      (nil))
> (insn 6 5 7 2 (set (mem/c:V2DF (symbol_ref:DI ("d1") [flags 0x2]
> <var_decl 0x7fd11d6341c8 d1>) [1 d1+0 S16 A128])
>         (reg:V2DF 98)) "z2.c":35:6 2421 {movv2df_internal}
>      (expr_list:REG_DEAD (reg:V2DF 98)
>         (nil)))
> (insn 7 6 8 2 (set (reg:V4SF 99)
>         (subreg:V4SF (reg:V32QI 110) 0)) "z2.c":36:6 2418 {movv4sf_internal}
>      (nil))
> (insn 8 7 9 2 (set (mem/c:V4SF (symbol_ref:DI ("f1") [flags 0x2]
> <var_decl 0x7fd11d634130 f1>) [2 f1+0 S16 A128])
>         (reg:V4SF 99)) "z2.c":36:6 2418 {movv4sf_internal}
>      (expr_list:REG_DEAD (reg:V4SF 99)
>         (nil)))
> ...
>
> and LRA does eliminate the redundant moves.
>
> > I think LRA could eliminate the redundant move.
> >
> > >+static void
> > >+ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
> > >+{
> > For the convenience of maintain, can we also replace the corresponding
> > code in the remove_partial_avx_dependency function with a call to this
> > ix86_place_single_vector_set function
>
> Will fix.

Fixed in v3.

Here is the v3 patch.  OK for master?

Thanks.

-- 
H.J.
---
For all different modes of all 0s/1s vectors, we can use the single widest
all 0s/1s vector register for all 0s/1s vector uses in the whole function.
Add a pass to generate a single widest all 0s/1s vector set instruction at
entry of the nearest common dominator for basic blocks with all 0s/1s
vector uses.  On Linux/x86-64, in cc1plus, this patch reduces the number
of vector xor instructions from 4803 to 4714 and pcmpeq instructions from
144 to 142.

NB: PR target/92080 and PR target/117839 aren't same.  PR target/117839
is for vectors of all 0s and all 1s with different sizes and different
components.  PR target/92080 is for broadcast of the same component to
different vector sizes.  This patch covers only all 0s and all 1s cases
of PR target/92080.

gcc/

PR target/92080
PR target/117839
* config/i386/i386-features.cc (ix86_place_single_vector_set):
New function.
(remove_partial_avx_dependency): Use it.
(ix86_get_vector_load_mode): New function.
(replace_vector_const): Likewise.
(remove_redundant_vector_load): Likewise.
(pass_data_remove_redundant_vector_load): Likewise.
(pass_remove_redundant_vector_load): Likewise.
(make_pass_remove_redundant_vector_load): Likewise.
* config/i386/i386-passes.def: Add
pass_remove_redundant_vector_load after
pass_remove_partial_avx_dependency.
* config/i386/i386-protos.h
(make_pass_remove_redundant_vector_load): New.
* config/i386/i386.cc (ix86_modes_tieable_p): Return true for
narrower non-scalar-integer modes in SSE registers.

gcc/testsuite/

PR target/92080
PR target/117839
* gcc.target/i386/pr117839-1a.c: New test.
* gcc.target/i386/pr117839-1b.c: Likewise.
* gcc.target/i386/pr117839-2.c: Likewise.
* gcc.target/i386/pr92080-1.c: Likewise.
* gcc.target/i386/pr92080-2.c: Likewise.
* gcc.target/i386/pr92080-3.c: Likewise.

From 0aaff253015852c353bc5b567b71b79f6c677b00 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <[email protected]>
Date: Fri, 29 Nov 2024 18:22:14 +0800
Subject: [PATCH v3] x86: Add a pass to remove redundant all 0s/1s vector load

For all different modes of all 0s/1s vectors, we can use the single widest
all 0s/1s vector register for all 0s/1s vector uses in the whole function.
Add a pass to generate a single widest all 0s/1s vector set instruction at
entry of the nearest common dominator for basic blocks with all 0s/1s
vector uses.  On Linux/x86-64, in cc1plus, this patch reduces the number
of vector xor instructions from 4803 to 4714 and pcmpeq instructions from
144 to 142.

NB: PR target/92080 and PR target/117839 aren't same.  PR target/117839
is for vectors of all 0s and all 1s with different sizes and different
components.  PR target/92080 is for broadcast of the same component to
different vector sizes.  This patch covers only all 0s and all 1s cases
of PR target/92080.

gcc/

	PR target/92080
	PR target/117839
	* config/i386/i386-features.cc (ix86_place_single_vector_set):
	New function.
	(remove_partial_avx_dependency): Use it.
	(ix86_get_vector_load_mode): New function.
	(replace_vector_const): Likewise.
	(remove_redundant_vector_load): Likewise.
	(pass_data_remove_redundant_vector_load): Likewise.
	(pass_remove_redundant_vector_load): Likewise.
	(make_pass_remove_redundant_vector_load): Likewise.
	* config/i386/i386-passes.def: Add
	pass_remove_redundant_vector_load after
	pass_remove_partial_avx_dependency.
	* config/i386/i386-protos.h
	(make_pass_remove_redundant_vector_load): New.
	* config/i386/i386.cc (ix86_modes_tieable_p): Return true for
	narrower non-scalar-integer modes in SSE registers.

gcc/testsuite/

	PR target/92080
	PR target/117839
	* gcc.target/i386/pr117839-1a.c: New test.
	* gcc.target/i386/pr117839-1b.c: Likewise.
	* gcc.target/i386/pr117839-2.c: Likewise.
	* gcc.target/i386/pr92080-1.c: Likewise.
	* gcc.target/i386/pr92080-2.c: Likewise.
	* gcc.target/i386/pr92080-3.c: Likewise.

Signed-off-by: H.J. Lu <[email protected]>
---
 gcc/config/i386/i386-features.cc            | 303 ++++++++++++++++++--
 gcc/config/i386/i386-passes.def             |   1 +
 gcc/config/i386/i386-protos.h               |   2 +
 gcc/config/i386/i386.cc                     |  25 +-
 gcc/testsuite/gcc.target/i386/pr117839-1a.c |  35 +++
 gcc/testsuite/gcc.target/i386/pr117839-1b.c |   5 +
 gcc/testsuite/gcc.target/i386/pr117839-2.c  |  40 +++
 gcc/testsuite/gcc.target/i386/pr92080-1.c   |  72 +++++
 gcc/testsuite/gcc.target/i386/pr92080-2.c   |  59 ++++
 gcc/testsuite/gcc.target/i386/pr92080-3.c   |  48 ++++
 10 files changed, 549 insertions(+), 41 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c35ac24fd8a..31f3ee2ef17 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3034,6 +3034,42 @@ ix86_rpad_gate ()
 	  && optimize_function_for_speed_p (cfun));
 }
 
+/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
+   for basic block map BBS, which is in the fake loop that contains the
+   whole function, so that there is only a single vector set in the
+   whole function.   */
+
+static void
+ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
+{
+  basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
+  while (bb->loop_father->latch
+	 != EXIT_BLOCK_PTR_FOR_FN (cfun))
+    bb = get_immediate_dominator (CDI_DOMINATORS,
+				  bb->loop_father->header);
+
+  rtx set = gen_rtx_SET (dest, src);
+
+  rtx_insn *insn = BB_HEAD (bb);
+  while (insn && !NONDEBUG_INSN_P (insn))
+    {
+      if (insn == BB_END (bb))
+	{
+	  insn = NULL;
+	  break;
+	}
+      insn = NEXT_INSN (insn);
+    }
+
+  rtx_insn *set_insn;
+  if (insn == BB_HEAD (bb))
+    set_insn = emit_insn_before (set, insn);
+  else
+    set_insn = emit_insn_after (set,
+				insn ? PREV_INSN (insn) : BB_END (bb));
+  df_insn_rescan (set_insn);
+}
+
 /* At entry of the nearest common dominator for basic blocks with
    conversions/rcp/sqrt/rsqrt/round, generate a single
 	vxorps %xmmN, %xmmN, %xmmN
@@ -3188,35 +3224,10 @@ remove_partial_avx_dependency (void)
       calculate_dominance_info (CDI_DOMINATORS);
       loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
 
-      /* Generate a vxorps at entry of the nearest dominator for basic
-	 blocks with conversions, which is in the fake loop that
-	 contains the whole function, so that there is only a single
-	 vxorps in the whole function.   */
-      bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
-					     convert_bbs);
-      while (bb->loop_father->latch
-	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
-	bb = get_immediate_dominator (CDI_DOMINATORS,
-				      bb->loop_father->header);
-
-      set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
-
-      insn = BB_HEAD (bb);
-      while (insn && !NONDEBUG_INSN_P (insn))
-	{
-	  if (insn == BB_END (bb))
-	    {
-	      insn = NULL;
-	      break;
-	    }
-	  insn = NEXT_INSN (insn);
-	}
-      if (insn == BB_HEAD (bb))
-	set_insn = emit_insn_before (set, insn);
-      else
-	set_insn = emit_insn_after (set,
-				    insn ? PREV_INSN (insn) : BB_END (bb));
-      df_insn_rescan (set_insn);
+      ix86_place_single_vector_set (v4sf_const0,
+				    CONST0_RTX (V4SFmode),
+				    convert_bbs);
+
       loop_optimizer_finalize ();
 
       if (!control_flow_insns.is_empty ())
@@ -3288,6 +3299,240 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
   return new pass_remove_partial_avx_dependency (ctxt);
 }
 
+/* Return a machine mode suitable for vector SIZE.  */
+
+static machine_mode
+ix86_get_vector_load_mode (unsigned int size)
+{
+  machine_mode mode;
+  if (size == 64)
+    mode = V64QImode;
+  else if (size == 32)
+    mode = V32QImode;
+  else
+    mode = V16QImode;
+  return mode;
+}
+
+/* Replace the source operand of instructions in VECTOR_INSNS with
+   VECTOR_CONST in VECTOR_MODE.  */
+
+static void
+replace_vector_const (machine_mode vector_mode, rtx vector_const,
+		      auto_bitmap &vector_insns)
+{
+  bitmap_iterator bi;
+  unsigned int id;
+
+  EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
+    {
+      rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
+
+      /* Get the single SET instruction.  */
+      rtx set = single_set (insn);
+      rtx dest = SET_SRC (set);
+      machine_mode mode = GET_MODE (dest);
+
+      rtx replace;
+      /* Replace the source operand with VECTOR_CONST.  */
+      if (SUBREG_P (dest) || mode == vector_mode)
+	replace = vector_const;
+      else
+	replace = gen_rtx_SUBREG (mode, vector_const, 0);
+
+      /* NB: Don't run recog_memoized here since vector SUBREG may not
+	 be valid.  Let LRA handle vector SUBREG.  */
+      SET_SRC (set) = replace;
+      /* Drop possible dead definitions.  */
+      PATTERN (insn) = set;
+      df_insn_rescan (insn);
+    }
+}
+
+/* At entry of the nearest common dominator for basic blocks with vector
+   CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
+   vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
+   uses.
+
+   NB: We want to generate only a single widest vector set to cover the
+   whole function.  The LCM algorithm isn't appropriate here since it
+   may place a vector set inside the loop.  */
+
+static unsigned int
+remove_redundant_vector_load (void)
+{
+  timevar_push (TV_MACH_DEP);
+
+  auto_bitmap zero_bbs;
+  auto_bitmap m1_bbs;
+  auto_bitmap zero_insns;
+  auto_bitmap m1_insns;
+
+  basic_block bb;
+  rtx_insn *insn;
+  unsigned HOST_WIDE_INT zero_count = 0;
+  unsigned HOST_WIDE_INT m1_count = 0;
+  unsigned int zero_size = 0;
+  unsigned int m1_size = 0;
+
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  rtx set = single_set (insn);
+	  if (!set)
+	    continue;
+
+	  /* Record single set vector instruction with CONST0_RTX and
+	     CONSTM1_RTX source.  Record basic blocks with CONST0_RTX and
+	     CONSTM1_RTX.  Count CONST0_RTX and CONSTM1_RTX.  Record the
+	     maximum size of CONST0_RTX and CONSTM1_RTX.  */
+
+	  rtx dest = SET_DEST (set);
+	  machine_mode mode = GET_MODE (dest);
+	  /* Skip non-vector instruction.  */
+	  if (!VECTOR_MODE_P (mode))
+	    continue;
+
+	  rtx src = SET_SRC (set);
+	  /* Skip non-vector load instruction.  */
+	  if (!REG_P (dest) && !SUBREG_P (dest))
+	    continue;
+
+	  if (src == CONST0_RTX (mode))
+	    {
+	      /* Record vector instruction with CONST0_RTX.  */
+	      bitmap_set_bit (zero_insns, INSN_UID (insn));
+
+	      /* Record the maximum vector size.  */
+	      if (zero_size < GET_MODE_SIZE (mode))
+		zero_size = GET_MODE_SIZE (mode);
+
+	      /* Record the basic block with CONST0_RTX.  */
+	      bitmap_set_bit (zero_bbs, bb->index);
+	      zero_count++;
+	    }
+	  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+		   && src == CONSTM1_RTX (mode))
+	    {
+	      /* Record vector instruction with CONSTM1_RTX.  */
+	      bitmap_set_bit (m1_insns, INSN_UID (insn));
+
+	      /* Record the maximum vector size.  */
+	      if (m1_size < GET_MODE_SIZE (mode))
+		m1_size = GET_MODE_SIZE (mode);
+
+	      /* Record the basic block with CONSTM1_RTX.  */
+	      bitmap_set_bit (m1_bbs, bb->index);
+	      m1_count++;
+	    }
+	}
+    }
+
+  if (zero_count > 1 || m1_count > 1)
+    {
+      machine_mode zero_mode, m1_mode;
+      rtx vector_const0, vector_constm1;
+
+      if (zero_count > 1)
+	{
+	  zero_mode = ix86_get_vector_load_mode (zero_size);
+	  vector_const0 = gen_reg_rtx (zero_mode);
+	  replace_vector_const (zero_mode, vector_const0, zero_insns);
+	}
+      else
+	{
+	  zero_mode = VOIDmode;
+	  vector_const0 = nullptr;
+	}
+
+      if (m1_count > 1)
+	{
+	  m1_mode = ix86_get_vector_load_mode (m1_size);
+	  vector_constm1 = gen_reg_rtx (m1_mode);
+	  replace_vector_const (m1_mode, vector_constm1, m1_insns);
+	}
+      else
+	{
+	  m1_mode = VOIDmode;
+	  vector_constm1 = nullptr;
+	}
+
+      /* (Re-)discover loops so that bb->loop_father can be used in the
+	 analysis below.  */
+      calculate_dominance_info (CDI_DOMINATORS);
+      loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+      if (vector_const0)
+	ix86_place_single_vector_set (vector_const0,
+				      CONST0_RTX (zero_mode),
+				      zero_bbs);
+
+      if (vector_constm1)
+	ix86_place_single_vector_set (vector_constm1,
+				      CONSTM1_RTX (m1_mode),
+				      m1_bbs);
+
+      loop_optimizer_finalize ();
+
+      df_process_deferred_rescans ();
+    }
+
+  df_clear_flags (DF_DEFER_INSN_RESCAN);
+
+  timevar_pop (TV_MACH_DEP);
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_remove_redundant_vector_load =
+{
+  RTL_PASS, /* type */
+  "rrvl", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_remove_redundant_vector_load : public rtl_opt_pass
+{
+public:
+  pass_remove_redundant_vector_load (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *fun) final override
+    {
+      return (TARGET_SSE2
+	      && optimize
+	      && optimize_function_for_speed_p (fun));
+    }
+
+  unsigned int execute (function *) final override
+    {
+      return remove_redundant_vector_load ();
+    }
+}; // class pass_remove_redundant_vector_load
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_remove_redundant_vector_load (gcc::context *ctxt)
+{
+  return new pass_remove_redundant_vector_load (ctxt);
+}
+
 /* Convert legacy instructions that clobbers EFLAGS to APX_NF
    instructions when there are no flag set between a flag
    producer and user.  */
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 39f8bc65ddc..06f0288b067 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -35,5 +35,6 @@ along with GCC; see the file COPYING3.  If not see
      PR116174.  */
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
 
+  INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load);
   INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
   INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index bea3fd4b2e2..c59b5a67e3a 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -427,6 +427,8 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
+extern rtl_opt_pass *make_pass_remove_redundant_vector_load
+  (gcc::context *);
 extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
 extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
 
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index dd076242177..ae2386785af 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21493,19 +21493,20 @@ ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
     return mode1 == SFmode;
 
   /* If MODE2 is only appropriate for an SSE register, then tie with
-     any other mode acceptable to SSE registers.  */
-  if (GET_MODE_SIZE (mode2) == 64
-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 64
-	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
-  if (GET_MODE_SIZE (mode2) == 32
-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 32
-	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
-  if (GET_MODE_SIZE (mode2) == 16
+     any vector modes or scalar floating point modes acceptable to SSE
+     registers, excluding scalar integer modes with SUBREG:
+	(subreg:QI (reg:TI 99) 0))
+	(subreg:HI (reg:TI 99) 0))
+	(subreg:SI (reg:TI 99) 0))
+	(subreg:DI (reg:TI 99) 0))
+     to avoid unnecessary move from SSE register to integer register.
+   */
+  if (GET_MODE_SIZE (mode2) >= 16
+      && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
+	  || ((VECTOR_MODE_P (mode1) || SCALAR_FLOAT_MODE_P (mode1))
+	      && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 16
-	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+    return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
 
   /* If MODE2 is appropriate for an MMX register, then tie
      with any other mode acceptable to MMX registers.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1a.c b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
new file mode 100644
index 00000000000..4501cfbcad4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include <stddef.h>
+
+void
+clear_memory (void *mem1, size_t nclears1, void *mem2, size_t nclears2)
+{
+  size_t *d1 = (size_t *) mem1;
+
+  *(d1 + 0) = 0;
+  *(d1 + 1) = 0;
+  *(d1 + 2) = 0;
+  if (nclears1 > 3)
+    {
+      *(d1 + nclears1 - 4) = 0;
+      *(d1 + nclears1 - 4 + 1) = 0;
+      *(d1 + nclears1 - 4 + 2) = 0;
+      *(d1 + nclears1 - 4 + 3) = 0;
+    }
+
+  double *d2 = (double *) mem2;
+
+  *(d2 + 0) = 0;
+  *(d2 + 1) = 0;
+  *(d2 + 2) = 0;
+  if (nclears2 > 3)
+    {
+      *(d2 + nclears2 - 4) = 0;
+      *(d2 + nclears2 - 4 + 1) = 0;
+      *(d2 + nclears2 - 4 + 2) = 0;
+      *(d2 + nclears2 - 4 + 3) = 0;
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1b.c b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
new file mode 100644
index 00000000000..e71b991a207
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include "pr117839-1a.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-2.c b/gcc/testsuite/gcc.target/i386/pr117839-2.c
new file mode 100644
index 00000000000..c76744cf98b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-2.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include <stddef.h>
+
+float
+clear_memory (void *mem, size_t clearsize)
+{
+  size_t *d = (size_t *) mem;
+  size_t nclears = clearsize / sizeof (size_t);
+
+  *(d + 0) = 0;
+  *(d + 1) = 0;
+  *(d + 2) = 0;
+  if (nclears > 9)
+    {
+      *(d + 5) = 0;
+      *(d + 5 + 1) = 0;
+      *(d + 5 + 2) = 0;
+      *(d + 5 + 3) = 0;
+      *(d + nclears - 8) = 0;
+      *(d + nclears - 8 + 1) = 0;
+      *(d + nclears - 8 + 2) = 0;
+      *(d + nclears - 8 + 3) = 0;
+    }
+  else
+    {
+      *(d + 1) = 0;
+      *(d + 2) = 0;
+      *(d + 3) = 0;
+      *(d + 4) = 0;
+      *(d + nclears - 4) = 0;
+      *(d + nclears - 4 + 1) = 0;
+      *(d + nclears - 4 + 2) = 0;
+      *(d + nclears - 4 + 3) = 0;
+    }
+
+  return nclears;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-1.c b/gcc/testsuite/gcc.target/i386/pr92080-1.c
new file mode 100644
index 00000000000..82d1ffd4e1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-1.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 2 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */
+
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef double v2df __attribute__((vector_size(16)));
+typedef double v4df __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v2di l1;
+v4sf f1;
+v2df d1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+v4di l2;
+v8sf f2;
+v4df d2;
+
+void
+foo ()
+{
+  d1 = __extension__(v2df){0, 0};
+  f1 = __extension__(v4sf){0, 0, 0};
+  l1 = __extension__(v2di){0, 0};
+  s1 = __extension__(v4si){0, 0, 0, 0};
+  h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+  b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo1 ()
+{
+  s1 = __extension__(v4si){-1, -1, -1, -1};
+  h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+  b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+}
+
+
+void
+foo2 ()
+{
+  d2 = __extension__(v4df){0, 0, 0, 0};
+  f2 = __extension__(v8sf){0, 0, 0, 0, 0, 0, 0, 0};
+  l2 = __extension__(v4di){0, 0, 0, 0};
+  s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+  h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+  b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo3 ()
+{
+  s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+  h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+  b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-2.c b/gcc/testsuite/gcc.target/i386/pr92080-2.c
new file mode 100644
index 00000000000..d160d90de53
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-2.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 1 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+
+void
+foo (int i, int j)
+{
+  switch (i)
+    {
+    case 1:
+      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+      s1 = __extension__(v4si){0, 0, 0, 0};
+      s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+      break;
+    case 2:
+      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+      break;
+    case 3:
+      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      break;
+    default:
+      break;
+    }
+
+  switch (i)
+    {
+    case 1:
+      s1 = __extension__(v4si){-1, -1, -1, -1};
+      b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      break;
+    case 2:
+      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+      break;
+    case 3:
+      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+      s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+      break;
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-3.c b/gcc/testsuite/gcc.target/i386/pr92080-3.c
new file mode 100644
index 00000000000..2174def4e6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-3.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64" } */
+/* { dg-final { scan-assembler-times "pxor" 1 } } */
+/* { dg-final { scan-assembler-times "pcmpeq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+
+void
+foo (int i, int j)
+{
+  switch (i)
+    {
+    case 1:
+      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+      s1 = __extension__(v4si){0, 0, 0, 0};
+      break;
+    case 2:
+      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+      break;
+    case 3:
+      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      break;
+    default:
+      break;
+    }
+
+  switch (i)
+    {
+    case 1:
+      s1 = __extension__(v4si){-1, -1, -1, -1};
+      break;
+    case 2:
+      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+      break;
+    case 3:
+      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+      break;
+    }
+}
-- 
2.49.0

[PATCH v3] x86: Add a pass to remove redundant all 0s/1s vector load

Reply via email to