On Mon, Jun 23, 2025 at 4:53 PM Hongtao Liu <crazy...@gmail.com> wrote:
>
> On Mon, Jun 23, 2025 at 4:45 PM H.J. Lu <hjl.to...@gmail.com> wrote:
> >
> > On Mon, Jun 23, 2025 at 4:10 PM H.J. Lu <hjl.to...@gmail.com> wrote:
> > >
> > > On Mon, Jun 23, 2025 at 3:11 PM Hongtao Liu <crazy...@gmail.com> wrote:
> > > >
> > > > On Thu, Jun 19, 2025 at 10:25 AM H.J. Lu <hjl.to...@gmail.com> wrote:
> > > > >
> > > > > Extend the remove_redundant_vector pass to handle vector broadcasts 
> > > > > from
> > > > > constant and variable scalars.  When broadcasting from constants and
> > > > > function arguments, we can place a single widest vector broadcast at
> > > > > entry of the nearest common dominator for basic blocks with all uses
> > > > > since constants and function arguments aren't changed.  For broadcast
> > > > > from variables with a single definition, the single definition is
> > > > > replaced with the widest broadcast.
> > > > >
> > > > > gcc/
> > > > >
> > > > >         PR target/92080
> > > > >         * config/i386/i386-expand.cc (ix86_expand_call): Set
> > > > >         recursive_function to true for recursive call.
> > > > >         * config/i386/i386-features.cc (ix86_place_single_vector_set):
> > > > >         Add an argument for inner scalar, default to nullptr.  Set the
> > > > >         source from inner scalar if not nullptr.
> > > > >         (ix86_get_vector_load_mode): Renamed to ...
> > > > >         (ix86_get_vector_cse_mode): This.  Add an argument for scalar 
> > > > > mode
> > > > >         and handle integer and float scalar modes.
> > > > >         (replace_vector_const): Add an argument for scalar mode and 
> > > > > pass
> > > > >         it to ix86_get_vector_load_mode.
> > > > >         (x86_cse_kind): New.
> > > > >         (redundant_load): Likewise.
> > > > >         (ix86_broadcast_inner): Likewise.
> > > > >         (remove_redundant_vector_load): Also support const0_rtx and
> > > > >         constm1_rtx broadcasts.  Handle vector broadcasts from 
> > > > > constant
> > > > >         and variable scalars.
> > > > >         * config/i386/i386.h (machine_function): Add 
> > > > > recursive_function.
> > > > >
> > > > > gcc/testsuite/
> > > > >
> > > > >         * gcc.target/i386/keylocker-aesdecwide128kl.c: Updated to 
> > > > > expect
> > > > >         movdqa instead pxor.
> > > > >         * gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
> > > > >         * gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
> > > > >         * gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-4.c: New test.
> > > > >         * gcc.target/i386/pr92080-5.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-6.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-7.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-8.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-9.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-10.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-11.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-12.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-13.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-14.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-15.c: Likewise.
> > > > >         * gcc.target/i386/pr92080-16.c: Likewise.
> > > > >
> > > > > Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
> > > > > ---
> > > > >  gcc/config/i386/i386-expand.cc                |   3 +
> > > > >  gcc/config/i386/i386-features.cc              | 410 
> > > > > ++++++++++++++----
> > > > >  gcc/config/i386/i386.h                        |   3 +
> > > > >  .../i386/keylocker-aesdecwide128kl.c          |  14 +-
> > > > >  .../i386/keylocker-aesdecwide256kl.c          |  14 +-
> > > > >  .../i386/keylocker-aesencwide128kl.c          |  14 +-
> > > > >  .../i386/keylocker-aesencwide256kl.c          |  14 +-
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-10.c    |  13 +
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-11.c    |  33 ++
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-12.c    |  16 +
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-13.c    |  32 ++
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-14.c    |  31 ++
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-15.c    |  25 ++
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-16.c    |  26 ++
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-4.c     |  50 +++
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-5.c     | 109 +++++
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-6.c     |  19 +
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-7.c     |  20 +
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-8.c     |  16 +
> > > > >  gcc/testsuite/gcc.target/i386/pr92080-9.c     |  81 ++++
> > > > >  20 files changed, 823 insertions(+), 120 deletions(-)
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-10.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-11.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-12.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-13.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-14.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-15.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-16.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-4.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-5.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-6.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-7.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-8.c
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-9.c
> > > > >
> >
> > > > > +  else
> > > > > +    {
> > > > > +      while (SUBREG_P (dest))
> > > > > +       dest = SUBREG_REG (dest);
> > > > > +
> > > > > +      /* Skip if the SET destination mode doesn't match.  */
> > > > > +      if (GET_MODE (dest) != mode)
> > > > > +       return nullptr;
> > > >
> > > > Can we just require (dest == reg || dest == op), otherwise we need to
> > > > make sure GET_MODE of the original dest can cover mode of op(which is
> > > > more complicated, need to make sure SUBREG_BYTE is also zero???)
> > >
> > > I will change it to
> > >
> > >       /* Skip if the SET destination isn't the broadcast source.  */
> > >       if (dest != reg)
> > >         return nullptr;
> >
> > Here is the v4 patch with:
> >
> >       /* The SET destination must be the broadcast source.  */
> >       gcc_assert (dest == op);
> I don't understand this, looks like you're post the dump patch instead
> of the original one.

Ooops.   Here is the real v4 patch which simplifies ix86_broadcast_inner
to

 rtx src = SET_SRC (set);

  if (CONST_INT_P (src))
    {
      op = src;
      *insn_p = nullptr;
    }
  else
    {
      *insn_p = insn;
    }

  *scalar_mode_p = mode;
  return op;

OK for master?

Thanks.

-- 
H.J.
From ee4ad23415a1b19b2b54d6a034dcd68ba05908a5 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.to...@gmail.com>
Date: Fri, 9 May 2025 07:17:07 +0800
Subject: [PATCH v4] x86: Extend the remove_redundant_vector pass

Extend the remove_redundant_vector pass to handle vector broadcasts from
constant and variable scalars.  When broadcasting from constants and
function arguments, we can place a single widest vector broadcast at
entry of the nearest common dominator for basic blocks with all uses
since constants and function arguments aren't changed.  For broadcast
from variables with a single definition, the single definition is
replaced with the widest broadcast.

gcc/

	PR target/92080
	* config/i386/i386-expand.cc (ix86_expand_call): Set
	recursive_function to true for recursive call.
	* config/i386/i386-features.cc (ix86_place_single_vector_set):
	Add an argument for inner scalar, default to nullptr.  Set the
	source from inner scalar if not nullptr.
	(ix86_get_vector_load_mode): Renamed to ...
	(ix86_get_vector_cse_mode): This.  Add an argument for scalar mode
	and handle integer and float scalar modes.
	(replace_vector_const): Add an argument for scalar mode and pass
	it to ix86_get_vector_load_mode.
	(x86_cse_kind): New.
	(redundant_load): Likewise.
	(ix86_broadcast_inner): Likewise.
	(remove_redundant_vector_load): Also support const0_rtx and
	constm1_rtx broadcasts.  Handle vector broadcasts from constant
	and variable scalars.
	* config/i386/i386.h (machine_function): Add recursive_function.

gcc/testsuite/

	* gcc.target/i386/keylocker-aesdecwide128kl.c: Updated to expect
	movdqa instead pxor.
	* gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
	* gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
	* gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
	* gcc.target/i386/pr92080-4.c: New test.
	* gcc.target/i386/pr92080-5.c: Likewise.
	* gcc.target/i386/pr92080-6.c: Likewise.
	* gcc.target/i386/pr92080-7.c: Likewise.
	* gcc.target/i386/pr92080-8.c: Likewise.
	* gcc.target/i386/pr92080-9.c: Likewise.
	* gcc.target/i386/pr92080-10.c: Likewise.
	* gcc.target/i386/pr92080-11.c: Likewise.
	* gcc.target/i386/pr92080-12.c: Likewise.
	* gcc.target/i386/pr92080-13.c: Likewise.
	* gcc.target/i386/pr92080-14.c: Likewise.
	* gcc.target/i386/pr92080-15.c: Likewise.
	* gcc.target/i386/pr92080-16.c: Likewise.
	* gcc.target/i386/pr92080-17.c: Likewise.
	* gcc.target/i386/pr92080-18.c: Likewise.
	* gcc.target/i386/pr92080-19.c: Likewise.
	* gcc.target/i386/pr92080-20.c: Likewise.

Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
---
 gcc/config/i386/i386-expand.cc                |   3 +
 gcc/config/i386/i386-features.cc              | 427 ++++++++++++++----
 gcc/config/i386/i386.h                        |   3 +
 .../i386/keylocker-aesdecwide128kl.c          |  14 +-
 .../i386/keylocker-aesdecwide256kl.c          |  14 +-
 .../i386/keylocker-aesencwide128kl.c          |  14 +-
 .../i386/keylocker-aesencwide256kl.c          |  14 +-
 gcc/testsuite/gcc.target/i386/pr92080-10.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr92080-11.c    |  33 ++
 gcc/testsuite/gcc.target/i386/pr92080-12.c    |  16 +
 gcc/testsuite/gcc.target/i386/pr92080-13.c    |  32 ++
 gcc/testsuite/gcc.target/i386/pr92080-14.c    |  31 ++
 gcc/testsuite/gcc.target/i386/pr92080-15.c    |  25 +
 gcc/testsuite/gcc.target/i386/pr92080-16.c    |  26 ++
 gcc/testsuite/gcc.target/i386/pr92080-17.c    |  40 ++
 gcc/testsuite/gcc.target/i386/pr92080-18.c    |  19 +
 gcc/testsuite/gcc.target/i386/pr92080-19.c    |  20 +
 gcc/testsuite/gcc.target/i386/pr92080-20.c    |  20 +
 gcc/testsuite/gcc.target/i386/pr92080-4.c     |  50 ++
 gcc/testsuite/gcc.target/i386/pr92080-5.c     | 109 +++++
 gcc/testsuite/gcc.target/i386/pr92080-6.c     |  19 +
 gcc/testsuite/gcc.target/i386/pr92080-7.c     |  20 +
 gcc/testsuite/gcc.target/i386/pr92080-8.c     |  16 +
 gcc/testsuite/gcc.target/i386/pr92080-9.c     |  81 ++++
 24 files changed, 939 insertions(+), 120 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-9.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 423fc632003..8e556f1b9c1 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10141,6 +10141,9 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
 	  else if (lookup_attribute ("no_callee_saved_registers",
 				     TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
 	    call_no_callee_saved_registers = true;
+	  if (fndecl == current_function_decl
+	      && decl_binds_to_current_def_p (fndecl))
+	    cfun->machine->recursive_function = true;
 	}
     }
   else
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 56ab7f2d23b..36878aab968 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3088,10 +3088,12 @@ ix86_rpad_gate ()
 /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
    for basic block map BBS, which is in the fake loop that contains the
    whole function, so that there is only a single vector set in the
-   whole function.   */
+   whole function.  If not nullptr, INNER_SCALAR is the inner scalar of
+   SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)).  */
 
 static void
-ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
+ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
+			      rtx inner_scalar = nullptr)
 {
   basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
   while (bb->loop_father->latch
@@ -3112,10 +3114,23 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
       insn = NEXT_INSN (insn);
     }
 
+  rtx_insn *set_insn;
   if (insn == BB_HEAD (bb))
-    emit_insn_before (set, insn);
+    set_insn = emit_insn_before (set, insn);
   else
-    emit_insn_after (set, insn ? PREV_INSN (insn) : BB_END (bb));
+    set_insn = emit_insn_after (set,
+				insn ? PREV_INSN (insn) : BB_END (bb));
+
+  if (inner_scalar)
+    {
+      /* Set the source in (vec_duplicate:V4SI (reg:SI 99)).  */
+      rtx reg = XEXP (src, 0);
+      if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
+	  && GET_MODE (reg) != GET_MODE (inner_scalar))
+	inner_scalar = gen_rtx_SUBREG (GET_MODE (reg), inner_scalar, 0);
+      rtx set = gen_rtx_SET (reg, inner_scalar);
+      emit_insn_before (set, set_insn);
+    }
 }
 
 /* At entry of the nearest common dominator for basic blocks with
@@ -3346,26 +3361,15 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
   return new pass_remove_partial_avx_dependency (ctxt);
 }
 
-/* Return a machine mode suitable for vector SIZE.  */
+/* Return a machine mode suitable for vector SIZE with SMODE inner
+   mode.  */
 
 static machine_mode
-ix86_get_vector_load_mode (unsigned int size)
+ix86_get_vector_cse_mode (unsigned int size, machine_mode smode)
 {
-  machine_mode mode;
-  if (size == 64)
-    mode = V64QImode;
-  else if (size == 32)
-    mode = V32QImode;
-  else if (size == 16)
-    mode = V16QImode;
-  else if (size == 8)
-    mode = V8QImode;
-  else if (size == 4)
-    mode = V4QImode;
-  else if (size == 2)
-    mode = V2QImode;
-  else
-    gcc_unreachable ();
+  scalar_mode s_mode = as_a <scalar_mode> (smode);
+  poly_uint64 nunits = size / GET_MODE_SIZE (smode);
+  machine_mode mode = mode_for_vector (s_mode, nunits).require ();
   return mode;
 }
 
@@ -3374,7 +3378,8 @@ ix86_get_vector_load_mode (unsigned int size)
 
 static void
 replace_vector_const (machine_mode vector_mode, rtx vector_const,
-		      auto_bitmap &vector_insns)
+		      auto_bitmap &vector_insns,
+		      machine_mode scalar_mode)
 {
   bitmap_iterator bi;
   unsigned int id;
@@ -3386,7 +3391,8 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
       /* Get the single SET instruction.  */
       rtx set = single_set (insn);
       rtx src = SET_SRC (set);
-      machine_mode mode = GET_MODE (src);
+      rtx dest = SET_DEST (set);
+      machine_mode mode = GET_MODE (dest);
 
       rtx replace;
       /* Replace the source operand with VECTOR_CONST.  */
@@ -3400,7 +3406,8 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
 	      /* If the mode size is smaller than its natural size,
 		 first insert an extra move with a QI vector SUBREG
 		 of the same size to avoid validate_subreg failure.  */
-	      machine_mode vmode = ix86_get_vector_load_mode (size);
+	      machine_mode vmode
+		= ix86_get_vector_cse_mode (size, scalar_mode);
 	      rtx vreg;
 	      if (mode == vmode)
 		vreg = vector_const;
@@ -3426,6 +3433,186 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
     }
 }
 
+enum x86_cse_kind
+{
+  X86_CSE_CONST0_VECTOR,
+  X86_CSE_CONSTM1_VECTOR,
+  X86_CSE_VEC_DUP
+};
+
+struct redundant_load
+{
+  /* Bitmap of basic blocks with broadcast instructions.  */
+  auto_bitmap bbs;
+  /* Bitmap of broadcast instructions.  */
+  auto_bitmap insns;
+  /* The broadcast inner scalar.  */
+  rtx val;
+  /* The inner scalar mode.  */
+  machine_mode mode;
+  /* The instruction which sets the inner scalar.  Nullptr if the inner
+     scalar is applied to the whole function, instead of within the same
+     block.  */
+  rtx_insn *def_insn;
+  /* The widest broadcast source.  */
+  rtx broadcast_source;
+  /* The widest broadcast register.  */
+  rtx broadcast_reg;
+  /* The basic block of the broadcast instruction.  */
+  basic_block bb;
+  /* The number of broadcast instructions with the same inner scalar.  */
+  unsigned HOST_WIDE_INT count;
+  /* The threshold of broadcast instructions with the same inner
+     scalar.  */
+  unsigned int threshold;
+  /* The widest broadcast size in bytes.  */
+  unsigned int size;
+  /* Load kind.  */
+  x86_cse_kind kind;
+};
+
+/* Return the inner scalar if OP is a broadcast, else return nullptr.  */
+
+static rtx
+ix86_broadcast_inner (rtx op, machine_mode mode,
+		      machine_mode *scalar_mode_p,
+		      x86_cse_kind *kind_p, rtx_insn **insn_p)
+{
+  if (op == const0_rtx || op == CONST0_RTX (mode))
+    {
+      *scalar_mode_p = QImode;
+      *kind_p = X86_CSE_CONST0_VECTOR;
+      *insn_p = nullptr;
+      return const0_rtx;
+    }
+  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+	   && (op == constm1_rtx || op == CONSTM1_RTX (mode)))
+    {
+      *scalar_mode_p = QImode;
+      *kind_p = X86_CSE_CONSTM1_VECTOR;
+      *insn_p = nullptr;
+      return constm1_rtx;
+    }
+
+  mode = GET_MODE (op);
+  int nunits = GET_MODE_NUNITS (mode);
+  if (nunits < 2)
+    return nullptr;
+
+  *kind_p = X86_CSE_VEC_DUP;
+
+  rtx reg;
+  if (GET_CODE (op) == VEC_DUPLICATE)
+    {
+      /* Only
+	  (vec_duplicate:V4SI (reg:SI 99))
+	  (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags 0x2]) [0  S8 A64]))
+	 are supported.  Set OP to the broadcast source by default.  */
+      op = XEXP (op, 0);
+      reg = op;
+      if (SUBREG_P (op)
+	  && SUBREG_BYTE (op) == 0
+	  && !paradoxical_subreg_p (op))
+	reg = SUBREG_REG (op);
+      if (!REG_P (reg))
+	{
+	  if (MEM_P (op)
+	      && SYMBOL_REF_P (XEXP (op, 0))
+	      && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))
+	    {
+	      /* Handle constant broadcast from memory.  */
+	      *scalar_mode_p = GET_MODE_INNER (mode);
+	      *insn_p = nullptr;
+	      return op;
+	    }
+	  return nullptr;
+	}
+    }
+  else if (CONST_VECTOR_P (op))
+    {
+      rtx first = XVECEXP (op, 0, 0);
+      for (int i = 1; i < nunits; ++i)
+	{
+	  rtx tmp = XVECEXP (op, 0, i);
+	  /* Vector duplicate value.  */
+	  if (!rtx_equal_p (tmp, first))
+	    return nullptr;
+	}
+      *scalar_mode_p = GET_MODE (first);
+      *insn_p = nullptr;
+      return first;
+    }
+  else
+    return nullptr;
+
+  mode = GET_MODE (op);
+
+  /* Only single def chain is supported.  */
+  df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
+  if (!ref
+      || DF_REF_IS_ARTIFICIAL (ref)
+      || DF_REF_NEXT_REG (ref) != nullptr)
+    return nullptr;
+
+  rtx_insn *insn = DF_REF_INSN (ref);
+  rtx set = single_set (insn);
+  if (!set)
+    return nullptr;
+
+  rtx src = SET_SRC (set);
+
+  if (CONST_INT_P (src))
+    {
+      /* Handle sequences like
+
+	 (set (reg:SI 99)
+	       (const_int 34 [0x22]))
+	 (set (reg:V4SI 98)
+	       (vec_duplicate:V4SI (reg:SI 99)))
+
+	 Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
+	 integer constant.  */
+      op = src;
+      *insn_p = nullptr;
+    }
+  else
+    {
+      /* Handle sequences like
+
+	 (set (reg:QI 105 [ c ])
+	      (reg:QI 5 di [ c ]))
+	 (set (reg:V64QI 102 [ _1 ])
+	      (vec_duplicate:V64QI (reg:QI 105 [ c ])))
+
+	 (set (reg/v:SI 116 [ argc ])
+	      (mem/c:SI (reg:SI 135) [2 argc+0 S4 A32]))
+	 (set (reg:V4SI 119 [ _45 ])
+	      (vec_duplicate:V4SI (reg/v:SI 116 [ argc ])))
+
+	 (set (reg:SI 98 [ _1 ])
+	      (sign_extend:SI (reg:QI 106 [ c ])))
+	 (set (reg:V16SI 103 [ _2 ])
+	       (vec_duplicate:V16SI (reg:SI 98 [ _1 ])))
+
+	 (set (reg:SI 102 [ cost ])
+	      (mem/c:SI (symbol_ref:DI ("cost") [flags 0x40])))
+	 (set (reg:V4HI 103 [ _16 ])
+	      (vec_duplicate:V4HI (subreg:HI (reg:SI 102 [ cost ]) 0)))
+
+	 (set (subreg:SI (reg/v:HI 107 [ cr_val ]) 0)
+	      (ashift:SI (reg:SI 158)
+			 (subreg:QI (reg:SI 156 [ _2 ]) 0)))
+	 (set (reg:V16HI 183 [ _61 ])
+	      (vec_duplicate:V16HI (reg/v:HI 107 [ cr_val ])))
+
+	 Set *INSN_P to INSN and return the broadcast source otherwise.  */
+      *insn_p = insn;
+    }
+
+  *scalar_mode_p = mode;
+  return op;
+}
+
 /* At entry of the nearest common dominator for basic blocks with vector
    CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
    vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
@@ -3440,20 +3627,16 @@ remove_redundant_vector_load (void)
 {
   timevar_push (TV_MACH_DEP);
 
-  auto_bitmap zero_bbs;
-  auto_bitmap m1_bbs;
-  auto_bitmap zero_insns;
-  auto_bitmap m1_insns;
-
+  auto_vec<redundant_load *> loads;
+  redundant_load *load;
   basic_block bb;
   rtx_insn *insn;
-  unsigned HOST_WIDE_INT zero_count = 0;
-  unsigned HOST_WIDE_INT m1_count = 0;
-  unsigned int zero_size = 0;
-  unsigned int m1_size = 0;
+  unsigned int i;
 
   df_set_flags (DF_DEFER_INSN_RESCAN);
 
+  bool recursive_call_p = cfun->machine->recursive_function;
+
   FOR_EACH_BB_FN (bb, cfun)
     {
       FOR_BB_INSNS (bb, insn)
@@ -3481,79 +3664,139 @@ remove_redundant_vector_load (void)
 	  if (!REG_P (dest) && !SUBREG_P (dest))
 	    continue;
 
-	  if (src == CONST0_RTX (mode))
-	    {
-	      /* Record vector instruction with CONST0_RTX.  */
-	      bitmap_set_bit (zero_insns, INSN_UID (insn));
+	  rtx_insn *def_insn;
+	  machine_mode scalar_mode;
+	  x86_cse_kind kind;
+	  rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
+					  &kind, &def_insn);
+	  if (!val)
+	    continue;
 
-	      /* Record the maximum vector size.  */
-	      if (zero_size < GET_MODE_SIZE (mode))
-		zero_size = GET_MODE_SIZE (mode);
+	   /* Remove redundant register loads if there are more than 2
+	      loads will be used.  */
+	  unsigned int threshold = 2;
+
+	  /* Check if there is a matching redundant vector load.   */
+	  bool matched = false;
+	  FOR_EACH_VEC_ELT (loads, i, load)
+	    if (load->val
+		&& load->kind == kind
+		&& load->mode == scalar_mode
+		&& (load->bb == bb
+		    || kind < X86_CSE_VEC_DUP
+		    /* Non all 0s/1s vector load must be in the same
+		       basic block if it is in a recursive call.  */
+		    || !recursive_call_p)
+		&& rtx_equal_p (load->val, val))
+	      {
+		/* Record vector instruction.  */
+		bitmap_set_bit (load->insns, INSN_UID (insn));
 
-	      /* Record the basic block with CONST0_RTX.  */
-	      bitmap_set_bit (zero_bbs, bb->index);
-	      zero_count++;
-	    }
-	  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
-		   && src == CONSTM1_RTX (mode))
-	    {
-	      /* Record vector instruction with CONSTM1_RTX.  */
-	      bitmap_set_bit (m1_insns, INSN_UID (insn));
+		/* Record the maximum vector size.  */
+		if (load->size < GET_MODE_SIZE (mode))
+		  load->size = GET_MODE_SIZE (mode);
 
-	      /* Record the maximum vector size.  */
-	      if (m1_size < GET_MODE_SIZE (mode))
-		m1_size = GET_MODE_SIZE (mode);
+		/* Record the basic block.  */
+		bitmap_set_bit (load->bbs, bb->index);
+		load->count++;
+		matched = true;
+		break;
+	      }
 
-	      /* Record the basic block with CONSTM1_RTX.  */
-	      bitmap_set_bit (m1_bbs, bb->index);
-	      m1_count++;
-	    }
-	}
-    }
+	  if (matched)
+	    continue;
 
-  if (zero_count > 1 || m1_count > 1)
-    {
-      machine_mode zero_mode, m1_mode;
-      rtx vector_const0, vector_constm1;
+	  /* We see this vector broadcast the first time.  */
+	  load = new redundant_load;
 
-      if (zero_count > 1)
-	{
-	  zero_mode = ix86_get_vector_load_mode (zero_size);
-	  vector_const0 = gen_reg_rtx (zero_mode);
-	  replace_vector_const (zero_mode, vector_const0, zero_insns);
-	}
-      else
-	{
-	  zero_mode = VOIDmode;
-	  vector_const0 = nullptr;
-	}
+	  load->val = copy_rtx (val);
+	  load->mode = scalar_mode;
+	  load->size = GET_MODE_SIZE (mode);
+	  load->def_insn = def_insn;
+	  load->count = 1;
+	  load->threshold = threshold;
+	  load->bb = BLOCK_FOR_INSN (insn);
+	  load->kind = kind;
 
-      if (m1_count > 1)
-	{
-	  m1_mode = ix86_get_vector_load_mode (m1_size);
-	  vector_constm1 = gen_reg_rtx (m1_mode);
-	  replace_vector_const (m1_mode, vector_constm1, m1_insns);
-	}
-      else
-	{
-	  m1_mode = VOIDmode;
-	  vector_constm1 = nullptr;
+	  bitmap_set_bit (load->insns, INSN_UID (insn));
+	  bitmap_set_bit (load->bbs, bb->index);
+
+	  loads.safe_push (load);
 	}
+    }
+
+  bool replaced = false;
+  rtx reg, broadcast_source, broadcast_reg;
+  FOR_EACH_VEC_ELT (loads, i, load)
+    if (load->count >= load->threshold)
+      {
+	machine_mode mode = ix86_get_vector_cse_mode (load->size,
+						      load->mode);
+	broadcast_reg = gen_reg_rtx (mode);
+	if (load->def_insn)
+	  {
+	    /* Replace redundant vector loads with a single vector load
+	       in the same basic block.  */
+	    reg = load->val;
+	    if (load->mode != GET_MODE (reg))
+	      reg = gen_rtx_SUBREG (load->mode, reg, 0);
+	    broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+	    replace_vector_const (mode, broadcast_reg, load->insns,
+				  load->mode);
+	  }
+	else
+	  {
+	    /* This is a constant integer/double vector.  If the
+	       inner scalar is 0 or -1, set vector to CONST0_RTX
+	       or CONSTM1_RTX directly.  */
+	    rtx reg;
+	    switch (load->kind)
+	      {
+	      case X86_CSE_CONST0_VECTOR:
+		broadcast_source = CONST0_RTX (mode);
+		break;
+	      case X86_CSE_CONSTM1_VECTOR:
+		broadcast_source = CONSTM1_RTX (mode);
+		break;
+	      default:
+		reg = gen_reg_rtx (load->mode);
+		broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+		break;
+	      }
+	    replace_vector_const (mode, broadcast_reg, load->insns,
+				  load->mode);
+	  }
+	load->broadcast_source = broadcast_source;
+	load->broadcast_reg = broadcast_reg;
+	replaced = true;
+      }
 
+  if (replaced)
+    {
       /* (Re-)discover loops so that bb->loop_father can be used in the
 	 analysis below.  */
       calculate_dominance_info (CDI_DOMINATORS);
       loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
 
-      if (vector_const0)
-	ix86_place_single_vector_set (vector_const0,
-				      CONST0_RTX (zero_mode),
-				      zero_bbs);
-
-      if (vector_constm1)
-	ix86_place_single_vector_set (vector_constm1,
-				      CONSTM1_RTX (m1_mode),
-				      m1_bbs);
+      FOR_EACH_VEC_ELT (loads, i, load)
+	if (load->count >= load->threshold)
+	  {
+	    if (load->def_insn)
+	      {
+		/* Insert a broadcast after the original scalar
+		   definition.  */
+		rtx set = gen_rtx_SET (load->broadcast_reg,
+				       load->broadcast_source);
+		insn = emit_insn_after (set, load->def_insn);
+	      }
+	    else
+	      ix86_place_single_vector_set (load->broadcast_reg,
+					    load->broadcast_source,
+					    load->bbs,
+					    (load->kind == X86_CSE_VEC_DUP
+					     ? load->val
+					     : nullptr));
+	  }
 
       loop_optimizer_finalize ();
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 7c16eac7700..812055085bb 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2924,6 +2924,9 @@ struct GTY(()) machine_function {
   /* True if inline asm with redzone clobber has been seen.  */
   BOOL_BITFIELD asm_redzone_clobber_seen : 1;
 
+  /* True if this is a recursive function.  */
+  BOOL_BITFIELD recursive_function : 1;
+
   /* The largest alignment, in bytes, of stack slot actually used.  */
   unsigned int max_used_stack_alignment;
 
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
index 93806e51508..e73ba35ddd1 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
@@ -19,14 +19,14 @@
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
 /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
index f9ccc82c7ca..33cd998bfdf 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
@@ -19,14 +19,14 @@
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
 /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
index c0fcd28fb07..75106e59b77 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
@@ -19,14 +19,14 @@
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
 /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
index 31463a8b2da..2787732229a 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
@@ -19,14 +19,14 @@
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
-/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
 /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
+/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-10.c b/gcc/testsuite/gcc.target/i386/pr92080-10.c
new file mode 100644
index 00000000000..b67f9d8d285
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-10.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=sapphirerapids -Ofast" } */
+/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */
+
+extern short write_picture_p_Vid_0;
+extern unsigned short *write_picture_p_2_0_0;
+extern int write_picture_p_0, write_picture_p_1, write_picture_i;
+void write_picture() {
+  unsigned short cr_val = 1 << write_picture_p_Vid_0;
+  for (; write_picture_p_1;)
+    for (; write_picture_i < write_picture_p_0; write_picture_i++)
+      write_picture_p_2_0_0[write_picture_i] = cr_val;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-11.c b/gcc/testsuite/gcc.target/i386/pr92080-11.c
new file mode 100644
index 00000000000..8747fc47640
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-11.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target { avx512f_runtime } } } */
+/* { dg-options "-mavx512f -mtune=icelake-server -O3" } */
+
+struct s {
+  char s[sizeof(long double)];
+};
+
+union u {
+  long double d;
+  struct s s;
+};
+
+int main()
+{
+  union u x = {0};
+#if __SIZEOF_LONG_DOUBLE__ == 16
+  x.s = (struct s){"xxxxxxxxxxxxxxxx"};
+#elif __SIZEOF_LONG_DOUBLE__ == 12
+  x.s = (struct s){"xxxxxxxxxxxx"};
+#elif __SIZEOF_LONG_DOUBLE__ == 8
+  x.s = (struct s){"xxxxxxxx"};
+#elif __SIZEOF_LONG_DOUBLE__ == 4
+  x.s = (struct s){"xxxx"};
+#endif
+
+  union u y = x;
+
+  for (unsigned char *p = (unsigned char *)&y + sizeof y;
+       p-- > (unsigned char *)&y;)
+    if (*p != (unsigned char)'x')
+      __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-12.c b/gcc/testsuite/gcc.target/i386/pr92080-12.c
new file mode 100644
index 00000000000..cb09eb2f0a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-12.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mno-mmx -march=icelake-server" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+
+signed char a;
+signed char f (int i, int j)
+{
+  signed char c;
+  while (i != 0)
+  {
+    a ^= j;
+    ++c;
+    ++i;
+  }
+  return c;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-13.c b/gcc/testsuite/gcc.target/i386/pr92080-13.c
new file mode 100644
index 00000000000..24b7616c894
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-13.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target { avx512f_runtime } } } */
+/* { dg-options "-mavx512f -mtune=icelake-server -O2 -save-temps" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */
+
+#include <assert.h>
+
+#define CONTAINER_KIND union
+
+typedef CONTAINER_KIND container { int value; } container;
+
+void move(container* end, container* start) {
+    container* p;
+    for (p = end; p > start; p--) {
+	(p)->value = (p-1)->value;
+    }
+}
+
+#define N 100
+
+int main(int argc, char* argv[]) {
+    container vals[N];
+    int i;
+    for (i=0; i<N; i++) {
+        vals[i].value = argc + i;
+    }
+    move(&vals[N-1], &vals[0]);
+    assert(vals[0].value == argc + 0);
+    for (i=1; i<N; i++) {
+        assert(vals[i].value == argc + i - 1);
+    }
+    return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-14.c b/gcc/testsuite/gcc.target/i386/pr92080-14.c
new file mode 100644
index 00000000000..6be41b63400
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-14.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+extern v16si sinksz;
+extern v8si sinksy;
+extern v4si sinksx;
+extern v4si sinksx1;
+
+extern void bar (void);
+
+void
+foo (char c, int i)
+{
+  sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  if (i == 1)
+    {
+      sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+      bar ();
+    }
+  else if (i == 2)
+    {
+      sinksx = __extension__(v4si){c,c,c,c};
+      bar ();
+    }
+  sinksx1 = __extension__(v4si){c,c,c,c};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-15.c b/gcc/testsuite/gcc.target/i386/pr92080-15.c
new file mode 100644
index 00000000000..fa55d82e48e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-15.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+
+extern v4si *s1;
+extern v8si *s2;
+extern v16si *s3;
+
+int
+foo (int i, int j)
+{
+  if (j == 1)
+   s1[i] = __extension__(v4si){34, 34, 34, 34};
+  else if (i == 1)
+    s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
+  if ((i + j) == 1234)
+    i = foo (j, i);
+  s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
+				   34, 34, 34, 34, 34, 34, 34, 34};
+  return i - j;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-16.c b/gcc/testsuite/gcc.target/i386/pr92080-16.c
new file mode 100644
index 00000000000..c8ab084b714
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-16.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+
+extern v4si *s1;
+extern v8si *s2;
+extern v16si *s3;
+
+int
+foo (int i, int j)
+{
+  if (j == 1)
+    {
+      s1[i] = __extension__(v4si){34, 34, 34, 34};
+      s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
+      s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
+				       34, 34, 34, 34, 34, 34, 34, 34};
+    }
+  if ((i + j) == 1234)
+    i = foo (j, i);
+  return i - j;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-17.c b/gcc/testsuite/gcc.target/i386/pr92080-17.c
new file mode 100644
index 00000000000..f3d6b691754
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-17.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.}  } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	vpbroadcastw	cost\(%rip\), %xmm0
+**	vmovq	%xmm0, cost1\(%rip\)
+**	vmovdqu	%xmm0, cost2\(%rip\)
+**	ret
+**...
+*/
+
+extern struct {
+  short cost[4];
+} cost1;
+extern struct {
+  short cost[8];
+} cost2;
+extern int cost;
+
+void
+foo (void)
+{
+  cost1.cost[0] = cost;
+  cost1.cost[1] = cost;
+  cost1.cost[2] = cost;
+  cost1.cost[3] = cost;
+  cost2.cost[0] = cost;
+  cost2.cost[1] = cost;
+  cost2.cost[2] = cost;
+  cost2.cost[3] = cost;
+  cost2.cost[4] = cost;
+  cost2.cost[5] = cost;
+  cost2.cost[6] = cost;
+  cost2.cost[7] = cost;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-18.c b/gcc/testsuite/gcc.target/i386/pr92080-18.c
new file mode 100644
index 00000000000..b4ec12e5536
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-18.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vbroadcastsd" 1 } } */
+
+typedef double v2df __attribute__((vector_size(16)));
+typedef double v4df __attribute__((vector_size(32)));
+typedef double v8df __attribute__((vector_size(64)));
+
+extern v2df d1;
+extern v4df d2;
+extern v8df d3;
+
+void
+foo ()
+{
+  d1 = __extension__(v2df){2.34, 2.34};
+  d2 = __extension__(v4df){2.34, 2.34, 2.34, 2.34};
+  d3 = __extension__(v8df){2.34, 2.34, 2.34, 2.34, 2.34, 2.34, 2.34, 2.34};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-19.c b/gcc/testsuite/gcc.target/i386/pr92080-19.c
new file mode 100644
index 00000000000..b1a1bdb47db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-19.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */
+
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef long long v8di __attribute__((vector_size(64)));
+
+extern v2di d1;
+extern v4di d2;
+extern v8di d3;
+
+void
+foo (long long a1, long long a2, long long a3, long long a4,
+     long long a5, long long a6, long long a7)
+{
+  d1 = __extension__(v2di){a7, a7};
+  d2 = __extension__(v4di){a7, a7, a7, a7};
+  d3 = __extension__(v8di){a7, a7, a7, a7, a7, a7, a7, a7};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-20.c b/gcc/testsuite/gcc.target/i386/pr92080-20.c
new file mode 100644
index 00000000000..542ef2a01b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-20.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vbroadcastsd" 1 } } */
+
+typedef double v2di __attribute__((vector_size(16)));
+typedef double v4di __attribute__((vector_size(32)));
+typedef double v8di __attribute__((vector_size(64)));
+
+extern v2di d1;
+extern v4di d2;
+extern v8di d3;
+
+void
+foo (double a1, double a2, double a3, double a4,
+     double a5, double a6, double a7)
+{
+  d1 = __extension__(v2di){a7, a7};
+  d2 = __extension__(v4di){a7, a7, a7, a7};
+  d3 = __extension__(v8di){a7, a7, a7, a7, a7, a7, a7, a7};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-4.c b/gcc/testsuite/gcc.target/i386/pr92080-4.c
new file mode 100644
index 00000000000..ebe1384c691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-4.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+typedef short v32hi __attribute__((vector_size(64)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+extern v16si sinksz;
+extern v8si sinksy;
+extern v4si sinksx;
+extern v32hi sinkhz;
+extern v16hi sinkhy;
+extern v8hi sinkhx;
+extern v64qi sinkbz;
+extern v32qi sinkby;
+extern v16qi sinkbx;
+
+void foo(char c) {
+  sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+  sinksx = __extension__(v4si){c,c,c,c};
+}
+
+void foo1(char c) {
+  sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c};
+}
+
+void foo2(char c) {
+  sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkbx = __extension__(v16qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-5.c b/gcc/testsuite/gcc.target/i386/pr92080-5.c
new file mode 100644
index 00000000000..380cd337e09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-5.c
@@ -0,0 +1,109 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastsd" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss" 1 } } */
+
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v16sf __attribute__((vector_size(64)));
+typedef double v2df __attribute__((vector_size(16)));
+typedef double v4df __attribute__((vector_size(32)));
+typedef double v8df __attribute__((vector_size(64)));
+
+extern v16qi b1;
+extern v8hi h1;
+extern v4si s1;
+extern v2di l1;
+extern v4sf f1;
+extern v2df d1;
+extern v32qi b2;
+extern v16hi h2;
+extern v8si s2;
+extern v4di l2;
+extern v8sf f2;
+extern v4df d2;
+extern v64qi b3;
+extern v32hi h3;
+extern v16si s3;
+extern v8di l3;
+extern v16sf f3;
+extern v8df d3;
+
+void
+foo1 ()
+{
+  b1 = __extension__(v16qi){34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34};
+  b2 = __extension__(v32qi){34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34};
+  b3 = __extension__(v64qi){34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo2 ()
+{
+  h1 = __extension__(v8hi){34, 34, 34, 34, 34, 34, 34, 34};
+  h2 = __extension__(v16hi){34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34};
+  h3 = __extension__(v32hi){34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo3 ()
+{
+  s1 = __extension__(v4si){34, 34, 34, 34};
+  s2 = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
+  s3 = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo4 ()
+{
+  l1 = __extension__(v2di){34, 34};
+  l2 = __extension__(v4di){34, 34, 34, 34};
+  l3 = __extension__(v8di){34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo5 ()
+{
+  f1 = __extension__(v4sf){34, 34, 34, 34};
+  f2 = __extension__(v8sf){34, 34, 34, 34, 34, 34, 34, 34};
+  f3 = __extension__(v16sf){34, 34, 34, 34, 34, 34, 34, 34,
+			    34, 34, 34, 34, 34, 34, 34, 34};
+}
+
+void
+foo6 ()
+{
+  d1 = __extension__(v2df){34, 34};
+  d2 = __extension__(v4df){34, 34, 34, 34};
+  d3 = __extension__(v8df){34, 34, 34, 34, 34, 34, 34, 34};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-6.c b/gcc/testsuite/gcc.target/i386/pr92080-6.c
new file mode 100644
index 00000000000..e4cdbee55be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-6.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+
+#include <immintrin.h>
+
+extern __m512i sinkz;
+extern __m256i sinky;
+extern char f;
+
+void
+foo(char c, int x)
+{
+  c += f;
+  sinkz = _mm512_set1_epi8(c);
+  if (x == 2)
+    f += 3;
+  sinky = _mm256_set1_epi8(c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-7.c b/gcc/testsuite/gcc.target/i386/pr92080-7.c
new file mode 100644
index 00000000000..8691684e96b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-7.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+
+#include <immintrin.h>
+
+extern __m512i sinkz;
+extern __m256i sinky;
+extern char f;
+extern void bar (void);
+
+void
+foo(char c, int x)
+{
+  c += f;
+  sinkz = _mm512_set1_epi8(c);
+  if (x == 2)
+    bar ();
+  sinky = _mm256_set1_epi8(c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-8.c b/gcc/testsuite/gcc.target/i386/pr92080-8.c
new file mode 100644
index 00000000000..7ebb62cea75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-8.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef long long int v2di __attribute__((vector_size(16)));
+extern v4si s;
+extern v2di l;
+
+void
+foo(void)
+{
+  l = __extension__(v2di){2,2};
+  s = __extension__(v4si){2,2,2,2};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-9.c b/gcc/testsuite/gcc.target/i386/pr92080-9.c
new file mode 100644
index 00000000000..f44ab563f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-9.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]+" 8 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]+" 3 } } */
+/* { dg-final { scan-assembler-times "vmovdqa32\[\\t \]+" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef long long int v2di __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+
+extern v16qi b1;
+extern v8hi h1;
+extern v4si s1;
+extern v2di l1;
+extern v32qi b2;
+extern v16hi h2;
+extern v8si s2;
+extern v4di l2;
+extern v64qi b3;
+extern v32hi h3;
+extern v16si s3;
+extern v8di l3;
+
+void
+foo(void)
+{
+  b1 = __extension__(v16qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
+  h1 = __extension__(v8hi){0x2222, 0x2222, 0x2222, 0x2222,
+			   0x2222, 0x2222, 0x2222, 0x2222};
+  s1 = __extension__(v4si){0x22222222,0x22222222,0x22222222,0x22222222};
+  l1 = __extension__(v2di){0x2222222222222222ULL,0x2222222222222222ULL};
+  b2 = __extension__(v32qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
+  h2 = __extension__(v16hi){0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222};
+  s2 = __extension__(v8si){0x22222222,0x22222222,0x22222222,0x22222222,
+			   0x22222222,0x22222222,0x22222222,0x22222222};
+  l2 = __extension__(v4di){0x2222222222222222ULL,0x2222222222222222ULL,
+			   0x2222222222222222ULL,0x2222222222222222ULL};
+  b3 = __extension__(v64qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+			    0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
+  h3 = __extension__(v32hi){0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222,
+			    0x2222, 0x2222, 0x2222, 0x2222};
+  s3 = __extension__(v16si){0x22222222,0x22222222,0x22222222,0x22222222,
+			    0x22222222,0x22222222,0x22222222,0x22222222,
+			    0x22222222,0x22222222,0x22222222,0x22222222,
+			    0x22222222,0x22222222,0x22222222,0x22222222};
+  l3 = __extension__(v8di){0x2222222222222222ULL,0x2222222222222222ULL,
+			   0x2222222222222222ULL,0x2222222222222222ULL,
+			   0x2222222222222222ULL,0x2222222222222222ULL,
+			   0x2222222222222222ULL,0x2222222222222222ULL};
+}
-- 
2.49.0

Reply via email to