On Tue, Apr 29, 2025 at 11:27 AM H.J. Lu <[email protected]> wrote:
>
> On Tue, Apr 29, 2025 at 10:08 AM Hongtao Liu <[email protected]> wrote:
> >
> > On Mon, Apr 28, 2025 at 5:07 PM H.J. Lu <[email protected]> wrote:
> > >
> > > On Mon, Apr 28, 2025 at 4:26 PM H.J. Lu <[email protected]> wrote:
> > > >
> > >
> > > > > > This is what my patch does:
> > > > > But it iterates through vector_insns, using a def-ref chain to find
> > > > > those insns. I think we can just record those single_set with src as
> > > > > const_m1/zero, and replace src for them.
> > > >
> > > > Will fix it.
> > >
> > > Fixed in the v2 patch.
> > >
> > > > > >
> > > > > > /* Check the single definition of CONST0_RTX and
> > > > > > integer
> > > > > > CONSTM1_RTX. */
> > > > > > rtx src = SET_SRC (set);
> > > > > > rtx replace;
> > > > > > if (vector_const0 && src == CONST0_RTX (mode))
> > > > > > {
> > > > > > /* Replace REG with VECTOR_CONST0. */
> > > > > > if (SUBREG_P (reg) || mode == zero_mode)
> > > > > > replace = vector_const0;
> > > > > > else
> > > > > > replace = gen_rtx_SUBREG (mode, vector_const0,
> > > > > > 0);
> > > > > > *DF_REF_REAL_LOC (ref) = replace;
> > > > > > replaced = true;
> > > > > > zero_replaced = true;
> > > > > > }
> > > > > >
> > > > > > It changed the source to a subreg directly.
> > > > > >
> > > > > > > Also we also need to change ix86_modes_tieable_p to make sure
> > > > > > > those
> > > > > > > inserted subreg can be handled by LRA and other passes?
> > > > > >
> > > > > > ix86_modes_tieable_p is OK:
> > > > > >
> > > > > > /* If MODE2 is only appropriate for an SSE register, then tie with
> > > > > > any other mode acceptable to SSE registers. */
> > > > > > if (GET_MODE_SIZE (mode2) == 64
> > > > > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > > return (GET_MODE_SIZE (mode1) == 64
> > > > > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > > if (GET_MODE_SIZE (mode2) == 32
> > > > > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > > return (GET_MODE_SIZE (mode1) == 32
> > > > > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > > if (GET_MODE_SIZE (mode2) == 16
> > > > > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > > return (GET_MODE_SIZE (mode1) == 16
> > > > > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > >
> > > > > It's ok only size of mode1 is equal to size of mode2.
> > > > > But in the testcase, there are different size vectors(32-bytes,
> > > > > 16-bytes).
> > > > >
> > > > > So it would be better as, for mode2 >= 16 bytes, it can only be put
> > > > > into SSE_REGS(except for TImode, but TImode still can be tied to
> > > > > <=16bytes mode1 which can be put into SSE_REGS) , if mode1 can also be
> > > > > put into SSE_REGS, then mode2 tie with mode1.
> > > > >
> > > > > /* If MODE2 is only appropriate for an SSE register, then tie with
> > > > > any other mode acceptable to SSE registers. */
> > > > > - if (GET_MODE_SIZE (mode2) == 64
> > > > > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > - return (GET_MODE_SIZE (mode1) == 64
> > > > > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > - if (GET_MODE_SIZE (mode2) == 32
> > > > > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > - return (GET_MODE_SIZE (mode1) == 32
> > > > > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > - if (GET_MODE_SIZE (mode2) == 16
> > > > > + if (GET_MODE_SIZE (mode2) >= 16
> > > > > + && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)
> > > > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > > > > - return (GET_MODE_SIZE (mode1) == 16
> > > > > - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
> > > > > + return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
> > > > >
> > > >
> > > > This caused:
> > > >
> > > > FAIL: gcc.target/i386/pr111267.c scan-assembler-not movd
> > > > FAIL: gcc.target/i386/pr111267.c scan-assembler-not movq
> > > > FAIL: gcc.target/i386/pr82580.c scan-assembler-not \\mmovzb
> > > >
> > > > since GCC thinks it is cheap to get QI/HI/SI/DI from TI in XMM.
> > > > I am testing:
> > > >
> > > > /* If MODE2 is only appropriate for an SSE register, then tie with
> > > > any other mode acceptable to SSE registers, excluding
> > > > (subreg:QI (reg:TI 99) 0))
> > > > (subreg:HI (reg:TI 99) 0))
> > > > (subreg:SI (reg:TI 99) 0))
> > > > (subreg:DI (reg:TI 99) 0))
> > > > to avoid unnecessary move from SSE register to integer register.
> > > > */
> > > > if (GET_MODE_SIZE (mode2) >= 16
> > > > && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
> > > > || (!INTEGRAL_MODE_P (mode1)
> > > > && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
> > > > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> > It looks like only scalar_int_mode_p should be exclude, not vector_mode?
> > so, how about
> > if (GET_MODE_SIZE (mode2) >= 16
> > && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
> > || ((VECTOR_MODE_P (mode1) || SCALAR_FLOAT_MODE_P (mode1))
> > && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
> > && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
> >
> > Only allow vector mode or scalar floating point in mode1 to be tied to
> > mode2 in SSE_REGS?
>
> Will fix.
Fixed in v3.
> >
> > >+ /* NB: Don't run recog_memoized here since vector SUBREG may not
> > >+ be valid. Let LRA handle vector SUBREG. */
> > It's tricky, we can insert a move pattern with dest same size as
> > SET_SRC (set), but with same component mode as constm1/constm0, so it
> > should be something like
> > mov v8qi, const0(v32qimode)
> > mov v2sf, subreg:v2sf (v8qi)
>
> IIt is exactly what my patch generates:
>
> (insn 31 2 5 2 (set (reg:V32QI 110)
> (const_vector:V32QI [
> (const_int 0 [0]) repeated x32
> ])) -1
> (nil))
> (insn 5 31 6 2 (set (reg:V2DF 98)
> (subreg:V2DF (reg:V32QI 110) 0)) "z2.c":35:6 2421 {movv2df_internal}
> (nil))
> (insn 6 5 7 2 (set (mem/c:V2DF (symbol_ref:DI ("d1") [flags 0x2]
> <var_decl 0x7fd11d6341c8 d1>) [1 d1+0 S16 A128])
> (reg:V2DF 98)) "z2.c":35:6 2421 {movv2df_internal}
> (expr_list:REG_DEAD (reg:V2DF 98)
> (nil)))
> (insn 7 6 8 2 (set (reg:V4SF 99)
> (subreg:V4SF (reg:V32QI 110) 0)) "z2.c":36:6 2418 {movv4sf_internal}
> (nil))
> (insn 8 7 9 2 (set (mem/c:V4SF (symbol_ref:DI ("f1") [flags 0x2]
> <var_decl 0x7fd11d634130 f1>) [2 f1+0 S16 A128])
> (reg:V4SF 99)) "z2.c":36:6 2418 {movv4sf_internal}
> (expr_list:REG_DEAD (reg:V4SF 99)
> (nil)))
> ...
>
> and LRA does eliminate the redundant moves.
>
> > I think LRA could eliminate the redundant move.
> >
> > >+static void
> > >+ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
> > >+{
> > For the convenience of maintain, can we also replace the corresponding
> > code in the remove_partial_avx_dependency function with a call to this
> > ix86_place_single_vector_set function
>
> Will fix.
Fixed in v3.
Here is the v3 patch. OK for master?
Thanks.
--
H.J.
---
For all different modes of all 0s/1s vectors, we can use the single widest
all 0s/1s vector register for all 0s/1s vector uses in the whole function.
Add a pass to generate a single widest all 0s/1s vector set instruction at
entry of the nearest common dominator for basic blocks with all 0s/1s
vector uses. On Linux/x86-64, in cc1plus, this patch reduces the number
of vector xor instructions from 4803 to 4714 and pcmpeq instructions from
144 to 142.
NB: PR target/92080 and PR target/117839 aren't same. PR target/117839
is for vectors of all 0s and all 1s with different sizes and different
components. PR target/92080 is for broadcast of the same component to
different vector sizes. This patch covers only all 0s and all 1s cases
of PR target/92080.
gcc/
PR target/92080
PR target/117839
* config/i386/i386-features.cc (ix86_place_single_vector_set):
New function.
(remove_partial_avx_dependency): Use it.
(ix86_get_vector_load_mode): New function.
(replace_vector_const): Likewise.
(remove_redundant_vector_load): Likewise.
(pass_data_remove_redundant_vector_load): Likewise.
(pass_remove_redundant_vector_load): Likewise.
(make_pass_remove_redundant_vector_load): Likewise.
* config/i386/i386-passes.def: Add
pass_remove_redundant_vector_load after
pass_remove_partial_avx_dependency.
* config/i386/i386-protos.h
(make_pass_remove_redundant_vector_load): New.
* config/i386/i386.cc (ix86_modes_tieable_p): Return true for
narrower non-scalar-integer modes in SSE registers.
gcc/testsuite/
PR target/92080
PR target/117839
* gcc.target/i386/pr117839-1a.c: New test.
* gcc.target/i386/pr117839-1b.c: Likewise.
* gcc.target/i386/pr117839-2.c: Likewise.
* gcc.target/i386/pr92080-1.c: Likewise.
* gcc.target/i386/pr92080-2.c: Likewise.
* gcc.target/i386/pr92080-3.c: Likewise.
From 0aaff253015852c353bc5b567b71b79f6c677b00 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <[email protected]>
Date: Fri, 29 Nov 2024 18:22:14 +0800
Subject: [PATCH v3] x86: Add a pass to remove redundant all 0s/1s vector load
For all different modes of all 0s/1s vectors, we can use the single widest
all 0s/1s vector register for all 0s/1s vector uses in the whole function.
Add a pass to generate a single widest all 0s/1s vector set instruction at
entry of the nearest common dominator for basic blocks with all 0s/1s
vector uses. On Linux/x86-64, in cc1plus, this patch reduces the number
of vector xor instructions from 4803 to 4714 and pcmpeq instructions from
144 to 142.
NB: PR target/92080 and PR target/117839 aren't same. PR target/117839
is for vectors of all 0s and all 1s with different sizes and different
components. PR target/92080 is for broadcast of the same component to
different vector sizes. This patch covers only all 0s and all 1s cases
of PR target/92080.
gcc/
PR target/92080
PR target/117839
* config/i386/i386-features.cc (ix86_place_single_vector_set):
New function.
(remove_partial_avx_dependency): Use it.
(ix86_get_vector_load_mode): New function.
(replace_vector_const): Likewise.
(remove_redundant_vector_load): Likewise.
(pass_data_remove_redundant_vector_load): Likewise.
(pass_remove_redundant_vector_load): Likewise.
(make_pass_remove_redundant_vector_load): Likewise.
* config/i386/i386-passes.def: Add
pass_remove_redundant_vector_load after
pass_remove_partial_avx_dependency.
* config/i386/i386-protos.h
(make_pass_remove_redundant_vector_load): New.
* config/i386/i386.cc (ix86_modes_tieable_p): Return true for
narrower non-scalar-integer modes in SSE registers.
gcc/testsuite/
PR target/92080
PR target/117839
* gcc.target/i386/pr117839-1a.c: New test.
* gcc.target/i386/pr117839-1b.c: Likewise.
* gcc.target/i386/pr117839-2.c: Likewise.
* gcc.target/i386/pr92080-1.c: Likewise.
* gcc.target/i386/pr92080-2.c: Likewise.
* gcc.target/i386/pr92080-3.c: Likewise.
Signed-off-by: H.J. Lu <[email protected]>
---
gcc/config/i386/i386-features.cc | 303 ++++++++++++++++++--
gcc/config/i386/i386-passes.def | 1 +
gcc/config/i386/i386-protos.h | 2 +
gcc/config/i386/i386.cc | 25 +-
gcc/testsuite/gcc.target/i386/pr117839-1a.c | 35 +++
gcc/testsuite/gcc.target/i386/pr117839-1b.c | 5 +
gcc/testsuite/gcc.target/i386/pr117839-2.c | 40 +++
gcc/testsuite/gcc.target/i386/pr92080-1.c | 72 +++++
gcc/testsuite/gcc.target/i386/pr92080-2.c | 59 ++++
gcc/testsuite/gcc.target/i386/pr92080-3.c | 48 ++++
10 files changed, 549 insertions(+), 41 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c35ac24fd8a..31f3ee2ef17 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3034,6 +3034,42 @@ ix86_rpad_gate ()
&& optimize_function_for_speed_p (cfun));
}
+/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
+ for basic block map BBS, which is in the fake loop that contains the
+ whole function, so that there is only a single vector set in the
+ whole function. */
+
+static void
+ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
+{
+ basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
+ while (bb->loop_father->latch
+ != EXIT_BLOCK_PTR_FOR_FN (cfun))
+ bb = get_immediate_dominator (CDI_DOMINATORS,
+ bb->loop_father->header);
+
+ rtx set = gen_rtx_SET (dest, src);
+
+ rtx_insn *insn = BB_HEAD (bb);
+ while (insn && !NONDEBUG_INSN_P (insn))
+ {
+ if (insn == BB_END (bb))
+ {
+ insn = NULL;
+ break;
+ }
+ insn = NEXT_INSN (insn);
+ }
+
+ rtx_insn *set_insn;
+ if (insn == BB_HEAD (bb))
+ set_insn = emit_insn_before (set, insn);
+ else
+ set_insn = emit_insn_after (set,
+ insn ? PREV_INSN (insn) : BB_END (bb));
+ df_insn_rescan (set_insn);
+}
+
/* At entry of the nearest common dominator for basic blocks with
conversions/rcp/sqrt/rsqrt/round, generate a single
vxorps %xmmN, %xmmN, %xmmN
@@ -3188,35 +3224,10 @@ remove_partial_avx_dependency (void)
calculate_dominance_info (CDI_DOMINATORS);
loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
- /* Generate a vxorps at entry of the nearest dominator for basic
- blocks with conversions, which is in the fake loop that
- contains the whole function, so that there is only a single
- vxorps in the whole function. */
- bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
- convert_bbs);
- while (bb->loop_father->latch
- != EXIT_BLOCK_PTR_FOR_FN (cfun))
- bb = get_immediate_dominator (CDI_DOMINATORS,
- bb->loop_father->header);
-
- set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
-
- insn = BB_HEAD (bb);
- while (insn && !NONDEBUG_INSN_P (insn))
- {
- if (insn == BB_END (bb))
- {
- insn = NULL;
- break;
- }
- insn = NEXT_INSN (insn);
- }
- if (insn == BB_HEAD (bb))
- set_insn = emit_insn_before (set, insn);
- else
- set_insn = emit_insn_after (set,
- insn ? PREV_INSN (insn) : BB_END (bb));
- df_insn_rescan (set_insn);
+ ix86_place_single_vector_set (v4sf_const0,
+ CONST0_RTX (V4SFmode),
+ convert_bbs);
+
loop_optimizer_finalize ();
if (!control_flow_insns.is_empty ())
@@ -3288,6 +3299,240 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
return new pass_remove_partial_avx_dependency (ctxt);
}
+/* Return a machine mode suitable for vector SIZE. */
+
+static machine_mode
+ix86_get_vector_load_mode (unsigned int size)
+{
+ machine_mode mode;
+ if (size == 64)
+ mode = V64QImode;
+ else if (size == 32)
+ mode = V32QImode;
+ else
+ mode = V16QImode;
+ return mode;
+}
+
+/* Replace the source operand of instructions in VECTOR_INSNS with
+ VECTOR_CONST in VECTOR_MODE. */
+
+static void
+replace_vector_const (machine_mode vector_mode, rtx vector_const,
+ auto_bitmap &vector_insns)
+{
+ bitmap_iterator bi;
+ unsigned int id;
+
+ EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
+ {
+ rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
+
+ /* Get the single SET instruction. */
+ rtx set = single_set (insn);
+ rtx dest = SET_SRC (set);
+ machine_mode mode = GET_MODE (dest);
+
+ rtx replace;
+ /* Replace the source operand with VECTOR_CONST. */
+ if (SUBREG_P (dest) || mode == vector_mode)
+ replace = vector_const;
+ else
+ replace = gen_rtx_SUBREG (mode, vector_const, 0);
+
+ /* NB: Don't run recog_memoized here since vector SUBREG may not
+ be valid. Let LRA handle vector SUBREG. */
+ SET_SRC (set) = replace;
+ /* Drop possible dead definitions. */
+ PATTERN (insn) = set;
+ df_insn_rescan (insn);
+ }
+}
+
+/* At entry of the nearest common dominator for basic blocks with vector
+ CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
+ vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
+ uses.
+
+ NB: We want to generate only a single widest vector set to cover the
+ whole function. The LCM algorithm isn't appropriate here since it
+ may place a vector set inside the loop. */
+
+static unsigned int
+remove_redundant_vector_load (void)
+{
+ timevar_push (TV_MACH_DEP);
+
+ auto_bitmap zero_bbs;
+ auto_bitmap m1_bbs;
+ auto_bitmap zero_insns;
+ auto_bitmap m1_insns;
+
+ basic_block bb;
+ rtx_insn *insn;
+ unsigned HOST_WIDE_INT zero_count = 0;
+ unsigned HOST_WIDE_INT m1_count = 0;
+ unsigned int zero_size = 0;
+ unsigned int m1_size = 0;
+
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+
+ rtx set = single_set (insn);
+ if (!set)
+ continue;
+
+ /* Record single set vector instruction with CONST0_RTX and
+ CONSTM1_RTX source. Record basic blocks with CONST0_RTX and
+ CONSTM1_RTX. Count CONST0_RTX and CONSTM1_RTX. Record the
+ maximum size of CONST0_RTX and CONSTM1_RTX. */
+
+ rtx dest = SET_DEST (set);
+ machine_mode mode = GET_MODE (dest);
+ /* Skip non-vector instruction. */
+ if (!VECTOR_MODE_P (mode))
+ continue;
+
+ rtx src = SET_SRC (set);
+ /* Skip non-vector load instruction. */
+ if (!REG_P (dest) && !SUBREG_P (dest))
+ continue;
+
+ if (src == CONST0_RTX (mode))
+ {
+ /* Record vector instruction with CONST0_RTX. */
+ bitmap_set_bit (zero_insns, INSN_UID (insn));
+
+ /* Record the maximum vector size. */
+ if (zero_size < GET_MODE_SIZE (mode))
+ zero_size = GET_MODE_SIZE (mode);
+
+ /* Record the basic block with CONST0_RTX. */
+ bitmap_set_bit (zero_bbs, bb->index);
+ zero_count++;
+ }
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && src == CONSTM1_RTX (mode))
+ {
+ /* Record vector instruction with CONSTM1_RTX. */
+ bitmap_set_bit (m1_insns, INSN_UID (insn));
+
+ /* Record the maximum vector size. */
+ if (m1_size < GET_MODE_SIZE (mode))
+ m1_size = GET_MODE_SIZE (mode);
+
+ /* Record the basic block with CONSTM1_RTX. */
+ bitmap_set_bit (m1_bbs, bb->index);
+ m1_count++;
+ }
+ }
+ }
+
+ if (zero_count > 1 || m1_count > 1)
+ {
+ machine_mode zero_mode, m1_mode;
+ rtx vector_const0, vector_constm1;
+
+ if (zero_count > 1)
+ {
+ zero_mode = ix86_get_vector_load_mode (zero_size);
+ vector_const0 = gen_reg_rtx (zero_mode);
+ replace_vector_const (zero_mode, vector_const0, zero_insns);
+ }
+ else
+ {
+ zero_mode = VOIDmode;
+ vector_const0 = nullptr;
+ }
+
+ if (m1_count > 1)
+ {
+ m1_mode = ix86_get_vector_load_mode (m1_size);
+ vector_constm1 = gen_reg_rtx (m1_mode);
+ replace_vector_const (m1_mode, vector_constm1, m1_insns);
+ }
+ else
+ {
+ m1_mode = VOIDmode;
+ vector_constm1 = nullptr;
+ }
+
+ /* (Re-)discover loops so that bb->loop_father can be used in the
+ analysis below. */
+ calculate_dominance_info (CDI_DOMINATORS);
+ loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+ if (vector_const0)
+ ix86_place_single_vector_set (vector_const0,
+ CONST0_RTX (zero_mode),
+ zero_bbs);
+
+ if (vector_constm1)
+ ix86_place_single_vector_set (vector_constm1,
+ CONSTM1_RTX (m1_mode),
+ m1_bbs);
+
+ loop_optimizer_finalize ();
+
+ df_process_deferred_rescans ();
+ }
+
+ df_clear_flags (DF_DEFER_INSN_RESCAN);
+
+ timevar_pop (TV_MACH_DEP);
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_remove_redundant_vector_load =
+{
+ RTL_PASS, /* type */
+ "rrvl", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_MACH_DEP, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_remove_redundant_vector_load : public rtl_opt_pass
+{
+public:
+ pass_remove_redundant_vector_load (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ bool gate (function *fun) final override
+ {
+ return (TARGET_SSE2
+ && optimize
+ && optimize_function_for_speed_p (fun));
+ }
+
+ unsigned int execute (function *) final override
+ {
+ return remove_redundant_vector_load ();
+ }
+}; // class pass_remove_redundant_vector_load
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_remove_redundant_vector_load (gcc::context *ctxt)
+{
+ return new pass_remove_redundant_vector_load (ctxt);
+}
+
/* Convert legacy instructions that clobbers EFLAGS to APX_NF
instructions when there are no flag set between a flag
producer and user. */
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 39f8bc65ddc..06f0288b067 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -35,5 +35,6 @@ along with GCC; see the file COPYING3. If not see
PR116174. */
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
+ INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load);
INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index bea3fd4b2e2..c59b5a67e3a 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -427,6 +427,8 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
(gcc::context *);
extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
(gcc::context *);
+extern rtl_opt_pass *make_pass_remove_redundant_vector_load
+ (gcc::context *);
extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index dd076242177..ae2386785af 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21493,19 +21493,20 @@ ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
return mode1 == SFmode;
/* If MODE2 is only appropriate for an SSE register, then tie with
- any other mode acceptable to SSE registers. */
- if (GET_MODE_SIZE (mode2) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 16
+ any vector modes or scalar floating point modes acceptable to SSE
+ registers, excluding scalar integer modes with SUBREG:
+ (subreg:QI (reg:TI 99) 0))
+ (subreg:HI (reg:TI 99) 0))
+ (subreg:SI (reg:TI 99) 0))
+ (subreg:DI (reg:TI 99) 0))
+ to avoid unnecessary move from SSE register to integer register.
+ */
+ if (GET_MODE_SIZE (mode2) >= 16
+ && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
+ || ((VECTOR_MODE_P (mode1) || SCALAR_FLOAT_MODE_P (mode1))
+ && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 16
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+ return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
/* If MODE2 is appropriate for an MMX register, then tie
with any other mode acceptable to MMX registers. */
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1a.c b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
new file mode 100644
index 00000000000..4501cfbcad4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include <stddef.h>
+
+void
+clear_memory (void *mem1, size_t nclears1, void *mem2, size_t nclears2)
+{
+ size_t *d1 = (size_t *) mem1;
+
+ *(d1 + 0) = 0;
+ *(d1 + 1) = 0;
+ *(d1 + 2) = 0;
+ if (nclears1 > 3)
+ {
+ *(d1 + nclears1 - 4) = 0;
+ *(d1 + nclears1 - 4 + 1) = 0;
+ *(d1 + nclears1 - 4 + 2) = 0;
+ *(d1 + nclears1 - 4 + 3) = 0;
+ }
+
+ double *d2 = (double *) mem2;
+
+ *(d2 + 0) = 0;
+ *(d2 + 1) = 0;
+ *(d2 + 2) = 0;
+ if (nclears2 > 3)
+ {
+ *(d2 + nclears2 - 4) = 0;
+ *(d2 + nclears2 - 4 + 1) = 0;
+ *(d2 + nclears2 - 4 + 2) = 0;
+ *(d2 + nclears2 - 4 + 3) = 0;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1b.c b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
new file mode 100644
index 00000000000..e71b991a207
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include "pr117839-1a.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-2.c b/gcc/testsuite/gcc.target/i386/pr117839-2.c
new file mode 100644
index 00000000000..c76744cf98b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-2.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t \]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include <stddef.h>
+
+float
+clear_memory (void *mem, size_t clearsize)
+{
+ size_t *d = (size_t *) mem;
+ size_t nclears = clearsize / sizeof (size_t);
+
+ *(d + 0) = 0;
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ if (nclears > 9)
+ {
+ *(d + 5) = 0;
+ *(d + 5 + 1) = 0;
+ *(d + 5 + 2) = 0;
+ *(d + 5 + 3) = 0;
+ *(d + nclears - 8) = 0;
+ *(d + nclears - 8 + 1) = 0;
+ *(d + nclears - 8 + 2) = 0;
+ *(d + nclears - 8 + 3) = 0;
+ }
+ else
+ {
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ *(d + 3) = 0;
+ *(d + 4) = 0;
+ *(d + nclears - 4) = 0;
+ *(d + nclears - 4 + 1) = 0;
+ *(d + nclears - 4 + 2) = 0;
+ *(d + nclears - 4 + 3) = 0;
+ }
+
+ return nclears;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-1.c b/gcc/testsuite/gcc.target/i386/pr92080-1.c
new file mode 100644
index 00000000000..82d1ffd4e1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-1.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 2 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */
+
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef double v2df __attribute__((vector_size(16)));
+typedef double v4df __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v2di l1;
+v4sf f1;
+v2df d1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+v4di l2;
+v8sf f2;
+v4df d2;
+
+void
+foo ()
+{
+ d1 = __extension__(v2df){0, 0};
+ f1 = __extension__(v4sf){0, 0, 0};
+ l1 = __extension__(v2di){0, 0};
+ s1 = __extension__(v4si){0, 0, 0, 0};
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo1 ()
+{
+ s1 = __extension__(v4si){-1, -1, -1, -1};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+}
+
+
+void
+foo2 ()
+{
+ d2 = __extension__(v4df){0, 0, 0, 0};
+ f2 = __extension__(v8sf){0, 0, 0, 0, 0, 0, 0, 0};
+ l2 = __extension__(v4di){0, 0, 0, 0};
+ s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo3 ()
+{
+ s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-2.c b/gcc/testsuite/gcc.target/i386/pr92080-2.c
new file mode 100644
index 00000000000..d160d90de53
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-2.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 1 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+
+void
+foo (int i, int j)
+{
+ switch (i)
+ {
+ case 1:
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ s1 = __extension__(v4si){0, 0, 0, 0};
+ s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+ break;
+ case 2:
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ case 3:
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ break;
+ default:
+ break;
+ }
+
+ switch (i)
+ {
+ case 1:
+ s1 = __extension__(v4si){-1, -1, -1, -1};
+ b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ break;
+ case 2:
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ case 3:
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-3.c b/gcc/testsuite/gcc.target/i386/pr92080-3.c
new file mode 100644
index 00000000000..2174def4e6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-3.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64" } */
+/* { dg-final { scan-assembler-times "pxor" 1 } } */
+/* { dg-final { scan-assembler-times "pcmpeq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+
+void
+foo (int i, int j)
+{
+ switch (i)
+ {
+ case 1:
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ s1 = __extension__(v4si){0, 0, 0, 0};
+ break;
+ case 2:
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ case 3:
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ break;
+ default:
+ break;
+ }
+
+ switch (i)
+ {
+ case 1:
+ s1 = __extension__(v4si){-1, -1, -1, -1};
+ break;
+ case 2:
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ case 3:
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ break;
+ }
+}
--
2.49.0