On Tue, Jun 30, 2026 at 1:57 PM Sarvesh Chandra <[email protected]> wrote:
>
> The AVX-512 masked epilogue emits a masked load (UNSPEC_MASKLOAD under
> a VEC_MERGE) feeding a masked binary op; the wrapper hides the memory
> operand from combine, so it is not folded into the operation.
>
> Add four define_insn_and_split patterns that strip the wrapper, for
> commutative and non-commutative ops, with and without an outer mask.
> Iterator V1248FH_AVX512VLBW covers all maskable modes; a new
> noncommutative_binary_operator predicate handles minus and div.
>
> gcc/ChangeLog:
>
> PR target/123997
> * config/i386/predicates.md (noncommutative_binary_operator):
> New predicate.
> * config/i386/sse.md (V1248FH_AVX512VLBW): New mode iterator.
> (*comm_maskload_fold_unmasked_op): New pattern.
> (*comm_maskload_fold_masked_op): New pattern.
> (*noncomm_maskload_fold_unmasked_op): New pattern.
> (*noncomm_maskload_fold_masked_op): New pattern.
>
> gcc/testsuite/ChangeLog:
>
> PR target/123997
> * gcc.target/i386/avx512-maskload-fold-1.c: New test.
> * gcc.target/i386/avx512-maskload-fold-2.c: New test.
> * gcc.target/i386/avx512-maskload-fold-3.c: New test.
> * gcc.target/i386/avx512-maskload-fold-4.c: New test.
> * gcc.target/i386/avx512-maskload-fold-5.c: New test.
> * gcc.target/i386/avx512-maskload-fold-6.c: New test.
> * gcc.target/i386/avx512-maskload-fold-7.c: New test.
> * gcc.target/i386/avx512-maskload-fold-8.c: New test.
> * gcc.target/i386/avx512-maskload-fold-9.c: New test.
> * gcc.target/i386/avx512-maskload-fold-10.c: New test.
> * gcc.target/i386/avx512-maskload-fold-11.c: New test.
> * gcc.target/i386/avx512-maskload-fold-12.c: New test.
> * gcc.target/i386/avx512-maskload-fold-13.c: New test.
> * gcc.target/i386/avx512-maskload-fold-14.c: New test.
> * gcc.target/i386/avx512-maskload-fold-15.c: New test.
> * gcc.target/i386/avx512-maskload-fold-16.c: New test.
>
> Co-authored-by: Venkataramanan Kumar <[email protected]>
> Signed-off-by: Sarvesh Chandra <[email protected]>
> ---
> RFC: The generic approach to avoid the explosion of patterns here would be
> to hoist the vec_merge at the root of the RTL tree during combine; that
> would target unary, binary and ternary operations, including operations
> with a complex RTL pattern such as vpavgb. The following vpavgb case
> results in no fold at -O3/-Ofast (avx512-maskload-fold-15.c, -16.c):
>
> void vector_avg_uint8 (unsigned char * __restrict dst,
> const unsigned char * __restrict a,
> const unsigned char * __restrict b, int n)
> {
> for (int i = 0; i < n; ++i)
> dst[i] = (a[i] + b[i] + 1) >> 1;
> }
>
> Bootstrapped and regression-tested on x86_64-pc-linux-gnu.
>
> gcc/config/i386/predicates.md | 4 +
> gcc/config/i386/sse.md | 116 ++++++++++++++++++
> .../gcc.target/i386/avx512-maskload-fold-1.c | 12 ++
> .../gcc.target/i386/avx512-maskload-fold-10.c | 12 ++
> .../gcc.target/i386/avx512-maskload-fold-11.c | 12 ++
> .../gcc.target/i386/avx512-maskload-fold-12.c | 15 +++
> .../gcc.target/i386/avx512-maskload-fold-13.c | 14 +++
> .../gcc.target/i386/avx512-maskload-fold-14.c | 12 ++
> .../gcc.target/i386/avx512-maskload-fold-15.c | 15 +++
> .../gcc.target/i386/avx512-maskload-fold-16.c | 15 +++
> .../gcc.target/i386/avx512-maskload-fold-2.c | 12 ++
> .../gcc.target/i386/avx512-maskload-fold-3.c | 12 ++
> .../gcc.target/i386/avx512-maskload-fold-4.c | 12 ++
> .../gcc.target/i386/avx512-maskload-fold-5.c | 15 +++
> .../gcc.target/i386/avx512-maskload-fold-6.c | 14 +++
> .../gcc.target/i386/avx512-maskload-fold-7.c | 12 ++
> .../gcc.target/i386/avx512-maskload-fold-8.c | 12 ++
> .../gcc.target/i386/avx512-maskload-fold-9.c | 12 ++
> 18 files changed, 328 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-10.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-11.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-12.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-13.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-14.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-15.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-16.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-3.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-4.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-5.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-6.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-7.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-8.c
> create mode 100644 gcc/testsuite/gcc.target/i386/avx512-maskload-fold-9.c
>
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index cf3a68a9fe8..c98f77e6ad5 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -1740,6 +1740,10 @@
> (define_predicate "commutative_operator"
> (match_code "plus,mult,and,ior,xor,smin,smax,umin,umax"))
>
> +;; Return true for non-commutative binary operators (minus, div).
> +(define_predicate "noncommutative_binary_operator"
> + (match_code "minus,div"))
The name is misleading, since we only have minus, div, better to be
named to sub_div_operator?
> +
> ;; Return true if OP is a binary operator that can be promoted to wider mode.
> (define_predicate "promotable_binary_operator"
> (ior (match_code "plus,minus,and,ior,xor,ashift")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 51d1e9b455a..2538b39c0bf 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -2792,6 +2792,122 @@
> (set_attr "btver2_decode" "direct,double")
> (set_attr "mode" "<MODE>")])
>
> +;; AVX-512 modes that support masked loads.
> +
> +(define_mode_iterator V1248FH_AVX512VLBW
> + [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
> + V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")
> + (V32HF "TARGET_AVX512FP16")
> + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
> + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
> + V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
> + V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")
> + (V64QI "TARGET_AVX512BW")
> + (V32QI "TARGET_AVX512VL && TARGET_AVX512BW")
> + (V16QI "TARGET_AVX512VL && TARGET_AVX512BW")
> + (V32HI "TARGET_AVX512BW")
> + (V16HI "TARGET_AVX512VL && TARGET_AVX512BW")
> + (V8HI "TARGET_AVX512VL && TARGET_AVX512BW")])
> +
> +;; Fold masked load into commutative op, no outer mask.
> +(define_insn_and_split "*comm_maskload_fold_unmasked_op"
> + [(set (match_operand:V1248FH_AVX512VLBW 0 "register_operand")
> + (match_operator:V1248FH_AVX512VLBW 1 "commutative_operator"
> + [(vec_merge:V1248FH_AVX512VLBW
> + (unspec:V1248FH_AVX512VLBW
> + [(match_operand:V1248FH_AVX512VLBW 2 "memory_operand")]
> + UNSPEC_MASKLOAD)
> + (match_operand:V1248FH_AVX512VLBW 3 "const0_operand")
> + (match_operand:<avx512fmaskmode> 4 "register_operand"))
> + (match_operand:V1248FH_AVX512VLBW 5 "register_operand")]))]
> + "TARGET_AVX512F"
> + "#"
> + "&& 1"
> + [(const_int 0)]
> +{
> + rtx op = gen_rtx_fmt_ee (GET_CODE (operands[1]), <MODE>mode,
> + operands[5], operands[2]);
> + rtx vm = gen_rtx_VEC_MERGE (<MODE>mode, op, operands[3], operands[4]);
> + emit_insn (gen_rtx_SET (operands[0], vm));
> + DONE;
> +})
It should be equal to vec_merge (op (operands[2], operands[5]), op (0,
operands[5]), mask), not vec_merge (op (operands[2], operands[5]), 0,
mask).
op (0, operands[5]) is not always equal to 0. .i.e when op is add, op
(0, operands[5]) is operands[5].
Similar for noncomm_maskload_fold_unmasked_op.
> +
> +;; Fold masked load into commutative op, outer mask.
> +(define_insn_and_split "*comm_maskload_fold_masked_op"
> + [(set (match_operand:V1248FH_AVX512VLBW 0 "register_operand")
> + (vec_merge:V1248FH_AVX512VLBW
> + (match_operator:V1248FH_AVX512VLBW 1 "commutative_operator"
> + [(vec_merge:V1248FH_AVX512VLBW
> + (unspec:V1248FH_AVX512VLBW
> + [(match_operand:V1248FH_AVX512VLBW 2 "memory_operand")]
> + UNSPEC_MASKLOAD)
> + (match_operand:V1248FH_AVX512VLBW 3 "const0_operand")
> + (match_operand:<avx512fmaskmode> 4 "register_operand"))
> + (match_operand:V1248FH_AVX512VLBW 5 "register_operand")])
> + (match_operand:V1248FH_AVX512VLBW 6 "const0_operand")
> + (match_dup 4)))]
> + "TARGET_AVX512F"
> + "#"
> + "&& 1"
> + [(const_int 0)]
> +{
> + rtx op = gen_rtx_fmt_ee (GET_CODE (operands[1]), <MODE>mode,
> + operands[5], operands[2]);
> + rtx vm = gen_rtx_VEC_MERGE (<MODE>mode, op, operands[6], operands[4]);
> + emit_insn (gen_rtx_SET (operands[0], vm));
> + DONE;
> +})
> +
> +;; Fold masked load into non-commutative op, no outer mask.
> +(define_insn_and_split "*noncomm_maskload_fold_unmasked_op"
> + [(set (match_operand:V1248FH_AVX512VLBW 0 "register_operand")
> + (match_operator:V1248FH_AVX512VLBW 1 "noncommutative_binary_operator"
> + [(match_operand:V1248FH_AVX512VLBW 5 "register_operand")
> + (vec_merge:V1248FH_AVX512VLBW
> + (unspec:V1248FH_AVX512VLBW
> + [(match_operand:V1248FH_AVX512VLBW 2 "memory_operand")]
> + UNSPEC_MASKLOAD)
> + (match_operand:V1248FH_AVX512VLBW 3 "const0_operand")
> + (match_operand:<avx512fmaskmode> 4 "register_operand"))]))]
> + "TARGET_AVX512F"
> + "#"
> + "&& 1"
> + [(const_int 0)]
> +{
> + rtx op = gen_rtx_fmt_ee (GET_CODE (operands[1]), <MODE>mode,
> + operands[5], operands[2]);
> + rtx vm = gen_rtx_VEC_MERGE (<MODE>mode, op, operands[3], operands[4]);
> + emit_insn (gen_rtx_SET (operands[0], vm));
> + DONE;
> +})
> +
> +;; Fold masked load into non-commutative op, outer mask.
> +(define_insn_and_split "*noncomm_maskload_fold_masked_op"
> + [(set (match_operand:V1248FH_AVX512VLBW 0 "register_operand")
> + (vec_merge:V1248FH_AVX512VLBW
> + (match_operator:V1248FH_AVX512VLBW 1
> "noncommutative_binary_operator"
> + [(match_operand:V1248FH_AVX512VLBW 5 "register_operand")
> + (vec_merge:V1248FH_AVX512VLBW
> + (unspec:V1248FH_AVX512VLBW
> + [(match_operand:V1248FH_AVX512VLBW 2 "memory_operand")]
> + UNSPEC_MASKLOAD)
> + (match_operand:V1248FH_AVX512VLBW 3 "const0_operand")
> + (match_operand:<avx512fmaskmode> 4 "register_operand"))])
> + (match_operand:V1248FH_AVX512VLBW 6 "const0_operand")
> + (match_dup 4)))]
> + "TARGET_AVX512F"
> + "#"
> + "&& 1"
> + [(const_int 0)]
> +{
> + rtx op = gen_rtx_fmt_ee (GET_CODE (operands[1]), <MODE>mode,
> + operands[5], operands[2]);
> + rtx vm = gen_rtx_VEC_MERGE (<MODE>mode, op, operands[6], operands[4]);
> + emit_insn (gen_rtx_SET (operands[0], vm));
> + DONE;
> +})
But if we remove UNSPEC, splitter it before RA, RA may reload the
whole memory to a register? It may issue an segment faullt when the
whole memory
is invalid? so maybe it's better to be a post_reload splitter or we
just add define_insn for all of those with some magic of subst.
> +
> +
> ;; Standard scalar operation patterns which preserve the rest of the
> ;; vector for combiner.
> (define_insn "*<sse>_vm<multdiv_mnemonic><mode>3"
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-1.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-1.c
> new file mode 100644
> index 00000000000..78125cad545
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" }
> */
> +/* { dg-final { scan-assembler "vaddpd\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_add_fp(double *__restrict dst,
> + const double *__restrict a,
> + const double *__restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] + b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-10.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-10.c
> new file mode 100644
> index 00000000000..839693bdf94
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-10.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-Ofast -march=x86-64-v4 --param
> vect-partial-vector-usage=1" } */
> +/* { dg-final { scan-assembler "vpaddd\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_add_int(int *__restrict dst,
> + const int *__restrict a,
> + const int *__restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] + b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-11.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-11.c
> new file mode 100644
> index 00000000000..2841cf758a9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-11.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-Ofast -march=x86-64-v4 --param
> vect-partial-vector-usage=1" } */
> +/* { dg-final { scan-assembler "vpmulld\[
> \\t\]+\[^\\n\]*\\(\[^\\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_mul_int(int *__restrict dst,
> + const int *__restrict a,
> + const int *__restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] * b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-12.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-12.c
> new file mode 100644
> index 00000000000..64b3870bb42
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-12.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=x86-64-v4 --param
> vect-partial-vector-usage=1" } */
> +/* { dg-final { scan-assembler "vsubps\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void
> +vector_sub_fp (float * __restrict dst,
> + const float * __restrict a,
> + const float * __restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] - b[i];
> +}
> +
> +
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-13.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-13.c
> new file mode 100644
> index 00000000000..d18aa4552bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-13.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=x86-64-v4 --param
> vect-partial-vector-usage=1" } */
> +/* { dg-final { scan-assembler "vpsubq\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void
> +vector_sub_int64 (long long * __restrict dst,
> + const long long * __restrict a,
> + const long long * __restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] - b[i];
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-14.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-14.c
> new file mode 100644
> index 00000000000..ea0a90596a1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-14.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=x86-64-v4 --param
> vect-partial-vector-usage=1" } */
> +/* { dg-final { scan-assembler-not "vsubps\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_sub_no_fold(float *__restrict dst,
> + const float *__restrict a,
> + float b,
> + int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = a[i] - b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-15.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-15.c
> new file mode 100644
> index 00000000000..658f0a6cc8b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-15.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" }
> */
> +/* vpavgb is a complex RTL pattern, not a plain binary operator, so the
> masked
> + load is not folded into it. A generic combine approach that hoists the
> + vec_merge to the root of the RTL tree would also cover this case. */
> +/* { dg-final { scan-assembler "vpavgb\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" { xfail *-*-* } } }
> */
> +
> +void vector_avg_uint8 (unsigned char * __restrict dst,
> + const unsigned char * __restrict a,
> + const unsigned char * __restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = (a[i] + b[i] + 1) >> 1;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-16.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-16.c
> new file mode 100644
> index 00000000000..f06c6ff33f2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-16.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=x86-64-v4 --param
> vect-partial-vector-usage=1" } */
> +/* vpavgb is a complex RTL pattern, not a plain binary operator, so the
> masked
> + load is not folded into it. A generic combine approach that hoists the
> + vec_merge to the root of the RTL tree would also cover this case. */
> +/* { dg-final { scan-assembler "vpavgb\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" { xfail *-*-* } } }
> */
> +
> +void vector_avg_uint8 (unsigned char * __restrict dst,
> + const unsigned char * __restrict a,
> + const unsigned char * __restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = (a[i] + b[i] + 1) >> 1;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-2.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-2.c
> new file mode 100644
> index 00000000000..885b349e176
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" }
> */
> +/* { dg-final { scan-assembler "vmulpd\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_mul_fp(double *__restrict dst,
> + const double *__restrict a,
> + const double *__restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] * b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-3.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-3.c
> new file mode 100644
> index 00000000000..9fb0066fdb1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-3.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" }
> */
> +/* { dg-final { scan-assembler "vpaddd\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_add_int(int *__restrict dst,
> + const int *__restrict a,
> + const int *__restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] + b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-4.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-4.c
> new file mode 100644
> index 00000000000..48a658d1d89
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-4.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" }
> */
> +/* { dg-final { scan-assembler "vpmulld\[
> \\t\]+\[^\\n\]*\\(\[^\\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_mul_int(int *__restrict dst,
> + const int *__restrict a,
> + const int *__restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] * b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-5.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-5.c
> new file mode 100644
> index 00000000000..8663b32f2c7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-5.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" }
> */
> +/* { dg-final { scan-assembler "vsubps\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void
> +vector_sub_fp (float * __restrict dst,
> + const float * __restrict a,
> + const float * __restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] - b[i];
> +}
> +
> +
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-6.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-6.c
> new file mode 100644
> index 00000000000..20c86bce0fc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-6.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" }
> */
> +/* { dg-final { scan-assembler "vpsubq\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void
> +vector_sub_int64 (long long * __restrict dst,
> + long long * __restrict a,
> + long long * __restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] - b[i];
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-7.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-7.c
> new file mode 100644
> index 00000000000..be0397bfa43
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-7.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=x86-64-v4 --param vect-partial-vector-usage=1" }
> */
> +/* { dg-final { scan-assembler-not "vsubps\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_sub_no_fold(float * __restrict dst,
> + const float * __restrict a,
> + float b,
> + int n)
> +{
> + for (int i = 0; i < n; i++)
> + dst[i] = a[i] - b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-8.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-8.c
> new file mode 100644
> index 00000000000..9743d2902c4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-8.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-Ofast -march=x86-64-v4 --param
> vect-partial-vector-usage=1" } */
> +/* { dg-final { scan-assembler "vaddpd\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_add_fp(double *__restrict dst,
> + const double *__restrict a,
> + const double *__restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] + b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-9.c
> b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-9.c
> new file mode 100644
> index 00000000000..5a5c37fb4a8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-maskload-fold-9.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-Ofast -march=x86-64-v4 --param
> vect-partial-vector-usage=1" } */
> +/* { dg-final { scan-assembler "vmulpd\[
> \\t\]+\[^\n\]*\\(\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}" } } */
> +
> +void vector_mul_fp(double *__restrict dst,
> + const double *__restrict a,
> + const double *__restrict b,
> + int n)
> +{
> + for (int i = 0; i < n; ++i)
> + dst[i] = a[i] * b[i];
> +}
> --
> 2.34.1
>
--
BR,
Hongtao