https://gcc.gnu.org/g:cdfa5fe03512f7ac5a293480f634df68fc973060
commit r16-1298-gcdfa5fe03512f7ac5a293480f634df68fc973060 Author: liuhongt <hongtao....@intel.com> Date: Tue Jun 3 14:12:23 2025 +0800 Also handle avx512 kmask & immediate 15 or 3 when VF is 4/2. like r16-105-g599bca27dc37b3, the patch handles redunduant clean up of upper-bits for maskload. .i.e Successfully matched this instruction: (set (reg:V4DF 175) (vec_merge:V4DF (unspec:V4DF [ (mem:V4DF (plus:DI (reg/v/f:DI 155 [ b ]) (reg:DI 143 [ ivtmp.56 ])) [1 S32 A64]) ] UNSPEC_MASKLOAD) (const_vector:V4DF [ (const_double:DF 0.0 [0x0.0p+0]) repeated x4 ]) (and:QI (reg:QI 125 [ mask__29.16 ]) (const_int 15 [0xf])))) For maskstore, looks like it's already optimal(at least I can't make a testcase). So The patch only hanldes maskload. gcc/ChangeLog: PR target/103750 * config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for maskload. * config/i386/sse.md (*<avx512>_load<mode>mask_and15): New define_insn_and_split. (*<avx512>_load<mode>mask_and3): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512f-pr103750-3.c: New test. Diff: --- gcc/config/i386/i386.cc | 12 ++++++- gcc/config/i386/sse.md | 38 ++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/avx512f-pr103750-3.c | 26 +++++++++++++++ 3 files changed, 75 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index d48654a729a1..20ee360dcb0f 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -22938,7 +22938,17 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } /* This is masked instruction, assume the same cost, as nonmasked variant. */ - else if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask))) + else if (TARGET_AVX512F + && (register_operand (mask, GET_MODE (mask)) + /* Redunduant clean up of high bits for kmask with VL=2/4 + .i.e (vec_merge op0, op1, (and op3 15)). */ + || (GET_CODE (mask) == AND + && register_operand (XEXP (mask, 0), GET_MODE (mask)) + && CONST_INT_P (XEXP (mask, 1)) + && ((INTVAL (XEXP (mask, 1)) == 3 + && GET_MODE_NUNITS (mode) == 2) + || (INTVAL (XEXP (mask, 1)) == 15 + && GET_MODE_NUNITS (mode) == 4))))) { *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed) + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index c40b0fd49978..252ba0796065 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1589,6 +1589,44 @@ "&& 1" [(set (match_dup 0) (match_dup 1))]) +(define_insn_and_split "*<avx512>_load<mode>mask_and15" + [(set (match_operand:V48_AVX512VL_4 0 "register_operand" "=v") + (vec_merge:V48_AVX512VL_4 + (unspec:V48_AVX512VL_4 + [(match_operand:V48_AVX512VL_4 1 "memory_operand" "m")] + UNSPEC_MASKLOAD) + (match_operand:V48_AVX512VL_4 2 "nonimm_or_0_operand" "0C") + (and:QI + (match_operand:QI 3 "register_operand" "Yk") + (const_int 15))))] + "TARGET_AVX512F" + "#" + "&& 1" + [(set (match_dup 0) + (vec_merge:V48_AVX512VL_4 + (unspec:V48_AVX512VL_4 [(match_dup 1)] UNSPEC_MASKLOAD) + (match_dup 2) + (match_dup 3)))]) + +(define_insn_and_split "*<avx512>_load<mode>mask_and3" + [(set (match_operand:V8_AVX512VL_2 0 "register_operand" "=v") + (vec_merge:V8_AVX512VL_2 + (unspec:V8_AVX512VL_2 + [(match_operand:V8_AVX512VL_2 1 "memory_operand" "m")] + UNSPEC_MASKLOAD) + (match_operand:V8_AVX512VL_2 2 "nonimm_or_0_operand" "0C") + (and:QI + (match_operand:QI 3 "register_operand" "Yk") + (const_int 3))))] + "TARGET_AVX512F" + "#" + "&& 1" + [(set (match_dup 0) + (vec_merge:V8_AVX512VL_2 + (unspec:V8_AVX512VL_2 [(match_dup 1)] UNSPEC_MASKLOAD) + (match_dup 2) + (match_dup 3)))]) + (define_expand "<avx512>_load<mode>_mask" [(set (match_operand:VI12_AVX512VL 0 "register_operand") (vec_merge:VI12_AVX512VL diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr103750-3.c b/gcc/testsuite/gcc.target/i386/avx512f-pr103750-3.c new file mode 100644 index 000000000000..9965e633b201 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr103750-3.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v4 -mprefer-vector-width=256 -Ofast" } */ +/* { dg-final { scan-assembler-not "kmov" } } */ + +void +foo (double* a, double* __restrict b, double* c, double* d, int n) +{ + for (int i = 0; i != n; i++) + { + double tmp = 0.0; + if (c[i] > d[i]) + tmp = b[i]; + a[i] = tmp; + } +} + +void +foo1 (double* a, double* __restrict b, double* c, double* d, int n) +{ + for (int i = 0; i != n; i++) + { + double tmp = 0.0; + if (c[i] > d[i]) + a[i] = b[i]; + } +}