[gcc r15-4397] Support andn_optab for x86
https://gcc.gnu.org/g:70f59d2a1c51bde085d8fc7df002918851e76c9c commit r15-4397-g70f59d2a1c51bde085d8fc7df002918851e76c9c Author: Cui, Lili Date: Thu Oct 17 08:50:38 2024 +0800 Support andn_optab for x86 Add new andn pattern to match the new optab added by r15-1890-gf379596e0ba99d. Only enable 64bit, 128bit and 256bit vector ANDN, X86-64 has mask mov instruction when avx512 is enabled. gcc/ChangeLog: * config/i386/sse.md (andn3): New. * config/i386/mmx.md (andn3): New. gcc/testsuite/ChangeLog: * g++.target/i386/vect-cmp.C: New test. Diff: --- gcc/config/i386/mmx.md | 7 +++ gcc/config/i386/sse.md | 7 +++ gcc/testsuite/g++.target/i386/vect-cmp.C | 23 +++ 3 files changed, 37 insertions(+) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 9d2a82c598e5..ef4ed8b501a1 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -4467,6 +4467,13 @@ operands[0] = lowpart_subreg (V16QImode, operands[0], mode); }) +(define_expand "andn3" + [(set (match_operand:MMXMODEI 0 "register_operand") +(and:MMXMODEI + (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand")) + (match_operand:MMXMODEI 2 "register_operand")))] + "TARGET_SSE2") + (define_insn "mmx_andnot3" [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") (and:MMXMODEI diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a45b50ad7324..7be313346677 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -18438,6 +18438,13 @@ (match_operand:VI_AVX2 2 "vector_operand")))] "TARGET_SSE2") +(define_expand "andn3" + [(set (match_operand:VI 0 "register_operand") + (and:VI + (not:VI (match_operand:VI 2 "register_operand")) + (match_operand:VI 1 "register_operand")))] + "TARGET_SSE2") + (define_expand "_andnot3_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand") (vec_merge:VI48_AVX512VL diff --git a/gcc/testsuite/g++.target/i386/vect-cmp.C b/gcc/testsuite/g++.target/i386/vect-cmp.C new file mode 100644 index ..c154474fa51c --- /dev/null +++ b/gcc/testsuite/g++.target/i386/vect-cmp.C @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3 -fdump-tree-optimized" } */ + +#define vect8 __attribute__((vector_size(8) )) +#define vect16 __attribute__((vector_size(16) )) +#define vect32 __attribute__((vector_size(32) )) + +vect8 int bar0 (vect8 float a, vect8 float b, vect8 int c) +{ + return (a > b) ? 0 : c; +} + +vect16 int bar1 (vect16 float a, vect16 float b, vect16 int c) +{ + return (a > b) ? 0 : c; +} + +vect32 int bar2 (vect32 float a, vect32 float b, vect32 int c) +{ + return (a > b) ? 0 : c; +} + +/* { dg-final { scan-tree-dump-times ".BIT_ANDN " 3 "optimized" { target { ! ia32 } } } } */
[gcc r15-5666] Optimize 128-bit vector permutation with pand, pandn and por.
https://gcc.gnu.org/g:60b708a9c878aff9a76ec0d446ae63e6527327a6 commit r15-5666-g60b708a9c878aff9a76ec0d446ae63e6527327a6 Author: Cui, Lili Date: Tue Nov 26 15:10:23 2024 +0800 Optimize 128-bit vector permutation with pand, pandn and por. This patch introduces a new subroutine in ix86_expand_vec_perm_const_1. On x86, use mixed constant permutation for V8HImode and V16QImode when SSE2 is supported. This patch handles certain vector shuffle operations more efficiently using pand, pandn, and por. This change is intended to improve assembly code generation for configurations that support SSE2. gcc/ChangeLog: PR target/116675 * config/i386/i386-expand.cc (expand_vec_perm_pand_pandn_por): New subroutine. (ix86_expand_vec_perm_const_1): Call expand_vec_perm_pand_pandn_por. gcc/testsuite/ChangeLog: PR target/116675 * gcc.target/i386/pr116675.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 50 + gcc/testsuite/gcc.target/i386/pr116675.c | 75 2 files changed, 125 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index f8dcce465e9a..2eb619725047 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -23102,6 +23102,53 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) return true; } +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a + permutation (which is a bland) with and, andnot and or when pshufb is not available. + + It handles case: + __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15); + __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15); + + An element[i] must be chosen between op0[i] and op1[i] to satisfy the + requirement. + */ + +static bool +expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d) +{ + rtx rperm[16], vperm; + unsigned int i, nelt = d->nelt; + + if (!TARGET_SSE2 + || d->one_operand_p + || (d->vmode != V16QImode && d->vmode != V8HImode)) +return false; + + if (d->perm[0] != 0) +return false; + + /* The dest[i] must select an element between op0[i] and op1[i]. */ + for (i = 1; i < nelt; i++) +if ((d->perm[i] % nelt) != i) + return false; + + if (d->testing_p) + return true; + + /* Generates a blend mask for the operators AND and ANDNOT. */ + machine_mode inner_mode = GET_MODE_INNER (d->vmode); + for (i = 0; i < nelt; i++) +rperm[i] = (d->perm[i] < nelt) ? CONSTM1_RTX (inner_mode) + : CONST0_RTX (inner_mode); + + vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm)); + vperm = force_reg (d->vmode, vperm); + + ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1); + + return true; +} + /* Implement permutation with pslldq + psrldq + por when pshufb is not available. */ static bool @@ -24161,6 +24208,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_psrlw_psllw_por (d)) return true; + if (expand_vec_perm_pand_pandn_por (d)) +return true; + /* Try sequences of four instructions. */ if (expand_vec_perm_even_odd_trunc (d)) diff --git a/gcc/testsuite/gcc.target/i386/pr116675.c b/gcc/testsuite/gcc.target/i386/pr116675.c new file mode 100644 index ..e463dd8415f5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr116675.c @@ -0,0 +1,75 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2 -mno-ssse3" } */ +/* { dg-final { scan-assembler-times "pand" 4 } } */ +/* { dg-final { scan-assembler-times "pandn" 4 } } */ +/* { dg-final { scan-assembler-times "por" 4 } } */ + +#include + +__attribute__((noinline, noclone, target("sse2"))) +static __v8hi foo1 (__v8hi a, __v8hi b) +{ + return __builtin_shufflevector (a, b, 0, 9, 2, 11, 4, 13, 6, 15); +} + +__attribute__((noinline, noclone, target("sse2"))) +static __v8hi foo2 (__v8hi a, __v8hi b) +{ + return __builtin_shufflevector (a, b, 8, 9, 2, 3, 4, 13, 14, 15); +} + +__attribute__((noinline, noclone, target("sse2"))) +static __v16qi foo3 (__v16qi a, __v16qi b) +{ + return __builtin_shufflevector (a, b, 0, 17, 2, 19, 4, 21, 6, 23, + 8, 25, 10, 27, 12, 29, 14, 31); +} + +__attribute__((noinline, noclone, target("sse2"))) +static __v16qi foo4 (__v16qi a, __v16qi b) +{ + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 21, 6, 23, +8, 25, 10, 27,12,29,14,31); +} + +__attribute__((noinline, noclone)) void +compare_v8hi (__v8hi a, __v8hi b) +{ + for (int i = 0; i < 8; i++) +if (a[i] != b[i]) + __builtin_abort (); +} + +__attribute__((noinline, noclone)) void +compare_v16qi (__v16qi a, __v16qi b) +{ + for (int i = 0; i < 16; i++) +if (a[i] != b[i]) + __builtin_abort (); +} + +int main (void) +{ + __v8hi s1, s2, s3, s4, s5, s6; + __v16qi s7, s8, s9, s10, s11, s
[gcc r16-1569] x86: Fix shrink wrap separate ICE under -fstack-clash-protection [PR120697]
https://gcc.gnu.org/g:1f2e4058e57c68b5ea91ab2bac469d5e57b6ff46 commit r16-1569-g1f2e4058e57c68b5ea91ab2bac469d5e57b6ff46 Author: Lili Cui Date: Thu Jun 19 08:39:54 2025 +0800 x86: Fix shrink wrap separate ICE under -fstack-clash-protection [PR120697] gcc/ChangeLog: PR target/120697 * config/i386/i386.cc (ix86_expand_prologue): Remove 3 assertions and associated code. gcc/testsuite/ChangeLog: PR target/120697 * gcc.target/i386/stack-clash-protection.c: New test. Diff: --- gcc/config/i386/i386.cc | 14 +- .../gcc.target/i386/stack-clash-protection.c | 19 +++ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 9bf198c7416c..77853297a2fa 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -9234,10 +9234,9 @@ ix86_expand_prologue (void) the stack frame saving one cycle of the prologue. However, avoid doing this if we have to probe the stack; at least on x86_64 the stack probe can turn into a call that clobbers a red zone location. */ - else if ((ix86_using_red_zone () + else if (ix86_using_red_zone () && (! TARGET_STACK_PROBE || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) - || crtl->shrink_wrapped_separate) { HOST_WIDE_INT allocate_offset; if (crtl->shrink_wrapped_separate) @@ -9253,11 +9252,6 @@ ix86_expand_prologue (void) ix86_emit_save_regs_using_mov (frame.reg_save_offset); int_registers_saved = true; - - if (ix86_using_red_zone () - && (! TARGET_STACK_PROBE - || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) - cfun->machine->red_zone_used = true; } } @@ -9377,8 +9371,6 @@ ix86_expand_prologue (void) && flag_stack_clash_protection && !ix86_target_stack_probe ()) { - gcc_assert (!crtl->shrink_wrapped_separate); - ix86_adjust_stack_and_probe (allocate, int_registers_saved, false); allocate = 0; } @@ -9389,8 +9381,6 @@ ix86_expand_prologue (void) { const HOST_WIDE_INT probe_interval = get_probe_interval (); - gcc_assert (!crtl->shrink_wrapped_separate); - if (STACK_CHECK_MOVING_SP) { if (crtl->is_leaf @@ -9447,8 +9437,6 @@ ix86_expand_prologue (void) else if (!ix86_target_stack_probe () || frame.stack_pointer_offset < CHECK_STACK_LIMIT) { - gcc_assert (!crtl->shrink_wrapped_separate); - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, GEN_INT (-allocate), -1, m->fs.cfa_reg == stack_pointer_rtx); diff --git a/gcc/testsuite/gcc.target/i386/stack-clash-protection.c b/gcc/testsuite/gcc.target/i386/stack-clash-protection.c new file mode 100644 index ..5be28cb3ac7b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/stack-clash-protection.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fstack-clash-protection" } */ + +int flag; +void open(); +int getChar(); +typedef enum { QUOTE } CharType; +typedef enum { UNQ } State; +CharType getCharType(); +void expand() { + open(); + if (flag) +return; + int ch = getChar(); + State nextState = getCharType(); + if (nextState) +while (ch) + ; +}
[gcc r16-1551] x86: Enable separate shrink wrapping
https://gcc.gnu.org/g:2c30f828e4507863713cff44cd30c88aa7f27865 commit r16-1551-g2c30f828e4507863713cff44cd30c88aa7f27865 Author: Lili Cui Date: Tue Jun 17 21:39:38 2025 +0800 x86: Enable separate shrink wrapping This commit implements the target macros (TARGET_SHRINK_WRAP_*) that enable separate shrink wrapping for function prologues/epilogues in x86. When performing separate shrink wrapping, we choose to use mov instead of push/pop, because using push/pop is more complicated to handle rsp adjustment and may lose performance, so here we choose to use mov, which has a small impact on code size, but guarantees performance. Using mov means we need to use sub/add to maintain the stack frame. In some special cases, we need to use lea to prevent affecting EFlags. Avoid inserting sub between test-je-jle to change EFlags, lea should be used here. foo: xorl%eax, %eax testl %edi, %edi je .L11 sub $16, %rsp --> leaq-16(%rsp), %rsp movq%r13, 8(%rsp) movl$1, %r13d jle .L4 Tested against SPEC CPU 2017, this change always has a net-positive effect on the dynamic instruction count. See the following table for the breakdown on how this reduces the number of dynamic instructions per workload on a like-for-like (with/without this commit): instruction count basewith commit (commit-base)/commit 502.gcc_r 98666845943 96891561634 -1.80% 526.blender_r 6.21226E+11 6.12992E+11 -1.33% 520.omnetpp_r 1.1241E+11 1.11093E+11 -1.17% 500.perlbench_r 1271558717 1263268350 -0.65% 523.xalancbmk_r 2.20103E+11 2.18836E+11 -0.58% 531.deepsjeng_r 2.73591E+11 2.72114E+11 -0.54% 500.perlbench_r 64195557393 63881512409 -0.49% 541.leela_r 2.99097E+11 2.98245E+11 -0.29% 548.exchange2_r 1.27976E+11 1.27784E+11 -0.15% 527.cam4_r 88981458425 7334679 -0.11% 554.roms_r 2.60072E+11 2.59809E+11 -0.10% Collected spec2017 performance on ZNVER5, EMR and ICELAKE. No performance regression was observed. For O2 multi-copy : 511.povray_r improved by 2.8% on ZNVER5. 511.povray_r improved by 4% on EMR 511.povray_r improved by 3.3 % ~ 4.6% on ICELAKE. gcc/ChangeLog: * config/i386/i386-protos.h (ix86_get_separate_components): New function. (ix86_components_for_bb): Likewise. (ix86_disqualify_components): Likewise. (ix86_emit_prologue_components): Likewise. (ix86_emit_epilogue_components): Likewise. (ix86_set_handled_components): Likewise. * config/i386/i386.cc (save_regs_using_push_pop): Split from ix86_compute_frame_layout. (ix86_compute_frame_layout): Use save_regs_using_push_pop. (pro_epilogue_adjust_stack): Use gen_pro_epilogue_adjust_stack_add_nocc. (ix86_expand_prologue): Add some assertions and adjust the stack frame at the beginning of the prolog for shrink wrapping separate. (ix86_emit_save_regs_using_mov): Skip registers that are wrapped separately. (ix86_emit_restore_regs_using_mov): Likewise. (ix86_expand_epilogue): Add some assertions and set restore_regs_via_mov to true for shrink wrapping separate. (ix86_get_separate_components): New function. (ix86_components_for_bb): Likewise. (ix86_disqualify_components): Likewise. (ix86_emit_prologue_components): Likewise. (ix86_emit_epilogue_components): Likewise. (ix86_set_handled_components): Likewise. (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define. (TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise. (TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise. (TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise. (TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise. (TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise. * config/i386/i386.h (struct machine_function):Add reg_is_wrapped_separately array for register wrapping information. * config/i386/i386.md (@pro_epilogue_adjust_stack_add_nocc): New. gcc/testsuite/ChangeLog: * gcc.target/x86_64/abi/callabi/leaf-2.c: Adjust the test. * gcc.target/i386/interrupt-16.c: Likewise. * gfortran.dg/guality/arg1.f90: Likewise. * gcc.target/i386/avx10_2-comibf-1.c: Likewise. * g++.target/i386/shrink_wrap_separa
[gcc r16-1640] Fix shrink wrap separate ICE for mingw [PR120741]
https://gcc.gnu.org/g:4b739c020a90dfe2569a292c44b2293a94d4bff5 commit r16-1640-g4b739c020a90dfe2569a292c44b2293a94d4bff5 Author: Lili Cui Date: Tue Jun 24 10:49:43 2025 +0800 Fix shrink wrap separate ICE for mingw [PR120741] gcc/ChangeLog: PR target/120741 * config/i386/i386.cc (ix86_expand_prologue): Remove 1 assertion. gcc/testsuite/ChangeLog: PR target/120741 * gcc.target/i386/pr120741.c: New test. * gcc.target/i386/shrink-wrap-separate-mingw.c: Likewise. Diff: --- gcc/config/i386/i386.cc| 2 -- gcc/testsuite/gcc.target/i386/pr120741.c | 22 ++ .../gcc.target/i386/shrink-wrap-separate-mingw.c | 22 ++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index fc3105919f45..84081ab12670 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -9443,8 +9443,6 @@ ix86_expand_prologue (void) } else { - gcc_assert (!crtl->shrink_wrapped_separate); - rtx eax = gen_rtx_REG (Pmode, AX_REG); rtx r10 = NULL; const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx); diff --git a/gcc/testsuite/gcc.target/i386/pr120741.c b/gcc/testsuite/gcc.target/i386/pr120741.c new file mode 100644 index ..b59a58c48b89 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120741.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mstack-arg-probe" } */ + +short __mingw_swformat_format; +__builtin_va_list __mingw_swformat_arg; +int __mingw_swformat_fc; +typedef struct { + void *fp; + int bch[1024]; +} _IFP; +void __mingw_swformat(_IFP *s) { + if (s->fp) +while (__mingw_swformat_format) + if (__mingw_swformat_fc == 'A') + *__builtin_va_arg(__mingw_swformat_arg, double *) = 0; +} +void +__mingw_vswscanf (void) +{ + _IFP ifp; + __mingw_swformat(&ifp); +} diff --git a/gcc/testsuite/gcc.target/i386/shrink-wrap-separate-mingw.c b/gcc/testsuite/gcc.target/i386/shrink-wrap-separate-mingw.c new file mode 100644 index ..58635e49647a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/shrink-wrap-separate-mingw.c @@ -0,0 +1,22 @@ +/* { dg-do compile { target *-*-mingw* *-*-cygwin* } } */ +/* { dg-options "-std=gnu99 -O2" } */ + +short __mingw_swformat_format; +__builtin_va_list __mingw_swformat_arg; +int __mingw_swformat_fc; +typedef struct { + void *fp; + int bch[1024]; +} _IFP; +void __mingw_swformat(_IFP *s) { + if (s->fp) +while (__mingw_swformat_format) + if (__mingw_swformat_fc == 'A') +*__builtin_va_arg(__mingw_swformat_arg, double *) = 0; +} +void +__mingw_vswscanf (void) +{ + _IFP ifp; + __mingw_swformat(&ifp); +}
[gcc r16-1757] Relax the testcase check for Solaris [PR120818]
https://gcc.gnu.org/g:e7fb2459b00cde4fb14062076df29320efafdb98 commit r16-1757-ge7fb2459b00cde4fb14062076df29320efafdb98 Author: Lili Cui Date: Sat Jun 28 06:19:19 2025 -0700 Relax the testcase check for Solaris [PR120818] gcc/testsuite/ChangeLog: PR target/120818 * g++.target/i386/shrink_wrap_separate.C: Relax the check. Diff: --- gcc/testsuite/g++.target/i386/shrink_wrap_separate.C | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gcc/testsuite/g++.target/i386/shrink_wrap_separate.C b/gcc/testsuite/g++.target/i386/shrink_wrap_separate.C index 294dccde5d31..b924fd02ca94 100644 --- a/gcc/testsuite/g++.target/i386/shrink_wrap_separate.C +++ b/gcc/testsuite/g++.target/i386/shrink_wrap_separate.C @@ -21,5 +21,4 @@ bool k() { b *n; return h(l->g, n); } -/* { dg-final { scan-rtl-dump "The components we wrap separately are \\\[sep 3 4\\\]" "pro_and_epilogue" { target { ia32 } } } } */ -/* { dg-final { scan-rtl-dump "The components we wrap separately are \\\[sep 40 41 42 43\\\]" "pro_and_epilogue" { target { ! ia32 } } } } */ +/* { dg-final { scan-rtl-dump "The components we wrap separately are" "pro_and_epilogue" } } */