Hi! In i386.md, we have nearbyint<mode>2 and rint<mode>2 patterns that expand SF/DF/XF mode patterns to rounding instructions. For pre-sse4.1 that is done using XFmode and so inappropriate for vectorization, but for sse4.1 and later we can just use the {,v}{round,rndscale}p{s,d} instructions when we emit {,v}rounds{s,d} for SF/DF mode.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2019-12-28 Jakub Jelinek <ja...@redhat.com> PR target/93078 * config/i386/sse.md (nearbyint<mode>2, rint<mode>2): New expanders with VF iterator. * gcc.target/i386/sse4_1-pr93078.c: New test. * gcc.target/i386/avx-pr93078.c: New test. * gcc.target/i386/avx512f-pr93078.c: New test. --- gcc/config/i386/sse.md.jj 2019-12-21 00:12:54.000000000 +0100 +++ gcc/config/i386/sse.md 2019-12-27 18:16:48.146431083 +0100 @@ -17977,6 +17977,24 @@ (define_insn "ptesttf2" (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) +(define_expand "nearbyint<mode>2" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF + [(match_operand:VF 1 "vector_operand") + (match_dup 2)] + UNSPEC_ROUND))] + "TARGET_SSE4_1" + "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);") + +(define_expand "rint<mode>2" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF + [(match_operand:VF 1 "vector_operand") + (match_dup 2)] + UNSPEC_ROUND))] + "TARGET_SSE4_1" + "operands[2] = GEN_INT (ROUND_MXCSR);") + (define_insn "<sse4_1>_round<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") (unspec:VF_128_256 --- gcc/testsuite/gcc.target/i386/sse4_1-pr93078.c.jj 2019-12-27 18:26:05.436970472 +0100 +++ gcc/testsuite/gcc.target/i386/sse4_1-pr93078.c 2019-12-27 18:32:29.107147604 +0100 @@ -0,0 +1,42 @@ +/* PR target/93078 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msse4.1 -mno-sse4.2 -masm=att" } */ +/* { dg-final { scan-assembler "roundps\[ \t]\+\\\$12," } } */ +/* { dg-final { scan-assembler "roundps\[ \t]\+\\\$4," } } */ +/* { dg-final { scan-assembler "roundpd\[ \t]\+\\\$12," } } */ +/* { dg-final { scan-assembler "roundpd\[ \t]\+\\\$4," } } */ + +float a[16], b[16]; +double c[8], d[8]; + +void +foo (void) +{ + int i; + for (i = 0; i < 16; ++i) + b[i] = __builtin_nearbyintf (a[i]); +} + +void +bar (void) +{ + int i; + for (i = 0; i < 16; ++i) + b[i] = __builtin_rintf (a[i]); +} + +void +baz (void) +{ + int i; + for (i = 0; i < 8; ++i) + d[i] = __builtin_nearbyint (c[i]); +} + +void +qux (void) +{ + int i; + for (i = 0; i < 8; ++i) + d[i] = __builtin_rint (c[i]); +} --- gcc/testsuite/gcc.target/i386/avx-pr93078.c.jj 2019-12-27 18:32:47.567867421 +0100 +++ gcc/testsuite/gcc.target/i386/avx-pr93078.c 2019-12-27 18:34:41.527137818 +0100 @@ -0,0 +1,9 @@ +/* PR target/93078 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mavx -mno-avx2 -mprefer-vector-width=256 -masm=att" } */ +/* { dg-final { scan-assembler "vroundps\[ \t]\+\\\$12,\[^\n\r]*%y" } } */ +/* { dg-final { scan-assembler "vroundps\[ \t]\+\\\$4,\[^\n\r]*%y" } } */ +/* { dg-final { scan-assembler "vroundpd\[ \t]\+\\\$12,\[^\n\r]*%y" } } */ +/* { dg-final { scan-assembler "vroundpd\[ \t]\+\\\$4,\[^\n\r]*%y" } } */ + +#include "sse4_1-pr93078.c" --- gcc/testsuite/gcc.target/i386/avx512f-pr93078.c.jj 2019-12-27 18:34:56.632908546 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr93078.c 2019-12-27 18:35:38.650270831 +0100 @@ -0,0 +1,9 @@ +/* PR target/93078 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mavx512f -mprefer-vector-width=512 -masm=att" } */ +/* { dg-final { scan-assembler "vrndscaleps\[ \t]\+\\\$12,\[^\n\r]*%z" } } */ +/* { dg-final { scan-assembler "vrndscaleps\[ \t]\+\\\$4,\[^\n\r]*%z" } } */ +/* { dg-final { scan-assembler "vrndscalepd\[ \t]\+\\\$12,\[^\n\r]*%z" } } */ +/* { dg-final { scan-assembler "vrndscalepd\[ \t]\+\\\$4,\[^\n\r]*%z" } } */ + +#include "sse4_1-pr93078.c" Jakub