https://gcc.gnu.org/g:6669dc51515313dd1e60c493596dbc90429fc362
commit r15-1239-g6669dc51515313dd1e60c493596dbc90429fc362 Author: Richard Biener <rguent...@suse.de> Date: Fri Jun 7 14:47:12 2024 +0200 tree-optimization/115385 - handle more gaps with peeling of a single iteration The following makes peeling of a single scalar iteration handle more gaps, including non-power-of-two cases. This can be done by rounding up the remaining access to the next power-of-two which ensures that the next scalar iteration will pick at least the number of excess elements we access. I've added a correctness testcase and one x86 specific scanning for the optimization. PR tree-optimization/115385 * tree-vect-stmts.cc (get_group_load_store_type): Peeling of a single scalar iteration is sufficient if we can narrow the access to the next power of two of the bits in the last access. (vectorizable_load): Ensure that the last access is narrowed. * gcc.dg/vect/pr115385.c: New testcase. * gcc.target/i386/vect-pr115385.c: Likewise. Diff: --- gcc/testsuite/gcc.dg/vect/pr115385.c | 88 +++++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/vect-pr115385.c | 53 ++++++++++++++++ gcc/tree-vect-stmts.cc | 44 ++++++++++++-- 3 files changed, 180 insertions(+), 5 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr115385.c b/gcc/testsuite/gcc.dg/vect/pr115385.c new file mode 100644 index 000000000000..a18cd665d7d0 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr115385.c @@ -0,0 +1,88 @@ +/* { dg-require-effective-target mmap } */ + +#include <sys/mman.h> +#include <stdio.h> + +#define COUNT 511 +#define MMAP_SIZE 0x20000 +#define ADDRESS 0x1122000000 +#define TYPE unsigned char + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +void __attribute__((noipa)) foo(TYPE * __restrict x, + TYPE *y, int n) +{ + for (int i = 0; i < n; ++i) + { + x[16*i+0] = y[3*i+0]; + x[16*i+1] = y[3*i+1]; + x[16*i+2] = y[3*i+2]; + x[16*i+3] = y[3*i+0]; + x[16*i+4] = y[3*i+1]; + x[16*i+5] = y[3*i+2]; + x[16*i+6] = y[3*i+0]; + x[16*i+7] = y[3*i+1]; + x[16*i+8] = y[3*i+2]; + x[16*i+9] = y[3*i+0]; + x[16*i+10] = y[3*i+1]; + x[16*i+11] = y[3*i+2]; + x[16*i+12] = y[3*i+0]; + x[16*i+13] = y[3*i+1]; + x[16*i+14] = y[3*i+2]; + x[16*i+15] = y[3*i+0]; + } +} + +void __attribute__((noipa)) bar(TYPE * __restrict x, + TYPE *y, int n) +{ + for (int i = 0; i < n; ++i) + { + x[16*i+0] = y[5*i+0]; + x[16*i+1] = y[5*i+1]; + x[16*i+2] = y[5*i+2]; + x[16*i+3] = y[5*i+3]; + x[16*i+4] = y[5*i+4]; + x[16*i+5] = y[5*i+0]; + x[16*i+6] = y[5*i+1]; + x[16*i+7] = y[5*i+2]; + x[16*i+8] = y[5*i+3]; + x[16*i+9] = y[5*i+4]; + x[16*i+10] = y[5*i+0]; + x[16*i+11] = y[5*i+1]; + x[16*i+12] = y[5*i+2]; + x[16*i+13] = y[5*i+3]; + x[16*i+14] = y[5*i+4]; + x[16*i+15] = y[5*i+0]; + } +} + +TYPE x[COUNT * 16]; + +int +main (void) +{ + void *y; + TYPE *end_y; + + y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (y == MAP_FAILED) + { + perror ("mmap"); + return 1; + } + + end_y = (TYPE *) ((char *) y + MMAP_SIZE); + + foo (x, end_y - COUNT * 3, COUNT); + bar (x, end_y - COUNT * 5, COUNT); + + return 0; +} + +/* We always require a scalar epilogue here but we don't know which + targets support vector composition this way. */ diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115385.c b/gcc/testsuite/gcc.target/i386/vect-pr115385.c new file mode 100644 index 000000000000..a6be9ce4e543 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-pr115385.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -msse4.1 -mno-avx -fdump-tree-vect-details" } */ + +void __attribute__((noipa)) foo(unsigned char * __restrict x, + unsigned char *y, int n) +{ + for (int i = 0; i < n; ++i) + { + x[16*i+0] = y[3*i+0]; + x[16*i+1] = y[3*i+1]; + x[16*i+2] = y[3*i+2]; + x[16*i+3] = y[3*i+0]; + x[16*i+4] = y[3*i+1]; + x[16*i+5] = y[3*i+2]; + x[16*i+6] = y[3*i+0]; + x[16*i+7] = y[3*i+1]; + x[16*i+8] = y[3*i+2]; + x[16*i+9] = y[3*i+0]; + x[16*i+10] = y[3*i+1]; + x[16*i+11] = y[3*i+2]; + x[16*i+12] = y[3*i+0]; + x[16*i+13] = y[3*i+1]; + x[16*i+14] = y[3*i+2]; + x[16*i+15] = y[3*i+0]; + } +} + +void __attribute__((noipa)) bar(unsigned char * __restrict x, + unsigned char *y, int n) +{ + for (int i = 0; i < n; ++i) + { + x[16*i+0] = y[5*i+0]; + x[16*i+1] = y[5*i+1]; + x[16*i+2] = y[5*i+2]; + x[16*i+3] = y[5*i+3]; + x[16*i+4] = y[5*i+4]; + x[16*i+5] = y[5*i+0]; + x[16*i+6] = y[5*i+1]; + x[16*i+7] = y[5*i+2]; + x[16*i+8] = y[5*i+3]; + x[16*i+9] = y[5*i+4]; + x[16*i+10] = y[5*i+0]; + x[16*i+11] = y[5*i+1]; + x[16*i+12] = y[5*i+2]; + x[16*i+13] = y[5*i+3]; + x[16*i+14] = y[5*i+4]; + x[16*i+15] = y[5*i+0]; + } +} + +/* { dg-final { scan-tree-dump "Data access with gaps requires scalar epilogue loop" "vect"} } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect"} } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index b151f53c2fbd..05fe523da993 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2151,11 +2151,24 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, nunits, &tem, &remain) || maybe_lt (remain + group_size, nunits))) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "peeling for gaps insufficient for " - "access\n"); - return false; + /* But peeling a single scalar iteration is enough if + we can use the next power-of-two sized partial + access. */ + unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size; + if (!nunits.is_constant (&cnunits) + || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf) + || ((cremain = remain.to_constant (), true) + && ((cpart_size = (1 << ceil_log2 (cremain))) != cnunits) + && vector_vector_composition_type + (vectype, cnunits / cpart_size, + &half_vtype) == NULL_TREE)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "peeling for gaps insufficient for " + "access\n"); + return false; + } } /* If this is single-element interleaving with an element @@ -11584,6 +11597,27 @@ vectorizable_load (vec_info *vinfo, gcc_assert (new_vtype || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); + /* But still reduce the access size to the next + required power-of-two so peeling a single + scalar iteration is sufficient. */ + unsigned HOST_WIDE_INT cremain; + if (remain.is_constant (&cremain)) + { + unsigned HOST_WIDE_INT cpart_size + = 1 << ceil_log2 (cremain); + if (known_gt (nunits, cpart_size) + && constant_multiple_p (nunits, cpart_size, + &num)) + { + tree ptype; + new_vtype + = vector_vector_composition_type (vectype, + num, + &ptype); + if (new_vtype) + ltype = ptype; + } + } } } tree offset