On Wed, Jun 15, 2016 at 10:48 AM, Richard Sandiford
<[email protected]> wrote:
> I recently relaxed the peeling-for-gaps conditions for LD3 but
> kept them as-is for load-and-permute. I don't think the conditons
> are needed for load-and-permute either though. No current load-and-
> permute should load outside the group, so if there is no gap at the end,
> the final vector element loaded will correspond to an element loaded
> by the original scalar loop.
>
> The patch for PR68559 (a missed optimisation PR) increased the peeled
> cases from "exact_log2 (groupsize) == -1" to "vf % group_size == 0", so
> before that fix, we didn't peel for gaps if there was no gap at the end
> of the group and if the group size was a power of 2.
>
> The only current non-power-of-2 load-and-permute size is 3, which
> doesn't require loading more than 3 vectors.
>
> The testcase is based on gcc.dg/vect/pr49038.c.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
Ok.
Thanks,
Richard.
> Thanks,
> Richard
>
>
> gcc/
> * tree-vect-stmts.c (vectorizable_load): Remove unnecessary
> peeling-for-gaps condition.
>
> gcc/testsuite/
> * gcc.dg/vect/group-no-gaps-1.c: New test.
>
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c
> +++ gcc/tree-vect-stmts.c
> @@ -6356,13 +6356,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> gcc_assert (GROUP_GAP (stmt_info));
> }
>
> - /* If there is a gap in the end of the group or the group size cannot
> - be made a multiple of the vector element count then we access excess
> + /* If there is a gap in the end of the group then we access excess
> elements in the last iteration and thus need to peel that off. */
> if (loop_vinfo
> && ! STMT_VINFO_STRIDED_P (stmt_info)
> - && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
> - || (!slp && !load_lanes_p && vf % group_size != 0)))
> + && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0)
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> Index: gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c
> ===================================================================
> --- /dev/null
> +++ gcc/testsuite/gcc.dg/vect/group-no-gaps-1.c
> @@ -0,0 +1,108 @@
> +/* { dg-require-effective-target mmap } */
> +
> +#include <sys/mman.h>
> +#include <stdio.h>
> +
> +#define COUNT 320
> +#define MMAP_SIZE 0x20000
> +#define ADDRESS1 0x1122000000
> +#define ADDRESS2 (ADDRESS1 + MMAP_SIZE * 16)
> +#define TYPE unsigned int
> +
> +#ifndef MAP_ANONYMOUS
> +#define MAP_ANONYMOUS MAP_ANON
> +#endif
> +
> +#define RHS0(B) b[B]
> +#define RHS1(B) RHS0(B) + b[(B) + 1]
> +#define RHS2(B) RHS1(B) + b[(B) + 2]
> +#define RHS3(B) RHS2(B) + b[(B) + 3]
> +#define RHS4(B) RHS3(B) + b[(B) + 4]
> +#define RHS5(B) RHS4(B) + b[(B) + 5]
> +#define RHS6(B) RHS5(B) + b[(B) + 6]
> +#define RHS7(B) RHS6(B) + b[(B) + 7]
> +
> +#define LHS0(B) a[B]
> +#define LHS1(B) LHS0(B) = a[(B) + 1]
> +#define LHS2(B) LHS1(B) = a[(B) + 2]
> +#define LHS3(B) LHS2(B) = a[(B) + 3]
> +#define LHS4(B) LHS3(B) = a[(B) + 4]
> +#define LHS5(B) LHS4(B) = a[(B) + 5]
> +#define LHS6(B) LHS5(B) = a[(B) + 6]
> +#define LHS7(B) LHS6(B) = a[(B) + 7]
> +
> +#define DEF_GROUP_SIZE(MULT, GAP, NO_GAP) \
> + void __attribute__((noinline, noclone)) \
> + gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b) \
> + { \
> + for (int i = 0; i < COUNT; i++) \
> + a[i] = RHS##GAP (i * MULT); \
> + } \
> + void __attribute__((noinline, noclone)) \
> + no_gap_load_##MULT (TYPE *__restrict a, TYPE *__restrict b) \
> + { \
> + for (int i = 0; i < COUNT; i++) \
> + a[i] = RHS##NO_GAP (i * MULT); \
> + } \
> + void __attribute__((noinline, noclone)) \
> + gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b) \
> + { \
> + for (int i = 0; i < COUNT; i++) \
> + LHS##GAP (i * MULT) = b[i]; \
> + } \
> + void __attribute__((noinline, noclone)) \
> + no_gap_store_##MULT (TYPE *__restrict a, TYPE *__restrict b) \
> + { \
> + for (int i = 0; i < COUNT; i++) \
> + LHS##NO_GAP (i * MULT) = b[i]; \
> + }
> +
> +#define USE_GROUP_SIZE(MULT) \
> + gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT + 1); \
> + no_gap_load_##MULT (end_x - COUNT, end_y - COUNT * MULT); \
> + gap_store_##MULT (end_x - COUNT * MULT + 1, end_y - COUNT); \
> + no_gap_store_##MULT (end_x - COUNT * MULT, end_y - COUNT)
> +
> +DEF_GROUP_SIZE (2, 0, 1)
> +DEF_GROUP_SIZE (3, 1, 2)
> +DEF_GROUP_SIZE (4, 2, 3)
> +DEF_GROUP_SIZE (5, 3, 4)
> +DEF_GROUP_SIZE (6, 4, 5)
> +DEF_GROUP_SIZE (7, 5, 6)
> +DEF_GROUP_SIZE (8, 6, 7)
> +
> +int
> +main (void)
> +{
> + void *x, *y;
> + TYPE *end_x, *end_y;
> +
> + x = mmap ((void *) ADDRESS1, MMAP_SIZE, PROT_READ | PROT_WRITE,
> + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> + if (x == MAP_FAILED)
> + {
> + perror ("mmap");
> + return 1;
> + }
> +
> + y = mmap ((void *) ADDRESS2, MMAP_SIZE, PROT_READ | PROT_WRITE,
> + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> + if (y == MAP_FAILED)
> + {
> + perror ("mmap");
> + return 1;
> + }
> +
> + end_x = (TYPE *) ((char *) x + MMAP_SIZE);
> + end_y = (TYPE *) ((char *) y + MMAP_SIZE);
> +
> + USE_GROUP_SIZE (2);
> + USE_GROUP_SIZE (3);
> + USE_GROUP_SIZE (4);
> + USE_GROUP_SIZE (5);
> + USE_GROUP_SIZE (6);
> + USE_GROUP_SIZE (7);
> + USE_GROUP_SIZE (8);
> +
> + return 0;
> +}