Hi! This patch fixes masked gather load vectorization if the narrowing (or widening) gather needs to be used (one where either the index is 32-bit and loaded type 64-bit or index is 64-bit and loaded type 32-bit).
Bootstrapped/regtested on x86_64-linux and i686-linux, Kyrill has also kindly tested the testcases on Haswell CPU. Ok for trunk? 2013-12-30 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/59591 * tree-vect-stmts.c (vectorizable_mask_load_store): Fix up handling of modifier = NARROW masked gathers. (permute_vec_elements): Use gimple_get_lhs instead of gimple_assign_lhs. * gcc.dg/vect/pr59591-1.c: New test. * gcc.dg/vect/pr59591-2.c: New test. * gcc.target/i386/pr59591-1.c: New test. * gcc.target/i386/pr59591-2.c: New test. --- gcc/tree-vect-stmts.c.jj 2013-12-27 19:24:33.000000000 +0100 +++ gcc/tree-vect-stmts.c 2013-12-30 13:10:24.366030631 +0100 @@ -1855,14 +1855,24 @@ vectorizable_mask_load_store (gimple stm tree vec_oprnd0 = NULL_TREE, op; tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl)); tree rettype, srctype, ptrtype, idxtype, masktype, scaletype; - tree ptr, vec_mask = NULL_TREE, mask_op, var, scale; + tree ptr, vec_mask = NULL_TREE, mask_op = NULL_TREE, var, scale; tree perm_mask = NULL_TREE, prev_res = NULL_TREE; + tree mask_perm_mask = NULL_TREE; edge pe = loop_preheader_edge (loop); gimple_seq seq; basic_block new_bb; enum { NARROW, NONE, WIDEN } modifier; int gather_off_nunits = TYPE_VECTOR_SUBPARTS (gather_off_vectype); + rettype = TREE_TYPE (TREE_TYPE (gather_decl)); + srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); + scaletype = TREE_VALUE (arglist); + gcc_checking_assert (types_compatible_p (srctype, rettype) + && types_compatible_p (srctype, masktype)); + if (nunits == gather_off_nunits) modifier = NONE; else if (nunits == gather_off_nunits / 2) @@ -1888,19 +1898,14 @@ vectorizable_mask_load_store (gimple stm perm_mask = vect_gen_perm_mask (vectype, sel); gcc_assert (perm_mask != NULL_TREE); ncopies *= 2; + for (i = 0; i < nunits; ++i) + sel[i] = i | gather_off_nunits; + mask_perm_mask = vect_gen_perm_mask (masktype, sel); + gcc_assert (mask_perm_mask != NULL_TREE); } else gcc_unreachable (); - rettype = TREE_TYPE (TREE_TYPE (gather_decl)); - srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); - ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); - idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); - masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist); - scaletype = TREE_VALUE (arglist); - gcc_checking_assert (types_compatible_p (srctype, rettype) - && types_compatible_p (srctype, masktype)); - vec_dest = vect_create_destination_var (gimple_call_lhs (stmt), vectype); ptr = fold_convert (ptrtype, gather_base); @@ -1940,28 +1945,35 @@ vectorizable_mask_load_store (gimple stm op = var; } - if (j == 0) - vec_mask = vect_get_vec_def_for_operand (mask, stmt, NULL); + if (mask_perm_mask && (j & 1)) + mask_op = permute_vec_elements (mask_op, mask_op, + mask_perm_mask, stmt, gsi); else { - vect_is_simple_use (vec_mask, NULL, loop_vinfo, NULL, &def_stmt, - &def, &dt); - vec_mask = vect_get_vec_def_for_stmt_copy (dt, vec_mask); - } + if (j == 0) + vec_mask = vect_get_vec_def_for_operand (mask, stmt, NULL); + else + { + vect_is_simple_use (vec_mask, NULL, loop_vinfo, NULL, + &def_stmt, &def, &dt); + vec_mask = vect_get_vec_def_for_stmt_copy (dt, vec_mask); + } - mask_op = vec_mask; - if (!useless_type_conversion_p (masktype, TREE_TYPE (vec_mask))) - { - gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)) - == TYPE_VECTOR_SUBPARTS (masktype)); - var = vect_get_new_vect_var (masktype, vect_simple_var, NULL); - var = make_ssa_name (var, NULL); - mask_op = build1 (VIEW_CONVERT_EXPR, masktype, mask_op); - new_stmt - = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, - mask_op, NULL_TREE); - vect_finish_stmt_generation (stmt, new_stmt, gsi); - mask_op = var; + mask_op = vec_mask; + if (!useless_type_conversion_p (masktype, TREE_TYPE (vec_mask))) + { + gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op)) + == TYPE_VECTOR_SUBPARTS (masktype)); + var = vect_get_new_vect_var (masktype, vect_simple_var, + NULL); + var = make_ssa_name (var, NULL); + mask_op = build1 (VIEW_CONVERT_EXPR, masktype, mask_op); + new_stmt + = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, + mask_op, NULL_TREE); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + mask_op = var; + } } new_stmt @@ -5446,7 +5458,7 @@ permute_vec_elements (tree x, tree y, tr tree perm_dest, data_ref; gimple perm_stmt; - perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype); + perm_dest = vect_create_destination_var (gimple_get_lhs (stmt), vectype); data_ref = make_ssa_name (perm_dest, NULL); /* Generate the permute statement. */ --- gcc/testsuite/gcc.dg/vect/pr59591-1.c.jj 2013-12-30 12:59:27.012435290 +0100 +++ gcc/testsuite/gcc.dg/vect/pr59591-1.c 2013-12-30 13:08:40.386562796 +0100 @@ -0,0 +1,55 @@ +/* PR tree-optimization/59591 */ +/* { dg-do run } */ +/* { dg-additional-options "-fopenmp-simd" } */ + +#ifndef CHECK_H +#include "tree-vect.h" +#endif + +extern void abort (void); + +int p[256], q[256], r[256], t[256]; + +__attribute__((noinline, noclone)) void +foo (void) +{ + int i; + #pragma omp simd safelen(64) + for (i = 0; i < 256; i++) + if (r[i] > 32) + t[i] = p[q[i] * 3L + 2L]; +} + +__attribute__((noinline, noclone)) void +bar (void) +{ + int i; + for (i = 0; i < 256; i++) + { + r[i] = ((i >> 2) & (1 << (i & 3))) ? 32 + i : 32 - i; + q[i] = r[i] > 32 ? ((i * 7) % 84) : 99 + i; + p[i] = i * 11; + t[i] = i * 13; + } + foo (); + for (i = 0; i < 256; i++) + if ((i >> 2) & (1 << (i & 3))) + { + if (t[i] != (((i * 7) % 84) * 3 + 2) * 11) + abort (); + } + else if (t[i] != i * 13) + abort (); +} + +#ifndef CHECK_H +int +main () +{ + check_vect (); + bar (); + return 0; +} +#endif + +/* { dg-final { cleanup-tree-dump "vect" } } */ --- gcc/testsuite/gcc.dg/vect/pr59591-2.c.jj 2013-12-30 12:59:35.249391492 +0100 +++ gcc/testsuite/gcc.dg/vect/pr59591-2.c 2013-12-30 13:08:58.791467078 +0100 @@ -0,0 +1,56 @@ +/* PR tree-optimization/59591 */ +/* { dg-do run } */ +/* { dg-additional-options "-fopenmp-simd" } */ + +#ifndef CHECK_H +#include "tree-vect.h" +#endif + +extern void abort (void); + +long long int p[256], r[256], t[256]; +int q[256]; + +__attribute__((noinline, noclone)) void +foo (void) +{ + int i; + #pragma omp simd safelen(64) + for (i = 0; i < 256; i++) + if (r[i] > 32LL) + t[i] = p[q[i]]; +} + +__attribute__((noinline, noclone)) void +bar (void) +{ + int i; + for (i = 0; i < 256; i++) + { + r[i] = ((i >> 2) & (1 << (i & 3))) ? 32 + i : 32 - i; + q[i] = r[i] > 32 ? ((i * 7) % 256) : 258 + i; + p[i] = i * 11; + t[i] = i * 13; + } + foo (); + for (i = 0; i < 256; i++) + if ((i >> 2) & (1 << (i & 3))) + { + if (t[i] != ((i * 7) % 256) * 11) + abort (); + } + else if (t[i] != i * 13) + abort (); +} + +#ifndef CHECK_H +int +main () +{ + check_vect (); + bar (); + return 0; +} +#endif + +/* { dg-final { cleanup-tree-dump "vect" } } */ --- gcc/testsuite/gcc.target/i386/pr59591-1.c.jj 2013-12-30 13:05:51.243440518 +0100 +++ gcc/testsuite/gcc.target/i386/pr59591-1.c 2013-12-30 13:09:16.100377227 +0100 @@ -0,0 +1,17 @@ +/* PR tree-optimization/59591 */ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fno-vect-cost-model" } */ +/* { dg-require-effective-target avx2 } */ + +#define CHECK_H "avx2-check.h" +#define TEST avx2_test + +#include "../../gcc.dg/vect/pr59591-1.c" + +#include CHECK_H + +static void +TEST (void) +{ + bar (); +} --- gcc/testsuite/gcc.target/i386/pr59591-2.c.jj 2013-12-30 13:06:13.000328094 +0100 +++ gcc/testsuite/gcc.target/i386/pr59591-2.c 2013-12-30 13:09:24.093319805 +0100 @@ -0,0 +1,17 @@ +/* PR tree-optimization/59591 */ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fno-vect-cost-model" } */ +/* { dg-require-effective-target avx2 } */ + +#define CHECK_H "avx2-check.h" +#define TEST avx2_test + +#include "../../gcc.dg/vect/pr59591-2.c" + +#include CHECK_H + +static void +TEST (void) +{ + bar (); +} Jakub