The following implements vectorization of bswap via VEC_PERM_EXPR on the corresponding QImode vector.
ARM already has backend handling via the builtin_vectorized_call hook and thus there were already testcases available. It doesn't end up working for vect-bswap16.c because we have a promoted argument to __builtin_bswap16 which confuses vectorization. Eventually the testcase should also succeed on vect_perm_byte targets but I have no way to verify that. Bootstrapped and tested on x86_64-unknown-linux-gnu, applied. Richard. 2016-11-09 Richard Biener <rguent...@suse.de> PR tree-optimization/78007 * tree-vect-stmts.c (vectorizable_bswap): New function. (vectorizable_call): Call vectorizable_bswap for BUILT_IN_BSWAP{16,32,64} if arguments are not promoted. * gcc.dg/vect/vect-bswap32.c: Adjust. * gcc.dg/vect/vect-bswap64.c: Likewise. Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 241959) +++ gcc/tree-vect-stmts.c (working copy) @@ -2432,6 +2432,116 @@ vectorizable_mask_load_store (gimple *st return true; } +/* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}. */ + +static bool +vectorizable_bswap (gimple *stmt, gimple_stmt_iterator *gsi, + gimple **vec_stmt, slp_tree slp_node, + tree vectype_in, enum vect_def_type *dt) +{ + tree op, vectype; + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + unsigned ncopies, nunits; + + op = gimple_call_arg (stmt, 0); + vectype = STMT_VINFO_VECTYPE (stmt_info); + nunits = TYPE_VECTOR_SUBPARTS (vectype); + + /* Multiple types in SLP are handled by creating the appropriate number of + vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in + case of SLP. */ + if (slp_node) + ncopies = 1; + else + ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; + + gcc_assert (ncopies >= 1); + + tree char_vectype = get_same_sized_vectype (char_type_node, vectype_in); + if (! char_vectype) + return false; + + unsigned char *elts + = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (char_vectype)); + unsigned char *elt = elts; + unsigned word_bytes = TYPE_VECTOR_SUBPARTS (char_vectype) / nunits; + for (unsigned i = 0; i < nunits; ++i) + for (unsigned j = 0; j < word_bytes; ++j) + *elt++ = (i + 1) * word_bytes - j - 1; + + if (! can_vec_perm_p (TYPE_MODE (char_vectype), false, elts)) + return false; + + if (! vec_stmt) + { + STMT_VINFO_TYPE (stmt_info) = call_vec_info_type; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "=== vectorizable_bswap ===" + "\n"); + if (! PURE_SLP_STMT (stmt_info)) + { + add_stmt_cost (stmt_info->vinfo->target_cost_data, + 1, vector_stmt, stmt_info, 0, vect_prologue); + add_stmt_cost (stmt_info->vinfo->target_cost_data, + ncopies, vec_perm, stmt_info, 0, vect_body); + } + return true; + } + + tree *telts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (char_vectype)); + for (unsigned i = 0; i < TYPE_VECTOR_SUBPARTS (char_vectype); ++i) + telts[i] = build_int_cst (char_type_node, elts[i]); + tree bswap_vconst = build_vector (char_vectype, telts); + + /* Transform. */ + vec<tree> vec_oprnds = vNULL; + gimple *new_stmt = NULL; + stmt_vec_info prev_stmt_info = NULL; + for (unsigned j = 0; j < ncopies; j++) + { + /* Handle uses. */ + if (j == 0) + vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node, -1); + else + vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds, NULL); + + /* Arguments are ready. create the new vector stmt. */ + unsigned i; + tree vop; + FOR_EACH_VEC_ELT (vec_oprnds, i, vop) + { + tree tem = make_ssa_name (char_vectype); + new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR, + char_vectype, vop)); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + tree tem2 = make_ssa_name (char_vectype); + new_stmt = gimple_build_assign (tem2, VEC_PERM_EXPR, + tem, tem, bswap_vconst); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + tem = make_ssa_name (vectype); + new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR, + vectype, tem2)); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + if (slp_node) + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); + } + + if (slp_node) + continue; + + if (j == 0) + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; + else + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; + + prev_stmt_info = vinfo_for_stmt (new_stmt); + } + + vec_oprnds.release (); + return true; +} + /* Return true if vector types VECTYPE_IN and VECTYPE_OUT have integer elements and if we can narrow VECTYPE_IN to VECTYPE_OUT in a single step. On success, store the binary pack code in @@ -2658,6 +2768,12 @@ vectorizable_call (gimple *gs, gimple_st { 0, 1, 2, ... vf - 1 } vector. */ gcc_assert (nargs == 0); } + else if (modifier == NONE + && (gimple_call_builtin_p (stmt, BUILT_IN_BSWAP16) + || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP32) + || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP64))) + return vectorizable_bswap (stmt, gsi, vec_stmt, slp_node, + vectype_in, dt); else { if (dump_enabled_p ()) Index: gcc/testsuite/gcc.dg/vect/vect-bswap32.c =================================================================== --- gcc/testsuite/gcc.dg/vect/vect-bswap32.c (revision 241959) +++ gcc/testsuite/gcc.dg/vect/vect-bswap32.c (working copy) @@ -1,4 +1,4 @@ -/* { dg-require-effective-target vect_bswap } */ +/* { dg-additional-options "-msse4" { target sse4_runtime } } */ #include "tree-vect.h" @@ -42,4 +42,4 @@ main (void) return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_bswap || sse4_runtime } } } } */ Index: gcc/testsuite/gcc.dg/vect/vect-bswap64.c =================================================================== --- gcc/testsuite/gcc.dg/vect/vect-bswap64.c (revision 241959) +++ gcc/testsuite/gcc.dg/vect/vect-bswap64.c (working copy) @@ -1,4 +1,4 @@ -/* { dg-require-effective-target vect_bswap } */ +/* { dg-additional-options "-msse4" { target sse4_runtime } } */ #include "tree-vect.h" @@ -42,4 +42,4 @@ main (void) return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_bswap || sse4_runtime } } } } */