Hi! This issue is practically the same as PR63341, except in this case it is for dr_explicit_realign rather than dr_explicit_realign_optimized, and the bump isn't passed through multiple functions and thus is easier to fix.
Without the patch we use (dataptr & -16) for the first load and ((dataptr + 12) & -16) for the second load, which works just fine if the elements are properly aligned (4 byte at least), but in this case we have underaligned accesses (coming from folding of memcpy in this testcase, and from 4 byte loads combined together recognized by bswap pass in the original source), and so we really want to use ((dataptr + 15) & -16), otherwise if we are unlucky we might read the same memory twice even when dataptr is not 16 byte aligned. Bootstrapped/regtested on {x86_64,i686,aarch64,powerpc64{,le},s390{,x}}-linux, ok for trunk? 2015-03-14 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/65369 * tree-vect-stmts.c (vectorizable_load) <case dr_explicit_realign>: Set bump to vs * TYPE_SIZE_UNIT (elem_type) - 1 instead of (vs - 1) * TYPE_SIZE_UNIT (elem_type). * gcc.c-torture/execute/pr65369.c: New test. --- gcc/tree-vect-stmts.c.jj 2015-03-09 08:05:13.000000000 +0100 +++ gcc/tree-vect-stmts.c 2015-03-13 17:27:30.613529768 +0100 @@ -6468,9 +6468,8 @@ vectorizable_load (gimple stmt, gimple_s case dr_explicit_realign: { tree ptr, bump; - tree vs_minus_1; - vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); + tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); if (compute_in_loop) msq = vect_setup_realignment (first_stmt, gsi, @@ -6499,8 +6498,9 @@ vectorizable_load (gimple stmt, gimple_s vect_finish_stmt_generation (stmt, new_stmt, gsi); msq = new_temp; - bump = size_binop (MULT_EXPR, vs_minus_1, + bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type)); + bump = size_binop (MINUS_EXPR, bump, size_one_node); ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump); new_stmt = gimple_build_assign (NULL_TREE, BIT_AND_EXPR, ptr, --- gcc/testsuite/gcc.c-torture/execute/pr65369.c.jj 2015-03-13 17:37:10.926175685 +0100 +++ gcc/testsuite/gcc.c-torture/execute/pr65369.c 2015-03-13 17:35:40.000000000 +0100 @@ -0,0 +1,45 @@ +/* PR tree-optimization/65369 */ + +static const char data[] = + "12345678901234567890123456789012345678901234567890" + "123456789012345678901234567890"; + +__attribute__ ((noinline)) +static void foo (const unsigned int *buf) +{ + if (__builtin_memcmp (buf, data, 64)) + __builtin_abort (); +} + +__attribute__ ((noinline)) +static void bar (const unsigned char *block) +{ + unsigned int buf[16]; + __builtin_memcpy (buf + 0, block + 0, 4); + __builtin_memcpy (buf + 1, block + 4, 4); + __builtin_memcpy (buf + 2, block + 8, 4); + __builtin_memcpy (buf + 3, block + 12, 4); + __builtin_memcpy (buf + 4, block + 16, 4); + __builtin_memcpy (buf + 5, block + 20, 4); + __builtin_memcpy (buf + 6, block + 24, 4); + __builtin_memcpy (buf + 7, block + 28, 4); + __builtin_memcpy (buf + 8, block + 32, 4); + __builtin_memcpy (buf + 9, block + 36, 4); + __builtin_memcpy (buf + 10, block + 40, 4); + __builtin_memcpy (buf + 11, block + 44, 4); + __builtin_memcpy (buf + 12, block + 48, 4); + __builtin_memcpy (buf + 13, block + 52, 4); + __builtin_memcpy (buf + 14, block + 56, 4); + __builtin_memcpy (buf + 15, block + 60, 4); + foo (buf); +} + +int +main () +{ + unsigned char input[sizeof data + 16] __attribute__((aligned (16))); + __builtin_memset (input, 0, sizeof input); + __builtin_memcpy (input + 1, data, sizeof data); + bar (input + 1); + return 0; +} Jakub