https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112331
--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
I'm not sure what the problem is with a zero DR step for an inner loop
reference
(possibly dependence analysis runs into some unhandled cases - who knows). The
following vectorizes the inner loop (the load is hoisted as invariant, but
the store is not sunk - there's no sinking phase after interchange).
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index d5c9c4a11c2..7d1f0697fe7 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -2944,6 +2944,7 @@ vect_analyze_data_ref_access (vec_info *vinfo,
dr_vec_info *dr_info)
DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
if (!nested_in_vect_loop_p (loop, stmt_info))
return DR_IS_READ (dr);
+#if 0
/* Allow references with zero step for outer loops marked
with pragma omp simd only - it guarantees absence of
loop-carried dependencies between inner loop iterations. */
@@ -2954,6 +2955,7 @@ vect_analyze_data_ref_access (vec_info *vinfo,
dr_vec_info *dr_info)
"zero step in inner loop of nest\n");
return false;
}
+#endif
}
if (loop && nested_in_vect_loop_p (loop, stmt_info))
Note when we don't vectorize we are eliding the inner loop later, when
we vectorize we don't.
unvectorized:
s111:
.LFB0:
.cfi_startproc
xorl %eax, %eax
.L2:
movl Y(%rax), %ecx
addq $4, %rax
leal 1(%rcx), %edx
movl %edx, X-4(%rax)
cmpq $128000, %rax
jne .L2
xorl %eax, %eax
ret
vectorized:
s111:
.LFB0:
.cfi_startproc
movdqa .LC0(%rip), %xmm1
xorl %ecx, %ecx
.L2:
movdqa Y(%rcx), %xmm0
leaq X(%rcx), %rdx
movl $400000, %eax
paddd %xmm1, %xmm0
.p2align 4,,10
.p2align 3
.L3:
movaps %xmm0, (%rdx)
subl $2, %eax
jne .L3
addq $16, %rcx
cmpq $128000, %rcx
jne .L2
xorl %eax, %eax
ret