https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121949
--- Comment #1 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
(In reply to Tamar Christina from comment #0)
> While the optimal solution may be to just extend row to a 64-bit IV, it's
> unclear why we didn't support unpacking in this case.
Indeed, just removing the check
incompatible_op1_vectype_p
= (op1_vectype == NULL_TREE
|| maybe_ne (TYPE_VECTOR_SUBPARTS (op1_vectype),
TYPE_VECTOR_SUBPARTS (vectype))
|| TYPE_MODE (op1_vectype) != TYPE_MODE (vectype));
if (incompatible_op1_vectype_p
&& (SLP_TREE_DEF_TYPE (slp_op1) != vect_constant_def
|| slp_op1->refcnt != 1))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"unusable type for last operand in"
" vector/vector shift/rotate.\n");
return false;
}
gets the unpacked version as expected
.L3:
movprfx z0, z29
uxtw z0.d, p6/m, z29.d
movprfx z27, z29
sxtw z27.d, p6/m, z29.d
lsrr z0.d, p6/m, z0.d, z30.d
and z0.d, z0.d, #0x1
cmpne p7.d, p7/z, z0.d, #0
ld1d z28.d, p7/z, [x2, x0, lsl 3]
add z28.d, z27.d, z28.d
st1d z28.d, p7, [x2, x0, lsl 3]
add z29.s, z29.s, z31.s
add x0, x0, x3
whilelo p7.d, w0, w1
b.any .L3
So it's unclear to me why that check is there.. Will check git history.