https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115843
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
The loops are
for (i = 0; i < 64; i++) {
KnightMoves[i] = 0;
if (Rank(i) > 0) {
if (Rank(i) > 1) {
if (File(i) > 0) KnightMoves[i] |= Mask[i-17];
if (File(i) < 7) KnightMoves[i] |= Mask[i-15];
}
if (File(i) > 1) KnightMoves[i] |= Mask[i-10];
if (File(i) < 6) KnightMoves[i] |= Mask[i-6];
}
if (Rank(i) < 7) {
if (Rank(i) < 6) {
if (File(i) > 0) KnightMoves[i] |= Mask[i+15];
if (File(i) < 7) KnightMoves[i] |= Mask[i+17];
}
if (File(i) > 1) KnightMoves[i] |= Mask[i+6];
if (File(i) < 6) KnightMoves[i] |= Mask[i+10];
}
}
for (i = 0; i < 64; i++) {
if (File(i) == FileA) {
KingPressureMask[i] = KingSafetyMask[i + 1];
} else if (File(i) == FileH) {
KingPressureMask[i] = KingSafetyMask[i - 1];
} else {
KingPressureMask[i] = KingSafetyMask[i];
}
}
for (i = 0; i < 64; i++) {
if (File(i) == FileA) {
KingPressureMask1[i] = KingSafetyMask1[i + 1];
} else if (File(i) == FileH) {
KingPressureMask1[i] = KingSafetyMask1[i - 1];
} else {
KingPressureMask1[i] = KingSafetyMask1[i];
}
}
the last loop is
<bb 302> [local count: 145013]:
<bb 183> [local count: 9271420]:
# i_38 = PHI <_1526(215), 0(302)>
# ivtmp_1427 = PHI <ivtmp_1430(215), 64(302)>
_296 = i_38 & 7;
_1526 = i_38 + 1;
_380 = _296 == 0;
_1371 = &KingSafetyMask1[_1526];
_298 = .MASK_LOAD (_1371, 64B, _380);
_804 = _296 == 7;
_1370 = (unsigned int) i_38;
_1369 = _1370 + 4294967295;
_299 = (int) _1369;
_1368 = &KingSafetyMask1[_299];
_300 = .MASK_LOAD (_1368, 64B, _804);
_301 = KingSafetyMask1[i_38];
_ifc__1431 = _804 ? _300 : _301;
_336 = _380 ? _298 : _ifc__1431;
KingPressureMask1[i_38] = _336;
ivtmp_1430 = ivtmp_1427 - 1;
if (ivtmp_1430 != 0)
goto <bb 215>; [98.44%]
else
goto <bb 189>; [1.56%]
<bb 215> [local count: 9126407]:
goto <bb 183>; [100.00%]
vectorized as
<bb 183> [local count: 579464]:
# vect_vec_iv_.194_1737 = PHI <_1915(215), { -15, -14, -13, -12, -11, -10,
-9, -8, -7, -6, -5, -4, -3, -2, -1, 0 }(198)>
# vectp_KingSafetyMask1.198_1768 = PHI <vectp_KingSafetyMask1.198_1859(215),
&MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -112B](198)>
# vectp_KingSafetyMask1.204_1878 = PHI <vectp_KingSafetyMask1.204_1879(215),
&MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -128B](198)>
# vectp_KingSafetyMask1.208_2015 = PHI <vectp_KingSafetyMask1.208_2017(215),
&MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -120B](198)>
# vectp_KingPressureMask1.216_2023 = PHI
<vectp_KingPressureMask1.216_2025(215), &MEM <BITBOARD[64]> [(void
*)&KingPressureMask1 + -120B](198)>
# ivtmp_2028 = PHI <ivtmp_2030(215), 79(198)>
# loop_mask_1995 = PHI <_1989(215), { 0, 0, 0, 0, 0, 0, 0, 0 }(198)>
# loop_mask_1860 = PHI <_1990(215), { 0, 0, 0, 0, 0, 0, 0, 0 }(198)>
_1915 = vect_vec_iv_.194_1737 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16 };
vect__296.195_1901 = vect_vec_iv_.194_1737 & { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7 };
mask__380.196_1920 = vect__296.195_1901 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0 };
mask_patt_1854.197_1855 = [vec_unpack_lo_expr] mask__380.196_1920;
mask_patt_1854.197_1733 = [vec_unpack_hi_expr] mask__380.196_1920;
vec_mask_and_1997 = mask_patt_1854.197_1855 & loop_mask_1860;
vect_patt_1732.200_1998 = .MASK_LOAD (vectp_KingSafetyMask1.198_1768, 128B,
vec_mask_and_1997);
vectp_KingSafetyMask1.198_1865 = vectp_KingSafetyMask1.198_1768 + 64;
vec_mask_and_2002 = mask_patt_1854.197_1733 & loop_mask_1995;
vect_patt_1732.201_2003 = .MASK_LOAD (vectp_KingSafetyMask1.198_1865, 128B,
vec_mask_and_2002);
mask__804.202_1876 = vect__296.195_1901 == { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7 };
mask_patt_1734.203_2005 = [vec_unpack_lo_expr] mask__804.202_1876;
mask_patt_1734.203_2007 = [vec_unpack_hi_expr] mask__804.202_1876;
vec_mask_and_2010 = mask_patt_1734.203_2005 & loop_mask_1860;
vect_patt_1772.206_2012 = .MASK_LOAD (vectp_KingSafetyMask1.204_1878, 512B,
vec_mask_and_2010);
vectp_KingSafetyMask1.204_2013 = vectp_KingSafetyMask1.204_1878 + 64;
vec_mask_and_1980 = mask_patt_1734.203_2007 & loop_mask_1995;
vect_patt_1772.207_1981 = .MASK_LOAD (vectp_KingSafetyMask1.204_2013, 512B,
vec_mask_and_1980);
vect__301.210_1882 = .MASK_LOAD (vectp_KingSafetyMask1.208_2015, 64B,
loop_mask_1860);
vectp_KingSafetyMask1.208_2018 = vectp_KingSafetyMask1.208_2015 + 64;
vect__301.211_2019 = .MASK_LOAD (vectp_KingSafetyMask1.208_2018, 64B,
loop_mask_1995);
vect_patt_1775.213_2021 = VEC_COND_EXPR <mask_patt_1734.203_2005,
vect_patt_1772.206_2012, vect__301.210_1882>;
vect_patt_1775.213_2022 = VEC_COND_EXPR <mask_patt_1734.203_2007,
vect_patt_1772.207_1981, vect__301.211_2019>;
vect_patt_1897.215_1984 = VEC_COND_EXPR <mask_patt_1854.197_1855,
vect_patt_1732.200_1998, vect_patt_1775.213_2021>;
vect_patt_1897.215_1985 = VEC_COND_EXPR <mask_patt_1854.197_1733,
vect_patt_1732.201_2003, vect_patt_1775.213_2022>;
.MASK_STORE (vectp_KingPressureMask1.216_2023, 64B, loop_mask_1860,
vect_patt_1897.215_1984);
vectp_KingPressureMask1.216_2026 = vectp_KingPressureMask1.216_2023 + 64;
.MASK_STORE (vectp_KingPressureMask1.216_2026, 64B, loop_mask_1995,
vect_patt_1897.215_1985);
vectp_KingSafetyMask1.198_1859 = vectp_KingSafetyMask1.198_1865 + 64;
vectp_KingSafetyMask1.204_1879 = vectp_KingSafetyMask1.204_2013 + 64;
vectp_KingSafetyMask1.208_2017 = vectp_KingSafetyMask1.208_2018 + 64;
vectp_KingPressureMask1.216_2025 = vectp_KingPressureMask1.216_2026 + 64;
ivtmp_2030 = ivtmp_2028 - 16;
_2031 = (unsigned short) ivtmp_2030;
_1988 = {_2031, _2031, _2031, _2031, _2031, _2031, _2031, _2031};
_1989 = { 8, 9, 10, 11, 12, 13, 14, 15 } < _1988;
_1990 = { 0, 1, 2, 3, 4, 5, 6, 7 } < _1988;
if (ivtmp_2028 > 16)
goto <bb 215>; [74.97%]
else
goto <bb 529>; [25.03%]
<bb 215> [local count: 434451]:
goto <bb 183>; [100.00%]
and with -mtune=cascadelake -mprefer-vector-width=512 we avoid the failure,
generating
<bb 183> [local count: 435039]:
# i_38 = PHI <_1526(215), 0(198)>
# ivtmp_1427 = PHI <ivtmp_1430(215), 64(198)>
# vect_vec_iv_.194_1737 = PHI <_1915(215), { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15 }(198)>
# vectp_KingSafetyMask1.198_1768 = PHI <vectp_KingSafetyMask1.198_1859(215),
&MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + 8B](198)>
# vectp_KingSafetyMask1.204_1876 = PHI <vectp_KingSafetyMask1.204_2005(215),
&MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -8B](198)>
# vectp_KingSafetyMask1.208_1879 = PHI <vectp_KingSafetyMask1.208_2010(215),
&KingSafetyMask1(198)>
# vectp_KingPressureMask1.216_2020 = PHI
<vectp_KingPressureMask1.216_2021(215), &KingPressureMask1(198)>
# ivtmp_1984 = PHI <ivtmp_1985(215), 0(198)>
_1915 = vect_vec_iv_.194_1737 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16 };
vect__296.195_1901 = vect_vec_iv_.194_1737 & { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7 };
mask__380.196_1920 = vect__296.195_1901 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0 };
...
the difference is peeling for alignment (which is an odd thing to do here,
but ...).