https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119234
Jeevitha <jeevitha at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |jeevitha at gcc dot gnu.org --- Comment #5 from Jeevitha <jeevitha at gcc dot gnu.org> --- I am able to reproduce the issue on the Power8 build. The issue occurs in the SGEMV kernels 4x8 and 4x4. For the below source in sgemv_4x8 below assemble was generated by the compiler, for (i = 0; i < n/4; i ++) { temp0 += v_x[i] * va0[i]; temp1 += v_x[i] * va1[i]; temp2 += v_x[i] * va2[i]; temp3 += v_x[i] * va3[i]; temp4 += v_x[i] * va4[i]; temp5 += v_x[i] * va5[i]; temp6 += v_x[i] * va6[i]; temp7 += v_x[i] * va7[i]; } Assembly: 0x000020000051d8a0 <+96>: add r30,r6,r9 0x000020000051d8a4 <+100>: lvx v12,r5,r9 0x000020000051d8a8 <+104>: lvx v13,r7,r9 0x000020000051d8ac <+108>: clrrdi r30,r30,4 0x000020000051d8b0 <+112>: lvx v1,r12,r9 0x000020000051d8b4 <+116>: lxvd2x vs0,0,r30 0x000020000051d8b8 <+120>: add r30,r11,r9 0x000020000051d8bc <+124>: clrrdi r30,r30,4 0x000020000051d8c0 <+128>: lxvd2x vs13,0,r30 0x000020000051d8c4 <+132>: add r30,r3,r9 0x000020000051d8c8 <+136>: clrrdi r30,r30,4 0x000020000051d8cc <+140>: xxswapd vs0,vs0 0x000020000051d8d0 <+144>: lxvd2x vs2,0,r30 0x000020000051d8d4 <+148>: add r30,r4,r9 0x000020000051d8d8 <+152>: clrrdi r30,r30,4 0x000020000051d8dc <+156>: xvmaddasp vs6,vs0,vs44 0x000020000051d8e0 <+160>: xvmaddasp vs7,vs0,vs45 0x000020000051d8e4 <+164>: xxswapd vs13,vs13 0x000020000051d8e8 <+168>: xvmaddasp vs10,vs0,vs33 0x000020000051d8ec <+172>: lxvd2x vs3,0,r30 0x000020000051d8f0 <+176>: add r30,r8,r9 0x000020000051d8f4 <+180>: clrrdi r30,r30,4 0x000020000051d8f8 <+184>: xvmaddasp vs12,vs0,vs13 0x000020000051d8fc <+188>: xxswapd vs2,vs2 0x000020000051d900 <+192>: lxvd2x vs4,0,r30 0x000020000051d904 <+196>: add r30,r10,r9 0x000020000051d908 <+200>: addi r9,r9,16 0x000020000051d90c <+204>: clrrdi r30,r30,4 0x000020000051d910 <+208>: xvmaddasp vs8,vs0,vs2 0x000020000051d914 <+212>: xxswapd vs3,vs3 0x000020000051d918 <+216>: lxvd2x vs5,0,r30 0x000020000051d91c <+220>: xvmaddasp vs9,vs0,vs3 0x000020000051d920 <+224>: xxswapd vs4,vs4 0x000020000051d924 <+228>: xvmaddasp vs11,vs0,vs4 0x000020000051d928 <+232>: xxswapd vs5,vs5 0x000020000051d92c <+236>: xvmaddasp vs32,vs0,vs5 0x000020000051d930 <+240>: bdnz 0x20000051d8a0 <sgemv_kernel_4x8+96> While debugging the issue, I observed a misaligned lvx instruction in sgemv_kernel_4x8, which results in incorrect vector loads 0x000020000051d8a4 71 temp0 += v_x[i] * va0[i]; => 0x000020000051d8a4 <sgemv_kernel_4x8+100>: ce 48 85 7d lvx v12,r5,r9 (gdb) p/x $r5 $3 = 0x10033940 (gdb) p/x $r9 $4 = 0x0 (gdb) x/10f $r5 0x10033940: 0 1 2 3 0x10033950: 4 5 6 7 0x10033960: 8 9 (gdb) si (gdb) p $v12 $6 = {uint128 = 85402898748987885467211345242806550528, v4_float = {0, 1, 2, 3}, v4_int32 = {0, 1065353216, 1073741824, 1077936128}, v8_int16 = {0, 0, 0, 16256, 0, 16384, 0, 16448}, v16_int8 = {0, 0, 0, 0, 0, 0, -128, 63, 0, 0, 0, 64, 0, 0, 64, 64}} Above lvx has no issue because, Address is 16-byte aligned. lvx fetches expected values. 0x000020000051d8a8 72 temp1 += v_x[i] * va1[i]; => 0x000020000051d8a8 <sgemv_kernel_4x8+104>: ce 48 a7 7d lvx v13,r7,r9 (gdb) p/x $r7 $7 = 0x10033a08 (gdb) p/x $r9 $8 = 0x0 (gdb) x/10f $r7 0x10033a08: 50 51 52 53 0x10033a18: 54 55 56 57 0x10033a28: 58 59 (gdb) si (gdb) p $v13 $9 = {uint128 = 88123662303566147823415553041968922624, v4_float = {48, 49, 50, 51}, v4_int32 = {1111490560, 1111752704, 1112014848, 1112276992}, v8_int16 = {0, 16960, 0, 16964, 0, 16968, 0, 16972}, v16_int8 = {0, 0, 64, 66, 0, 0, 68, 66, 0, 0, 72, 66, 0, 0, 76, 66}} In this case, Address 0x10033a08 is not 16-byte aligned. Internally, lvx masks (0xFFFF_FFFF_FFFF_FFF0) changed address to 0x10033a00, so it fetches values starting two floats earlier than intended.//v4_float = {48, 49, 50, 51} This causes incorrect computation in temp1 during the loop, and the error propagates through further vectorized operations Sample Values (for one loop iteration): i = 0 v_x[i] = {1.000000, 1.000000, 1.000000, 1.000000} va0[i] = {0.000000, 1.000000, 2.000000, 3.000000} va1[i] = {50.000000, 51.000000, 52.000000, 53.000000} va2[i] = {100.000000, 101.000000, 102.000000, 103.000000} va3[i] = {150.000000, 151.000000, 152.000000, 153.000000} va4[i] = {200.000000, 201.000000, 202.000000, 203.000000} va5[i] = {250.000000, 251.000000, 252.000000, 253.000000} va6[i] = {300.000000, 301.000000, 302.000000, 303.000000} va7[i] = {350.000000, 351.000000, 352.000000, 353.000000} Accumulated temp values: temp0 = {0.000000, 1.000000, 2.000000, 3.000000} temp1 = {48.000000, 49.000000, 50.000000, 51.000000} /* it should have {50.000000, 51.000000, 52.000000, 53.000000}*/ temp2 = {100.000000, 101.000000, 102.000000, 103.000000} temp3 = {148.000000, 149.000000, 150.000000, 151.000000}//wrong temp4 = {200.000000, 201.000000, 202.000000, 203.000000} temp5 = {248.000000, 249.000000, 250.000000, 251.000000}//wrong temp6 = {300.000000, 301.000000, 302.000000, 303.000000} temp7 = {348.000000, 349.000000, 350.000000, 351.000000}//wrong If you look at the above, incorrect values are being accumulated in specific temp variables inside sgemv_kernel_4x8. Specifically, temp1, temp3, temp5, and temp7 have incorrect values due to misaligned memory access in the lvx instruction. Note temp1 = va1; // should load {50.0, 51.0, 52.0, 53.0} If they are not vectorizing the kernel, then the for loop generates the following code on Power8: 0x000020000051dc70 <+896>: lxvd2x vs0,r14,r30 0x000020000051dc74 <+900>: lxvd2x vs1,r27,r30 0x000020000051dc78 <+904>: lxvd2x vs13,r25,r30 0x000020000051dc7c <+908>: lxvd2x vs2,r24,r30 0x000020000051dc80 <+912>: lxvd2x vs3,r23,r30 0x000020000051dc84 <+916>: lxvd2x vs4,r22,r30 0x000020000051dc88 <+920>: lxvd2x vs5,r21,r30 0x000020000051dc8c <+924>: lxvd2x vs6,r28,r30 0x000020000051dc90 <+928>: addi r30,r30,16 0x000020000051dc94 <+932>: xvmaddasp vs7,vs0,vs1 0x000020000051dc98 <+936>: xvmaddasp vs8,vs0,vs13 0x000020000051dc9c <+940>: xvmaddasp vs9,vs0,vs2 0x000020000051dca0 <+944>: xvmaddasp vs10,vs0,vs3 0x000020000051dca4 <+948>: xvmaddasp vs11,vs0,vs4 0x000020000051dca8 <+952>: xvmaddasp vs12,vs0,vs5 0x000020000051dcac <+956>: xvmaddasp vs32,vs0,vs6 0x000020000051dcb0 <+960>: bdnz 0x20000051dc70 <sgemv_kernel_4x8+896> In the above assembly, there is no alignment issue with lxvd2x because it does not perform any internal alignment adjustments — the data is loaded correctly. When I manually vectorized the for loop, it generated proper lxvd2x instructions like the above, and there was no issue: "All Openblas sgemm test passed" with below code and little changes in v_y computation for (i = 0; i < n; i += 4) { vector float vx = vec_vsx_ld(0, &x[i]); vector float vva0 = vec_vsx_ld(0, &a0[i]); vector float vva1 = vec_vsx_ld(0, &a1[i]); vector float vva2 = vec_vsx_ld(0, &a2[i]); vector float vva3 = vec_vsx_ld(0, &a3[i]); vector float vva4 = vec_vsx_ld(0, &a4[i]); vector float vva5 = vec_vsx_ld(0, &a5[i]); vector float vva6 = vec_vsx_ld(0, &a6[i]); vector float vva7 = vec_vsx_ld(0, &a7[i]); temp0 = vec_madd(vx, vva0, temp0); temp1 = vec_madd(vx, vva1, temp1); temp2 = vec_madd(vx, vva2, temp2); temp3 = vec_madd(vx, vva3, temp3); temp4 = vec_madd(vx, vva4, temp4); temp5 = vec_madd(vx, vva5, temp5); temp6 = vec_madd(vx, vva6, temp6); temp7 = vec_madd(vx, vva7, temp7); } On Power9 and Power10, we don’t face this issue because they generate the lxvx instruction no alignment issue.