https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87077
--- Comment #3 from trashyankes at wp dot pl --- (In reply to Richard Biener from comment #2) > Can you attach the source please? These stupid Web 2.0 sites do not allow > to save it to a file. Code: ``` #include <pmmintrin.h> #include <immintrin.h> struct alignas(32) Vx { float x[4]; }; struct alignas(32) Mx { Vx x[4]; }; #define M_COMMON_ATTR() __attribute__ ((target("fma"), optimize("-ffast-math"))) M_COMMON_ATTR() Vx mul(const Mx& mtx, const Vx& vec) { Vx res; for (int i = 0; i < 4; ++i) { auto r = 0.0f; for (int j = 0; j < 4; ++j) { r += mtx.x[i].x[j] * vec.x[j]; } res.x[i] = r; } return res; } M_COMMON_ATTR() Vx mulSSE(const Mx& mtx, const Vx& vec) { Vx res; auto v0 = _mm_load_ps(vec.x); auto m0 = _mm_load_ps(mtx.x[0].x); auto m1 = _mm_load_ps(mtx.x[1].x); auto m2 = _mm_load_ps(mtx.x[2].x); auto m3 = _mm_load_ps(mtx.x[3].x); m0 = _mm_mul_ps(m0, v0); m1 = _mm_mul_ps(m1, v0); m2 = _mm_mul_ps(m2, v0); m3 = _mm_mul_ps(m3, v0); m0 = _mm_hadd_ps(m0, m1); m2 = _mm_hadd_ps(m2, m3); m0 = _mm_hadd_ps(m0, m2); _mm_store_ps(res.x, m0); return res; } ``` `mul` use GCC optimalizer `mulSSE` use hand written code that I expect from `mul` I use `optimize("-ffast-math")` to eliminate case where compiler is forbid to change order of summing (because `(a + b) + c != a + (b + c)`). Similar with `target("fma")` it needed to enable `_mm_hadd_ps`.