Hello world, the attached patch enables FMA for the AVX2 and AVX512F variants of matmul. This should bring a very nice speedup (although I have been unable to run benchmarks due to lack of a suitable machine).
Question: Is this still appropriate for the current state of trunk? Or rather, OK for when gcc 8 opens (which might still be some time in the future)? 2017-03-01 Thomas Koenig <tkoe...@gcc.gnu.org> PR fortran/78379 * m4/matmul.m4: (matmul_'rtype_code`_avx2): Also generate for reals. Add fma to target options. (matmul_'rtype_code`_avx512f): Add fma to target options. (matmul_'rtype_code`): Call AVX2 and AVX512F only if FMA is available. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regenerated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated. Regards Thomas
Index: m4/matmul.m4 =================================================================== --- m4/matmul.m4 (Revision 245760) +++ m4/matmul.m4 (Arbeitskopie) @@ -75,14 +75,6 @@ int blas_limit, blas_call gemm); export_proto(matmul_'rtype_code`); -'ifelse(rtype_letter,`r',dnl -`#if defined(HAVE_AVX) && defined(HAVE_AVX2) -/* REAL types generate identical code for AVX and AVX2. Only generate - an AVX2 function if we are dealing with integer. */ -#undef HAVE_AVX2 -#endif') -` - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -101,7 +93,7 @@ `static void 'matmul_name` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static' include(matmul_internal.m4)dnl `#endif /* HAVE_AVX2 */ @@ -110,7 +102,7 @@ `static void 'matmul_name` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f,fma"))); static' include(matmul_internal.m4)dnl `#endif /* HAVE_AVX512F */ @@ -138,7 +130,9 @@ { /* Run down the available processors in order of preference. */ #ifdef HAVE_AVX512F - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) + { matmul_p = matmul_'rtype_code`_avx512f; goto tailcall; @@ -147,7 +141,8 @@ #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_'rtype_code`_avx2; goto tailcall;