From: Bert Tenjy <tng...@gmail.com> This patch adds functionality to enable use of POWER Architecture's VSX extensions to speed up certain code sequences. Typically 2 or 4 consecutive loop iterations that each make a single scalar function call will be combined into one iteration making a single vector function call.
The patch organization follows that used in x86_64 and Aarch64 implementations of the same vectorization functionality. The x86_64 document describing their vector function interface is at https://software.intel.com/sites/default/files/managed/b4/c8/Intel- Vector-Function-ABI.pdf Aarch64's document is at: https://developer.arm.com/docs/101129/latest The document describing POWER Architecture Vector Function interface is tentatively at: https://sourceware.org/glibc/wiki/Homepage?action=AttachFile& do=view&target=powerarchvectfuncabi.html The major test of this patch autovectorizes math functions and thus requires libmvec. PPC64 libmvec functionality is only available on GLIBC development branch tuliom/libmvec. Until that branch is merged into the main development branch, testing vector functions will require building and installing from branch tuliom/libmvec into a non-system directory. The development GCC with this patch will similarly have to be built and installed into a non-standard location. The DejaGnu vector SIMD tests have been enabled for PPC64. After necessary changes to the 4 vect-simd-clone-{1,4,5,8}.c files, 45 tests change status from UNSUPPORTED to PASS. --- gcc/config/rs6000/rs6000.c | 147 ++++++++++++++++++ gcc/testsuite/gcc.dg/vect/vect-simd-clone-1.c | 12 ++ gcc/testsuite/gcc.dg/vect/vect-simd-clone-4.c | 4 + gcc/testsuite/gcc.dg/vect/vect-simd-clone-5.c | 4 + gcc/testsuite/gcc.dg/vect/vect-simd-clone-8.c | 8 + gcc/testsuite/lib/target-supports.exp | 3 +- 6 files changed, 177 insertions(+), 1 deletion(-) A few notes: 1. Version v0 of this patch is at: https://gcc.gnu.org/pipermail/gcc-patches/2020-February/540426.html 2. The code changes in rs6000.c are in response to Bill Schmidt's feedback to v0. 3. Vector SIMD testsuite files are now enabled for POWER8 and higher PPC64 processors. 4. Changes to files vect-simd-clone-{1,4,5,8}.c are needed since PPC64 has only 128bit-wide vector bus. x86_64 for which the tests were initially written has buses wider than that for AVX and higher architectures. 5. Per Segher's response to v0, we still need to agree a name for the guiding document whose name is currently 'POWER Architecture Vector Function ABI'. diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index d26a18f3ece..8216a7d7c2c 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1273,6 +1273,142 @@ static const struct attribute_spec rs6000_attribute_table[] = #endif { NULL, 0, 0, false, false, false, false, NULL, NULL } }; + +/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */ + +static int +rs6000_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, + struct cgraph_simd_clone *clonei, + tree base_type, int num) +{ + int ret = 1; + + if (clonei->simdlen + && (clonei->simdlen < 2 + || clonei->simdlen > 128 + || (clonei->simdlen & (clonei->simdlen - 1)) != 0)) + { + warning_at (DECL_SOURCE_LOCATION (node->decl), 0, + "unsupported simdlen %d", clonei->simdlen); + return 0; + } + + tree ret_type = TREE_TYPE (TREE_TYPE (node->decl)); + if (TREE_CODE (ret_type) != VOID_TYPE) + switch (TYPE_MODE (ret_type)) + { + case E_QImode: + case E_HImode: + case E_SImode: + case E_DImode: + case E_SFmode: + case E_DFmode: + if (!AGGREGATE_TYPE_P (ret_type)) + break; + /* FALLTHRU */ + default: + warning_at (DECL_SOURCE_LOCATION (node->decl), 0, + "unsupported return type %qT for simd", ret_type); + return 0; + } + + tree t; + int i; + tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl)); + bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE); + + for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0; + t && t != void_list_node; t = TREE_CHAIN (t), i++) + { + tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t); + switch (TYPE_MODE (arg_type)) + { + case E_QImode: + case E_HImode: + case E_SImode: + case E_DImode: + case E_SFmode: + case E_DFmode: + if (!AGGREGATE_TYPE_P (arg_type)) + break; + /* FALLTHRU */ + default: + if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_UNIFORM) + break; + warning_at (DECL_SOURCE_LOCATION (node->decl), 0, + "unsupported argument type %qT for simd", arg_type); + return 0; + } + } + + if (TARGET_VSX) + { + clonei->vecsize_mangle = 'b'; + ret = 1; + } + clonei->mask_mode = VOIDmode; + switch (clonei->vecsize_mangle) + { + case 'b': + clonei->vecsize_int = 128; + clonei->vecsize_float = 128; + break; + case 'c': + clonei->vecsize_int = 128; + clonei->vecsize_float = 128; + break; + default: + gcc_unreachable (); + } + if (clonei->simdlen == 0) + { + if (SCALAR_INT_MODE_P (TYPE_MODE (base_type))) + clonei->simdlen = clonei->vecsize_int; + else + clonei->simdlen = clonei->vecsize_float; + clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type)); + } + else + { + tree ctype = ret_type; + if (TREE_CODE (ret_type) == VOID_TYPE) + ctype = base_type; + int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen; + if (SCALAR_INT_MODE_P (TYPE_MODE (ctype))) + cnt /= clonei->vecsize_int; + else + cnt /= clonei->vecsize_float; + if (cnt > 8) + { + warning_at (DECL_SOURCE_LOCATION (node->decl), 0, + "unsupported simdlen %d", clonei->simdlen); + return 0; + } + } + return ret; +} + +/* Add target attribute to SIMD clone NODE if needed. */ + +void +rs6000_simd_clone_adjust (struct cgraph_node *node) +{ +} + +static int +rs6000_simd_clone_usable (struct cgraph_node *node) +{ + switch (node->simdclone->vecsize_mangle) + { + case 'b': + if (!TARGET_VSX) + return -1; + return 0; + default: + gcc_unreachable (); + } +} + #ifndef TARGET_PROFILE_KERNEL #define TARGET_PROFILE_KERNEL 0 @@ -1281,6 +1417,17 @@ static const struct attribute_spec rs6000_attribute_table[] = /* Initialize the GCC target structure. */ #undef TARGET_ATTRIBUTE_TABLE #define TARGET_ATTRIBUTE_TABLE rs6000_attribute_table + +#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN +#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \ + rs6000_simd_clone_compute_vecsize_and_simdlen + +#undef TARGET_SIMD_CLONE_ADJUST +#define TARGET_SIMD_CLONE_ADJUST rs6000_simd_clone_adjust + +#undef TARGET_SIMD_CLONE_USABLE +#define TARGET_SIMD_CLONE_USABLE rs6000_simd_clone_usable + #undef TARGET_SET_DEFAULT_TYPE_ATTRIBUTES #define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES rs6000_set_default_type_attributes #undef TARGET_ATTRIBUTE_TAKES_IDENTIFIER_P diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-1.c b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-1.c index 50429049500..13f20aaa16a 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-1.c +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-1.c @@ -10,10 +10,22 @@ int array[N]; +#ifdef __powerpc__ + +#pragma omp declare simd simdlen(2) notinbranch +#pragma omp declare simd simdlen(2) notinbranch uniform(b) linear(c:3) +#pragma omp declare simd simdlen(4) notinbranch +#pragma omp declare simd simdlen(4) notinbranch uniform(b) linear(c:3) + +#else + #pragma omp declare simd simdlen(4) notinbranch #pragma omp declare simd simdlen(4) notinbranch uniform(b) linear(c:3) #pragma omp declare simd simdlen(8) notinbranch #pragma omp declare simd simdlen(8) notinbranch uniform(b) linear(c:3) + +#endif + __attribute__((noinline)) int foo (int a, int b, int c) { diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-4.c b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-4.c index debbe77b79d..c440582776e 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-4.c +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-4.c @@ -12,7 +12,11 @@ float d[N]; int e[N]; unsigned short f[N]; +#ifdef __powerpc__ +#pragma omp declare simd simdlen(4) notinbranch uniform(b) +#else #pragma omp declare simd simdlen(8) notinbranch uniform(b) +#endif __attribute__((noinline)) float foo (float a, float b, float c) { diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-5.c b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-5.c index 6a098d9a51a..d9dc792b81e 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-5.c +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-5.c @@ -10,7 +10,11 @@ int d[N], e[N]; +#ifdef __powerpc__ +#pragma omp declare simd simdlen(2) notinbranch uniform(b) linear(c:3) +#else #pragma omp declare simd simdlen(4) notinbranch uniform(b) linear(c:3) +#endif __attribute__((noinline)) long long int foo (int a, int b, int c) { diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-8.c b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-8.c index 1bfd19dc8ab..4ed3da47449 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-8.c +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-8.c @@ -12,14 +12,22 @@ int a[N], b[N]; long int c[N]; unsigned char d[N]; +#ifdef __powerpc__ +#pragma omp declare simd simdlen(2) notinbranch +#else #pragma omp declare simd simdlen(8) notinbranch +#endif __attribute__((noinline)) int foo (long int a, int b, int c) { return a + b + c; } +#ifdef __powerpc__ +#pragma omp declare simd simdlen(2) notinbranch +#else #pragma omp declare simd simdlen(8) notinbranch +#endif __attribute__((noinline)) long int bar (int a, int b, long int c) { diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index e79015b4d54..48edbc8e415 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -3499,7 +3499,8 @@ proc check_effective_target_vect_simd_clones { } { return [check_cached_effective_target_indexed vect_simd_clones { expr { (([istarget i?86-*-*] || [istarget x86_64-*-*]) && [check_effective_target_avx512f]) - || [istarget amdgcn-*-*] }}] + || [istarget amdgcn-*-*] + || ([istarget powerpc*-*-*] && [check_p8vector_hw_available]) }}] } # Return 1 if this is a AArch64 target supporting big endian -- 2.20.1