This patch fixes a PR that I noticed several years ago during power8 development. I noticed that the compiler would often create a two element vector and store the vector.
Particularly for DImode on power8, this could involve two direct moves and a XXPERMDI to glue the two parts together. On power9, there a single direct move instruction that combines the two elements. Originally I had the optimization for DFmode as well as DImode. I found if the values were already in vector registers, that generally it was faster to do the XXPERMDI and vector store. So I rewrote this patch to only optimize the DImode where the assumption is the DImode values will be in GPRs. I have done bootstraps with/without the patch, and there were no regressions. I did the builds on a little endian power9 linux system and a big endian power8 system (both 32/64-bit support on big endian). Can I check this change into the master branch. gcc/ 2020-06-30 Michael Meissner <meiss...@linux.ibm.com> PR target/81594 * config/rs6000/predicates.md (ds_form_memory): New predicate. * config/rs6000/vsx.md (concatv2di_store): New insn. (dupv2di_store): New insn. gcc/testsuite/ 2020-06-30 Michael Meissner <meiss...@linux.ibm.com> PR target/81594 * gcc.target/powerpc/pr81594.c: New test. --- gcc/config/rs6000/predicates.md | 42 +++++++++++++++ gcc/config/rs6000/vsx.md | 84 ++++++++++++++++++++++++++++++ gcc/testsuite/gcc.target/powerpc/pr81594.c | 61 ++++++++++++++++++++++ 3 files changed, 187 insertions(+) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr81594.c diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 9762855..4f7e313 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -1856,3 +1856,45 @@ (define_predicate "prefixed_memory" { return address_is_prefixed (XEXP (op, 0), mode, NON_PREFIXED_DEFAULT); }) + +;; Return true if the operand is a valid memory operand with an offsettable +;; address that can be split into 2 sub-addresses, each of which is a valid +;; DS-form (bottom 2 bits of the offset are 0). This is used to optimize +;; creating a vector of two DImode elements and then storing the vector. We +;; want to eliminate the direct moves from GPRs to form the vector and do the +;; store directly from the GPRs. + +(define_predicate "ds_form_memory" + (match_code "mem") +{ + if (!memory_operand (op, mode)) + return false; + + rtx addr = XEXP (op, 0); + + if (REG_P (addr) || SUBREG_P (addr)) + return true; + + if (GET_CODE (addr) != PLUS) + return false; + + if (!base_reg_operand (XEXP (addr, 0), Pmode)) + return false; + + rtx offset = XEXP (addr, 1); + if (!CONST_INT_P (offset)) + return false; + + HOST_WIDE_INT value = INTVAL (offset); + + if (TARGET_PREFIXED) + return SIGNED_34BIT_OFFSET_EXTRA_P (value, GET_MODE_SIZE (DImode)); + + /* If we don't support prefixed addressing, ensure that the two addresses + created would each be valid for doing a STD instruction (which is a + DS-form instruction that requires the bottom 2 bits to be 0). */ + if ((value & 0x3) != 0) + return false; + + return SIGNED_16BIT_OFFSET_EXTRA_P (value, GET_MODE_SIZE (DImode)); +}) diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 732a548..a9ebd24 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -2896,6 +2896,90 @@ (define_insn "*vsx_concat_<mode>_3" } [(set_attr "type" "vecperm")]) +;; If the only use for a VEC_CONCAT is to store 2 64-bit values, replace it +;; with two stores. Only do this on DImode, since it saves doing 1 direct move +;; on power9, and 2 direct moves + XXPERMDI on power8 to form the vector so we +;; can do a vector store. This typically shows up with -O3 where two stores +;; are combined into a vector. +;; +;; Typically DFmode would generate XXPERMDI and a vector store. Benchmarks +;; like Spec show that is typically the same speed or faster than doing the two +;; scalar DFmode stores. +(define_insn_and_split "*concatv2di_store" + [(set (match_operand:V2DI 0 "memory_operand" "=m,m,m,m") + (vec_concat:V2DI + (match_operand:DI 1 "gpc_reg_operand" "r,wa,r,wa") + (match_operand:DI 2 "gpc_reg_operand" "r,wa,wa,r"))) + (clobber (match_scratch:DI 3 "=&b,&b,&b,&b"))] + "TARGET_DIRECT_MOVE_64BIT" + "#" + "&& 1" + [(set (match_dup 4) + (match_dup 5)) + (set (match_dup 6) + (match_dup 7))] +{ + rtx mem = operands[0]; + + /* If the address can't be used directly for both stores, copy it to the + temporary base register. */ + if (!ds_form_memory (mem, V2DImode)) + { + rtx old_addr = XEXP (mem, 0); + rtx new_addr = operands[3]; + if (GET_CODE (new_addr) == SCRATCH) + new_addr = gen_reg_rtx (Pmode); + + emit_move_insn (new_addr, old_addr); + mem = change_address (mem, VOIDmode, new_addr); + } + + /* Because we are creating scalar stores, we don't have to swap the order + of the elements and then swap the stores to get the right order on + little endian systems. */ + operands[4] = adjust_address (mem, DImode, 0); + operands[5] = operands[1]; + operands[6] = adjust_address (mem, DImode, 8); + operands[7] = operands[2]; +} + [(set_attr "length" "8") + (set_attr "type" "store,fpstore,fpstore,store")]) + +;; Optimize creating a vector with 2 duplicate DImode elements and storing it. +(define_insn_and_split "*dupv2di_store" + [(set (match_operand:V2DI 0 "memory_operand" "=m,m") + (vec_duplicate:V2DI + (match_operand:DI 1 "gpc_reg_operand" "r,wa"))) + (clobber (match_scratch:DI 2 "=&b,&b"))] + "TARGET_DIRECT_MOVE_64BIT" + "#" + "&& 1" + [(set (match_dup 3) + (match_dup 1)) + (set (match_dup 4) + (match_dup 1))] +{ + rtx mem = operands[0]; + + /* If the address can't be used directly for both stores, copy it to the + temporary base register. */ + if (!ds_form_memory (mem, V2DImode)) + { + rtx old_addr = XEXP (mem, 0); + rtx new_addr = operands[2]; + if (GET_CODE (new_addr) == SCRATCH) + new_addr = gen_reg_rtx (Pmode); + + emit_move_insn (new_addr, old_addr); + mem = change_address (mem, VOIDmode, new_addr); + } + + operands[3] = adjust_address (mem, DImode, 0); + operands[4] = adjust_address (mem, DImode, 8); +} + [(set_attr "length" "8") + (set_attr "type" "store,fpstore")]) + ;; Special purpose concat using xxpermdi to glue two single precision values ;; together, relying on the fact that internally scalar floats are represented ;; as doubles. This is used to initialize a V4SF vector with 4 floats diff --git a/gcc/testsuite/gcc.target/powerpc/pr81594.c b/gcc/testsuite/gcc.target/powerpc/pr81594.c new file mode 100644 index 0000000..35a9749 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr81594.c @@ -0,0 +1,61 @@ +/* { dg-do compile { target { powerpc-*-* && ilp64 } } } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-mdejagnu-cpu=power8 -O2" } */ + +/* PR target/81594. Optimize creating a vector of 2 64-bit elements and then + storing the vector into separate stores. */ + +void +store_v2di_0 (vector unsigned long long *p, + unsigned long long a, + unsigned long long b) +{ + *p = (vector unsigned long long) { a, b }; +} + +void +store_v2di_4 (vector unsigned long long *p, + unsigned long long a, + unsigned long long b) +{ + p[4] = (vector unsigned long long) { a, b }; +} + +void +store_v2di_splat_0 (vector unsigned long long *p, unsigned long long a) +{ + *p = (vector unsigned long) { a, a }; +} + +void +store_v2di_splat_8 (vector unsigned long long *p, unsigned long long a) +{ + p[8] = (vector unsigned long long) { a, a }; +} + +/* 2047 is the largest index that can be used with DS-form instructions. */ +void +store_v2di_2047 (vector unsigned long long *p, + unsigned long long a, + unsigned long long b) +{ + p[2047] = (vector unsigned long long) { a, b }; +} + +/* 2048 will require the constant to be loaded because we can't use a pair of + DS-form instructions. If we have prefixed addressing, a prefixed form will + be generated instead. Two separate stores should still be issued. */ +void +store_v2di_2048 (vector unsigned long long *p, + unsigned long long a, + unsigned long long b) +{ + p[2048] = (vector unsigned long long) { a, b }; +} + +/* { dg-final { scan-assembler-not {\mstxv\M} } } */ +/* { dg-final { scan-assembler-not {\mstxvx\M} } } */ +/* { dg-final { scan-assembler-not {\mmfvsrd\M} } } */ +/* { dg-final { scan-assembler-not {\mmtvsrd\M} } } */ +/* { dg-final { scan-assembler-not {\mmtvsrdd\M} } } */ +/* { dg-final { scan-assembler-not {\mxxpermdi\M} } } */ -- 1.8.3.1