Hi, Introduce simple peephole2 optimization which substitutes a sequence of four consecutive load or store (LDR, STR) instructions with two load or store pair (LDP, STP) instructions for 2 element supported vector modes (V2SI, V2SF, V2DI, and V2DF). Generated load / store pair instruction offset is adjusted accordingly.
Bootstrapped and tested on aarch64-none-linux-gnu. Example: $ cat stp_vec_v2sf.c typedef float __attribute__((vector_size(8))) vec; void store_adjusted(vec *out, vec x, vec y) { out[400] = x; out[401] = y; out[402] = y; out[403] = x; } Example compiled with: $ ./aarch64-none-linux-gnu-gcc -S -O2 stp_vec_v2sf.c -dp Before the patch: store_adjusted: str d0, [x0, 3200] // 9 [c=4 l=4] *aarch64_simd_movv2si/2 str d1, [x0, 3208] // 11 [c=4 l=4] *aarch64_simd_movv2si/2 str d1, [x0, 3216] // 13 [c=4 l=4] *aarch64_simd_movv2si/2 str d0, [x0, 3224] // 15 [c=4 l=4] *aarch64_simd_movv2si/2 ret // 26 [c=0 l=4] *do_return After the patch: store_adjusted: add x1, x0, 3200 // 27 [c=4 l=4] *adddi3_aarch64/0 stp d0, d1, [x1] // 28 [c=0 l=4] vec_store_pairv2siv2si stp d1, d0, [x1, 16] // 29 [c=0 l=4] vec_store_pairv2siv2si ret // 22 [c=0 l=4] *do_return OK for master ? kind regards, Przemyslaw gcc/Changelog: * config/aarch64/aarch64-ldpstp.md: Add two peepholes for adjusted vector V2SI, V2SF, V2DI, V2DF load and store modes. * config/aarch64/aarch64-protos.h (aarch64_gen_adjusted_ldpstp): Add new parameter nunits. (aarch64_operands_adjust_ok_for_ldpstp): Add new parameter nunits. * config/aarch64/aarch64.c (aarch64_operands_adjust_ok_for_ldpstp): Add new parameter nunits and support for vector types. (aarch64_gen_adjusted_ldpstp): Add new parameter nunits and support for vector types. * config/aarch64/iterators.md (VP_2E): New iterator for 2 element vectors. (nunits): Add SI and DI to mode attribute. gcc/testsuite/Changelog: * gcc.target/aarch64/ldp_vec_v2sf.c: New test. * gcc.target/aarch64/ldp_vec_v2si.c: New test. * gcc.target/aarch64/stp_vec_v2df.c: New test. * gcc.target/aarch64/stp_vec_v2di.c: New test. * gcc.target/aarch64/stp_vec_v2sf.c: New test. * gcc.target/aarch64/stp_vec_v2si.c: New test.
diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md index dd6f39615c51105a45b7b3dcde7b86e900ae7119..94c312f8f4f6472ebbeca0c2f3e760e0e316f7b7 100644 --- a/gcc/config/aarch64/aarch64-ldpstp.md +++ b/gcc/config/aarch64/aarch64-ldpstp.md @@ -186,10 +186,10 @@ (define_peephole2 (set (match_operand:GPI 6 "register_operand" "") (match_operand:GPI 7 "memory_operand" "")) (match_dup 8)] - "aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode)" + "aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode, <nunits>)" [(const_int 0)] { - if (aarch64_gen_adjusted_ldpstp (operands, true, <MODE>mode, UNKNOWN)) + if (aarch64_gen_adjusted_ldpstp (operands, true, <MODE>mode, <nunits>, UNKNOWN)) DONE; else FAIL; @@ -206,10 +206,10 @@ (define_peephole2 (set (match_operand:GPF 6 "register_operand" "") (match_operand:GPF 7 "memory_operand" "")) (match_dup 8)] - "aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode)" + "aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode, <nunits>)" [(const_int 0)] { - if (aarch64_gen_adjusted_ldpstp (operands, true, <MODE>mode, UNKNOWN)) + if (aarch64_gen_adjusted_ldpstp (operands, true, <MODE>mode, <nunits>, UNKNOWN)) DONE; else FAIL; @@ -226,10 +226,10 @@ (define_peephole2 (set (match_operand:DI 6 "register_operand" "") (sign_extend:DI (match_operand:SI 7 "memory_operand" ""))) (match_dup 8)] - "aarch64_operands_adjust_ok_for_ldpstp (operands, true, SImode)" + "aarch64_operands_adjust_ok_for_ldpstp (operands, true, SImode, 1)" [(const_int 0)] { - if (aarch64_gen_adjusted_ldpstp (operands, true, SImode, SIGN_EXTEND)) + if (aarch64_gen_adjusted_ldpstp (operands, true, SImode, 1, SIGN_EXTEND)) DONE; else FAIL; @@ -246,10 +246,10 @@ (define_peephole2 (set (match_operand:DI 6 "register_operand" "") (zero_extend:DI (match_operand:SI 7 "memory_operand" ""))) (match_dup 8)] - "aarch64_operands_adjust_ok_for_ldpstp (operands, true, SImode)" + "aarch64_operands_adjust_ok_for_ldpstp (operands, true, SImode, 1)" [(const_int 0)] { - if (aarch64_gen_adjusted_ldpstp (operands, true, SImode, ZERO_EXTEND)) + if (aarch64_gen_adjusted_ldpstp (operands, true, SImode, 1, ZERO_EXTEND)) DONE; else FAIL; @@ -266,10 +266,10 @@ (define_peephole2 (set (match_operand:GPI 6 "memory_operand" "") (match_operand:GPI 7 "aarch64_reg_or_zero" "")) (match_dup 8)] - "aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode)" + "aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode, <nunits>)" [(const_int 0)] { - if (aarch64_gen_adjusted_ldpstp (operands, false, <MODE>mode, UNKNOWN)) + if (aarch64_gen_adjusted_ldpstp (operands, false, <MODE>mode, <nunits>, UNKNOWN)) DONE; else FAIL; @@ -286,10 +286,52 @@ (define_peephole2 (set (match_operand:GPF 6 "memory_operand" "") (match_operand:GPF 7 "aarch64_reg_or_fp_zero" "")) (match_dup 8)] - "aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode)" + "aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode, <nunits>)" [(const_int 0)] { - if (aarch64_gen_adjusted_ldpstp (operands, false, <MODE>mode, UNKNOWN)) + if (aarch64_gen_adjusted_ldpstp (operands, false, <MODE>mode, <nunits>, UNKNOWN)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(match_scratch:DI 8 "r") + (set (match_operand:VP_2E 0 "memory_operand" "") + (match_operand:VP_2E 1 "aarch64_reg_or_zero" "")) + (set (match_operand:VP_2E 2 "memory_operand" "") + (match_operand:VP_2E 3 "aarch64_reg_or_zero" "")) + (set (match_operand:VP_2E 4 "memory_operand" "") + (match_operand:VP_2E 5 "aarch64_reg_or_zero" "")) + (set (match_operand:VP_2E 6 "memory_operand" "") + (match_operand:VP_2E 7 "aarch64_reg_or_zero" "")) + (match_dup 8)] + "TARGET_SIMD + && aarch64_operands_adjust_ok_for_ldpstp (operands, false, <VEL>mode, <nunits>)" + [(const_int 0)] +{ + if (aarch64_gen_adjusted_ldpstp (operands, false, <VEL>mode, <nunits>, UNKNOWN)) + DONE; + else + FAIL; +}) + +(define_peephole2 + [(match_scratch:DI 8 "r") + (set (match_operand:VP_2E 0 "register_operand" "") + (match_operand:VP_2E 1 "memory_operand" "")) + (set (match_operand:VP_2E 2 "register_operand" "") + (match_operand:VP_2E 3 "memory_operand" "")) + (set (match_operand:VP_2E 4 "register_operand" "") + (match_operand:VP_2E 5 "memory_operand" "")) + (set (match_operand:VP_2E 6 "register_operand" "") + (match_operand:VP_2E 7 "memory_operand" "")) + (match_dup 8)] + "TARGET_SIMD + && aarch64_operands_adjust_ok_for_ldpstp (operands, true, <VEL>mode, <nunits>)" + [(const_int 0)] +{ + if (aarch64_gen_adjusted_ldpstp (operands, true, <VEL>mode, <nunits>, UNKNOWN)) DONE; else FAIL; diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 9e43adb7db0373df6cc5ef1d2b22f217aca2aad2..8855fcbedbca8784e30511c017d95b58d03ee452 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -681,7 +681,7 @@ void aarch64_split_compare_and_swap (rtx op[]); void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx); -bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE); +bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, int nunits, RTX_CODE); void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx); bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool); @@ -732,7 +732,7 @@ int aarch64_ccmp_mode_to_code (machine_mode mode); bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset); bool aarch64_operands_ok_for_ldpstp (rtx *, bool, machine_mode); -bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, scalar_mode); +bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, scalar_mode, int nunits); void aarch64_swap_ldrstr_operands (rtx *, bool); extern void aarch64_asm_output_pool_epilogue (FILE *, const char *, diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 973c65aa4fb348450872036617362aa17310fb20..15bfbc29f68eadd6c7e5458228cd74bc734ab627 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -21873,6 +21873,9 @@ aarch64_ldrstr_offset_compare (const void *x, const void *y) /* Given OPERANDS of consecutive load/store, check if we can merge them into ldp/stp by adjusting the offset. LOAD is true if they are load instructions. MODE is the mode of memory operands. + NUNITS is the number of units for MODE of memory operands. This + allows us to, in addition to scalar modes (NUNITS == 1), adjust + vector modes (NUNITS > 1) of memory operands. Given below consecutive stores: @@ -21893,7 +21896,7 @@ aarch64_ldrstr_offset_compare (const void *x, const void *y) bool aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load, - scalar_mode mode) + scalar_mode mode, int nunits) { const int num_insns = 4; enum reg_class rclass; @@ -21970,7 +21973,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load, for (int i = 0; i < num_insns; i++) offvals[i] = INTVAL (offset[i]); - msize = GET_MODE_SIZE (mode); + msize = GET_MODE_SIZE (mode) * nunits; /* Check if the offsets can be put in the right order to do a ldp/stp. */ qsort (offvals, num_insns, sizeof (HOST_WIDE_INT), @@ -22010,13 +22013,14 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load, bool aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, - scalar_mode mode, RTX_CODE code) + scalar_mode mode, int nunits, RTX_CODE code) { rtx base, offset_1, offset_3, t1, t2; rtx mem_1, mem_2, mem_3, mem_4; rtx temp_operands[8]; HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3, stp_off_upper_limit, stp_off_lower_limit, msize; + machine_mode mem_mode; /* We make changes on a copy as we may still bail out. */ for (int i = 0; i < 8; i ++) @@ -22049,7 +22053,7 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, && offset_3 != NULL_RTX); /* Adjust offset so it can fit in LDP/STP instruction. */ - msize = GET_MODE_SIZE (mode); + msize = GET_MODE_SIZE (mode) * nunits; stp_off_upper_limit = msize * (0x40 - 1); stp_off_lower_limit = - msize * 0x40; @@ -22114,8 +22118,11 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8], new_off_3 + msize), true); - if (!aarch64_mem_pair_operand (mem_1, mode) - || !aarch64_mem_pair_operand (mem_3, mode)) + /* If nunits > 1 we are adjusting for vector mode. In this case we should + generate mode for vector built from nunits and scalar_mode provided. */ + mem_mode = (nunits == 1) ? mode : mode_for_vector(mode, nunits).else_void(); + if (!aarch64_mem_pair_operand (mem_1, mem_mode) + || !aarch64_mem_pair_operand (mem_3, mem_mode)) return false; if (code == ZERO_EXTEND) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index a568cf21b99d4b169d7e367c5f00d65c544ef790..8c5765476a9db2b93775f7da770bb2ba03677763 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -98,6 +98,9 @@ (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF]) ;; Copy of the above. (define_mode_iterator DREG2 [V8QI V4HI V4HF V2SI V2SF DF]) +;; All modes suitable to store/load pair (2 elements) using STP/LDP. +(define_mode_iterator VP_2E [V2SI V2SF V2DI V2DF]) + ;; Advanced SIMD, 64-bit container, all integer modes. (define_mode_iterator VD_BHSI [V8QI V4HI V2SI]) @@ -935,6 +938,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16") (V4BF "4") (V8BF "8") (V2SF "2") (V4SF "4") (V1DF "1") (V2DF "2") + (SI "1") (SF "1") (DI "1") (DF "1")]) ;; Map a mode to the number of bits in it, if the size of the mode diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c new file mode 100644 index 0000000000000000000000000000000000000000..f46dea1f748a094509ecfa0292a7c54e94164c9a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +typedef float __attribute__((vector_size(8))) vec; + +vec +load_long(vec *v) { + return v[110] + v[111] + v[112] + v[113]; +} + +/* { dg-final { scan-assembler "add\tx\[0-9\]+, x\[0-9\]+, 880" } } */ +/* { dg-final { scan-assembler "ldp\td\[0-9\]+, d\[0-9\]+, \\\[x\[0-9\]+\\\]" } } */ +/* { dg-final { scan-assembler "ldp\td\[0-9\]+, d\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" } } */ +/* { dg-final { scan-assembler-not "ldr\t" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c new file mode 100644 index 0000000000000000000000000000000000000000..0abd94f942ae7ec49afda590989773f52556404c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +typedef int __attribute__((vector_size(8))) vec; + +vec +load_long(vec *v) { + return v[110] + v[111] + v[112] + v[113]; +} + +/* { dg-final { scan-assembler "add\tx\[0-9\]+, x\[0-9\]+, 880" } } */ +/* { dg-final { scan-assembler "ldp\td\[0-9\]+, d\[0-9\]+, \\\[x\[0-9\]+\\\]" } } */ +/* { dg-final { scan-assembler "ldp\td\[0-9\]+, d\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" } } */ +/* { dg-final { scan-assembler-not "ldr\t" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c new file mode 100644 index 0000000000000000000000000000000000000000..cb7a65c006af451b873f8adc0546af6f8efa3c43 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +typedef double __attribute__((vector_size(16))) vec; + +void +store_adjusted(vec *out, vec x, vec y) +{ + out[100] = x; + out[101] = y; + out[102] = y; + out[103] = x; +} + +/* { dg-final { scan-assembler "add\tx\[0-9\]+, x\[0-9\]+, 1600" } } */ +/* { dg-final { scan-assembler "stp\tq\[0-9\]+, q\[0-9\]+, \\\[x\[0-9\]+\\\]" } } */ +/* { dg-final { scan-assembler "stp\tq\[0-9\]+, q\[0-9\]+, \\\[x\[0-9\]+, 32\\\]" } } */ +/* { dg-final { scan-assembler-not "str\t" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c new file mode 100644 index 0000000000000000000000000000000000000000..a5b298d5c43beb2df4c21d1cb81a961cca908192 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +typedef long long __attribute__((vector_size(16))) vec; + +void +store_adjusted(vec *out, vec x, vec y) +{ + out[100] = x; + out[101] = y; + out[102] = y; + out[103] = x; +} + +/* { dg-final { scan-assembler "add\tx\[0-9\]+, x\[0-9\]+, 1600" } } */ +/* { dg-final { scan-assembler "stp\tq\[0-9\]+, q\[0-9\]+, \\\[x\[0-9\]+\\\]" } } */ +/* { dg-final { scan-assembler "stp\tq\[0-9\]+, q\[0-9\]+, \\\[x\[0-9\]+, 32\\\]" } } */ +/* { dg-final { scan-assembler-not "str\t" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c new file mode 100644 index 0000000000000000000000000000000000000000..3bf8c58faa3b687040b5a5bccec54f771914b474 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +typedef float __attribute__((vector_size(8))) vec; + +void +store_adjusted(vec *out, vec x, vec y) +{ + out[400] = x; + out[401] = y; + out[402] = y; + out[403] = x; +} + +/* { dg-final { scan-assembler "add\tx\[0-9\]+, x\[0-9\]+, 3200" } } */ +/* { dg-final { scan-assembler "stp\td\[0-9\]+, d\[0-9\]+, \\\[x\[0-9\]+\\\]" } } */ +/* { dg-final { scan-assembler "stp\td\[0-9\]+, d\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" } } */ +/* { dg-final { scan-assembler-not "str\t" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c new file mode 100644 index 0000000000000000000000000000000000000000..f9d1cf4ac6bad7d44604a71037dd15cff55ced51 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +typedef int __attribute__((vector_size(8))) vec; + +void +store_adjusted(vec *out, vec x, vec y) +{ + out[400] = x; + out[401] = y; + out[402] = y; + out[403] = x; +} + +/* { dg-final { scan-assembler "add\tx\[0-9\]+, x\[0-9\]+, 3200" } } */ +/* { dg-final { scan-assembler "stp\td\[0-9\]+, d\[0-9\]+, \\\[x\[0-9\]+\\\]" } } */ +/* { dg-final { scan-assembler "stp\td\[0-9\]+, d\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" } } */ +/* { dg-final { scan-assembler-not "str\t" } } */