Greetings, This patch removes the arch-common aese/aesmc and aesd/aesimc fusions (i.e. aes fusion) implemented in the scheduling phase through the aarch_crypto_can_dual function. The reason is due to observing undesired behaviour in cases such as: - when register allocation goes bad (e.g. extra movs) - aes operations with xor and zeroed keys among interleaved operations
A more stable version should be provided by instead doing the aes fusion during the combine pass. As such, new combine patterns have been added to enable this. The second change is the aese and aesd patterns have been rewritten as encapsulating a xor operation. The purpose is to simplify the need of having additional combine patterns for cases like the ones below: For AESE (though it also applies to AESD as both have a xor operation): data = data ^ key; data = vaeseq_u8(data, zero); --- veor q1, q0, q1 aese.8 q1, q9 Should mean and generate the same as: data = vaeseq_u8(data, key); --- aese.8 q1, q0 Bootstrapped and tested on arm-none-linux-gnueabihf. Cheers, Syl gcc/ChangeLog: 2019-07-05 Sylvia Taylor <sylvia.tay...@arm.com> * config/arm/crypto.md: (crypto_<crypto_pattern>): Redefine aese/aesd pattern with xor. (crypto_<crypto_pattern>): Remove attribute enabled for aesmc. (crypto_<crypto_pattern>): Split CRYPTO_BINARY into 2 patterns. (*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New. * config/arm/arm.c (aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check. * config/arm/aarch-common-protos.h (aarch_crypto_can_dual_issue): Remove. * config/arm/aarch-common.c (aarch_crypto_can_dual_issue): Likewise. * config/arm/exynos-m1.md: Remove aese/aesmc fusion. * config/arm/cortex-a53.md: Likewise. * config/arm/cortex-a57.md: Likewise. * config/arm/iterators.md: (CRYPTO_BINARY): Redefine. (CRYPTO_UNARY): Removed. (CRYPTO_AES, CRYPTO_AESMC): New. gcc/testsuite/ChangeLog: 2019-07-05 Sylvia Taylor <sylvia.tay...@arm.com> * gcc.target/arm/aes-fuse-1.c: New. * gcc.target/arm/aes-fuse-2.c: New. * gcc.target/arm/aes_xor_combine.c: New.
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h index 11cd5145bbc77ab35e7874a75a93ec0e7bb0ea28..3bf38a104f6941eec1ce88db7d6b6ceb7da0af92 100644 --- a/gcc/config/arm/aarch-common-protos.h +++ b/gcc/config/arm/aarch-common-protos.h @@ -24,7 +24,6 @@ #define GCC_AARCH_COMMON_PROTOS_H extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *); -extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *); extern bool aarch_rev16_p (rtx); extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode); extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode); diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c index c7af12d4cd1714c70ebc6d6c7d4454606d15f864..965a07a43e3129dd1743d4a79813a597feca0b71 100644 --- a/gcc/config/arm/aarch-common.c +++ b/gcc/config/arm/aarch-common.c @@ -31,46 +31,6 @@ #include "rtl-iter.h" #include "memmodel.h" -/* In ARMv8-A there's a general expectation that AESE/AESMC - and AESD/AESIMC sequences of the form: - - AESE Vn, _ - AESMC Vn, Vn - - will issue both instructions in a single cycle on super-scalar - implementations. This function identifies such pairs. */ - -int -aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn) -{ - rtx producer_set, consumer_set; - rtx producer_src, consumer_src; - - producer_set = single_set (producer_insn); - consumer_set = single_set (consumer_insn); - - producer_src = producer_set ? SET_SRC (producer_set) : NULL; - consumer_src = consumer_set ? SET_SRC (consumer_set) : NULL; - - if (producer_src && consumer_src - && GET_CODE (producer_src) == UNSPEC && GET_CODE (consumer_src) == UNSPEC - && ((XINT (producer_src, 1) == UNSPEC_AESE - && XINT (consumer_src, 1) == UNSPEC_AESMC) - || (XINT (producer_src, 1) == UNSPEC_AESD - && XINT (consumer_src, 1) == UNSPEC_AESIMC))) - { - unsigned int regno = REGNO (SET_DEST (producer_set)); - - /* Before reload the registers are virtual, so the destination of - consumer_set doesn't need to match. */ - - return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed) - && REGNO (XVECEXP (consumer_src, 0, 0)) == regno; - } - - return 0; -} - /* Return TRUE if X is either an arithmetic shift left, or is a multiplication by a power of two. */ bool diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index e9aba65c70563f23ba3049702072a59cf555b9ce..5c5129a8e52adb07bb431eb51c6f6239b9b0c941 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -30565,10 +30565,6 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr) if (!arm_macro_fusion_p ()) return false; - if (current_tune->fusible_ops & tune_params::FUSE_AES_AESMC - && aarch_crypto_can_dual_issue (prev, curr)) - return true; - if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT && arm_sets_movw_movt_fusible_p (prev_set, curr_set)) return true; diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md index b55d34e91c03216cefa23de23f2191579f2e0edb..4b8412236b9b8f3edadb50eba500a6a606fbd8c4 100644 --- a/gcc/config/arm/cortex-a53.md +++ b/gcc/config/arm/cortex-a53.md @@ -722,10 +722,3 @@ (define_bypass 4 "cortex_a53_fpmac" "cortex_a53_fpmac" "aarch_accumulator_forwarding") - -;; We want AESE and AESMC to end up consecutive to one another. - -(define_bypass 0 "cortex_a53_crypto_aese" - "cortex_a53_crypto_aesmc" - "aarch_crypto_can_dual_issue") - diff --git a/gcc/config/arm/cortex-a57.md b/gcc/config/arm/cortex-a57.md index 577dc8d7fe255bebf496d8384b1019c0f211232c..7d567b060f461bf173610409bf66acb456421900 100644 --- a/gcc/config/arm/cortex-a57.md +++ b/gcc/config/arm/cortex-a57.md @@ -796,9 +796,3 @@ ;; help. (define_bypass 1 "cortex_a57_*" "cortex_a57_call,cortex_a57_branch") - -;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency -(define_bypass 0 "cortex_a57_crypto_simple" - "cortex_a57_crypto_simple" - "aarch_crypto_can_dual_issue") - diff --git a/gcc/config/arm/crypto.md b/gcc/config/arm/crypto.md index 63d9d9ffa424fa51b05ebee5138b2c7c0f304745..bf34f69fc75c4b16dd947a7fe48728a97a319abf 100644 --- a/gcc/config/arm/crypto.md +++ b/gcc/config/arm/crypto.md @@ -19,33 +19,76 @@ ;; <http://www.gnu.org/licenses/>. -;; When AES/AESMC fusion is enabled we want the register allocation to -;; look like: -;; AESE Vn, _ -;; AESMC Vn, Vn -;; So prefer to tie operand 1 to operand 0 when fusing. - (define_insn "crypto_<crypto_pattern>" - [(set (match_operand:<crypto_mode> 0 "register_operand" "=w,w") - (unspec:<crypto_mode> [(match_operand:<crypto_mode> 1 - "register_operand" "0,w")] - CRYPTO_UNARY))] + [(set (match_operand:<crypto_mode> 0 "register_operand" "=w") + (unspec:<crypto_mode> + [(match_operand:<crypto_mode> 1 "register_operand" "w")] + CRYPTO_AESMC))] "TARGET_CRYPTO" "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q1" - [(set_attr "type" "<crypto_type>") - (set_attr_alternative "enabled" - [(if_then_else (match_test - "arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)") - (const_string "yes" ) - (const_string "no")) - (const_string "yes")])] + [(set_attr "type" "<crypto_type>")] +) + +(define_insn "crypto_<crypto_pattern>" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (unspec:V16QI + [(xor:V16QI + (match_operand:V16QI 1 "register_operand" "%0") + (match_operand:V16QI 2 "register_operand" "w"))] + CRYPTO_AES))] + "TARGET_CRYPTO" + "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2" + [(set_attr "type" "<crypto_type>")] +) + +;; When AESE/AESMC fusion is enabled we really want to keep the two together +;; and enforce the register dependency without scheduling or register +;; allocation messing up the order or introducing moves inbetween. +;; Mash the two together during combine. + +(define_insn "*aarch32_crypto_aese_fused" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (unspec:V16QI + [(unspec:V16QI + [(xor:V16QI + (match_operand:V16QI 1 "register_operand" "%0") + (match_operand:V16QI 2 "register_operand" "w"))] + UNSPEC_AESE)] + UNSPEC_AESMC))] + "TARGET_CRYPTO + && arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)" + "aese.8\\t%q0, %q2\;aesmc.8\\t%q0, %q0" + [(set_attr "type" "crypto_aese") + (set_attr "length" "8")] +) + +;; When AESD/AESIMC fusion is enabled we really want to keep the two together +;; and enforce the register dependency without scheduling or register +;; allocation messing up the order or introducing moves inbetween. +;; Mash the two together during combine. + +(define_insn "*aarch32_crypto_aesd_fused" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (unspec:V16QI + [(unspec:V16QI + [(xor:V16QI + (match_operand:V16QI 1 "register_operand" "%0") + (match_operand:V16QI 2 "register_operand" "w"))] + UNSPEC_AESD)] + UNSPEC_AESIMC))] + "TARGET_CRYPTO + && arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)" + "aesd.8\\t%q0, %q2\;aesimc.8\\t%q0, %q0" + [(set_attr "type" "crypto_aese") + (set_attr "length" "8")] ) (define_insn "crypto_<crypto_pattern>" [(set (match_operand:<crypto_mode> 0 "register_operand" "=w") - (unspec:<crypto_mode> [(match_operand:<crypto_mode> 1 "register_operand" "0") - (match_operand:<crypto_mode> 2 "register_operand" "w")] - CRYPTO_BINARY))] + (unspec:<crypto_mode> + [(match_operand:<crypto_mode> 1 "register_operand" "0") + (match_operand:<crypto_mode> 2 "register_operand" "w")] + CRYPTO_BINARY))] "TARGET_CRYPTO" "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2" [(set_attr "type" "<crypto_type>")] diff --git a/gcc/config/arm/exynos-m1.md b/gcc/config/arm/exynos-m1.md index 3d04a52ac3d550ca97497cd8ebf058456614881c..150ac85ebc7dad0c0028a35a8851bef66a997642 100644 --- a/gcc/config/arm/exynos-m1.md +++ b/gcc/config/arm/exynos-m1.md @@ -950,11 +950,6 @@ "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\ exynos_m1_crypto_poly*") -;; AES{D,E}/AESMC pairs can feed each other instantly. -(define_bypass 0 "exynos_m1_crypto_simple" - "exynos_m1_crypto_simple" - "aarch_crypto_can_dual_issue") - ;; Predicted branches take no time, but mispredicted ones take forever anyway. (define_bypass 1 "exynos_m1_*" "exynos_m1_call, exynos_m1_branch") diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index c33e572c3e89c3dc5848bd6b825d618481247558..24d9583deff17e74aadf182d5cd7e1659c02c6dc 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -411,10 +411,11 @@ (define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W UNSPEC_CRC32CB UNSPEC_CRC32CH UNSPEC_CRC32CW]) -(define_int_iterator CRYPTO_UNARY [UNSPEC_AESMC UNSPEC_AESIMC]) +(define_int_iterator CRYPTO_AESMC [UNSPEC_AESMC UNSPEC_AESIMC]) -(define_int_iterator CRYPTO_BINARY [UNSPEC_AESD UNSPEC_AESE - UNSPEC_SHA1SU1 UNSPEC_SHA256SU0]) +(define_int_iterator CRYPTO_AES [UNSPEC_AESD UNSPEC_AESE]) + +(define_int_iterator CRYPTO_BINARY [UNSPEC_SHA1SU1 UNSPEC_SHA256SU0]) (define_int_iterator CRYPTO_TERNARY [UNSPEC_SHA1SU0 UNSPEC_SHA256H UNSPEC_SHA256H2 UNSPEC_SHA256SU1]) diff --git a/gcc/testsuite/gcc.target/arm/aes-fuse-1.c b/gcc/testsuite/gcc.target/arm/aes-fuse-1.c new file mode 100644 index 0000000000000000000000000000000000000000..27b08aeef7ba7c9fc8b5cbebcbcbf576ca88f064 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/aes-fuse-1.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */ + +#include <arm_neon.h> + +#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key))); +#define AESMC(r, i) (r = vaesmcq_u8 (i)) + +const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +uint8x16_t dummy; +uint8x16_t a; +uint8x16_t b; +uint8x16_t c; +uint8x16_t d; +uint8x16_t x; +uint8x16_t y; +uint8x16_t k; + +void foo (void) +{ + AESE (a, a, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESE (b, b, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESE (c, c, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESE (d, d, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + x = x ^ k; + AESE (x, x, zero); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + y = y ^ k; + AESE (y, y, zero); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + AESMC (d, d); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESMC (c, c); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESMC (b, b); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESMC (a, a); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + AESMC (y, y); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESMC (x, x); +} + +/* { dg-final { scan-assembler-times "crypto_aese_fused" 6 } } */ +/* { dg-final { scan-assembler-not "veor" } } */ diff --git a/gcc/testsuite/gcc.target/arm/aes-fuse-2.c b/gcc/testsuite/gcc.target/arm/aes-fuse-2.c new file mode 100644 index 0000000000000000000000000000000000000000..1266a287531691f84d80c62d3e1e70915aff9668 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/aes-fuse-2.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */ + +#include <arm_neon.h> + +#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key))); +#define AESIMC(r, i) (r = vaesimcq_u8 (i)) + +const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +uint8x16_t dummy; +uint8x16_t a; +uint8x16_t b; +uint8x16_t c; +uint8x16_t d; +uint8x16_t x; +uint8x16_t y; +uint8x16_t k; + +void foo (void) +{ + AESD (a, a, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESD (b, b, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESD (c, c, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESD (d, d, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + x = x ^ k; + AESD (x, x, zero); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + y = y ^ k; + AESD (y, y, zero); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + AESIMC (d, d); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESIMC (c, c); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESIMC (b, b); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESIMC (a, a); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + AESIMC (y, y); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESIMC (x, x); +} + +/* { dg-final { scan-assembler-times "crypto_aesd_fused" 6 } } */ +/* { dg-final { scan-assembler-not "veor" } } */ diff --git a/gcc/testsuite/gcc.target/arm/aes_xor_combine.c b/gcc/testsuite/gcc.target/arm/aes_xor_combine.c new file mode 100644 index 0000000000000000000000000000000000000000..17ae1c53e4a9bcb0bd439fd504ceafda4496d809 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/aes_xor_combine.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ + +#include <arm_neon.h> + +#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key))); +#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key))); + +const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +uint8x16_t foo_key_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar) +{ + dummy = dummy ^ foo; + AESE(dummy, dummy, zero); + dummy = dummy ^ bar; + AESE(dummy, dummy, zero); + + dummy = dummy ^ foo; + AESD(dummy, dummy, zero); + dummy = dummy ^ bar; + AESD(dummy, dummy, zero); + + return dummy; +} + +uint8x16_t foo_data_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar) +{ + dummy = dummy ^ foo; + AESE(dummy, zero, dummy); + dummy = dummy ^ bar; + AESE(dummy, zero, dummy); + + dummy = dummy ^ foo; + AESD(dummy, zero, dummy); + dummy = dummy ^ bar; + AESD(dummy, zero, dummy); + + return dummy; +} + +/* { dg-final { scan-assembler-not "veor" } } */