On 14/10/15 13:30, Wilco Dijkstra wrote:
Enable instruction fusion of dependent AESE; AESMC and AESD; AESIMC pairs. This 
can give up to 2x
speedup on many AArch64 implementations. Also model the crypto instructions on 
Cortex-A57 according
to the Optimization Guide.

Passes regression tests.

arm-wise this is ok, but I'd like a follow up patch to enable this fusion
for the arm port as well. It should be fairly simple.
Just add a new enum value to fuse_ops inside tune_params in arm-protos.h
and update the arm implementation in aarch_macro_fusion_pair_p similar
to your aarch64 implementation.

Thanks,
Kyrill


ChangeLog:
2015-10-14  Wilco Dijkstra  <wdijk...@arm.com>

        * gcc/config/aarch64/aarch64.c (cortexa53_tunings): Add AES fusion.
        (cortexa57_tunings): Likewise.
        (cortexa72_tunings): Likewise.
        (arch_macro_fusion_pair_p): Add support for AES fusion.
        * gcc/config/aarch64/aarch64-fusion-pairs.def: Add AES_AESMC entry.
        * gcc/config/arm/aarch-common.c (aarch_crypto_can_dual_issue):
        Allow virtual registers before reload so early scheduling works.
        * gcc/config/arm/cortex-a57.md (cortex_a57_crypto_simple): Use
        correct latency and pipeline.
        (cortex_a57_crypto_complex): Likewise.
        (cortex_a57_crypto_xor): Likewise.
        (define_bypass): Add AES bypass.


---
  gcc/config/aarch64/aarch64-fusion-pairs.def |  1 +
  gcc/config/aarch64/aarch64.c                | 10 +++++++---
  gcc/config/arm/aarch-common.c               |  7 +++++--
  gcc/config/arm/cortex-a57.md                | 17 +++++++++++------
  4 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def
b/gcc/config/aarch64/aarch64-fusion-pairs.def
index 53bbef4..fea79fc 100644
--- a/gcc/config/aarch64/aarch64-fusion-pairs.def
+++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
@@ -33,4 +33,5 @@ AARCH64_FUSION_PAIR ("adrp+add", ADRP_ADD)
  AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK)
  AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR)
  AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
+AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 230902d..96368c6 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -376,7 +376,7 @@ static const struct tune_params cortexa53_tunings =
    &generic_branch_cost,
    4, /* memmov_cost  */
    2, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
     | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
    8,  /* function_align.  */
    8,  /* jump_align.  */
@@ -398,7 +398,7 @@ static const struct tune_params cortexa57_tunings =
    &generic_branch_cost,
    4, /* memmov_cost  */
    3, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
     | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
    16, /* function_align.  */
    8,  /* jump_align.  */
@@ -420,7 +420,7 @@ static const struct tune_params cortexa72_tunings =
    &generic_branch_cost,
    4, /* memmov_cost  */
    3, /* issue_rate  */
-  (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
     | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
    16, /* function_align.  */
    8,  /* jump_align.  */
@@ -12843,6 +12843,10 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn 
*curr)
          }
      }
+ if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
+       && aarch_crypto_can_dual_issue (prev, curr))
+    return true;
+
    if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
        && any_condjump_p (curr))
      {
diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c
index 5dd8222..e191ab6 100644
--- a/gcc/config/arm/aarch-common.c
+++ b/gcc/config/arm/aarch-common.c
@@ -63,8 +63,11 @@ aarch_crypto_can_dual_issue (rtx_insn *producer_insn, 
rtx_insn *consumer_insn)
    {
      unsigned int regno = REGNO (SET_DEST (producer_set));
- return REGNO (SET_DEST (consumer_set)) == regno
-           && REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
+    /* Before reload the registers are virtual, so the destination of
+       consumer_set doesn't need to match.  */
+
+    return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed)
+           && REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
    }
return 0;
diff --git a/gcc/config/arm/cortex-a57.md b/gcc/config/arm/cortex-a57.md
index a32c848..eab9d99 100644
--- a/gcc/config/arm/cortex-a57.md
+++ b/gcc/config/arm/cortex-a57.md
@@ -745,20 +745,20 @@
                         neon_fp_sqrt_s_q, neon_fp_sqrt_d_q"))
    "ca57_cx2_block*3")
-(define_insn_reservation "cortex_a57_crypto_simple" 4
+(define_insn_reservation "cortex_a57_crypto_simple" 3
    (and (eq_attr "tune" "cortexa57")
         (eq_attr "type" 
"crypto_aese,crypto_aesmc,crypto_sha1_fast,crypto_sha256_fast"))
-  "ca57_cx2")
+  "ca57_cx1")
-(define_insn_reservation "cortex_a57_crypto_complex" 7
+(define_insn_reservation "cortex_a57_crypto_complex" 6
    (and (eq_attr "tune" "cortexa57")
         (eq_attr "type" "crypto_sha1_slow,crypto_sha256_slow"))
-  "ca57_cx2+(ca57_cx2_issue,ca57_cx2)")
+  "ca57_cx1*2")
-(define_insn_reservation "cortex_a57_crypto_xor" 7
+(define_insn_reservation "cortex_a57_crypto_xor" 6
    (and (eq_attr "tune" "cortexa57")
         (eq_attr "type" "crypto_sha1_xor"))
-  "(ca57_cx1+ca57_cx2)")
+  "(ca57_cx1*2)|(ca57_cx2*2)")
;; We lie with calls. They take up all issue slots, but are otherwise
  ;; not harmful.
@@ -795,3 +795,8 @@
  (define_bypass 1 "cortex_a57_*"
                 "cortex_a57_call,cortex_a57_branch")
+;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency
+(define_bypass 0 "cortex_a57_crypto_simple"
+                "cortex_a57_crypto_simple"
+                "aarch_crypto_can_dual_issue")
+

Reply via email to