Hi All, This updates the cost for Neoverse V2 to reflect the updated Software Optimization Guide.
It also makes Cortex-X3 use the Neoverse V2 cost model. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-cores.def (cortex-x3): Use Neoverse-V2 costs. * config/aarch64/tuning_models/neoversev2.h: Update costs. --- diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index e58bc0f27de3d60d39c02d2be2aa15570bd5db4d..34307fe0c1721dda67adab768dd22a5649687f6e 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -186,7 +186,7 @@ AARCH64_CORE("cortex-a720", cortexa720, cortexa57, V9_2A, (SVE2_BITPERM, MEMTA AARCH64_CORE("cortex-x2", cortexx2, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd48, -1) -AARCH64_CORE("cortex-x3", cortexx3, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd4e, -1) +AARCH64_CORE("cortex-x3", cortexx3, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversev2, 0x41, 0xd4e, -1) AARCH64_CORE("cortex-x4", cortexx4, cortexa57, V9_2A, (SVE2_BITPERM, MEMTAG, PROFILE), neoversen2, 0x41, 0xd81, -1) diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h index f76e4ef358f7dfb9c7d7b470ea7240eaa2120f8e..cca459e32c1384f57f8345d86b42b7814ae44115 100644 --- a/gcc/config/aarch64/tuning_models/neoversev2.h +++ b/gcc/config/aarch64/tuning_models/neoversev2.h @@ -57,13 +57,13 @@ static const advsimd_vec_cost neoversev2_advsimd_vector_cost = 2, /* ld2_st2_permute_cost */ 2, /* ld3_st3_permute_cost */ 3, /* ld4_st4_permute_cost */ - 3, /* permute_cost */ + 2, /* permute_cost */ 4, /* reduc_i8_cost */ 4, /* reduc_i16_cost */ 2, /* reduc_i32_cost */ 2, /* reduc_i64_cost */ 6, /* reduc_f16_cost */ - 3, /* reduc_f32_cost */ + 4, /* reduc_f32_cost */ 2, /* reduc_f64_cost */ 2, /* store_elt_extra_cost */ /* This value is just inherited from the Cortex-A57 table. */ @@ -86,28 +86,28 @@ static const sve_vec_cost neoversev2_sve_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ - 3, /* ld2_st2_permute_cost */ + 2, /* ld2_st2_permute_cost */ 3, /* ld3_st3_permute_cost */ - 4, /* ld4_st4_permute_cost */ - 3, /* permute_cost */ + 3, /* ld4_st4_permute_cost */ + 2, /* permute_cost */ /* Theoretically, a reduction involving 15 scalar ADDs could - complete in ~3 cycles and would have a cost of 15. [SU]ADDV - completes in 11 cycles, so give it a cost of 15 + 8. */ - 21, /* reduc_i8_cost */ - /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */ - 14, /* reduc_i16_cost */ - /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */ + complete in ~5 cycles and would have a cost of 15. [SU]ADDV + completes in 9 cycles, so give it a cost of 15 + 4. */ + 19, /* reduc_i8_cost */ + /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5. */ + 12, /* reduc_i16_cost */ + /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4. */ 7, /* reduc_i32_cost */ - /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */ - 2, /* reduc_i64_cost */ + /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3. */ + 4, /* reduc_i64_cost */ /* Theoretically, a reduction involving 7 scalar FADDs could - complete in ~6 cycles and would have a cost of 14. FADDV - completes in 8 cycles, so give it a cost of 14 + 2. */ - 16, /* reduc_f16_cost */ - /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */ - 8, /* reduc_f32_cost */ - /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */ - 4, /* reduc_f64_cost */ + complete in ~6 cycles and would have a cost of 7. FADDV + completes in 8 cycles, so give it a cost of 7 + 2. */ + 9, /* reduc_f16_cost */ + /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 3 + 2. */ + 5, /* reduc_f32_cost */ + /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 1 + 2. */ + 3, /* reduc_f64_cost */ 2, /* store_elt_extra_cost */ /* This value is just inherited from the Cortex-A57 table. */ 8, /* vec_to_scalar_cost */ @@ -127,7 +127,7 @@ static const sve_vec_cost neoversev2_sve_vector_cost = /* A strided Advanced SIMD x64 load would take two parallel FP loads (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads - (cost 8) and a vec_construct (cost 2). Add a full vector operation + (cost 8) and a vec_construct (cost 4). Add a full vector operation (cost 2) to that, to avoid the difference being lost in rounding. There is no easy comparison between a strided Advanced SIMD x32 load @@ -165,14 +165,14 @@ static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info = { { { - 3, /* loads_per_cycle */ + 3, /* loads_stores_per_cycle */ 2, /* stores_per_cycle */ 4, /* general_ops_per_cycle */ 0, /* fp_simd_load_general_ops */ 1 /* fp_simd_store_general_ops */ }, 2, /* ld2_st2_general_ops */ - 3, /* ld3_st3_general_ops */ + 2, /* ld3_st3_general_ops */ 3 /* ld4_st4_general_ops */ }, 2, /* pred_ops_per_cycle */ @@ -190,7 +190,7 @@ static const aarch64_vec_issue_info neoversev2_vec_issue_info = &neoversev2_sve_issue_info }; -/* Demeter costs for vector insn classes. */ +/* Neoversev2 costs for vector insn classes. */ static const struct cpu_vector_cost neoversev2_vector_cost = { 1, /* scalar_int_stmt_cost */ @@ -243,4 +243,4 @@ static const struct tune_params neoversev2_tunings = AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ }; -#endif /* GCC_AARCH64_H_NEOVERSEV2. */ +#endif /* GCC_AARCH64_H_NEOVERSEV2. */ \ No newline at end of file --
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index e58bc0f27de3d60d39c02d2be2aa15570bd5db4d..34307fe0c1721dda67adab768dd22a5649687f6e 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -186,7 +186,7 @@ AARCH64_CORE("cortex-a720", cortexa720, cortexa57, V9_2A, (SVE2_BITPERM, MEMTA AARCH64_CORE("cortex-x2", cortexx2, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd48, -1) -AARCH64_CORE("cortex-x3", cortexx3, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd4e, -1) +AARCH64_CORE("cortex-x3", cortexx3, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversev2, 0x41, 0xd4e, -1) AARCH64_CORE("cortex-x4", cortexx4, cortexa57, V9_2A, (SVE2_BITPERM, MEMTAG, PROFILE), neoversen2, 0x41, 0xd81, -1) diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h index f76e4ef358f7dfb9c7d7b470ea7240eaa2120f8e..cca459e32c1384f57f8345d86b42b7814ae44115 100644 --- a/gcc/config/aarch64/tuning_models/neoversev2.h +++ b/gcc/config/aarch64/tuning_models/neoversev2.h @@ -57,13 +57,13 @@ static const advsimd_vec_cost neoversev2_advsimd_vector_cost = 2, /* ld2_st2_permute_cost */ 2, /* ld3_st3_permute_cost */ 3, /* ld4_st4_permute_cost */ - 3, /* permute_cost */ + 2, /* permute_cost */ 4, /* reduc_i8_cost */ 4, /* reduc_i16_cost */ 2, /* reduc_i32_cost */ 2, /* reduc_i64_cost */ 6, /* reduc_f16_cost */ - 3, /* reduc_f32_cost */ + 4, /* reduc_f32_cost */ 2, /* reduc_f64_cost */ 2, /* store_elt_extra_cost */ /* This value is just inherited from the Cortex-A57 table. */ @@ -86,28 +86,28 @@ static const sve_vec_cost neoversev2_sve_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ - 3, /* ld2_st2_permute_cost */ + 2, /* ld2_st2_permute_cost */ 3, /* ld3_st3_permute_cost */ - 4, /* ld4_st4_permute_cost */ - 3, /* permute_cost */ + 3, /* ld4_st4_permute_cost */ + 2, /* permute_cost */ /* Theoretically, a reduction involving 15 scalar ADDs could - complete in ~3 cycles and would have a cost of 15. [SU]ADDV - completes in 11 cycles, so give it a cost of 15 + 8. */ - 21, /* reduc_i8_cost */ - /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */ - 14, /* reduc_i16_cost */ - /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */ + complete in ~5 cycles and would have a cost of 15. [SU]ADDV + completes in 9 cycles, so give it a cost of 15 + 4. */ + 19, /* reduc_i8_cost */ + /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5. */ + 12, /* reduc_i16_cost */ + /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4. */ 7, /* reduc_i32_cost */ - /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */ - 2, /* reduc_i64_cost */ + /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3. */ + 4, /* reduc_i64_cost */ /* Theoretically, a reduction involving 7 scalar FADDs could - complete in ~6 cycles and would have a cost of 14. FADDV - completes in 8 cycles, so give it a cost of 14 + 2. */ - 16, /* reduc_f16_cost */ - /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */ - 8, /* reduc_f32_cost */ - /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */ - 4, /* reduc_f64_cost */ + complete in ~6 cycles and would have a cost of 7. FADDV + completes in 8 cycles, so give it a cost of 7 + 2. */ + 9, /* reduc_f16_cost */ + /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 3 + 2. */ + 5, /* reduc_f32_cost */ + /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 1 + 2. */ + 3, /* reduc_f64_cost */ 2, /* store_elt_extra_cost */ /* This value is just inherited from the Cortex-A57 table. */ 8, /* vec_to_scalar_cost */ @@ -127,7 +127,7 @@ static const sve_vec_cost neoversev2_sve_vector_cost = /* A strided Advanced SIMD x64 load would take two parallel FP loads (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads - (cost 8) and a vec_construct (cost 2). Add a full vector operation + (cost 8) and a vec_construct (cost 4). Add a full vector operation (cost 2) to that, to avoid the difference being lost in rounding. There is no easy comparison between a strided Advanced SIMD x32 load @@ -165,14 +165,14 @@ static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info = { { { - 3, /* loads_per_cycle */ + 3, /* loads_stores_per_cycle */ 2, /* stores_per_cycle */ 4, /* general_ops_per_cycle */ 0, /* fp_simd_load_general_ops */ 1 /* fp_simd_store_general_ops */ }, 2, /* ld2_st2_general_ops */ - 3, /* ld3_st3_general_ops */ + 2, /* ld3_st3_general_ops */ 3 /* ld4_st4_general_ops */ }, 2, /* pred_ops_per_cycle */ @@ -190,7 +190,7 @@ static const aarch64_vec_issue_info neoversev2_vec_issue_info = &neoversev2_sve_issue_info }; -/* Demeter costs for vector insn classes. */ +/* Neoversev2 costs for vector insn classes. */ static const struct cpu_vector_cost neoversev2_vector_cost = { 1, /* scalar_int_stmt_cost */ @@ -243,4 +243,4 @@ static const struct tune_params neoversev2_tunings = AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ }; -#endif /* GCC_AARCH64_H_NEOVERSEV2. */ +#endif /* GCC_AARCH64_H_NEOVERSEV2. */ \ No newline at end of file