Hi Guys, I am applying the patch below to fix a couple of performance regressions for the RX toolchain on the 4.6 branch (when compared to the 4.5 branch). These include aligning jumps, loops and labels, and combining extending loads and simple arithmetic operations.
Cheers Nick gcc/ChangeLog 2011-03-29 Nick Clifton <ni...@redhat.com> * config/rx/rx.h (LABEL_ALIGN_AFTER_BARRIER): Define. (ASM_OUTPUT_MAX_SKIP): Define. * config/rx/predicates.md (rx_zs_comparison_operator): Do not allow LT aor GE comparisons. * config/rx/rx-protos.h (rx_align_for_label): Prototype. * config/rx/rx.md: Add peepholes and patterns to combine extending loads with simple arithmetic instructions. * config/rx/rx.c (rx_is_legitimate_address): Allow QI and HI modes to use pre-decrement and post-increment addressing. (rx_is_restricted_memory_address): For REG+INT addressing, ensure that the INT is a valid offset. (rx_print_operand): Handle %R. Fix %Q's handling of MEMs. (rx_option_override): Set alignments. (rx_align_for_label): New function. (rx_max_skip_for_label): New function. (TARGET_ASM_JUMP_ALIGN_MAX_SKIP): Define. (TARGET_ASM_LOOP_ALIGN_MAX_SKIP): Define. (TARGET_ASM_LABEL_ALIGN_MAX_SKIP): Define. (TARGET_ASM_LABEL_ALIGN_AFTER_BARRIER_MAX_SKIP): Define. Index: gcc/config/rx/rx.h =================================================================== --- gcc/config/rx/rx.h (revision 171651) +++ gcc/config/rx/rx.h (working copy) @@ -615,4 +615,23 @@ #define BRANCH_COST(SPEED,PREDICT) 1 #define REGISTER_MOVE_COST(MODE,FROM,TO) 2 -#define SELECT_CC_MODE(OP,X,Y) rx_select_cc_mode(OP, X, Y) +#define SELECT_CC_MODE(OP,X,Y) rx_select_cc_mode((OP), (X), (Y)) + +#define LABEL_ALIGN_AFTER_BARRIER(x) rx_align_for_label () + +#define ASM_OUTPUT_MAX_SKIP_ALIGN(STREAM, LOG, MAX_SKIP) \ + do \ + { \ + if ((LOG) == 0 || (MAX_SKIP) == 0) \ + break; \ + if (TARGET_AS100_SYNTAX) \ + { \ + if ((LOG) >= 2) \ + fprintf (STREAM, "\t.ALIGN 4\t; %d alignment actually requested\n", 1 << (LOG)); \ + else \ + fprintf (STREAM, "\t.ALIGN 2\n"); \ + } \ + else \ + fprintf (STREAM, "\t.balign %d,3,%d\n", 1 << (LOG), (MAX_SKIP)); \ + } \ + while (0) Index: gcc/config/rx/predicates.md =================================================================== --- gcc/config/rx/predicates.md (revision 171651) +++ gcc/config/rx/predicates.md (working copy) @@ -284,7 +284,7 @@ ) (define_predicate "rx_zs_comparison_operator" - (match_code "eq,ne,lt,ge") + (match_code "eq,ne") ) ;; GT and LE omitted due to operand swap required. Index: gcc/config/rx/rx-protos.h =================================================================== --- gcc/config/rx/rx-protos.h (revision 171651) +++ gcc/config/rx/rx-protos.h (working copy) @@ -30,16 +30,17 @@ extern int rx_initial_elimination_offset (int, int); #ifdef RTX_CODE +extern int rx_align_for_label (void); extern void rx_emit_stack_popm (rtx *, bool); extern void rx_emit_stack_pushm (rtx *); extern void rx_expand_epilogue (bool); extern char * rx_gen_move_template (rtx *, bool); extern bool rx_is_legitimate_constant (rtx); extern bool rx_is_restricted_memory_address (rtx, Mmode); +extern bool rx_match_ccmode (rtx, Mmode); extern void rx_notice_update_cc (rtx body, rtx insn); extern void rx_split_cbranch (Mmode, Rcode, rtx, rtx, rtx); extern Mmode rx_select_cc_mode (Rcode, rtx, rtx); -extern bool rx_match_ccmode (rtx, Mmode); #endif #endif /* GCC_RX_PROTOS_H */ Index: gcc/config/rx/rx.md =================================================================== --- gcc/config/rx/rx.md (revision 171651) +++ gcc/config/rx/rx.md (working copy) @@ -1545,6 +1545,139 @@ (set_attr "length" "3,4,5,6,7,6")] ) +;; A set of peepholes to catch extending loads followed by arithmetic operations. +;; We use iterators where possible to reduce the amount of typing and hence the +;; possibilities for typos. + +(define_code_iterator extend_types [(zero_extend "") (sign_extend "")]) +(define_code_attr letter [(zero_extend "R") (sign_extend "Q")]) + +(define_code_iterator memex_commutative [(plus "") (and "") (ior "") (xor "")]) +(define_code_iterator memex_noncomm [(div "") (udiv "") (minus "")]) +(define_code_iterator memex_nocc [(smax "") (smin "") (mult "")]) + +(define_code_attr op [(plus "add") (and "and") (div "div") (udiv "divu") (smax "max") (smin "min") (mult "mul") (ior "or") (minus "sub") (xor "xor")]) + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand") + (extend_types:SI (match_operand:small_int_modes 1 "rx_restricted_mem_operand"))) + (parallel [(set (match_operand:SI 2 "register_operand") + (memex_commutative:SI (match_dup 0) + (match_dup 2))) + (clobber (reg:CC CC_REG))])] + "peep2_regno_dead_p (2, REGNO (operands[0]))" + [(parallel [(set:SI (match_dup 2) + (memex_commutative:SI (match_dup 2) + (extend_types:SI (match_dup 1)))) + (clobber (reg:CC CC_REG))])] +) + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand") + (extend_types:SI (match_operand:small_int_modes 1 "rx_restricted_mem_operand"))) + (parallel [(set (match_operand:SI 2 "register_operand") + (memex_commutative:SI (match_dup 2) + (match_dup 0))) + (clobber (reg:CC CC_REG))])] + "peep2_regno_dead_p (2, REGNO (operands[0]))" + [(parallel [(set:SI (match_dup 2) + (memex_commutative:SI (match_dup 2) + (extend_types:SI (match_dup 1)))) + (clobber (reg:CC CC_REG))])] +) + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand") + (extend_types:SI (match_operand:small_int_modes 1 "rx_restricted_mem_operand"))) + (parallel [(set (match_operand:SI 2 "register_operand") + (memex_noncomm:SI (match_dup 2) + (match_dup 0))) + (clobber (reg:CC CC_REG))])] + "peep2_regno_dead_p (2, REGNO (operands[0]))" + [(parallel [(set:SI (match_dup 2) + (memex_noncomm:SI (match_dup 2) + (extend_types:SI (match_dup 1)))) + (clobber (reg:CC CC_REG))])] +) + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand") + (extend_types:SI (match_operand:small_int_modes 1 "rx_restricted_mem_operand"))) + (set (match_operand:SI 2 "register_operand") + (memex_nocc:SI (match_dup 0) + (match_dup 2)))] + "peep2_regno_dead_p (2, REGNO (operands[0]))" + [(set:SI (match_dup 2) + (memex_nocc:SI (match_dup 2) + (extend_types:SI (match_dup 1))))] +) + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand") + (extend_types:SI (match_operand:small_int_modes 1 "rx_restricted_mem_operand"))) + (set (match_operand:SI 2 "register_operand") + (memex_nocc:SI (match_dup 2) + (match_dup 0)))] + "peep2_regno_dead_p (2, REGNO (operands[0]))" + [(set:SI (match_dup 2) + (memex_nocc:SI (match_dup 2) + (extend_types:SI (match_dup 1))))] +) + +(define_insn "*<memex_commutative:code>si3_<extend_types:code><small_int_modes:mode>" + [(set (match_operand:SI 0 "register_operand" "=r") + (memex_commutative:SI (match_operand:SI 1 "register_operand" "%0") + (extend_types:SI (match_operand:small_int_modes 2 "rx_restricted_mem_operand" "Q")))) + (clobber (reg:CC CC_REG))] + "" + "<memex_commutative:op>\t%<extend_types:letter>2, %0" + [(set_attr "timings" "33") + (set_attr "length" "5")] ;; Worst case sceanario. FIXME: If we defined separate patterns +) ;; rather than using iterators we could specify exact sizes. + +(define_insn "*<memex_noncomm:code>si3_<extend_types:code><small_int_modes:mode>" + [(set (match_operand:SI 0 "register_operand" "=r") + (memex_noncomm:SI (match_operand:SI 1 "register_operand" "0") + (extend_types:SI (match_operand:small_int_modes 2 "rx_restricted_mem_operand" "Q")))) + (clobber (reg:CC CC_REG))] + "" + "<memex_noncomm:op>\t%<extend_types:letter>2, %0" + [(set_attr "timings" "33") + (set_attr "length" "5")] ;; Worst case sceanario. FIXME: If we defined separate patterns +) ;; rather than using iterators we could specify exact sizes. + +(define_insn "*<memex_nocc:code>si3_<extend_types:code><small_int_modes:mode>" + [(set (match_operand:SI 0 "register_operand" "=r") + (memex_nocc:SI (match_operand:SI 1 "register_operand" "%0") + (extend_types:SI (match_operand:small_int_modes 2 "rx_restricted_mem_operand" "Q"))))] + "" + "<memex_nocc:op>\t%<extend_types:letter>2, %0" + [(set_attr "timings" "33") + (set_attr "length" "5")] ;; Worst case sceanario. FIXME: If we defined separate patterns +) ;; rather than using iterators we could specify exact sizes. + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand") + (extend_types:SI (match_operand:small_int_modes 1 "rx_restricted_mem_operand"))) + (set (reg:CC CC_REG) + (compare:CC (match_operand:SI 2 "register_operand") + (match_dup 0)))] + "peep2_regno_dead_p (2, REGNO (operands[0]))" + [(set (reg:CC CC_REG) + (compare:CC (match_dup 2) + (extend_types:SI (match_dup 1))))] +) + +(define_insn "*comparesi3_<extend_types:code><small_int_modes:mode>" + [(set (reg:CC CC_REG) + (compare:CC (match_operand:SI 0 "register_operand" "=r") + (extend_types:SI (match_operand:small_int_modes 1 "rx_restricted_mem_operand" "Q"))))] + "" + "cmp\t%<extend_types:letter>1, %0" + [(set_attr "timings" "33") + (set_attr "length" "5")] ;; Worst case sceanario. FIXME: If we defined separate patterns +) ;; rather than using iterators we could specify exact sizes. + ;; Floating Point Instructions (define_insn "addsf3" Index: gcc/config/rx/rx.c =================================================================== --- gcc/config/rx/rx.c (revision 171651) +++ gcc/config/rx/rx.c (working copy) @@ -57,7 +57,7 @@ #define CC_FLAG_Z (1 << 1) #define CC_FLAG_O (1 << 2) #define CC_FLAG_C (1 << 3) -#define CC_FLAG_FP (1 << 4) /* fake, to differentiate CC_Fmode */ +#define CC_FLAG_FP (1 << 4) /* Fake, to differentiate CC_Fmode. */ static unsigned int flags_from_mode (enum machine_mode mode); static unsigned int flags_from_code (enum rtx_code code); @@ -85,7 +85,9 @@ /* Register Indirect. */ return true; - if (GET_MODE_SIZE (mode) == 4 + if ((GET_MODE_SIZE (mode) == 4 + || GET_MODE_SIZE (mode) == 2 + || GET_MODE_SIZE (mode) == 1) && (GET_CODE (x) == PRE_DEC || GET_CODE (x) == POST_INC)) /* Pre-decrement Register Indirect or Post-increment Register Indirect. */ @@ -187,8 +189,11 @@ base = XEXP (mem, 0); index = XEXP (mem, 1); - return RX_REG_P (base) && CONST_INT_P (index); + if (! RX_REG_P (base) || ! CONST_INT_P (index)) + return false; + return IN_RANGE (INTVAL (index), 0, (0x10000 * GET_MODE_SIZE (mode)) - 1); + case SYMBOL_REF: /* Can happen when small data is being supported. Assume that it will be resolved into GP+INT. */ @@ -386,11 +391,14 @@ %L Print low part of a DImode register, integer or address. %N Print the negation of the immediate value. %Q If the operand is a MEM, then correctly generate - register indirect or register relative addressing. */ + register indirect or register relative addressing. + %R Like %Q but for zero-extending loads. */ static void rx_print_operand (FILE * file, rtx op, int letter) { + bool unsigned_load = false; + switch (letter) { case 'A': @@ -450,6 +458,7 @@ else { unsigned int flags = flags_from_mode (mode); + switch (code) { case LT: @@ -588,10 +597,15 @@ rx_print_integer (file, - INTVAL (op)); break; + case 'R': + gcc_assert (GET_MODE_SIZE (GET_MODE (op)) < 4); + unsigned_load = true; + /* Fall through. */ case 'Q': if (MEM_P (op)) { HOST_WIDE_INT offset; + rtx mem = op; op = XEXP (op, 0); @@ -626,22 +640,24 @@ rx_print_operand (file, op, 0); fprintf (file, "]."); - switch (GET_MODE_SIZE (GET_MODE (op))) + switch (GET_MODE_SIZE (GET_MODE (mem))) { case 1: - gcc_assert (offset < 65535 * 1); - fprintf (file, "B"); + gcc_assert (offset <= 65535 * 1); + fprintf (file, unsigned_load ? "UB" : "B"); break; case 2: gcc_assert (offset % 2 == 0); - gcc_assert (offset < 65535 * 2); - fprintf (file, "W"); + gcc_assert (offset <= 65535 * 2); + fprintf (file, unsigned_load ? "UW" : "W"); break; - default: + case 4: gcc_assert (offset % 4 == 0); - gcc_assert (offset < 65535 * 4); + gcc_assert (offset <= 65535 * 4); fprintf (file, "L"); break; + default: + gcc_unreachable (); } break; } @@ -2336,6 +2352,13 @@ flag_strict_volatile_bitfields = 1; rx_override_options_after_change (); + + if (align_jumps == 0 && ! optimize_size) + align_jumps = 3; + if (align_loops == 0 && ! optimize_size) + align_loops = 3; + if (align_labels == 0 && ! optimize_size) + align_labels = 3; } /* Implement TARGET_OPTION_OPTIMIZATION_TABLE. */ @@ -2728,6 +2751,45 @@ } +int +rx_align_for_label (void) +{ + return optimize_size ? 1 : 3; +} + +static int +rx_max_skip_for_label (rtx lab) +{ + int opsize; + rtx op; + + if (lab == NULL_RTX) + return 0; + op = lab; + do + { + op = next_nonnote_insn (op); + } + while (op && (LABEL_P (op) + || (INSN_P (op) && GET_CODE (PATTERN (op)) == USE))); + if (!op) + return 0; + + opsize = get_attr_length (op); + if (opsize >= 0 && opsize < 8) + return opsize - 1; + return 0; +} + +#undef TARGET_ASM_JUMP_ALIGN_MAX_SKIP +#define TARGET_ASM_JUMP_ALIGN_MAX_SKIP rx_max_skip_for_label +#undef TARGET_ASM_LOOP_ALIGN_MAX_SKIP +#define TARGET_ASM_LOOP_ALIGN_MAX_SKIP rx_max_skip_for_label +#undef TARGET_LABEL_ALIGN_AFTER_BARRIER_MAX_SKIP +#define TARGET_LABEL_ALIGN_AFTER_BARRIER_MAX_SKIP rx_max_skip_for_label +#undef TARGET_ASM_LABEL_ALIGN_MAX_SKIP +#define TARGET_ASM_LABEL_ALIGN_MAX_SKIP rx_max_skip_for_label + #undef TARGET_FUNCTION_VALUE #define TARGET_FUNCTION_VALUE rx_function_value