On Fri, Jan 10, 2014 at 2:17 PM, Paulo Matos <pma...@broadcom.com> wrote:
> Hello,
>
> I have been investigating a few loops and while looking at:
> void matrix_mul_vect(unsigned int N, int *C, short *A, short *B) {
>  unsigned int i,j;
>  for (i=0; i<N; i++) {
>   C[i]=0;
>   for (j=0; j<N; j++) {
>    C[i]+=A[i*N+j] * B[j];
>   }
>  }
> }
>
> I noticed GCC is generating worst code now (trunk and 4.8) than in 4.5.
> Using x86_64 as an example (compiled with O2 and gcc 4.8):
> matrix_mul_vect:
> .LFB0:
>         .cfi_startproc
>         pushq   %rbp
>         .cfi_def_cfa_offset 16
>         .cfi_offset 6, -16
>         xorl    %eax, %eax
>         xorl    %ebp, %ebp
>         testl   %edi, %edi
>         pushq   %rbx
>         .cfi_def_cfa_offset 24
>         .cfi_offset 3, -24
>         je      .L1
>         .p2align 4,,10
>         .p2align 3
> .L7:
>         movq    %rcx, %r9
>         leal    (%rax,%rdi), %ebx
>         xorl    %r10d, %r10d
>         jmp     .L4
>         .p2align 4,,10
>         .p2align 3
> .L10:
>         movl    %r8d, %r10d
> .L4:
>         movl    %eax, %r8d
>         movswl  (%r9), %r11d
>         addl    $1, %eax
>         movswl  (%rdx,%r8,2), %r8d
>         addq    $2, %r9
>         imull   %r11d, %r8d
>         addl    %r10d, %r8d
>         cmpl    %ebx, %eax
>         jne     .L10
>         movl    %r8d, (%rsi,%rbp,4)
>         addq    $1, %rbp
>         cmpl    %ebp, %edi
>         ja      .L7
> .L1:
>         popq    %rbx
>         .cfi_def_cfa_offset 16
>         popq    %rbp
>         .cfi_def_cfa_offset 8
>         ret
>         .cfi_endproc
> .LFE0:
>
>
> For 4.5:
> matrix_mul_vect:
> .LFB0:
>         .cfi_startproc
>         pushq   %r13
>         .cfi_def_cfa_offset 16
>         testl   %edi, %edi
>         pushq   %r12
>         .cfi_def_cfa_offset 24
>         pushq   %rbp
>         .cfi_def_cfa_offset 32
>         pushq   %rbx
>         .cfi_def_cfa_offset 40
>         je      .L1
>         .cfi_offset 3, -40
>         .cfi_offset 6, -32
>         .cfi_offset 12, -24
>         .cfi_offset 13, -16
>         leal    -1(%rdi), %ebx
>         xorl    %r12d, %r12d
>         xorl    %ebp, %ebp
>         addq    $1, %rbx
>         leaq    0(,%rbx,4), %r13
>         addq    %rbx, %rbx
>         .p2align 4,,10
>         .p2align 3
> .L4:
>         movl    $0, (%rsi,%rbp)
>         movl    %r12d, %r9d
>         xorl    %eax, %eax
>         xorl    %r10d, %r10d
>         .p2align 4,,10
>         .p2align 3
> .L3:
>         mov     %r9d, %r8d
>         movswl  (%rcx,%rax), %r11d
>         addq    $2, %rax
>         movswl  (%rdx,%r8,2), %r8d
>         addl    $1, %r9d
>         imull   %r11d, %r8d
>         addl    %r8d, %r10d
>         cmpq    %rbx, %rax
>         jne     .L3
>         movl    %r10d, (%rsi,%rbp)
>         addq    $4, %rbp
>         addl    %edi, %r12d
>         cmpq    %r13, %rbp
>         jne     .L4
> .L1:
>         popq    %rbx
>         .cfi_def_cfa_offset 32
>         popq    %rbp
>         .cfi_def_cfa_offset 24
>         popq    %r12
>         .cfi_def_cfa_offset 16
>         popq    %r13
>         .cfi_def_cfa_offset 8
>         ret
>         .cfi_endproc
> .LFE0:
>
>
> The problem seems to be, at least partially, due to the creation of an extra 
> basic block that ends up being a loop latch with a useless insn. This can be 
> seen in the code output by gcc 4.8, label .L10.
> Before register allocation blocks 4 and 5 look like (gcc 4.8):
> ;; basic block 4, loop depth 2, count 0, freq 9100, maybe hot
> ;;  prev block 3, next block 5, flags: (RTL, MODIFIED)
> ;;  pred:       3 [100.0%]  (FALLTHRU)
> ;;              5 [100.0%]  (DFS_BACK)
> ;; bb 4 artificial_defs: { }
> ;; bb 4 artificial_uses: { u18(6){ }u19(7){ }u20(16){ }u21(20){ }}
> ;; lr  in        6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 91 97 98 101 102 103 
> 104 105
> ;; lr  use       6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 97 98 101 104
> ;; lr  def       17 [flags] 85 96 97 106 108 109
> (code_label 61 36 50 4 4 "" [1 uses])
> (note 50 61 51 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
> (insn 51 50 52 4 (set (reg:DI 106 [ D.3277 ])
>         (zero_extend:DI (reg:SI 85 [ ivtmp.7 ]))) coremark.c:142 139 
> {*zero_extendsidi2_rex64}
>      (nil))
> (insn 52 51 53 4 (set (reg:SI 109 [ D.3280 ])
>         (sign_extend:SI (mem:HI (plus:DI (mult:DI (reg:DI 106 [ D.3277 ])
>                         (const_int 2 [0x2]))
>                     (reg/v/f:DI 104 [ A ])) [4 *_18+0 S2 A16]))) 
> coremark.c:142 153 {extendhisi2}
>      (expr_list:REG_DEAD (reg:DI 106 [ D.3277 ])
>         (nil)))
> (insn 53 52 54 4 (set (reg:SI 108 [ D.3280 ])
>         (sign_extend:SI (mem:HI (reg:DI 97 [ ivtmp.8 ]) [4 MEM[base: _63, 
> offset: 0B]+0 S2 A16]))) coremark.c:142 153 {extendhisi2}
>      (nil))
> (insn 54 53 55 4 (parallel [
>             (set (reg:SI 109 [ D.3280 ])
>                 (mult:SI (reg:SI 109 [ D.3280 ])
>                     (reg:SI 108 [ D.3280 ])))
>             (clobber (reg:CC 17 flags))
>         ]) coremark.c:142 353 {*mulsi3_1}
>      (expr_list:REG_DEAD (reg:SI 108 [ D.3280 ])
>         (expr_list:REG_UNUSED (reg:CC 17 flags)
>             (nil))))
> (insn 55 54 56 4 (parallel [
>             (set (reg:SI 96 [ D.3280 ])
>                 (plus:SI (reg:SI 109 [ D.3280 ])
>                     (reg:SI 101 [ D.3283 ])))
>             (clobber (reg:CC 17 flags))
>         ]) coremark.c:142 273 {*addsi_1}
>      (expr_list:REG_DEAD (reg:SI 109 [ D.3280 ])
>         (expr_list:REG_DEAD (reg:SI 101 [ D.3283 ])
>             (expr_list:REG_UNUSED (reg:CC 17 flags)
>                 (nil)))))
> (insn 56 55 57 4 (parallel [
>             (set (reg:SI 85 [ ivtmp.7 ])
>                 (plus:SI (reg:SI 85 [ ivtmp.7 ])
>                     (const_int 1 [0x1])))
>             (clobber (reg:CC 17 flags))
>         ]) 273 {*addsi_1}
>      (expr_list:REG_UNUSED (reg:CC 17 flags)
>         (nil)))
> (insn 57 56 58 4 (parallel [
>             (set (reg:DI 97 [ ivtmp.8 ])
>                 (plus:DI (reg:DI 97 [ ivtmp.8 ])
>                     (const_int 2 [0x2])))
>             (clobber (reg:CC 17 flags))
>         ]) 274 {*adddi_1}
>      (expr_list:REG_UNUSED (reg:CC 17 flags)
>         (nil)))
> (insn 58 57 59 4 (set (reg:CCZ 17 flags)
>         (compare:CCZ (reg:SI 85 [ ivtmp.7 ])
>             (reg:SI 98 [ D.3281 ]))) coremark.c:141 7 {*cmpsi_1}
>      (nil))
> (jump_insn 59 58 60 4 (set (pc)
>         (if_then_else (eq (reg:CCZ 17 flags)
>                 (const_int 0 [0]))
>             (label_ref 64)
>             (pc))) coremark.c:141 621 {*jcc_1}
>      (expr_list:REG_DEAD (reg:CCZ 17 flags)
>         (expr_list:REG_BR_PROB (const_int 900 [0x384])
>             (nil)))
>  -> 64)
> ;;  succ:       5 [91.0%]  (FALLTHRU)
> ;;              6 [9.0%]  (LOOP_EXIT)
> ;; lr  out       6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 91 96 97 98 102 103 
> 104 105
>
> ;; basic block 5, loop depth 2, count 0, freq 8281, maybe hot
> ;;  prev block 4, next block 6, flags: (RTL, MODIFIED)
> ;;  pred:       4 [91.0%]  (FALLTHRU)
> ;; bb 5 artificial_defs: { }
> ;; bb 5 artificial_uses: { u35(6){ }u36(7){ }u37(16){ }u38(20){ }}
> ;; lr  in        6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 91 96 97 98 102 103 
> 104 105
> ;; lr  use       6 [bp] 7 [sp] 16 [argp] 20 [frame] 96
> ;; lr  def       101
> (note 60 59 34 5 [bb 5] NOTE_INSN_BASIC_BLOCK)
> (insn 34 60 84 5 (set (reg:SI 101 [ D.3283 ])
>         (reg:SI 96 [ D.3280 ])) coremark.c:141 89 {*movsi_internal}
>      (expr_list:REG_DEAD (reg:SI 96 [ D.3280 ])
>         (nil)))
> (jump_insn 84 34 85 5 (set (pc)
>         (label_ref 61)) 659 {jump}
>      (nil)
>  -> 61)
> ;;  succ:       4 [100.0%]  (DFS_BACK)
> ;; lr  out       6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 91 97 98 101 102 103 
> 104 105
>
> However, registers 96 and 101 only show up in block 4 here:
> (insn 55 54 56 4 (parallel [
>             (set (reg:SI 96 [ D.3280 ])
>                 (plus:SI (reg:SI 109 [ D.3280 ])
>                     (reg:SI 101 [ D.3283 ])))
>             (clobber (reg:CC 17 flags))
>         ]) coremark.c:142 273 {*addsi_1}
>      (expr_list:REG_DEAD (reg:SI 109 [ D.3280 ])
>         (expr_list:REG_DEAD (reg:SI 101 [ D.3283 ])
>             (expr_list:REG_UNUSED (reg:CC 17 flags)
>                 (nil)))))
>
> I don't really understand why GCC is generating this extra basic block 
> however it looks like a regression. Block 5 could disappear and insn 55 
> rewritten as (replaced reg101 by reg96):
>
>             (set (reg:SI 96 [ D.3280 ])
>                 (plus:SI (reg:SI 109 [ D.3280 ])
>                     (reg:SI 96 [ D.3283 ])))
>
> It is worse on my port due to several technical constraints. For example, 
> zero overhead loops can only be generated if there are no internal branches, 
> which means that any loop without an empty latch can't be transformed into a 
> zero overhead loop, which is the case with this example. So, loops with empty 
> latches are desired as much as possible. Once GCC starts creating code like 
> this performance on my port drops drastically.
>
> What are your thoughts?
>
> Note: I just noticed this doesn't happen in trunk anymore. Any idea of what 
> went into trunk to fix this?

Most likely changes to SSA coalescing at out-of-SSA time like

2013-09-26  Richard Biener  <rguent...@suse.de>

        * tree-ssa-live.c (var_map_base_init): Handle SSA names with
        DECL_IGNORED_P base VAR_DECLs like anonymous SSA names.
        (loe_visit_block): Use gcc_checking_assert.
        * tree-ssa-coalesce.c (create_outofssa_var_map): Use
        gimple_assign_ssa_name_copy_p.
        (gimple_can_coalesce_p): Adjust according to the var_map_base_init
        change.

and an earlier patch by Jeff Law.

Richard.

> Paulo Matos
>
>

Reply via email to