void foo (int * restrict a, int * restrict b, int * restrict c) { int i; for(i = 0; i < 100; i+=4) { a[i] = b[i] * c[i]; a[i+1] = b[i+1] * c[i+1]; a[i+2] = b[i+2] * c[i+2]; a[i+3] = b[i+3] * c[i+3]; } }
Trunk x86-64 compiler (162821) produces code that later load instructions are not scheduled before the previous store instructions as expected. Clearly, restrict qualifier is not used here. ~/work/install-x86/bin/gcc tst3.c -O2 -S -std=c99 -da -fschedule-insns -frename-registers .L2: movl (%rdx,%rax), %r10d imull (%rsi,%rax), %r10d movl %r10d, (%rdi,%rax) movl 4(%rdx,%rax), %r9d imull 4(%rsi,%rax), %r9d movl %r9d, 4(%rdi,%rax) movl 8(%rdx,%rax), %r8d imull 8(%rsi,%rax), %r8d movl %r8d, 8(%rdi,%rax) movl 12(%rdx,%rax), %ecx imull 12(%rsi,%rax), %ecx movl %ecx, 12(%rdi,%rax) addq $16, %rax cmpq $400, %rax Richard has a patch and it seems to work for this example. Index: expr.c =================================================================== --- expr.c (revision 162841) +++ expr.c (working copy) @@ -8665,7 +8665,7 @@ expand_expr_real_1 (tree exp, rtx target set_mem_addr_space (temp, as); base = get_base_address (TMR_ORIGINAL (exp)); if (base - && INDIRECT_REF_P (base) + && (INDIRECT_REF_P (base) || TREE_CODE (base) == MEM_REF) && TMR_BASE (exp) && TREE_CODE (TMR_BASE (exp)) == SSA_NAME && POINTER_TYPE_P (TREE_TYPE (TMR_BASE (exp)))) The code generated: .L2: movl (%rdx,%rax), %r10d movl 4(%rdx,%rax), %r9d imull (%rsi,%rax), %r10d imull 4(%rsi,%rax), %r9d movl 8(%rdx,%rax), %r8d movl 12(%rdx,%rax), %ecx imull 8(%rsi,%rax), %r8d imull 12(%rsi,%rax), %ecx movl %r10d, (%rdi,%rax) movl %r9d, 4(%rdi,%rax) movl %r8d, 8(%rdi,%rax) movl %ecx, 12(%rdi,%rax) addq $16, %rax cmpq $400, %rax jne .L2 -- Summary: restrict qualifier is not used in a manually unrolled loop Product: gcc Version: 4.6.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: bmei at broadcom dot com http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45176