https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99068

--- Comment #9 from Brian Grayson <brian.grayson at sifive dot com> ---
If I understand correctly, you're saying that it is sometimes preferred for gcc
to avoid update form, but even when the load and addi are next to each other
it's possible to use update form, like in the example assembly discussed in
this bug? It seems the situation here is exactly the poster child for when it
*should* generate an lhau -- same register being used, contiguous instructions,
no change in register pressure by using lhau. In fact, it seems a peephole
optimizer would do exactly that type of change.

If it's preferred to not use update form even when one has contiguous
instructions and no register pressure change, then I guess it's a bug that gcc
*does* generate an lhau under -O3 for found_zero(). Or to state it differently,
I don't understand why gcc generates code differently for these two cases
w.r.t. lhau, as they are both the same underlying computation and gcc knows
that:

#include <stdint.h>
int16_t a[1000];
int64_t N = 100;
int found_zero() { // Generates an lhau.
  for (int64_t i = 0; i <= N; i++) { if (a[i] == 0) return 1; }
  return 0;
}
int found_zero_ptr() { // Does not generate an lhau.
  for (int16_t* p = &a[0]; p <= &a[N]; p++) { if (*p == 0) return 1; }
  return 0;
}

This emits:

  .file "stub.c"
  .machine power4
  .section  ".text"
  .section  ".toc","aw"
  .align 3
.LC0:
  .quad a-2
  .section  ".text"
  .align 2
  .p2align 4,,15
  .globl found_zero
  .section  ".opd","aw"
  .align 3
found_zero:
  .quad .L.found_zero,.TOC.@tocbase,0
  .previous
  .type found_zero, @function
.L.found_zero:
.LFB0:
  .cfi_startproc
  addis 7,2,.LANCHOR0@toc@ha
  ld 7,.LANCHOR0@toc@l(7)
  cmpdi 0,7,0
  blt 0,.L4
  addis 8,2,.LC0@toc@ha
  ld 8,.LC0@toc@l(8)
  li 10,0
  b .L3
  .p2align 4,,15
.L8:
  bgt 7,.L4
.L3:
  lhau 9,2(8)   <--- lhau generated here, instead of an addi r8,r8,2
  addi 10,10,1
  cmpd 7,10,7
  cmpwi 0,9,0
  bne 0,.L8
  li 3,1
  blr
  .p2align 4,,15
.L4:
  li 3,0
  blr
  .long 0
  .byte 0,0,0,0,0,0,0,0
  .cfi_endproc
.LFE0:
  .size found_zero,.-.L.found_zero
  .section  ".toc","aw"
.LC1:
  .quad a
  .section  ".text"
  .align 2
  .p2align 4,,15
  .globl found_zero_ptr
  .section  ".opd","aw"
  .align 3
found_zero_ptr:
  .quad .L.found_zero_ptr,.TOC.@tocbase,0
  .previous
  .type found_zero_ptr, @function
.L.found_zero_ptr:
.LFB1:
  .cfi_startproc
  addis 8,2,.LANCHOR0@toc@ha
  ld 8,.LANCHOR0@toc@l(8)
  addis 9,2,.LC1@toc@ha
  ld 9,.LC1@toc@l(9)
  sldi 8,8,1
  add 8,8,9
  cmpld 0,8,9
  bge 0,.L11
  b .L12
  .p2align 4,,15
.L15:
  bgt 7,.L12
.L11:
  lha 10,0(9)   <--- No lhau here, even though r9 is overwritten as 
  addi 9,9,2    <--- source and dest by the addi, and they are contiguous
  cmpld 7,9,8
  cmpwi 0,10,0
  bne 0,.L15
  li 3,1
  blr
  .p2align 4,,15
.L12:
  li 3,0
  blr
  .long 0
  .byte 0,0,0,0,0,0,0,0
  .cfi_endproc
.LFE1:
  .size found_zero_ptr,.-.L.found_zero_ptr

Reply via email to