https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64036
Oleg Endo <olegendo at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Last reconfirmed| |2024-10-14
Status|UNCONFIRMED |NEW
Ever confirmed|0 |1
--- Comment #7 from Oleg Endo <olegendo at gcc dot gnu.org> ---
Another case, where the long fdiv execution time can be overlapped with other
instructions. Currently this doesn't get scheduled properly and ends up with
very bad code that stalls on every operation.
However, if the if-block in sh.cc
if (targetm.small_register_classes_for_mode_p (VOIDmode))
{
/* Never run scheduling before reload, since that can
...
is taken out, scheduling before RA is performed and results in better code for
this case.
One not so good thing is as pointed out at (1) below, the results write-back
order is not very good. It's unnecessarily far away from the fmul result and
creates long register live ranges, which will result in worse code if it's more
complex (has more variables).
struct vertex
{
float x, y, z, w;
float a, r, g, b;
};
struct material
{
float a, r, g, b;
};
void test (const struct vertex* restrict vtx,
const struct material* restrict mat,
unsigned int count,
struct vertex* restrict vtx_out)
{
while (count--)
{
float inv_w = 1.0f / vtx->w;
vtx_out->x = vtx->x * inv_w;
vtx_out->y = vtx->y * inv_w;
vtx_out->z = vtx->z * inv_w;
vtx_out->a = vtx->a * mat->a;
vtx_out->r = vtx->r * mat->r;
vtx_out->g = vtx->g * mat->g;
vtx_out->b = vtx->b * mat->b;
vtx_out++;
vtx++;
}
}
compiled with -m4-single -ml -O2:
.L3:
mov r12,r0
fmov.s @(r0,r9),fr1
fmov fr8,fr7
dt r6
fdiv fr1,fr7 // fdiv start
fmov.s @(r0,r4),fr0
fmov.s @(r0,r8),fr6
fmov.s @(r0,r3),fr5
mov r1,r0
fmov.s @(r0,r4),fr4
fmov.s @(r0,r8),fr3
fmov.s @(r0,r3),fr2
fmul fr12,fr4 // vtx->{a|r|g|b} * mat->{a|r|g|b}
fmov.s @(r0,r9),fr1
mov r12,r0
fmul fr10,fr3 // vtx->{a|r|g|b} * mat->{a|r|g|b}
fmul fr11,fr2 // vtx->{a|r|g|b} * mat->{a|r|g|b}
fmul fr9,fr1 // vtx->{a|r|g|b} * mat->{a|r|g|b}
fmul fr7,fr0 // vtx->{x|y|z} * inv_w
fmul fr7,fr6
fmov.s fr0,@(r0,r7) // (1) write-back maybe too early
fmul fr7,fr5
fmov.s fr6,@(r0,r5)
fmov.s fr5,@(r0,r2)
mov r1,r0
fmov.s fr4,@(r0,r7)
add #32,r1
fmov.s fr3,@(r0,r5) // (1) this should be written back first
fmov.s fr2,@(r0,r2)
mov r12,r0
fmov.s fr1,@(r0,r10)
bf/s .L3
add #32,r12
fmov.s @r15+,fr12
mov.l @r15+,r12
mov.l @r15+,r11
mov.l @r15+,r10
mov.l @r15+,r9
rts
mov.l @r15+,r8
.align 1
.L10:
rts
nop