https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64036

Oleg Endo <olegendo at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Last reconfirmed|                            |2024-10-14
             Status|UNCONFIRMED                 |NEW
     Ever confirmed|0                           |1

--- Comment #7 from Oleg Endo <olegendo at gcc dot gnu.org> ---
Another case, where the long fdiv execution time can be overlapped with other
instructions.  Currently this doesn't get scheduled properly and ends up with
very bad code that stalls on every operation.

However, if the if-block in sh.cc

  if (targetm.small_register_classes_for_mode_p (VOIDmode))
    {
      /* Never run scheduling before reload, since that can
      ...

is taken out, scheduling before RA is performed and results in better code for
this case.

One not so good thing is as pointed out at (1) below, the results write-back
order is not very good.  It's unnecessarily far away from the fmul result and
creates long register live ranges, which will result in worse code if it's more
complex (has more variables).


struct vertex
{
  float x, y, z, w;
  float a, r, g, b;
};

struct material
{
  float a, r, g, b;    
};

void test (const struct vertex* restrict vtx,
           const struct material* restrict mat,
           unsigned int count,
           struct vertex* restrict vtx_out)
{
  while (count--)
  {
    float inv_w = 1.0f / vtx->w;

    vtx_out->x = vtx->x * inv_w;
    vtx_out->y = vtx->y * inv_w;
    vtx_out->z = vtx->z * inv_w;

    vtx_out->a = vtx->a * mat->a;
    vtx_out->r = vtx->r * mat->r;
    vtx_out->g = vtx->g * mat->g;
    vtx_out->b = vtx->b * mat->b;

    vtx_out++;
    vtx++;
  }
}

compiled with -m4-single -ml -O2:

.L3:
        mov     r12,r0
        fmov.s  @(r0,r9),fr1
        fmov    fr8,fr7
        dt      r6
        fdiv    fr1,fr7        // fdiv start

        fmov.s  @(r0,r4),fr0
        fmov.s  @(r0,r8),fr6
        fmov.s  @(r0,r3),fr5
        mov     r1,r0
        fmov.s  @(r0,r4),fr4
        fmov.s  @(r0,r8),fr3
        fmov.s  @(r0,r3),fr2
        fmul    fr12,fr4       // vtx->{a|r|g|b} * mat->{a|r|g|b}
        fmov.s  @(r0,r9),fr1
        mov     r12,r0
        fmul    fr10,fr3       // vtx->{a|r|g|b} * mat->{a|r|g|b}
        fmul    fr11,fr2       // vtx->{a|r|g|b} * mat->{a|r|g|b}
        fmul    fr9,fr1        // vtx->{a|r|g|b} * mat->{a|r|g|b}

        fmul    fr7,fr0        // vtx->{x|y|z} * inv_w
        fmul    fr7,fr6
        fmov.s  fr0,@(r0,r7)   // (1) write-back maybe too early
        fmul    fr7,fr5
        fmov.s  fr6,@(r0,r5)
        fmov.s  fr5,@(r0,r2)
        mov     r1,r0
        fmov.s  fr4,@(r0,r7)
        add     #32,r1
        fmov.s  fr3,@(r0,r5)   // (1) this should be written back first
        fmov.s  fr2,@(r0,r2)
        mov     r12,r0
        fmov.s  fr1,@(r0,r10)
        bf/s    .L3
        add     #32,r12

        fmov.s  @r15+,fr12
        mov.l   @r15+,r12
        mov.l   @r15+,r11
        mov.l   @r15+,r10
        mov.l   @r15+,r9
        rts     
        mov.l   @r15+,r8
        .align 1
.L10:
        rts     
        nop

Reply via email to