https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64036
Oleg Endo <olegendo at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Last reconfirmed| |2024-10-14 Status|UNCONFIRMED |NEW Ever confirmed|0 |1 --- Comment #7 from Oleg Endo <olegendo at gcc dot gnu.org> --- Another case, where the long fdiv execution time can be overlapped with other instructions. Currently this doesn't get scheduled properly and ends up with very bad code that stalls on every operation. However, if the if-block in sh.cc if (targetm.small_register_classes_for_mode_p (VOIDmode)) { /* Never run scheduling before reload, since that can ... is taken out, scheduling before RA is performed and results in better code for this case. One not so good thing is as pointed out at (1) below, the results write-back order is not very good. It's unnecessarily far away from the fmul result and creates long register live ranges, which will result in worse code if it's more complex (has more variables). struct vertex { float x, y, z, w; float a, r, g, b; }; struct material { float a, r, g, b; }; void test (const struct vertex* restrict vtx, const struct material* restrict mat, unsigned int count, struct vertex* restrict vtx_out) { while (count--) { float inv_w = 1.0f / vtx->w; vtx_out->x = vtx->x * inv_w; vtx_out->y = vtx->y * inv_w; vtx_out->z = vtx->z * inv_w; vtx_out->a = vtx->a * mat->a; vtx_out->r = vtx->r * mat->r; vtx_out->g = vtx->g * mat->g; vtx_out->b = vtx->b * mat->b; vtx_out++; vtx++; } } compiled with -m4-single -ml -O2: .L3: mov r12,r0 fmov.s @(r0,r9),fr1 fmov fr8,fr7 dt r6 fdiv fr1,fr7 // fdiv start fmov.s @(r0,r4),fr0 fmov.s @(r0,r8),fr6 fmov.s @(r0,r3),fr5 mov r1,r0 fmov.s @(r0,r4),fr4 fmov.s @(r0,r8),fr3 fmov.s @(r0,r3),fr2 fmul fr12,fr4 // vtx->{a|r|g|b} * mat->{a|r|g|b} fmov.s @(r0,r9),fr1 mov r12,r0 fmul fr10,fr3 // vtx->{a|r|g|b} * mat->{a|r|g|b} fmul fr11,fr2 // vtx->{a|r|g|b} * mat->{a|r|g|b} fmul fr9,fr1 // vtx->{a|r|g|b} * mat->{a|r|g|b} fmul fr7,fr0 // vtx->{x|y|z} * inv_w fmul fr7,fr6 fmov.s fr0,@(r0,r7) // (1) write-back maybe too early fmul fr7,fr5 fmov.s fr6,@(r0,r5) fmov.s fr5,@(r0,r2) mov r1,r0 fmov.s fr4,@(r0,r7) add #32,r1 fmov.s fr3,@(r0,r5) // (1) this should be written back first fmov.s fr2,@(r0,r2) mov r12,r0 fmov.s fr1,@(r0,r10) bf/s .L3 add #32,r12 fmov.s @r15+,fr12 mov.l @r15+,r12 mov.l @r15+,r11 mov.l @r15+,r10 mov.l @r15+,r9 rts mov.l @r15+,r8 .align 1 .L10: rts nop