================
@@ -150,13 +150,14 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT: movprfx z1, z0
+; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: fcvtzu z0.s, p0/m, z0.h
-; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2]
----------------
gbossu wrote:
In that example, we do get one more instruction now (the `movprfx`), but I
think the schedule is actually better because we eliminate one dependency
between `ext` and the second `uunpklo`. Now the two `uunpklo` can execute in
parallel.
This is is the theme of the test updates in general: Sometimes more
instructions, but more freedom for the `MachineScheduler`
https://github.com/llvm/llvm-project/pull/152554
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits