Module: Mesa Branch: main Commit: 629bef2a4c0b394f5db2753164d3ebc91bbad777 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=629bef2a4c0b394f5db2753164d3ebc91bbad777
Author: Faith Ekstrand <[email protected]> Date: Fri Dec 15 11:54:42 2023 -0600 nak: Handle minimum execution latencies in the dep tracker Some instructions have a minimum latency before another instruction can execute. It's a little unclear exactly what the details on these are. For things like OpBar it's probably something to do with when stuff actually convergres. For MemBar, maybe we only need to wait before we do something that also touches memory? Unclear and the few docs seem to imply that it's a straight-up stall. For now, we model it as an execution latency where nothing is allowed to happen until then. The blob also inserts a NOP with a delay of 2 in these cases. It's not entirely clear why but it's probably best we do the same. Theses instructions tend to be pretty heavy-weight anyway so 2 cycles isn't really going to cost much compared to the chances that we're missing some subtle HW issues somewhere. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26716> --- src/nouveau/compiler/nak/calc_instr_deps.rs | 18 ++++++++++++++++-- src/nouveau/compiler/nak/from_nir.rs | 1 - src/nouveau/compiler/nak/ir.rs | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/nouveau/compiler/nak/calc_instr_deps.rs b/src/nouveau/compiler/nak/calc_instr_deps.rs index 53edd834166..e4f0cdf5f70 100644 --- a/src/nouveau/compiler/nak/calc_instr_deps.rs +++ b/src/nouveau/compiler/nak/calc_instr_deps.rs @@ -470,8 +470,8 @@ fn calc_delays(f: &mut Function, sm: u8) { let mut ready = RegTracker::new(0_u32); let mut bars_ready = [0_u32; 6]; for instr in b.instrs.iter_mut().rev() { - let mut min_start = cycle + 1; // TODO: co-issue - // Barriers take two cycles before we can wait on them + // TODO: co-issue + let mut min_start = cycle + instr.get_exec_latency(sm); if let Some(bar) = instr.deps.rd_bar() { min_start = max(min_start, bars_ready[usize::from(bar)] + 2); } @@ -507,6 +507,20 @@ fn calc_delays(f: &mut Function, sm: u8) { cycle = min_start; } } + + // It's unclear exactly why but the blob inserts a Nop with a delay of 2 + // after every instruction which has an exec latency. Perhaps it has + // something to do with .yld? In any case, the extra 2 cycles aren't worth + // the chance of weird bugs. + f.map_instrs(|instr, _| { + if instr.get_exec_latency(sm) > 1 { + let mut nop = Instr::new_boxed(OpNop { label: None }); + nop.deps.set_delay(2); + MappedInstrs::Many(vec![instr, nop]) + } else { + MappedInstrs::One(instr) + } + }); } impl Shader { diff --git a/src/nouveau/compiler/nak/from_nir.rs b/src/nouveau/compiler/nak/from_nir.rs index 63be492b6d2..c37f76f06d8 100644 --- a/src/nouveau/compiler/nak/from_nir.rs +++ b/src/nouveau/compiler/nak/from_nir.rs @@ -2245,7 +2245,6 @@ impl<'a> ShaderFromNir<'a> { ); self.info.num_barriers = 1; b.push_op(OpBar {}); - b.push_op(OpNop { label: None }); } _ => panic!("Unhandled execution scope"), } diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs index c004d1627e3..2406134fff2 100644 --- a/src/nouveau/compiler/nak/ir.rs +++ b/src/nouveau/compiler/nak/ir.rs @@ -5142,6 +5142,25 @@ impl Instr { } } + /// Minimum latency before another instruction can execute + pub fn get_exec_latency(&self, sm: u8) -> u32 { + match &self.op { + Op::Bar(_) | Op::MemBar(_) => { + if sm >= 80 { + 6 + } else { + 5 + } + } + Op::CCtl(_op) => { + // CCTL.C needs 8, CCTL.I needs 11 + 11 + } + // Op::DepBar(_) => 4, + _ => 1, // TODO: co-issue + } + } + pub fn get_dst_latency(&self, sm: u8, dst_idx: usize) -> u32 { debug_assert!(self.has_fixed_latency(sm)); let file = match self.dsts()[dst_idx] {
