Module: Mesa
Branch: main
Commit: 629bef2a4c0b394f5db2753164d3ebc91bbad777
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=629bef2a4c0b394f5db2753164d3ebc91bbad777

Author: Faith Ekstrand <[email protected]>
Date:   Fri Dec 15 11:54:42 2023 -0600

nak: Handle minimum execution latencies in the dep tracker

Some instructions have a minimum latency before another instruction can
execute.  It's a little unclear exactly what the details on these are.
For things like OpBar it's probably something to do with when stuff
actually convergres.  For MemBar, maybe we only need to wait before we
do something that also touches memory?  Unclear and the few docs seem to
imply that it's a straight-up stall.  For now, we model it as an
execution latency where nothing is allowed to happen until then.

The blob also inserts a NOP with a delay of 2 in these cases.  It's not
entirely clear why but it's probably best we do the same.  Theses
instructions tend to be pretty heavy-weight anyway so 2 cycles isn't
really going to cost much compared to the chances that we're missing
some subtle HW issues somewhere.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26716>

---

 src/nouveau/compiler/nak/calc_instr_deps.rs | 18 ++++++++++++++++--
 src/nouveau/compiler/nak/from_nir.rs        |  1 -
 src/nouveau/compiler/nak/ir.rs              | 19 +++++++++++++++++++
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/nouveau/compiler/nak/calc_instr_deps.rs 
b/src/nouveau/compiler/nak/calc_instr_deps.rs
index 53edd834166..e4f0cdf5f70 100644
--- a/src/nouveau/compiler/nak/calc_instr_deps.rs
+++ b/src/nouveau/compiler/nak/calc_instr_deps.rs
@@ -470,8 +470,8 @@ fn calc_delays(f: &mut Function, sm: u8) {
         let mut ready = RegTracker::new(0_u32);
         let mut bars_ready = [0_u32; 6];
         for instr in b.instrs.iter_mut().rev() {
-            let mut min_start = cycle + 1; // TODO: co-issue
-                                           // Barriers take two cycles before 
we can wait on them
+            // TODO: co-issue
+            let mut min_start = cycle + instr.get_exec_latency(sm);
             if let Some(bar) = instr.deps.rd_bar() {
                 min_start = max(min_start, bars_ready[usize::from(bar)] + 2);
             }
@@ -507,6 +507,20 @@ fn calc_delays(f: &mut Function, sm: u8) {
             cycle = min_start;
         }
     }
+
+    // It's unclear exactly why but the blob inserts a Nop with a delay of 2
+    // after every instruction which has an exec latency.  Perhaps it has
+    // something to do with .yld?  In any case, the extra 2 cycles aren't worth
+    // the chance of weird bugs.
+    f.map_instrs(|instr, _| {
+        if instr.get_exec_latency(sm) > 1 {
+            let mut nop = Instr::new_boxed(OpNop { label: None });
+            nop.deps.set_delay(2);
+            MappedInstrs::Many(vec![instr, nop])
+        } else {
+            MappedInstrs::One(instr)
+        }
+    });
 }
 
 impl Shader {
diff --git a/src/nouveau/compiler/nak/from_nir.rs 
b/src/nouveau/compiler/nak/from_nir.rs
index 63be492b6d2..c37f76f06d8 100644
--- a/src/nouveau/compiler/nak/from_nir.rs
+++ b/src/nouveau/compiler/nak/from_nir.rs
@@ -2245,7 +2245,6 @@ impl<'a> ShaderFromNir<'a> {
                         );
                         self.info.num_barriers = 1;
                         b.push_op(OpBar {});
-                        b.push_op(OpNop { label: None });
                     }
                     _ => panic!("Unhandled execution scope"),
                 }
diff --git a/src/nouveau/compiler/nak/ir.rs b/src/nouveau/compiler/nak/ir.rs
index c004d1627e3..2406134fff2 100644
--- a/src/nouveau/compiler/nak/ir.rs
+++ b/src/nouveau/compiler/nak/ir.rs
@@ -5142,6 +5142,25 @@ impl Instr {
         }
     }
 
+    /// Minimum latency before another instruction can execute
+    pub fn get_exec_latency(&self, sm: u8) -> u32 {
+        match &self.op {
+            Op::Bar(_) | Op::MemBar(_) => {
+                if sm >= 80 {
+                    6
+                } else {
+                    5
+                }
+            }
+            Op::CCtl(_op) => {
+                // CCTL.C needs 8, CCTL.I needs 11
+                11
+            }
+            // Op::DepBar(_) => 4,
+            _ => 1, // TODO: co-issue
+        }
+    }
+
     pub fn get_dst_latency(&self, sm: u8, dst_idx: usize) -> u32 {
         debug_assert!(self.has_fixed_latency(sm));
         let file = match self.dsts()[dst_idx] {

Reply via email to