On Mon, 2015-09-21 at 14:59 +0100, David Woodhouse wrote: > After which, I think we might be able to turn on TX checksumming by > default and I also have a way to implement early detection of the TX > stall I've been seeing.
This is a patch I've been playing with to catch the TX stall.
When it happens we get a TxEmpty interrupt *without* TxDone.
After loading the driver, we never get TxEmpty without TxDone until the
problem has happened. I've got a minimal cp_tx_timeout() which just
clears the TxOn bit in the Cmd register and turns it back on again,
after moving the descriptors all down to the start of the TX ring.
Strangely, *after* this has happened we do see a lot of instances of
TxEmpty without TxDone. But then the TxDone usually comes immediately
afterwards. Until the driver is reloaded, at which point the next
instance of TxEmpty without TxDone *is* the hardware stalling again.
I'm very confused about what's going on there. I think I possibly need
to set a timer when the TxEmpty/!TxDone interrupt happens, and do a
preemptive reset of the TX queue (as shown below) if the timer manages
to expire before the TxDone actually happens.
diff --git a/drivers/net/ethernet/realtek/8139cp.c
b/drivers/net/ethernet/realtek/8139cp.c
index 67a4fcf..64b44ec 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -592,8 +592,8 @@ static irqreturn_t cp_interrupt (int irq, void
*dev_instance)
if (!status || (status == 0xFFFF))
goto out_unlock;
- netif_dbg(cp, intr, dev, "intr, status %04x cmd %02x cpcmd %04x\n",
- status, cpr8(Cmd), cpr16(CpCmd));
+ netif_dbg(cp, intr, dev, "intr, status %04x cmd %02x cpcmd %04x desc
%04x\n",
+ status, cpr8(Cmd), cpr16(CpCmd), cpr16(TxDmaOkLowDesc));
cpw16(IntrStatus, status & ~cp_rx_intr_mask);
@@ -623,6 +623,10 @@ static irqreturn_t cp_interrupt (int irq, void
*dev_instance)
Let it stop. */
cp->tx_running = 0;
} else {
+ if (!(status & (TxOK | TxErr)))
+ netdev_warn(dev, "TxEmpty without TxDone. h %d
t %d d %04x\n",
+ cp->tx_head, cp->tx_tail,
cpr16(TxDmaOkLowDesc));
+
/* The hardware raced with us adding a new descriptor,
and we didn't get the TxEmpty IRQ in time so we
didn't prod it. Prod it now to restart. */
@@ -1307,12 +1311,49 @@ static void cp_tx_timeout(struct net_device *dev)
le64_to_cpu(cp->tx_ring[i].addr),
cp->tx_skb[i]);
}
-
+#if 1
+ /* Stop the TX (which is already stopped), move the
+ descriptors down, and start it again */
+ cpw8_f(Cmd, RxOn);
+ if (cp->tx_tail == 0) {
+ /* Do nothing */
+ } else if (cp->tx_head > cp->tx_tail) {
+ for (i = 0; i < cp->tx_head - cp->tx_tail; i++) {
+ int from = i + cp->tx_tail;
+ cp->tx_ring[i].addr = cp->tx_ring[from].addr;
+ cp->tx_ring[i].opts2 = cp->tx_ring[from].opts2;
+ cp->tx_ring[i].opts1 = cp->tx_ring[from].opts1;
+ cp->tx_ring[from].opts1 &= ~cpu_to_le32(DescOwn);
+ cp->tx_opts[i] = cp->tx_opts[from];
+ cp->tx_skb[i] = cp->tx_skb[from];
+ cp->tx_skb[from] = NULL;
+ printk("Ring move %d->%d\n", from, i);
+ }
+ } else {
+ for (i = cp->tx_tail - cp->tx_head - 1; i >= 0; i--) {
+ int from = (i + cp->tx_tail) & (CP_TX_RING_SIZE - 1);
+ cp->tx_ring[i].addr = cp->tx_ring[from].addr;
+ cp->tx_ring[i].opts2 = cp->tx_ring[from].opts2;
+ cp->tx_ring[i].opts1 = cp->tx_ring[from].opts1;
+ cp->tx_ring[from].opts1 &= ~cpu_to_le32(DescOwn);
+ cp->tx_opts[i] = cp->tx_opts[from];
+ cp->tx_skb[i] = cp->tx_skb[from];
+ cp->tx_skb[from] = NULL;
+ printk("Ring move %d->%d\n", from, i);
+ }
+ }
+ cpw8_f(Cmd, RxOn|TxOn);
+ cp->tx_head = (cp->tx_head - cp->tx_tail) & (CP_TX_RING_SIZE - 1);
+ cp->tx_tail = 0;
+ printk("head %d tail %d\n", cp->tx_head, cp->tx_tail);
+ cpw8(TxPoll, NormalTxPoll);
+#else
cp_stop_hw(cp);
cp_clean_rings(cp);
rc = cp_init_rings(cp);
cp_start_hw(cp);
__cp_set_rx_mode(dev);
+#endif
cpw16_f(IntrMask, cp_norx_intr_mask);
netif_wake_queue(dev);
--
dwmw2
smime.p7s
Description: S/MIME cryptographic signature
