On Mon, Nov 08, 2021 at 10:59:55AM +0100, Stefan Sperling wrote:
> iwx(4) has an issue which occurs very occasionally for me (every couple
> of days or sometimes even weeks) where ssh(1) fails to connect until the
> interface is reset with ifconfig down/up.
> 
> The initial protocol exchange with the SSH server succeeds, but as soon
> as the client enters interactive state and sets TCP_NODELAY the connection
> will hang.
> While the interface is in this broken state other traffic such as ping
> still makes it through, but any new ssh connection attempts show the
> same symptom. It is just the interactive SSH packets which are stuck.
> 
> TCP_NODELAY packets are mapped to a special QoS TID by net80211, and each
> such TID gets mapped to a dedicated Tx aggregation queue by the driver.
> Combined with the observation that the Linux driver carries a workaround
> for "stuck" individual Tx queues, we are likely facing a situation where
> only one Tx queue in the device has stopped working.
> 
> Our interface watchdog cannot detect single Tx queue failures at present.
> It is satisfied as long as packets make it out on any queue.
> With the patch below we use a Tx timer per queue which will hopefully result
> in a device timeout when the problem occurs. A manual reset will no longer be
> required to unwedge things since the watchdog should now trigger a reset.
> 
> It is possible that other hangs which have been reported are related to
> this but show different symptoms depending on which Tx queue gets stuck.
> If the default Tx agg queue gets stuck it will become impossible to send
> data packets that are not marked as TCP_NODELAY. If you are connected over
> ssh while this happens the open ssh session might prevent the watchdog from
> triggering and no new connections can be made.
> 
> I wish we had a fix for Tx queues getting stuck in the first place but this
> is the best I can do.
> 
> ok?

Did anyone try this yet? It is working well for me.

This new version of the patch has been rebased on top of two small
iwx(4) driver fixes I have sent out separately today. With this,
the size of the sc_tx_timer array is corrected to match the new
size of the Tx ring array.

Apply them all in sequence:

"fix Tx ring array size"
https://marc.info/?l=openbsd-tech&m=163671980016530&w=2

"fix TID array index bound checks"
https://marc.info/?l=openbsd-tech&m=163672074217035&w=2

ok?
 
diff 6629860d5767033824e2fb1c0f621c18216a7316 
fbfd8a2c2e804554680d3879819c789931916dc5
blob - 6e67f7db864ec9a7ca61057b7284e7d7a6bffe5d
blob + e1793a5b742f691cbffa930e8f6363538190d590
--- sys/dev/pci/if_iwx.c
+++ sys/dev/pci/if_iwx.c
@@ -4552,8 +4552,6 @@ iwx_rx_tx_cmd(struct iwx_softc *sc, struct iwx_rx_pack
        bus_dmamap_sync(sc->sc_dmat, data->map, 0, IWX_RBUF_SIZE,
            BUS_DMASYNC_POSTREAD);
 
-       sc->sc_tx_timer = 0;
-
        /* Sanity checks. */
        if (sizeof(*tx_resp) > len)
                return;
@@ -4563,6 +4561,8 @@ iwx_rx_tx_cmd(struct iwx_softc *sc, struct iwx_rx_pack
            tx_resp->frame_count * sizeof(tx_resp->status) > len)
                return;
 
+       sc->sc_tx_timer[qid] = 0;
+
        if (tx_resp->frame_count > 1) /* A-MPDU */
                return;
 
@@ -4658,7 +4658,7 @@ iwx_rx_compressed_ba(struct iwx_softc *sc, struct iwx_
                idx = le16toh(ba_tfd->tfd_index);
                if (idx >= IWX_TX_RING_COUNT)
                        continue;
-               sc->sc_tx_timer = 0;
+               sc->sc_tx_timer[qid] = 0;
                iwx_txq_advance(sc, ring, idx);
                iwx_clear_oactive(sc, ring);
        }
@@ -5433,6 +5433,9 @@ iwx_tx(struct iwx_softc *sc, struct mbuf *m, struct ie
                sc->qfullmsk |= 1 << ring->qid;
        }
 
+       if (ic->ic_if.if_flags & IFF_UP)
+               sc->sc_tx_timer[ring->qid] = 15;
+
        return 0;
 }
 
@@ -7973,10 +7976,8 @@ iwx_start(struct ifnet *ifp)
                        continue;
                }
 
-               if (ifp->if_flags & IFF_UP) {
-                       sc->sc_tx_timer = 15;
+               if (ifp->if_flags & IFF_UP)
                        ifp->if_timer = 1;
-               }
        }
 
        return;
@@ -8045,7 +8046,8 @@ iwx_stop(struct ifnet *ifp)
                struct iwx_rxba_data *rxba = &sc->sc_rxba_data[i];
                iwx_clear_reorder_buffer(sc, rxba);
        }
-       ifp->if_timer = sc->sc_tx_timer = 0;
+       memset(sc->sc_tx_timer, 0, sizeof(sc->sc_tx_timer));
+       ifp->if_timer = 0;
 
        splx(s);
 }
@@ -8054,21 +8056,30 @@ void
 iwx_watchdog(struct ifnet *ifp)
 {
        struct iwx_softc *sc = ifp->if_softc;
+       int i;
 
        ifp->if_timer = 0;
-       if (sc->sc_tx_timer > 0) {
-               if (--sc->sc_tx_timer == 0) {
-                       printf("%s: device timeout\n", DEVNAME(sc));
-                       if (ifp->if_flags & IFF_DEBUG) {
-                               iwx_nic_error(sc);
-                               iwx_dump_driver_status(sc);
+
+       /*
+        * We maintain a separate timer for each Tx queue because
+        * Tx aggregation queues can get "stuck" while other queues
+        * keep working. The Linux driver uses a similar workaround.
+        */
+       for (i = 0; i < nitems(sc->sc_tx_timer); i++) {
+               if (sc->sc_tx_timer[i] > 0) {
+                       if (--sc->sc_tx_timer[i] == 0) {
+                               printf("%s: device timeout\n", DEVNAME(sc));
+                               if (ifp->if_flags & IFF_DEBUG) {
+                                       iwx_nic_error(sc);
+                                       iwx_dump_driver_status(sc);
+                               }
+                               if ((sc->sc_flags & IWX_FLAG_SHUTDOWN) == 0)
+                                       task_add(systq, &sc->init_task);
+                               ifp->if_oerrors++;
+                               return;
                        }
-                       if ((sc->sc_flags & IWX_FLAG_SHUTDOWN) == 0)
-                               task_add(systq, &sc->init_task);
-                       ifp->if_oerrors++;
-                       return;
+                       ifp->if_timer = 1;
                }
-               ifp->if_timer = 1;
        }
 
        ieee80211_watchdog(ifp);
blob - 719d9b6295b9eb9c31b36a98bcb0259a8a2f4cf3
blob + 50d9e6346079be78b020a0d43a3c755e5945f1d3
--- sys/dev/pci/if_iwxvar.h
+++ sys/dev/pci/if_iwxvar.h
@@ -563,7 +563,7 @@ struct iwx_softc {
        struct iwx_nvm_data sc_nvm;
        struct iwx_bf_data sc_bf;
 
-       int sc_tx_timer;
+       int sc_tx_timer[IWX_NUM_TX_QUEUES];
        int sc_rx_ba_sessions;
 
        int sc_scan_last_antenna;

Reply via email to