The diff below is a first step towards running the em(4) interrupt handler without grabbing the kernel lock. It runs the rx completion path without the lock, which is the important bit to be able to test the work that has been going on to make the network stack run on multiple CPUs. But the handler still grabs the kernel lock for tx completions, rx ring refills and everything else. Which means that it will still grab the lock every time it runs, which may affect how it interacts with the network stack.
The locking strategy is centered around the if_rxring interface. The tricky bit is to prevent the code that brings down the interface from frees the data structures accessed by the interrupt handler while it is running. The interrupt handler's rx completion path checks if there are any filled ring slots before checking them and holds the lock while doing so. The code that frees the rx ring now explicitly calls if_rxr_init(&sc->rx_ring, 0, 0); which clears the number of filled ring slots, so after the initialization call completes, we can be sure the rx completion code doesn't run again until we refill the ring. The nice side effect if resetting the if_rxring structure this way is that if you bring down the interface, systat mbuf correctly shows the number of allocated mbufs on the ring as 0. But it also resets the water marks, which may or may not be a good idea. ok? Index: if_em.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.c,v retrieving revision 1.299 diff -u -p -r1.299 if_em.c --- if_em.c 24 Jun 2015 09:40:54 -0000 1.299 +++ if_em.c 17 Aug 2015 09:24:58 -0000 @@ -353,6 +353,8 @@ em_attach(struct device *parent, struct sc = (struct em_softc *)self; sc->osdep.em_pa = *pa; + mtx_init(&sc->rx_mtx, IPL_NET); + timeout_set(&sc->timer_handle, em_local_timer, sc); timeout_set(&sc->tx_fifo_timer_handle, em_82547_move_tail, sc); @@ -922,6 +924,8 @@ em_intr(void *arg) refill = 1; } + KERNEL_LOCK(); + /* Link status change */ if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { sc->hw.get_link_status = 1; @@ -942,6 +946,8 @@ em_intr(void *arg) E1000_WRITE_REG(&sc->hw, RDT, sc->last_rx_desc_filled); } + KERNEL_UNLOCK(); + return (1); } @@ -1698,8 +1704,8 @@ em_allocate_pci_resources(struct em_soft sc->hw.back = &sc->osdep; intrstr = pci_intr_string(pc, ih); - sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET, em_intr, sc, - sc->sc_dv.dv_xname); + sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE, + em_intr, sc, sc->sc_dv.dv_xname); if (sc->sc_intrhand == NULL) { printf(": couldn't establish interrupt"); if (intrstr != NULL) @@ -2413,6 +2419,8 @@ em_txeof(struct em_softc *sc) if (sc->num_tx_desc_avail == sc->num_tx_desc) return; + KERNEL_LOCK(); + num_avail = sc->num_tx_desc_avail; first = sc->next_tx_to_clean; tx_desc = &sc->tx_desc_base[first]; @@ -2494,6 +2502,8 @@ em_txeof(struct em_softc *sc) ifp->if_timer = EM_TX_TIMEOUT; sc->num_tx_desc_avail = num_avail; + + KERNEL_UNLOCK(); } /********************************************************************* @@ -2743,6 +2753,10 @@ em_free_receive_structures(struct em_sof INIT_DEBUGOUT("free_receive_structures: begin"); + mtx_enter(&sc->rx_mtx); + if_rxr_init(&sc->rx_ring, 0, 0); + mtx_leave(&sc->rx_mtx); + if (sc->rx_buffer_area != NULL) { rx_buffer = sc->rx_buffer_area; for (i = 0; i < sc->num_rx_desc; i++, rx_buffer++) { @@ -2825,6 +2839,8 @@ em_rxfill(struct em_softc *sc) int post = 0; int i; + mtx_enter(&sc->rx_mtx); + i = sc->last_rx_desc_filled; for (slots = if_rxr_get(&sc->rx_ring, sc->num_rx_desc); @@ -2841,6 +2857,8 @@ em_rxfill(struct em_softc *sc) if_rxr_put(&sc->rx_ring, slots); + mtx_leave(&sc->rx_mtx); + return (post); } @@ -2856,6 +2874,7 @@ em_rxeof(struct em_softc *sc) { struct ifnet *ifp = &sc->interface_data.ac_if; struct mbuf_list ml = MBUF_LIST_INITIALIZER(); + struct mbuf_list free_ml = MBUF_LIST_INITIALIZER(); struct mbuf *m; u_int8_t accept_frame = 0; u_int8_t eop = 0; @@ -2867,8 +2886,11 @@ em_rxeof(struct em_softc *sc) struct em_buffer *pkt; u_int8_t status; - if (if_rxr_inuse(&sc->rx_ring) == 0) + mtx_enter(&sc->rx_mtx); + if (if_rxr_inuse(&sc->rx_ring) == 0) { + mtx_leave(&sc->rx_mtx); return; + } i = sc->next_rx_desc_to_check; @@ -2988,12 +3010,12 @@ em_rxeof(struct em_softc *sc) sc->dropped_pkts++; if (sc->fmp != NULL) { - m_freem(sc->fmp); + ml_enqueue(&free_ml, sc->fmp); sc->fmp = NULL; sc->lmp = NULL; } - m_freem(m); + ml_enqueue(&free_ml, m); } /* Advance our pointers to the next descriptor. */ @@ -3005,9 +3027,14 @@ em_rxeof(struct em_softc *sc) 0, sizeof(*desc) * sc->num_rx_desc, BUS_DMASYNC_PREREAD); - if_input(ifp, &ml); - sc->next_rx_desc_to_check = i; + + mtx_leave(&sc->rx_mtx); + + MBUF_LIST_FOREACH(&free_ml, m) + m_freem(m); + + if_input(ifp, &ml); } /********************************************************************* Index: if_em.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.h,v retrieving revision 1.54 diff -u -p -r1.54 if_em.h --- if_em.h 26 Dec 2014 05:46:32 -0000 1.54 +++ if_em.h 17 Aug 2015 09:24:58 -0000 @@ -373,6 +373,7 @@ struct em_softc { struct em_dma_alloc rxdma; /* bus_dma glue for rx desc */ struct em_rx_desc *rx_desc_base; struct if_rxring rx_ring; + struct mutex rx_mtx; u_int32_t next_rx_desc_to_check; u_int32_t last_rx_desc_filled; u_int32_t rx_buffer_len;