On 17/08/15(Mon) 19:55, Mark Kettenis wrote: > The diff below is a first step towards running the em(4) interrupt > handler without grabbing the kernel lock. It runs the rx completion > path without the lock, which is the important bit to be able to test > the work that has been going on to make the network stack run on > multiple CPUs. But the handler still grabs the kernel lock for tx > completions, rx ring refills and everything else. Which means that it > will still grab the lock every time it runs, which may affect how it > interacts with the network stack. > > The locking strategy is centered around the if_rxring interface. The > tricky bit is to prevent the code that brings down the interface from > frees the data structures accessed by the interrupt handler while it > is running. The interrupt handler's rx completion path checks if > there are any filled ring slots before checking them and holds the > lock while doing so. The code that frees the rx ring now explicitly > calls > > if_rxr_init(&sc->rx_ring, 0, 0); > > which clears the number of filled ring slots, so after the > initialization call completes, we can be sure the rx completion code > doesn't run again until we refill the ring. > > The nice side effect if resetting the if_rxring structure this way is > that if you bring down the interface, systat mbuf correctly shows the > number of allocated mbufs on the ring as 0. But it also resets the > water marks, which may or may not be a good idea. > > ok?
ok mpi@ > Index: if_em.c > =================================================================== > RCS file: /cvs/src/sys/dev/pci/if_em.c,v > retrieving revision 1.299 > diff -u -p -r1.299 if_em.c > --- if_em.c 24 Jun 2015 09:40:54 -0000 1.299 > +++ if_em.c 17 Aug 2015 09:24:58 -0000 > @@ -353,6 +353,8 @@ em_attach(struct device *parent, struct > sc = (struct em_softc *)self; > sc->osdep.em_pa = *pa; > > + mtx_init(&sc->rx_mtx, IPL_NET); > + > timeout_set(&sc->timer_handle, em_local_timer, sc); > timeout_set(&sc->tx_fifo_timer_handle, em_82547_move_tail, sc); > > @@ -922,6 +924,8 @@ em_intr(void *arg) > refill = 1; > } > > + KERNEL_LOCK(); > + > /* Link status change */ > if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) { > sc->hw.get_link_status = 1; > @@ -942,6 +946,8 @@ em_intr(void *arg) > E1000_WRITE_REG(&sc->hw, RDT, sc->last_rx_desc_filled); > } > > + KERNEL_UNLOCK(); > + > return (1); > } > > @@ -1698,8 +1704,8 @@ em_allocate_pci_resources(struct em_soft > sc->hw.back = &sc->osdep; > > intrstr = pci_intr_string(pc, ih); > - sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET, em_intr, sc, > - sc->sc_dv.dv_xname); > + sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE, > + em_intr, sc, sc->sc_dv.dv_xname); > if (sc->sc_intrhand == NULL) { > printf(": couldn't establish interrupt"); > if (intrstr != NULL) > @@ -2413,6 +2419,8 @@ em_txeof(struct em_softc *sc) > if (sc->num_tx_desc_avail == sc->num_tx_desc) > return; > > + KERNEL_LOCK(); > + > num_avail = sc->num_tx_desc_avail; > first = sc->next_tx_to_clean; > tx_desc = &sc->tx_desc_base[first]; > @@ -2494,6 +2502,8 @@ em_txeof(struct em_softc *sc) > ifp->if_timer = EM_TX_TIMEOUT; > > sc->num_tx_desc_avail = num_avail; > + > + KERNEL_UNLOCK(); > } > > /********************************************************************* > @@ -2743,6 +2753,10 @@ em_free_receive_structures(struct em_sof > > INIT_DEBUGOUT("free_receive_structures: begin"); > > + mtx_enter(&sc->rx_mtx); > + if_rxr_init(&sc->rx_ring, 0, 0); > + mtx_leave(&sc->rx_mtx); > + > if (sc->rx_buffer_area != NULL) { > rx_buffer = sc->rx_buffer_area; > for (i = 0; i < sc->num_rx_desc; i++, rx_buffer++) { > @@ -2825,6 +2839,8 @@ em_rxfill(struct em_softc *sc) > int post = 0; > int i; > > + mtx_enter(&sc->rx_mtx); > + > i = sc->last_rx_desc_filled; > > for (slots = if_rxr_get(&sc->rx_ring, sc->num_rx_desc); > @@ -2841,6 +2857,8 @@ em_rxfill(struct em_softc *sc) > > if_rxr_put(&sc->rx_ring, slots); > > + mtx_leave(&sc->rx_mtx); > + > return (post); > } > > @@ -2856,6 +2874,7 @@ em_rxeof(struct em_softc *sc) > { > struct ifnet *ifp = &sc->interface_data.ac_if; > struct mbuf_list ml = MBUF_LIST_INITIALIZER(); > + struct mbuf_list free_ml = MBUF_LIST_INITIALIZER(); > struct mbuf *m; > u_int8_t accept_frame = 0; > u_int8_t eop = 0; > @@ -2867,8 +2886,11 @@ em_rxeof(struct em_softc *sc) > struct em_buffer *pkt; > u_int8_t status; > > - if (if_rxr_inuse(&sc->rx_ring) == 0) > + mtx_enter(&sc->rx_mtx); > + if (if_rxr_inuse(&sc->rx_ring) == 0) { > + mtx_leave(&sc->rx_mtx); > return; > + } > > i = sc->next_rx_desc_to_check; > > @@ -2988,12 +3010,12 @@ em_rxeof(struct em_softc *sc) > sc->dropped_pkts++; > > if (sc->fmp != NULL) { > - m_freem(sc->fmp); > + ml_enqueue(&free_ml, sc->fmp); > sc->fmp = NULL; > sc->lmp = NULL; > } > > - m_freem(m); > + ml_enqueue(&free_ml, m); > } > > /* Advance our pointers to the next descriptor. */ > @@ -3005,9 +3027,14 @@ em_rxeof(struct em_softc *sc) > 0, sizeof(*desc) * sc->num_rx_desc, > BUS_DMASYNC_PREREAD); > > - if_input(ifp, &ml); > - > sc->next_rx_desc_to_check = i; > + > + mtx_leave(&sc->rx_mtx); > + > + MBUF_LIST_FOREACH(&free_ml, m) > + m_freem(m); > + > + if_input(ifp, &ml); > } > > /********************************************************************* > Index: if_em.h > =================================================================== > RCS file: /cvs/src/sys/dev/pci/if_em.h,v > retrieving revision 1.54 > diff -u -p -r1.54 if_em.h > --- if_em.h 26 Dec 2014 05:46:32 -0000 1.54 > +++ if_em.h 17 Aug 2015 09:24:58 -0000 > @@ -373,6 +373,7 @@ struct em_softc { > struct em_dma_alloc rxdma; /* bus_dma glue for rx desc */ > struct em_rx_desc *rx_desc_base; > struct if_rxring rx_ring; > + struct mutex rx_mtx; > u_int32_t next_rx_desc_to_check; > u_int32_t last_rx_desc_filled; > u_int32_t rx_buffer_len; >