On Fri, Dec 14, 2018 at 11:59:00AM +0100, Ard Biesheuvel wrote: > On Fri, 14 Dec 2018 at 09:59, Ilias Apalodimas > <ilias.apalodi...@linaro.org> wrote: > > > > Currently the driver issues 2 mmio reads to figure out the number of > > transmitted packets and clean them. We can get rid of the expensive > > reads since BIT 31 of the Tx descriptor can be used for that. > > We can also remove the budget counting of Tx completions since all of > > the descriptors are not deliberately processed. > > > > This last sentence reflects what we discussed off-line, right? That > reaping used Tx descriptors shouldn't be counted against the budget, > but should only occur if the budget is > 0 after processing the Rx > queue?
yes > > Performance numbers using pktgen are: > > size pre-patch(pps) post-patch(pps) > > 64 362483 427916 > > 128 358315 411686 > > 256 352725 389683 > > 512 215675 216464 > > 1024 113812 114442 > > > > Signed-off-by: Ilias Apalodimas <ilias.apalodi...@linaro.org> > > --- > > drivers/net/ethernet/socionext/netsec.c | 97 ++++++++++++++----------- > > 1 file changed, 54 insertions(+), 43 deletions(-) > > > > diff --git a/drivers/net/ethernet/socionext/netsec.c > > b/drivers/net/ethernet/socionext/netsec.c > > index 584a6b3f6542..05a0948ad929 100644 > > --- a/drivers/net/ethernet/socionext/netsec.c > > +++ b/drivers/net/ethernet/socionext/netsec.c > > @@ -257,7 +257,6 @@ struct netsec_desc_ring { > > dma_addr_t desc_dma; > > struct netsec_desc *desc; > > void *vaddr; > > - u16 pkt_cnt; > > u16 head, tail; > > }; > > > > @@ -598,33 +597,26 @@ static void netsec_set_rx_de(struct netsec_priv *priv, > > dring->desc[idx].len = desc->len; > > } > > > > -static int netsec_clean_tx_dring(struct netsec_priv *priv, int budget) > > +static bool netsec_clean_tx_dring(struct netsec_priv *priv) > > { > > struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_TX]; > > unsigned int pkts, bytes; > > - > > - dring->pkt_cnt += netsec_read(priv, NETSEC_REG_NRM_TX_DONE_PKTCNT); > > - > > - if (dring->pkt_cnt < budget) > > - budget = dring->pkt_cnt; > > + struct netsec_de *entry; > > + int tail = dring->tail; > > + int cnt = 0; > > > > pkts = 0; > > bytes = 0; > > + entry = dring->vaddr + DESC_SZ * tail; > > > > - while (pkts < budget) { > > + while (!(entry->attr & (1U << NETSEC_TX_SHIFT_OWN_FIELD)) && > > + cnt < DESC_NUM) { > > struct netsec_desc *desc; > > - struct netsec_de *entry; > > - int tail, eop; > > - > > - tail = dring->tail; > > - > > - /* move tail ahead */ > > - dring->tail = (tail + 1) % DESC_NUM; > > + int eop; > > > > desc = &dring->desc[tail]; > > - entry = dring->vaddr + DESC_SZ * tail; > > - > > eop = (entry->attr >> NETSEC_TX_LAST) & 1; > > + dma_rmb(); > > > > dma_unmap_single(priv->dev, desc->dma_addr, desc->len, > > DMA_TO_DEVICE); > > @@ -633,38 +625,51 @@ static int netsec_clean_tx_dring(struct netsec_priv > > *priv, int budget) > > bytes += desc->skb->len; > > dev_kfree_skb(desc->skb); > > } > > + /* clean up so netsec_uninit_pkt_dring() won't free the skb > > + * again > > + */ > > *desc = (struct netsec_desc){}; > > + > > + /* entry->attr is not going to be accessed by the NIC until > > + * netsec_set_tx_de() is called. No need for a dma_wmb() > > here > > + */ > > + entry->attr = 1U << NETSEC_TX_SHIFT_OWN_FIELD; > > + /* move tail ahead */ > > + dring->tail = (tail + 1) % DESC_NUM; > > + > > + tail = dring->tail; > > + entry = dring->vaddr + DESC_SZ * tail; > > + cnt++; > > } > > - dring->pkt_cnt -= budget; > > > > - priv->ndev->stats.tx_packets += budget; > > + if (!cnt) > > + return false; > > + > > + /* reading the register clears the irq */ > > + netsec_read(priv, NETSEC_REG_NRM_TX_DONE_PKTCNT); > > + > > + priv->ndev->stats.tx_packets += cnt; > > priv->ndev->stats.tx_bytes += bytes; > > > > - netdev_completed_queue(priv->ndev, budget, bytes); > > + netdev_completed_queue(priv->ndev, cnt, bytes); > > > > - return budget; > > + return true; > > } > > > > -static int netsec_process_tx(struct netsec_priv *priv, int budget) > > +static void netsec_process_tx(struct netsec_priv *priv) > > { > > struct net_device *ndev = priv->ndev; > > - int new, done = 0; > > + bool cleaned; > > > > - do { > > - new = netsec_clean_tx_dring(priv, budget); > > - done += new; > > - budget -= new; > > - } while (new); > > + cleaned = netsec_clean_tx_dring(priv); > > > > - if (done && netif_queue_stopped(ndev)) { > > + if (cleaned && netif_queue_stopped(ndev)) { > > /* Make sure we update the value, anyone stopping the queue > > * after this will read the proper consumer idx > > */ > > smp_wmb(); > > netif_wake_queue(ndev); > > } > > - > > - return done; > > } > > > > static void *netsec_alloc_rx_data(struct netsec_priv *priv, > > @@ -813,24 +818,17 @@ static int netsec_process_rx(struct netsec_priv > > *priv, int budget) > > static int netsec_napi_poll(struct napi_struct *napi, int budget) > > { > > struct netsec_priv *priv; > > - int tx, rx, done, todo; > > + int rx, done, todo; > > > > priv = container_of(napi, struct netsec_priv, napi); > > > > + netsec_process_tx(priv); > > + > > So shouldn't we do this *after* processing Rx, and only if there is budget > left? I am not really sure, this would drown Tx processing if you had a bunch of received packets that exhausted the budget. Intel 1gbit drivers are doing something similar. They reclaim Tx prior to processing Rx. The only thing that could be checked here i guess is keep the napi poll running if *all* Tx descriptors were processed at some point instead of re-enabling irqs. > > > todo = budget; > > do { > > - if (!todo) > > - break; > > - > > - tx = netsec_process_tx(priv, todo); > > - todo -= tx; > > - > > - if (!todo) > > - break; > > - > > rx = netsec_process_rx(priv, todo); > > todo -= rx; > > - } while (rx || tx); > > + } while (rx); > > > > done = budget - todo; > > > > @@ -1007,7 +1005,6 @@ static void netsec_uninit_pkt_dring(struct > > netsec_priv *priv, int id) > > > > dring->head = 0; > > dring->tail = 0; > > - dring->pkt_cnt = 0; > > > > if (id == NETSEC_RING_TX) > > netdev_reset_queue(priv->ndev); > > @@ -1030,6 +1027,7 @@ static void netsec_free_dring(struct netsec_priv > > *priv, int id) > > static int netsec_alloc_dring(struct netsec_priv *priv, enum ring_id id) > > { > > struct netsec_desc_ring *dring = &priv->desc_ring[id]; > > + int i; > > > > dring->vaddr = dma_zalloc_coherent(priv->dev, DESC_SZ * DESC_NUM, > > &dring->desc_dma, GFP_KERNEL); > > @@ -1040,6 +1038,19 @@ static int netsec_alloc_dring(struct netsec_priv > > *priv, enum ring_id id) > > if (!dring->desc) > > goto err; > > > > + if (id == NETSEC_RING_TX) { > > + for (i = 0; i < DESC_NUM; i++) { > > + struct netsec_de *de; > > + > > + de = dring->vaddr + (DESC_SZ * i); > > + /* de->attr is not going to be accessed by the NIC > > + * until netsec_set_tx_de() is called. > > + * No need for a dma_wmb() here > > + */ > > + de->attr = 1U << NETSEC_TX_SHIFT_OWN_FIELD; > > + } > > + } > > + > > return 0; > > err: > > netsec_free_dring(priv, id); > > -- > > 2.19.1 > >