Performance Enhancements - aggressive prefetch of rx_desc and skb->data just like we do for 10gig - align the prefetches to a dword to help speed them up - copybreak for packets < 256 bytes, ideally we would like to modify ethtool to allow this value to be changed, helps small MTU, many reassemblies case - Fix RX buffer size changes - Fixed Jumbo frames and memory allocation
Signed-off-by: Jeff Kirsher <[EMAIL PROTECTED]> Signed-off-by: John Ronciak <[EMAIL PROTECTED]> Signed-off-by: Jesse Brandeburg <[EMAIL PROTECTED]> diff -up linux-2.6/drivers/net/e1000/e1000.h linux-2.6.new/drivers/net/e1000/e1000.h --- linux-2.6/drivers/net/e1000/e1000.h 2005-11-14 16:20:34.000000000 -0800 +++ linux-2.6.new/drivers/net/e1000/e1000.h 2005-11-04 01:23:40.000000000 -0800 @@ -216,6 +216,12 @@ struct e1000_rx_ring { struct e1000_ps_page *ps_page; struct e1000_ps_page_dma *ps_page_dma; + struct sk_buff *rx_skb_top; + struct sk_buff *rx_skb_prev; + + /* cpu for rx queue */ + int cpu; + uint16_t rdh; uint16_t rdt; uint64_t pkt; @@ -288,7 +288,8 @@ struct e1000_adapter { struct e1000_rx_ring *rx_ring); #endif void (*alloc_rx_buf) (struct e1000_adapter *adapter, - struct e1000_rx_ring *rx_ring); + struct e1000_rx_ring *rx_ring, + int cleaned_count); struct e1000_rx_ring *rx_ring; /* One per active queue */ #ifdef CONFIG_E1000_NAPI struct net_device *polling_netdev; /* One per active queue */ diff -up linux-2.6/drivers/net/e1000/e1000_main.c linux-2.6.new/drivers/net/e1000/e1000_main.c --- linux-2.6/drivers/net/e1000/e1000_main.c 2005-11-14 16:20:34.000000000 -0800 +++ linux-2.6.new/drivers/net/e1000/e1000_main.c 2005-11-04 01:23:40.000000000 -0800 @@ -171,9 +171,11 @@ static boolean_t e1000_clean_rx_irq_ps(s struct e1000_rx_ring *rx_ring); #endif static void e1000_alloc_rx_buffers(struct e1000_adapter *adapter, - struct e1000_rx_ring *rx_ring); + struct e1000_rx_ring *rx_ring, + int cleaned_count); static void e1000_alloc_rx_buffers_ps(struct e1000_adapter *adapter, - struct e1000_rx_ring *rx_ring); + struct e1000_rx_ring *rx_ring, + int cleaned_count); static int e1000_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd); static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd); @@ -344,7 +344,8 @@ e1000_up(struct e1000_adapter *adapter) e1000_setup_rctl(adapter); e1000_configure_rx(adapter); for (i = 0; i < adapter->num_queues; i++) - adapter->alloc_rx_buf(adapter, &adapter->rx_ring[i]); + adapter->alloc_rx_buf(adapter, &adapter->rx_ring[i], + adapter->rx_ring[i].count); #ifdef CONFIG_PCI_MSI if(adapter->hw.mac_type > e1000_82547_rev_2) { @@ -1454,6 +1457,8 @@ setup_rx_desc_die: rxdr->next_to_clean = 0; rxdr->next_to_use = 0; + rxdr->rx_skb_top = NULL; + rxdr->rx_skb_prev = NULL; return 0; } @@ -1527,23 +1532,8 @@ e1000_setup_rctl(struct e1000_adapter *a rctl |= adapter->rx_buffer_len << 0x11; } else { rctl &= ~E1000_RCTL_SZ_4096; - rctl |= E1000_RCTL_BSEX; - switch (adapter->rx_buffer_len) { - case E1000_RXBUFFER_2048: - default: - rctl |= E1000_RCTL_SZ_2048; - rctl &= ~E1000_RCTL_BSEX; - break; - case E1000_RXBUFFER_4096: - rctl |= E1000_RCTL_SZ_4096; - break; - case E1000_RXBUFFER_8192: - rctl |= E1000_RCTL_SZ_8192; - break; - case E1000_RXBUFFER_16384: - rctl |= E1000_RCTL_SZ_16384; - break; - } + rctl &= ~E1000_RCTL_BSEX; + rctl |= E1000_RCTL_SZ_2048; } #ifdef CONFIG_E1000_PACKET_SPLIT @@ -1935,6 +1925,16 @@ e1000_clean_rx_ring(struct e1000_adapter } } + /* there also may be some cached data in our adapter */ + if(rx_ring->rx_skb_top) { + dev_kfree_skb(rx_ring->rx_skb_top); + + /* rx_skb_prev will be wiped out by rx_skb_top */ + rx_ring->rx_skb_top = NULL; + rx_ring->rx_skb_prev = NULL; + } + + size = sizeof(struct e1000_buffer) * rx_ring->count; memset(rx_ring->buffer_info, 0, size); size = sizeof(struct e1000_ps_page) * rx_ring->count; @@ -2005,7 +2005,8 @@ e1000_leave_82542_rst(struct e1000_adapt if(netif_running(netdev)) { e1000_configure_rx(adapter); - e1000_alloc_rx_buffers(adapter, &adapter->rx_ring[0]); + e1000_alloc_rx_buffers(adapter, &adapter->rx_ring[0], + adapter->rx_ring[0].count); } } @@ -2903,29 +2904,30 @@ e1000_change_mtu(struct net_device *netd "on 82573\n"); return -EINVAL; } + if(unlikely((adapter->hw.mac_type < e1000_82543) && + (max_frame > MAXIMUM_ETHERNET_FRAME_SIZE))) { + DPRINTK(PROBE, ERR, "Jumbo Frames not supported on 82542\n"); + return -EINVAL; + } + + /* since the driver code now supports splitting a packet across + * multiple descriptors, most of the fifo related limitations on + * jumbo frame traffic have gone away. + * simply use 2k descriptors for everything. + * + * NOTE: dev_alloc_skb reserves 16 bytes, and typically NET_IP_ALIGN + * means we reserve 2 more, this pushes us to allocate from the next + * larger slab size + * i.e. RXBUFFER_2048 --> size-4096 slab */ + /* recent hardware supports 1KB granularity */ if(adapter->hw.mac_type > e1000_82547_rev_2) { - adapter->rx_buffer_len = max_frame; + adapter->rx_buffer_len = + ((max_frame < E1000_RXBUFFER_2048) ? + max_frame : E1000_RXBUFFER_2048); E1000_ROUNDUP(adapter->rx_buffer_len, 1024); - } else { - if(unlikely((adapter->hw.mac_type < e1000_82543) && - (max_frame > MAXIMUM_ETHERNET_FRAME_SIZE))) { - DPRINTK(PROBE, ERR, "Jumbo Frames not supported " - "on 82542\n"); - return -EINVAL; - - } else { - if(max_frame <= E1000_RXBUFFER_2048) { - adapter->rx_buffer_len = E1000_RXBUFFER_2048; - } else if(max_frame <= E1000_RXBUFFER_4096) { - adapter->rx_buffer_len = E1000_RXBUFFER_4096; - } else if(max_frame <= E1000_RXBUFFER_8192) { - adapter->rx_buffer_len = E1000_RXBUFFER_8192; - } else if(max_frame <= E1000_RXBUFFER_16384) { - adapter->rx_buffer_len = E1000_RXBUFFER_16384; - } - } - } + } else + adapter->rx_buffer_len = E1000_RXBUFFER_2048; netdev->mtu = new_mtu; @@ -3049,8 +3188,8 @@ e1000_update_stats(struct e1000_adapter adapter->net_stats.rx_errors = adapter->stats.rxerrc + adapter->stats.crcerrs + adapter->stats.algnerrc + - adapter->stats.rlec + adapter->stats.mpc + - adapter->stats.cexterr; + adapter->stats.rlec + adapter->stats.cexterr; + adapter->net_stats.rx_dropped = 0; adapter->net_stats.rx_length_errors = adapter->stats.rlec; adapter->net_stats.rx_crc_errors = adapter->stats.crcerrs; adapter->net_stats.rx_frame_errors = adapter->stats.algnerrc; @@ -3294,9 +3454,6 @@ e1000_clean_tx_irq(struct e1000_adapter E1000_STATUS_TXOFF)) { /* detected Tx unit hang */ - i = tx_ring->next_to_clean; - eop = tx_ring->buffer_info[i].next_to_watch; - eop_desc = E1000_TX_DESC(*tx_ring, eop); DPRINTK(DRV, ERR, "Detected Tx Unit Hang\n" " TDH <%x>\n" " TDT <%x>\n" @@ -3303,7 +3454,6 @@ " next_to_use <%x>\n" " next_to_clean <%x>\n" "buffer_info[next_to_clean]\n" - " dma <%llx>\n" " time_stamp <%lx>\n" " next_to_watch <%x>\n" " jiffies <%lx>\n" @@ -3311,9 +3477,8 @@ e1000_clean_tx_irq(struct e1000_adapter readl(adapter->hw.hw_addr + tx_ring->tdh), readl(adapter->hw.hw_addr + tx_ring->tdt), tx_ring->next_to_use, - i, - (unsigned long long)tx_ring->buffer_info[i].dma, - tx_ring->buffer_info[i].time_stamp, + tx_ring->next_to_clean, + tx_ring->buffer_info[eop].time_stamp, eop, jiffies, eop_desc->upper.fields.status); @@ -3391,46 +3388,98 @@ e1000_clean_rx_irq(struct e1000_adapter { struct net_device *netdev = adapter->netdev; struct pci_dev *pdev = adapter->pdev; - struct e1000_rx_desc *rx_desc; - struct e1000_buffer *buffer_info; - struct sk_buff *skb; + struct e1000_rx_desc *rx_desc, *next_rxd; + struct e1000_buffer *buffer_info, *next_buffer, *next2_buffer; unsigned long flags; uint32_t length; uint8_t last_byte; - unsigned int i; - boolean_t cleaned = FALSE; + unsigned int i, j; + int cleaned_count = 0; + boolean_t cleaned = FALSE, multi_descriptor = FALSE; i = rx_ring->next_to_clean; rx_desc = E1000_RX_DESC(*rx_ring, i); + buffer_info = &rx_ring->buffer_info[i]; while(rx_desc->status & E1000_RXD_STAT_DD) { - buffer_info = &rx_ring->buffer_info[i]; + struct sk_buff *skb, *next_skb; + u8 status; + #ifdef CONFIG_E1000_NAPI if(*work_done >= work_to_do) break; (*work_done)++; #endif - cleaned = TRUE; + status = rx_desc->status; + skb = buffer_info->skb; + buffer_info->skb = NULL; + + prefetch(skb->data - NET_IP_ALIGN); + if(++i == rx_ring->count) i = 0; + next_rxd = E1000_RX_DESC(*rx_ring, i); + prefetch(next_rxd); + + if((j = i + 1) == rx_ring->count) j = 0; + next2_buffer = &rx_ring->buffer_info[j]; + prefetch(next2_buffer); + + next_buffer = &rx_ring->buffer_info[i]; + next_skb = next_buffer->skb; + prefetch(next_skb); + prefetch(next_skb->data - NET_IP_ALIGN); + + cleaned = TRUE; + cleaned_count++; pci_unmap_single(pdev, buffer_info->dma, buffer_info->length, PCI_DMA_FROMDEVICE); - skb = buffer_info->skb; length = le16_to_cpu(rx_desc->length); - if(unlikely(!(rx_desc->status & E1000_RXD_STAT_EOP))) { - /* All receives must fit into a single buffer */ - E1000_DBG("%s: Receive packet consumed multiple" - " buffers\n", netdev->name); - dev_kfree_skb_irq(skb); + if(!(status & E1000_RXD_STAT_EOP)) { + skb_put(skb, length); + if(!rx_ring->rx_skb_top) { + rx_ring->rx_skb_top = skb; + rx_ring->rx_skb_top->len = length; + rx_ring->rx_skb_prev = skb; + } else { + if(skb_shinfo(rx_ring->rx_skb_top)->frag_list) { + rx_ring->rx_skb_prev->next = skb; + skb->prev = rx_ring->rx_skb_prev; + } else { + skb_shinfo(rx_ring->rx_skb_top)->frag_list = skb; + } + rx_ring->rx_skb_prev = skb; + rx_ring->rx_skb_top->data_len += length; + } goto next_desc; + } else { + skb_put(skb, length); + if (rx_ring->rx_skb_top) { + if(skb_shinfo(rx_ring->rx_skb_top) + ->frag_list) { + rx_ring->rx_skb_prev->next = skb; + skb->prev = rx_ring->rx_skb_prev; + } else + skb_shinfo(rx_ring->rx_skb_top) + ->frag_list = skb; + + rx_ring->rx_skb_top->data_len += length; + rx_ring->rx_skb_top->len += + rx_ring->rx_skb_top->data_len; + + skb = rx_ring->rx_skb_top; + multi_descriptor = TRUE; + rx_ring->rx_skb_top = NULL; + rx_ring->rx_skb_prev = NULL; + } } if(unlikely(rx_desc->errors & E1000_RXD_ERR_FRAME_ERR_MASK)) { last_byte = *(skb->data + length - 1); - if(TBI_ACCEPT(&adapter->hw, rx_desc->status, + if(TBI_ACCEPT(&adapter->hw, status, rx_desc->errors, length, last_byte)) { spin_lock_irqsave(&adapter->stats_lock, flags); e1000_tbi_adjust_stats(&adapter->hw, @@ -3445,18 +3494,41 @@ e1000_clean_rx_irq(struct e1000_adapter } } - /* Good Receive */ - skb_put(skb, length - ETHERNET_FCS_SIZE); + /* code added for copybreak, this should improve + * performance for small packets with large amounts + * of reassembly being done in the stack */ +#define E1000_CB_LENGTH 256 + if((length < E1000_CB_LENGTH) && + !rx_ring->rx_skb_top && + /* or maybe (status & E1000_RXD_STAT_EOP) && */ + !multi_descriptor) { + struct sk_buff *new_skb = + dev_alloc_skb(length + NET_IP_ALIGN); + if(new_skb) { + skb_reserve(new_skb, NET_IP_ALIGN); + new_skb->dev = netdev; + memcpy(new_skb->data - NET_IP_ALIGN, + skb->data - NET_IP_ALIGN, + length + NET_IP_ALIGN); + /* save the skb in buffer_info as good */ + buffer_info->skb = skb; + skb = new_skb; + skb_put(skb, length); + } + } + + /* end copybreak code */ /* Receive Checksum Offload */ e1000_rx_checksum(adapter, - (uint32_t)(rx_desc->status) | + (uint32_t)(status) | ((uint32_t)(rx_desc->errors) << 24), rx_desc->csum, skb); + skb->protocol = eth_type_trans(skb, netdev); #ifdef CONFIG_E1000_NAPI if(unlikely(adapter->vlgrp && - (rx_desc->status & E1000_RXD_STAT_VP))) { + (status & E1000_RXD_STAT_VP))) { vlan_hwaccel_receive_skb(skb, adapter->vlgrp, le16_to_cpu(rx_desc->special) & E1000_RXD_SPC_VLAN_MASK); @@ -3465,7 +3537,7 @@ e1000_clean_rx_irq(struct e1000_adapter } #else /* CONFIG_E1000_NAPI */ if(unlikely(adapter->vlgrp && - (rx_desc->status & E1000_RXD_STAT_VP))) { + (status & E1000_RXD_STAT_VP))) { vlan_hwaccel_rx(skb, adapter->vlgrp, le16_to_cpu(rx_desc->special) & E1000_RXD_SPC_VLAN_MASK); @@ -3478,13 +3646,22 @@ e1000_clean_rx_irq(struct e1000_adapter next_desc: rx_desc->status = 0; - buffer_info->skb = NULL; - if(unlikely(++i == rx_ring->count)) i = 0; - rx_desc = E1000_RX_DESC(*rx_ring, i); + /* return some buffers to hardware, one at a time is too slow */ + if (unlikely(cleaned_count >= E1000_RX_BUFFER_WRITE)) { + adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count); + cleaned_count = 0; + } + + /* use prefetched values */ + rx_desc = next_rxd; + buffer_info = next_buffer; } rx_ring->next_to_clean = i; - adapter->alloc_rx_buf(adapter, rx_ring); + + cleaned_count = E1000_DESC_UNUSED(rx_ring); + if (cleaned_count) + adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count); return cleaned; } @@ -3504,16 +3585,17 @@ e1000_clean_rx_irq_ps(struct e1000_adapt struct e1000_rx_ring *rx_ring) #endif { - union e1000_rx_desc_packet_split *rx_desc; + union e1000_rx_desc_packet_split *rx_desc, *next_rxd; struct net_device *netdev = adapter->netdev; struct pci_dev *pdev = adapter->pdev; - struct e1000_buffer *buffer_info; + struct e1000_buffer *buffer_info, *next_buffer, *next2_buffer; struct e1000_ps_page *ps_page; struct e1000_ps_page_dma *ps_page_dma; - struct sk_buff *skb; + struct sk_buff *skb, *next_skb; unsigned int i, j; uint32_t length, staterr; boolean_t cleaned = FALSE; + int cleaned_count = 0; i = rx_ring->next_to_clean; rx_desc = E1000_RX_DESC_PS(*rx_ring, i); @@ -3528,13 +3610,29 @@ e1000_clean_rx_irq_ps(struct e1000_adapt break; (*work_done)++; #endif + skb = buffer_info->skb; + + prefetch(skb->data - NET_IP_ALIGN); + + if(++i == rx_ring->count) i = 0; + next_rxd = E1000_RX_DESC_PS(*rx_ring, i); + prefetch(next_rxd); + + if((j = i + 1) == rx_ring->count) j = 0; + next2_buffer = &rx_ring->buffer_info[j]; + prefetch(next2_buffer); + + next_buffer = &rx_ring->buffer_info[i]; + next_skb = next_buffer->skb; + prefetch(next_skb); + prefetch(next_skb->data - NET_IP_ALIGN); + cleaned = TRUE; + cleaned_count++; pci_unmap_single(pdev, buffer_info->dma, buffer_info->length, PCI_DMA_FROMDEVICE); - skb = buffer_info->skb; - if(unlikely(!(staterr & E1000_RXD_STAT_EOP))) { E1000_DBG("%s: Packet Split buffers didn't pick up" " the full packet\n", netdev->name); @@ -3610,13 +3780,24 @@ e1000_clean_rx_irq_ps(struct e1000_adapt next_desc: rx_desc->wb.middle.status_error &= ~0xFF; buffer_info->skb = NULL; - if(unlikely(++i == rx_ring->count)) i = 0; - rx_desc = E1000_RX_DESC_PS(*rx_ring, i); + /* return some buffers to hardware, one at a time is too slow */ + if (unlikely(cleaned_count >= E1000_RX_BUFFER_WRITE)) { + adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count); + cleaned_count = 0; + } + + /* use prefetched values */ + rx_desc = next_rxd; + buffer_info = next_buffer; + staterr = le32_to_cpu(rx_desc->wb.middle.status_error); } rx_ring->next_to_clean = i; - adapter->alloc_rx_buf(adapter, rx_ring); + + cleaned_count = E1000_DESC_UNUSED(rx_ring); + if (cleaned_count) + adapter->alloc_rx_buf(adapter, rx_ring, cleaned_count); return cleaned; } @@ -3628,7 +3737,8 @@ next_desc: static void e1000_alloc_rx_buffers(struct e1000_adapter *adapter, - struct e1000_rx_ring *rx_ring) + struct e1000_rx_ring *rx_ring, + int cleaned_count) { struct net_device *netdev = adapter->netdev; struct pci_dev *pdev = adapter->pdev; @@ -3641,8 +3819,14 @@ e1000_alloc_rx_buffers(struct e1000_adap i = rx_ring->next_to_use; buffer_info = &rx_ring->buffer_info[i]; - while(!buffer_info->skb) { - skb = dev_alloc_skb(bufsz); + while(cleaned_count--) { + if(!(skb = buffer_info->skb)) + skb = dev_alloc_skb(bufsz); + else { + skb->tail = skb->head; + skb->len = 0; + goto map_skb; + } if(unlikely(!skb)) { /* Better luck next round */ @@ -3682,6 +3861,7 @@ e1000_alloc_rx_buffers(struct e1000_adap buffer_info->skb = skb; buffer_info->length = adapter->rx_buffer_len; +map_skb: buffer_info->dma = pci_map_single(pdev, skb->data, adapter->rx_buffer_len, @@ -3707,20 +3824,21 @@ e1000_alloc_rx_buffers(struct e1000_adap rx_desc = E1000_RX_DESC(*rx_ring, i); rx_desc->buffer_addr = cpu_to_le64(buffer_info->dma); - if(unlikely((i & ~(E1000_RX_BUFFER_WRITE - 1)) == i)) { - /* Force memory writes to complete before letting h/w - * know there are new descriptors to fetch. (Only - * applicable for weak-ordered memory model archs, - * such as IA-64). */ - wmb(); - writel(i, adapter->hw.hw_addr + rx_ring->rdt); - } - if(unlikely(++i == rx_ring->count)) i = 0; buffer_info = &rx_ring->buffer_info[i]; } - rx_ring->next_to_use = i; + if (rx_ring->next_to_use != i) { + rx_ring->next_to_use = i; + if(unlikely(i-- == 0)) i = (rx_ring->count - 1); + + /* Force memory writes to complete before letting h/w + * know there are new descriptors to fetch. (Only + * applicable for weak-ordered memory model archs, + * such as IA-64). */ + wmb(); + writel(i, adapter->hw.hw_addr + rx_ring->rdt); + } } /** @@ -3730,7 +3848,8 @@ e1000_alloc_rx_buffers(struct e1000_adap static void e1000_alloc_rx_buffers_ps(struct e1000_adapter *adapter, - struct e1000_rx_ring *rx_ring) + struct e1000_rx_ring *rx_ring, + int cleaned_count) { struct net_device *netdev = adapter->netdev; struct pci_dev *pdev = adapter->pdev; @@ -3746,7 +3925,7 @@ e1000_alloc_rx_buffers_ps(struct e1000_a ps_page = &rx_ring->ps_page[i]; ps_page_dma = &rx_ring->ps_page_dma[i]; - while(!buffer_info->skb) { + while (cleaned_count--) { rx_desc = E1000_RX_DESC_PS(*rx_ring, i); for(j = 0; j < PS_PAGE_BUFFERS; j++) { @@ -3793,19 +3976,6 @@ e1000_alloc_rx_buffers_ps(struct e1000_a rx_desc->read.buffer_addr[0] = cpu_to_le64(buffer_info->dma); - if(unlikely((i & ~(E1000_RX_BUFFER_WRITE - 1)) == i)) { - /* Force memory writes to complete before letting h/w - * know there are new descriptors to fetch. (Only - * applicable for weak-ordered memory model archs, - * such as IA-64). */ - wmb(); - /* Hardware increments by 16 bytes, but packet split - * descriptors are 32 bytes...so we increment tail - * twice as much. - */ - writel(i<<1, adapter->hw.hw_addr + rx_ring->rdt); - } - if(unlikely(++i == rx_ring->count)) i = 0; buffer_info = &rx_ring->buffer_info[i]; ps_page = &rx_ring->ps_page[i]; @@ -3813,7 +3976,21 @@ } no_buffers: - rx_ring->next_to_use = i; + if (rx_ring->next_to_use != i) { + rx_ring->next_to_use = i; + if(unlikely(i-- == 0)) i = (rx_ring->count - 1); + + /* Force memory writes to complete before letting h/w + * know there are new descriptors to fetch. (Only + * applicable for weak-ordered memory model archs, + * such as IA-64). */ + wmb(); + /* Hardware increments by 16 bytes, but packet split + * descriptors are 32 bytes...so we increment tail + * twice as much. + */ + writel(i<<1, adapter->hw.hw_addr + rx_ring->rdt); + } } /** - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html