To handle possible multi segment mbufs, the driver would allocate
a worst case 64k buffer on the stack. Since each Tx queue is
single threaded, better to allocate the buffer from hugepage
with rte_malloc when queue is setup.

The buffer needs to be come from huge pages because the primary
process may start the device but the bounce buffer could be used
in transmit path by secondary process.

Using function rte_pktmbuf_free_bulk is marginally faster here.

Do proper handling of pcap_sendpacket() errors.
The function can return -1 in case of device queue being full.
This should not be counted as an error.

The driver has always handled multi-segment transmit
but the flag was never set in the offload capabilities.

Signed-off-by: Stephen Hemminger <[email protected]>
---
 drivers/net/pcap/pcap_ethdev.c | 114 +++++++++++++++++++--------------
 drivers/net/pcap/pcap_osdep.h  |  14 ++++
 2 files changed, 81 insertions(+), 47 deletions(-)

diff --git a/drivers/net/pcap/pcap_ethdev.c b/drivers/net/pcap/pcap_ethdev.c
index 806451dc99..84da41542b 100644
--- a/drivers/net/pcap/pcap_ethdev.c
+++ b/drivers/net/pcap/pcap_ethdev.c
@@ -12,6 +12,7 @@
 #include <errno.h>
 #include <sys/time.h>
 #include <sys/types.h>
+#include <unistd.h>
 #include <pcap.h>
 
 #include <rte_cycles.h>
@@ -91,6 +92,9 @@ struct pcap_tx_queue {
        struct queue_stat tx_stat;
        char name[PATH_MAX];
        char type[ETH_PCAP_ARG_MAXLEN];
+
+       /* Temp buffer used for non-linear packets */
+       uint8_t *bounce_buf;
 };
 
 struct pmd_internals {
@@ -385,18 +389,17 @@ static uint16_t
 eth_pcap_tx_dumper(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
        unsigned int i;
-       struct rte_mbuf *mbuf;
        struct pmd_process_private *pp;
        struct pcap_tx_queue *dumper_q = queue;
        uint16_t num_tx = 0;
        uint32_t tx_bytes = 0;
        struct pcap_pkthdr header;
        pcap_dumper_t *dumper;
-       unsigned char temp_data[RTE_ETH_PCAP_SNAPLEN];
-       size_t len, caplen;
+       unsigned char *temp_data;
 
        pp = rte_eth_devices[dumper_q->port_id].process_private;
        dumper = pp->tx_dumper[dumper_q->queue_id];
+       temp_data = dumper_q->bounce_buf;
 
        if (dumper == NULL || nb_pkts == 0)
                return 0;
@@ -404,27 +407,24 @@ eth_pcap_tx_dumper(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
        /* writes the nb_pkts packets to the previously opened pcap file
         * dumper */
        for (i = 0; i < nb_pkts; i++) {
-               mbuf = bufs[i];
+               struct rte_mbuf *mbuf = bufs[i];
+               uint32_t len, caplen;
+               const uint8_t *data;
+
                len = caplen = rte_pktmbuf_pkt_len(mbuf);
-               if (unlikely(!rte_pktmbuf_is_contiguous(mbuf) &&
-                               len > sizeof(temp_data))) {
-                       caplen = sizeof(temp_data);
-               }
 
                calculate_timestamp(&header.ts);
                header.len = len;
                header.caplen = caplen;
-               /* rte_pktmbuf_read() returns a pointer to the data directly
-                * in the mbuf (when the mbuf is contiguous) or, otherwise,
-                * a pointer to temp_data after copying into it.
-                */
-               pcap_dump((u_char *)dumper, &header,
-                       rte_pktmbuf_read(mbuf, 0, caplen, temp_data));
 
-               num_tx++;
-               tx_bytes += caplen;
-               rte_pktmbuf_free(mbuf);
+               data = rte_pktmbuf_read(mbuf, 0, caplen, temp_data);
+               if (likely(data != NULL)) {
+                       pcap_dump((u_char *)dumper, &header, data);
+                       num_tx++;
+                       tx_bytes += caplen;
+               }
        }
+       rte_pktmbuf_free_bulk(bufs, nb_pkts);
 
        /*
         * Since there's no place to hook a callback when the forwarding
@@ -449,71 +449,74 @@ eth_tx_drop(void *queue, struct rte_mbuf **bufs, uint16_t 
nb_pkts)
        uint32_t tx_bytes = 0;
        struct pcap_tx_queue *tx_queue = queue;
 
-       if (unlikely(nb_pkts == 0))
-               return 0;
-
-       for (i = 0; i < nb_pkts; i++) {
+       for (i = 0; i < nb_pkts; i++)
                tx_bytes += bufs[i]->pkt_len;
-               rte_pktmbuf_free(bufs[i]);
-       }
+
+       rte_pktmbuf_free_bulk(bufs, nb_pkts);
 
        tx_queue->tx_stat.pkts += nb_pkts;
        tx_queue->tx_stat.bytes += tx_bytes;
 
-       return i;
+       return nb_pkts;
 }
 
 /*
- * Callback to handle sending packets through a real NIC.
+ * Send a burst of packets to a pcap device.
+ *
+ * On Linux, pcap_sendpacket() calls send() on a blocking PF_PACKET
+ * socket with default kernel buffer sizes and no TX ring (PACKET_TX_RING).
+ * The send() call only blocks when the kernel socket send buffer is full,
+ * providing limited backpressure.
+ *
+ * On error, pcap_sendpacket() returns non-zero and the loop breaks,
+ * leaving remaining packets unsent.
+ *
+ * Bottom line: backpressure is not an error.
  */
 static uint16_t
 eth_pcap_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
        unsigned int i;
-       int ret;
-       struct rte_mbuf *mbuf;
        struct pmd_process_private *pp;
        struct pcap_tx_queue *tx_queue = queue;
        uint16_t num_tx = 0;
        uint32_t tx_bytes = 0;
        pcap_t *pcap;
-       unsigned char temp_data[RTE_ETH_PCAP_SNAPLEN];
-       size_t len;
+       unsigned char *temp_data;
 
        pp = rte_eth_devices[tx_queue->port_id].process_private;
        pcap = pp->tx_pcap[tx_queue->queue_id];
+       temp_data = tx_queue->bounce_buf;
 
        if (unlikely(nb_pkts == 0 || pcap == NULL))
                return 0;
 
        for (i = 0; i < nb_pkts; i++) {
-               mbuf = bufs[i];
-               len = rte_pktmbuf_pkt_len(mbuf);
-               if (unlikely(!rte_pktmbuf_is_contiguous(mbuf) &&
-                               len > sizeof(temp_data))) {
-                       PMD_LOG(ERR,
-                               "Dropping multi segment PCAP packet. Size (%zd) 
> max size (%zd).",
-                               len, sizeof(temp_data));
-                       rte_pktmbuf_free(mbuf);
+               struct rte_mbuf *mbuf = bufs[i];
+               uint32_t len = rte_pktmbuf_pkt_len(mbuf);
+
+               if (unlikely(!rte_pktmbuf_is_contiguous(mbuf) && len > 
RTE_ETH_PCAP_SNAPSHOT_LEN)) {
+                       PMD_TX_LOG(ERR,
+                               "Dropping multi segment PCAP packet. Size (%u) 
> max size (%u).",
+                               len, RTE_ETH_PCAP_SNAPSHOT_LEN);
+                       tx_queue->tx_stat.err_pkts++;
                        continue;
                }
 
-               /* rte_pktmbuf_read() returns a pointer to the data directly
-                * in the mbuf (when the mbuf is contiguous) or, otherwise,
-                * a pointer to temp_data after copying into it.
-                */
-               ret = pcap_sendpacket(pcap,
-                       rte_pktmbuf_read(mbuf, 0, len, temp_data), len);
-               if (unlikely(ret != 0))
+               if (pcap_sendpacket(pcap, rte_pktmbuf_read(mbuf, 0, len, 
temp_data), len) != 0) {
+                       /* Unfortunately, libpcap collapses transient and hard 
errors. */
+                       PMD_TX_LOG(ERR, "pcap_sendpacket() failed: %s", 
pcap_geterr(pcap));
                        break;
+               }
+
                num_tx++;
                tx_bytes += len;
-               rte_pktmbuf_free(mbuf);
        }
 
+       rte_pktmbuf_free_bulk(bufs, i);
+
        tx_queue->tx_stat.pkts += num_tx;
        tx_queue->tx_stat.bytes += tx_bytes;
-       tx_queue->tx_stat.err_pkts += i - num_tx;
 
        return i;
 }
@@ -753,6 +756,7 @@ eth_dev_info(struct rte_eth_dev *dev,
        dev_info->max_rx_queues = dev->data->nb_rx_queues;
        dev_info->max_tx_queues = dev->data->nb_tx_queues;
        dev_info->min_rx_bufsize = 0;
+       dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS;
 
        return 0;
 }
@@ -965,7 +969,7 @@ static int
 eth_tx_queue_setup(struct rte_eth_dev *dev,
                uint16_t tx_queue_id,
                uint16_t nb_tx_desc __rte_unused,
-               unsigned int socket_id __rte_unused,
+               unsigned int socket_id,
                const struct rte_eth_txconf *tx_conf __rte_unused)
 {
        struct pmd_internals *internals = dev->data->dev_private;
@@ -973,11 +977,26 @@ eth_tx_queue_setup(struct rte_eth_dev *dev,
 
        pcap_q->port_id = dev->data->port_id;
        pcap_q->queue_id = tx_queue_id;
+       pcap_q->bounce_buf = rte_malloc_socket(NULL, RTE_ETH_PCAP_SNAPSHOT_LEN,
+                                              RTE_CACHE_LINE_SIZE, socket_id);
+       if (pcap_q->bounce_buf == NULL)
+               return -ENOMEM;
+
        dev->data->tx_queues[tx_queue_id] = pcap_q;
 
        return 0;
 }
 
+static void
+eth_tx_queue_release(struct rte_eth_dev *dev, uint16_t tx_queue_id)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+       struct pcap_tx_queue *pcap_q = &internals->tx_queue[tx_queue_id];
+
+       rte_free(pcap_q->bounce_buf);
+       pcap_q->bounce_buf = NULL;
+}
+
 static int
 eth_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id)
 {
@@ -1018,6 +1037,7 @@ static const struct eth_dev_ops ops = {
        .dev_infos_get = eth_dev_info,
        .rx_queue_setup = eth_rx_queue_setup,
        .tx_queue_setup = eth_tx_queue_setup,
+       .tx_queue_release = eth_tx_queue_release,
        .rx_queue_start = eth_rx_queue_start,
        .tx_queue_start = eth_tx_queue_start,
        .rx_queue_stop = eth_rx_queue_stop,
diff --git a/drivers/net/pcap/pcap_osdep.h b/drivers/net/pcap/pcap_osdep.h
index a0e2b5ace9..fe7399ff9f 100644
--- a/drivers/net/pcap/pcap_osdep.h
+++ b/drivers/net/pcap/pcap_osdep.h
@@ -13,6 +13,20 @@
 extern int eth_pcap_logtype;
 #define RTE_LOGTYPE_ETH_PCAP eth_pcap_logtype
 
+#ifdef RTE_ETHDEV_DEBUG_RX
+#define PMD_RX_LOG(level, ...) \
+       RTE_LOG_LINE_PREFIX(level, ETH_PCAP, "%s() rx: ", __func__, __VA_ARGS__)
+#else
+#define PMD_RX_LOG(...) do { } while (0)
+#endif
+
+#ifdef RTE_ETHDEV_DEBUG_TX
+#define PMD_TX_LOG(level, ...) \
+       RTE_LOG_LINE_PREFIX(level, ETH_PCAP, "%s() tx: ", __func__, __VA_ARGS__)
+#else
+#define PMD_TX_LOG(...) do { } while (0)
+#endif
+
 int osdep_iface_index_get(const char *name);
 int osdep_iface_mac_get(const char *name, struct rte_ether_addr *mac);
 
-- 
2.51.0

Reply via email to