This patch implements selective tx signaling for IPoIB.
It lets the user set the ratio between the number of
sent packets and the number of TX completion signals.
This optimization has the following advantages:
+ increase the packet per second (PPS) rate
+ reduce the number of interrupts related to ipoib tx completions
Since the IB HCA HW executes work requests posted QP in-order, we can i
assume that a completion of a work request means that all the work
requests posted before it are also completed and hence their
associated resources (skbs in this context) can be recycled.
The current driver implementation asks for a completion signaling for every
sent packet (a ratio of 1).
This patch enables the user to set a higher ratio.
Asking for a completion signal for every n (>1) packets saves the following:
1. less interrupts to the host
2. the amortized cost for tx completion handing is lowered
3. the tx_lock is taken less often
The cost of selective signaling is in the average amount of memory that the
IPoIB
driver consumes since skbs are freed in the TX completion handler (which is now
executed less often).
So, if the current driver holds only few skbs at any given time (and normally
not more than one) the new driver holds
skbs up to n (the ratio between sent packets and the number of tx completions).
For reasonable value of n
can lead to over consumption of few tens of Kbytes but the real issue is
elsewhere. Applications that set the
socket buffer to a small size (with setsockopt()) may suffer from ENOBUFS
failures
when calling to sendto() or sendmsg(). A good example for this is ping and a
signaling ratio
of 16 packets to 1 completion request. In this case few successful pings are
followed by an endless sequence
of errors (until ping restarts). The solution is to set n with attention to the
specific user applications and
to use setsockopt() with care (ping for instance, can be run with -S).
Another issue is related to the ipoib_ib_dev_stop() operation. This function
checks that the tail and head
of the tx_ring are equal and if they are not it assumes that there are
uncompleted work requests.
With this patch it is normal that the tail and head of the tx_ring would be
different sice we are not always
asking for a completion notification. Since I don't see a way to tell if the
tail/head gap is normal or
due to a failure I only reduce the message severity from warn to dbg if the
condition for expected gap is true.
However, I still see there a tiny chance that a completion notification would
arrive after the timeout in ipoib_ib_dev_stop()
expires and the it tries to free the skbs in the tx_ring(). Solutions to that
can be
1. protect the code with a lock - but I started with trying to avoid locks
2. reduce the hazard by adding more to the timeout and calling
test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); in the TX completion
handler to check if ipoib_ib_dev_stop()
had started.
I would be happy to get comments for the last issue and for the rest of the
patch of course.
thanks
> MoniS
ipoib.h | 2 ++
ipoib_ib.c | 39 ++++++++++++++++++++++++++++-----------
ipoib_main.c | 10 +++++++++-
ipoib_verbs.c | 4 ++--
4 files changed, 41 insertions(+), 14 deletions(-)
Signed-off-by: Moni Shoua <[EMAIL PROTECTED]>
---
Index: infiniband/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- infiniband.orig/drivers/infiniband/ulp/ipoib/ipoib.h 2007-01-07
15:39:49.421190295 +0200
+++ infiniband/drivers/infiniband/ulp/ipoib/ipoib.h 2007-01-07
15:42:33.768824668 +0200
@@ -164,6 +164,7 @@ struct ipoib_dev_priv {
struct ipoib_tx_buf *tx_ring;
unsigned tx_head;
unsigned tx_tail;
+ unsigned tx_completion_mark;
struct ib_sge tx_sge;
struct ib_send_wr tx_wr;
@@ -335,6 +336,7 @@ static inline void ipoib_unregister_debu
extern int ipoib_sendq_size;
extern int ipoib_recvq_size;
+extern int num_unsignal_tx;
extern struct ib_sa_client ipoib_sa_client;
Index: infiniband/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- infiniband.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2007-01-07
15:39:49.443186365 +0200
+++ infiniband/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2007-01-07
19:29:21.885896644 +0200
@@ -256,29 +256,32 @@ static void ipoib_ib_handle_tx_wc(struct
return;
}
- tx_req = &priv->tx_ring[wr_id];
+ do {
+ tx_req = &priv->tx_ring[wr_id];
- ib_dma_unmap_single(priv->ca, tx_req->mapping,
- tx_req->skb->len, DMA_TO_DEVICE);
+ ib_dma_unmap_single(priv->ca, tx_req->mapping,
+ tx_req->skb->len, DMA_TO_DEVICE);
- ++priv->stats.tx_packets;
- priv->stats.tx_bytes += tx_req->skb->len;
+ ++priv->stats.tx_packets;
+ priv->stats.tx_bytes += tx_req->skb->len;
- dev_kfree_skb_any(tx_req->skb);
+ dev_kfree_skb_any(tx_req->skb);
+ } while (wr_id-- > priv->tx_completion_mark);
spin_lock_irqsave(&priv->tx_lock, flags);
- ++priv->tx_tail;
+ priv->tx_tail += wc->wr_id - priv->tx_completion_mark + 1;
if (netif_queue_stopped(dev) &&
test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) &&
priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
netif_wake_queue(dev);
spin_unlock_irqrestore(&priv->tx_lock, flags);
+ priv->tx_completion_mark = (wc->wr_id + 1) & (ipoib_sendq_size - 1);
if (wc->status != IB_WC_SUCCESS &&
wc->status != IB_WC_WR_FLUSH_ERR)
ipoib_warn(priv, "failed send event "
"(status=%d, wrid=%d vend_err %x)\n",
- wc->status, wr_id, wc->vendor_err);
+ wc->status, (unsigned int)wc->wr_id, wc->vendor_err);
}
static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
@@ -309,7 +312,11 @@ static inline int post_send(struct ipoib
u64 addr, int len)
{
struct ib_send_wr *bad_wr;
+ int send_signaled=0;
+ int ret;
+ if ((wr_id & num_unsignal_tx) == num_unsignal_tx)
+ send_signaled=1;
priv->tx_sge.addr = addr;
priv->tx_sge.length = len;
@@ -317,7 +324,11 @@ static inline int post_send(struct ipoib
priv->tx_wr.wr.ud.remote_qpn = qpn;
priv->tx_wr.wr.ud.ah = address;
- return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+ if (send_signaled)
+ priv->tx_wr.send_flags |= IB_SEND_SIGNALED;
+ ret = ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+ priv->tx_wr.send_flags &= ~IB_SEND_SIGNALED;
+ return ret;
}
void ipoib_send(struct net_device *dev, struct sk_buff *skb,
@@ -522,8 +533,14 @@ int ipoib_ib_dev_stop(struct net_device
while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
if (time_after(jiffies, begin + 5 * HZ)) {
- ipoib_warn(priv, "timing out; %d sends %d receives not
completed\n",
- priv->tx_head - priv->tx_tail,
recvs_pending(dev));
+ if (recvs_pending(dev) ||
+ (priv->tx_head - priv->tx_tail) >
num_unsignal_tx)
+ ipoib_warn(priv, "timing out; %d sends %d
receives not completed\n",
+ priv->tx_head - priv->tx_tail,
recvs_pending(dev));
+ else
+ ipoib_dbg(priv, "%d sends left in q."
+ "probably sent without
completion notification.\n",
+ priv->tx_head - priv->tx_tail);
/*
* assume the HW is wedged and just free up
Index: infiniband/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- infiniband.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-01-07
15:39:49.454184399 +0200
+++ infiniband/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-01-07
15:42:33.806817879 +0200
@@ -57,11 +57,15 @@ MODULE_LICENSE("Dual BSD/GPL");
int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
+int tx_signal_rate __read_mostly = 1;
+int num_unsignal_tx;
module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+module_param_named(tx_signal_rate, tx_signal_rate, int, 0444);
+MODULE_PARM_DESC(tx_signal_rate, "Number of tx wr per wc. Must be power of 2");
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;
@@ -849,7 +853,7 @@ int ipoib_dev_init(struct net_device *de
goto out_rx_ring_cleanup;
}
- /* priv->tx_head & tx_tail are already 0 */
+ /* tx_completion_mark, priv->tx_head & tx_tail are already 0 */
if (ipoib_ib_dev_init(dev, ca, port))
goto out_tx_ring_cleanup;
@@ -1179,6 +1183,10 @@ static int __init ipoib_init_module(void
ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE);
+ tx_signal_rate = roundup_pow_of_two(tx_signal_rate);
+ tx_signal_rate = min(tx_signal_rate, ipoib_sendq_size);
+ tx_signal_rate = max(tx_signal_rate, 1);
+ num_unsignal_tx = tx_signal_rate - 1;
ret = ipoib_register_debugfs();
if (ret)
return ret;
Index: infiniband/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- infiniband.orig/drivers/infiniband/ulp/ipoib/ipoib_verbs.c 2007-01-07
15:39:49.481179576 +0200
+++ infiniband/drivers/infiniband/ulp/ipoib/ipoib_verbs.c 2007-01-07
15:42:33.832813235 +0200
@@ -164,7 +164,7 @@ int ipoib_transport_dev_init(struct net_
.max_send_sge = 1,
.max_recv_sge = 1
},
- .sq_sig_type = IB_SIGNAL_ALL_WR,
+ .sq_sig_type = IB_SIGNAL_REQ_WR,
.qp_type = IB_QPT_UD
};
@@ -208,7 +208,7 @@ int ipoib_transport_dev_init(struct net_
priv->tx_wr.opcode = IB_WR_SEND;
priv->tx_wr.sg_list = &priv->tx_sge;
priv->tx_wr.num_sge = 1;
- priv->tx_wr.send_flags = IB_SEND_SIGNALED;
+ priv->tx_wr.send_flags = 0;
return 0;
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general
To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general