We've had reports from customers of boxes with a week's uptime and 32k MTUs
getting failed order-3 allocs under:

        udp_sendmsg .. ip_append_data .. {sock_wmalloc, sock_alloc_send_skb}

I came up with the following which seems to do the right thing in a trivial
test.  It clamps the size of the first frag's allocation to a single page
if the dev supports _SG.  The rest of the message is copied into page frags
as usual. 

I'm *pretty sure* I got the math right, but this is some twisty, twisty, code.
Please yell at me if I've messed something up.

Is this sort of thing mergable?  I'm happy to tweak the patch into whatever for
is acceptable.  Maybe a knob for the largest order that it should allow would
be appropriate?

- z

Index: 2.6.16-mm2-bigmtu/net/ipv4/ip_output.c
===================================================================
--- 2.6.16-mm2-bigmtu.orig/net/ipv4/ip_output.c
+++ 2.6.16-mm2-bigmtu/net/ipv4/ip_output.c
@@ -891,13 +891,26 @@ alloc_new_skb:
                        datalen = length + fraggap;
                        if (datalen > mtu - fragheaderlen)
                                datalen = maxfraglen - fragheaderlen;
+
+                       alloclen = fragheaderlen + hh_len + 15;
+
+                       /* avoid multi page allocs */
+                       if ((rt->u.dst.dev->features&NETIF_F_SG) &&
+                           (SKB_DATA_KMALLOC_BYTES(alloclen + datalen)
+                                                               > PAGE_SIZE)) {
+                               datalen = SKB_MAX_ORDER(alloclen, 0);
+                               /* watch out for trailer_len */
+                               if (datalen == length)
+                                       datalen -= 2;
+                       }
+
                        fraglen = datalen + fragheaderlen;
 
                        if ((flags & MSG_MORE) && 
                            !(rt->u.dst.dev->features&NETIF_F_SG))
-                               alloclen = mtu;
+                               alloclen += mtu - fragheaderlen;
                        else
-                               alloclen = datalen + fragheaderlen;
+                               alloclen += datalen + fragheaderlen;
 
                        /* The last fragment gets additional space at tail.
                         * Note, with MSG_MORE we overallocate on fragments,
@@ -908,15 +921,13 @@ alloc_new_skb:
                                alloclen += rt->u.dst.trailer_len;
 
                        if (transhdrlen) {
-                               skb = sock_alloc_send_skb(sk, 
-                                               alloclen + hh_len + 15,
+                               skb = sock_alloc_send_skb(sk, alloclen,
                                                (flags & MSG_DONTWAIT), &err);
                        } else {
                                skb = NULL;
                                if (atomic_read(&sk->sk_wmem_alloc) <=
                                    2 * sk->sk_sndbuf)
-                                       skb = sock_wmalloc(sk, 
-                                                          alloclen + hh_len + 
15, 1,
+                                       skb = sock_wmalloc(sk, alloclen, 1,
                                                           sk->sk_allocation);
                                if (unlikely(skb == NULL))
                                        err = -ENOBUFS;
Index: 2.6.16-mm2-bigmtu/include/linux/skbuff.h
===================================================================
--- 2.6.16-mm2-bigmtu.orig/include/linux/skbuff.h
+++ 2.6.16-mm2-bigmtu/include/linux/skbuff.h
@@ -39,6 +39,8 @@
 
 #define SKB_DATA_ALIGN(X)      (((X) + (SMP_CACHE_BYTES - 1)) & \
                                 ~(SMP_CACHE_BYTES - 1))
+#define SKB_DATA_KMALLOC_BYTES(X) (SKB_DATA_ALIGN(X) + \
+                                 sizeof(struct skb_shared_info))
 #define SKB_MAX_ORDER(X, ORDER)        (((PAGE_SIZE << (ORDER)) - (X) - \
                                  sizeof(struct skb_shared_info)) & \
                                  ~(SMP_CACHE_BYTES - 1))
Index: 2.6.16-mm2-bigmtu/net/core/skbuff.c
===================================================================
--- 2.6.16-mm2-bigmtu.orig/net/core/skbuff.c
+++ 2.6.16-mm2-bigmtu/net/core/skbuff.c
@@ -148,8 +148,7 @@ struct sk_buff *__alloc_skb(unsigned int
                goto out;
 
        /* Get the DATA. Size must match skb_add_mtu(). */
-       size = SKB_DATA_ALIGN(size);
-       data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+       data = ____kmalloc(SKB_DATA_KMALLOC_BYTES(size), gfp_mask);
        if (!data)
                goto nodata;
 
@@ -1486,7 +1485,7 @@ void skb_insert(struct sk_buff *old, str
 void skb_add_mtu(int mtu)
 {
        /* Must match allocation in alloc_skb */
-       mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info);
+       mtu = SKB_DATA_KMALLOC_BYTES(mtu);
 
        kmem_add_cache_size(mtu);
 }
Index: 2.6.16-mm2-bigmtu/drivers/atm/iphase.c
===================================================================
--- 2.6.16-mm2-bigmtu.orig/drivers/atm/iphase.c
+++ 2.6.16-mm2-bigmtu/drivers/atm/iphase.c
@@ -1306,7 +1306,7 @@ static void rx_dle_intr(struct atm_dev *
           {
              atomic_inc(&vcc->stats->rx_err);
              dev_kfree_skb_any(skb);
-             atm_return(vcc, atm_guess_pdu2truesize(len));
+             atm_return(vcc, SKB_DATA_KMALLOC_BYTES(len));
              goto INCR_DLE;
            }
           // get real pkt length  pwang_test
@@ -1320,7 +1320,7 @@ static void rx_dle_intr(struct atm_dev *
              IF_ERR(printk("rx_dle_intr: Bad  AAL5 trailer %d (skb len %d)", 
                                                             length, skb->len);)
              dev_kfree_skb_any(skb);
-             atm_return(vcc, atm_guess_pdu2truesize(len));
+             atm_return(vcc, SKB_DATA_KMALLOC_BYTES(len));
              goto INCR_DLE;
           }
           skb_trim(skb, length);
Index: 2.6.16-mm2-bigmtu/include/linux/atmdev.h
===================================================================
--- 2.6.16-mm2-bigmtu.orig/include/linux/atmdev.h
+++ 2.6.16-mm2-bigmtu/include/linux/atmdev.h
@@ -418,17 +418,6 @@ void atm_dev_deregister(struct atm_dev *
 void vcc_insert_socket(struct sock *sk);
 
 
-/*
- * This is approximately the algorithm used by alloc_skb.
- *
- */
-
-static inline int atm_guess_pdu2truesize(int size)
-{
-       return (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info));
-}
-
-
 static inline void atm_force_charge(struct atm_vcc *vcc,int truesize)
 {
        atomic_add(truesize, &sk_atm(vcc)->sk_rmem_alloc);
Index: 2.6.16-mm2-bigmtu/net/atm/atm_misc.c
===================================================================
--- 2.6.16-mm2-bigmtu.orig/net/atm/atm_misc.c
+++ 2.6.16-mm2-bigmtu/net/atm/atm_misc.c
@@ -28,7 +28,7 @@ struct sk_buff *atm_alloc_charge(struct 
     gfp_t gfp_flags)
 {
        struct sock *sk = sk_atm(vcc);
-       int guess = atm_guess_pdu2truesize(pdu_size);
+       int guess = SKB_DATA_KMALLOC_BYTES(pdu_size);
 
        atm_force_charge(vcc,guess);
        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to