Hello again,

This patch introduces the use of rcu for the ipv4 established connections 
hashtable, as well as the timewait table since they are closely intertwined.  
This removes 4 atomic operations per packet from the tcp_v4_rcv codepath, 
which helps quite a bit when the other performance barriers in the system 
are removed.  Eliminating the rwlock cache bouncing should also help on SMP 
systems.

By itself, this improves local netperf performance on a P4/HT by ~260Mbit/s 
on average.  With smaller packets (say, ethernet size) the difference should 
be larger.

Note that this patch changes the semantics of __inet_lookup() and 
__inet_lookup_established() to *not* perform a sock_hold().  This way 
we can avoid the atomic inc and dec of the reference count and rely on 
rcu pinning the socket over the scope of the rcu_read_lock_bh() region.

Only minimal fixes for ipv6 and dccp to use the renamed wlock, as I am not 
setup to test them.  I have stared at this patch for a while, and I think 
it's correct, but more eyes are definately warranted.  Most of the issues 
regarding the state and lifespan of a struct sock are already handled by 
the network layer as the socket can be locked after being retrieved from 
the connection hashtable, we just have to be careful about reinitializing 
fields that the rcu hash list depends on.

                -ben

Before: max 7838.86, min 7771.90, avg 7811.68
 87380  16384  16384    10.01      7793.04   90.56    90.56    1.904   1.904 
 87380  16384  16384    10.01      7806.54   91.61    91.61    1.923   1.923 
 87380  16384  16384    10.00      7819.29   90.80    90.80    1.903   1.903 
 87380  16384  16384    10.00      7815.89   90.70    90.70    1.901   1.901 
 87380  16384  16384    10.01      7771.90   91.66    91.66    1.932   1.932 
 87380  16384  16384    10.00      7831.59   90.20    90.20    1.887   1.887 
 87380  16384  16384    10.01      7796.56   91.26    91.26    1.918   1.918 
 87380  16384  16384    10.01      7838.86   89.26    89.26    1.866   1.866 
 87380  16384  16384    10.01      7835.44   90.56    90.56    1.894   1.894 


After: max 8113.70, min 8026.32, avg 8072.34
 87380  16384  16384    10.01      8045.55   87.11    87.11    1.774   1.774 
 87380  16384  16384    10.01      8065.14   90.86    90.86    1.846   1.846 
 87380  16384  16384    10.00      8077.76   89.85    89.85    1.822   1.822 
 87380  16384  16384    10.00      8026.32   89.80    89.80    1.833   1.833 
 87380  16384  16384    10.01      8108.59   89.81    89.81    1.815   1.815 
 87380  16384  16384    10.01      8034.53   89.01    89.01    1.815   1.815 
 87380  16384  16384    10.00      8113.70   90.45    90.45    1.827   1.827 
 87380  16384  16384    10.00      8111.37   89.90    89.90    1.816   1.816 
 87380  16384  16384    10.01      8077.75   87.96    87.96    1.784   1.784 
 87380  16384  16384    10.00      8062.70   90.25    90.25    1.834   1.834 

                -ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <[EMAIL PROTECTED]>.

Signed-off-by: Benjamin LaHaise <[EMAIL PROTECTED]>
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 25f708f..73b05ab 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -65,7 +65,7 @@ static inline void __inet6_hash(struct i
                sk->sk_hash = hash = inet6_sk_ehashfn(sk);
                hash &= (hashinfo->ehash_size - 1);
                list = &hashinfo->ehash[hash].chain;
-               lock = &hashinfo->ehash[hash].lock;
+               lock = &hashinfo->ehash[hash].wlock;
                write_lock(lock);
        }
 
@@ -98,7 +98,7 @@ static inline struct sock *
        struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
 
        prefetch(head->chain.first);
-       read_lock(&head->lock);
+       read_lock(&head->wlock);
        sk_for_each(sk, node, &head->chain) {
                /* For IPV6 do the cheaper port and family tests first. */
                if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif))
@@ -118,12 +118,12 @@ static inline struct sock *
                                goto hit;
                }
        }
-       read_unlock(&head->lock);
+       read_unlock(&head->wlock);
        return NULL;
 
 hit:
        sock_hold(sk);
-       read_unlock(&head->lock);
+       read_unlock(&head->wlock);
        return sk;
 }
 
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 135d80f..4cde832 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -39,7 +39,7 @@
  * for the rest.  I'll experiment with dynamic table growth later.
  */
 struct inet_ehash_bucket {
-       rwlock_t          lock;
+       rwlock_t wlock;
        struct hlist_head chain;
 };
 
@@ -224,7 +224,7 @@ static inline void __inet_hash(struct in
                sk->sk_hash = inet_sk_ehashfn(sk);
                head = inet_ehash_bucket(hashinfo, sk->sk_hash);
                list = &head->chain;
-               lock = &head->lock;
+               lock = &head->wlock;
                write_lock(lock);
        }
        __sk_add_node(sk, list);
@@ -255,7 +255,7 @@ static inline void inet_unhash(struct in
                inet_listen_wlock(hashinfo);
                lock = &hashinfo->lhash_lock;
        } else {
-               lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+               lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->wlock;
                write_lock_bh(lock);
        }
 
@@ -279,14 +279,13 @@ extern struct sock *__inet_lookup_listen
 
 /* Optimize the common listener case. */
 static inline struct sock *
-               inet_lookup_listener(struct inet_hashinfo *hashinfo,
+               _inet_lookup_listener(struct inet_hashinfo *hashinfo,
                                     const u32 daddr,
                                     const unsigned short hnum, const int dif)
 {
        struct sock *sk = NULL;
        const struct hlist_head *head;
 
-       read_lock(&hashinfo->lhash_lock);
        head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
        if (!hlist_empty(head)) {
                const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
@@ -298,10 +297,21 @@ static inline struct sock *
                        goto sherry_cache;
                sk = __inet_lookup_listener(head, daddr, hnum, dif);
        }
-       if (sk) {
 sherry_cache:
+       return sk;
+}
+
+static inline struct sock *
+               inet_lookup_listener(struct inet_hashinfo *hashinfo,
+                                    const u32 daddr,
+                                    const unsigned short hnum, const int dif)
+{
+       struct sock *sk;
+
+       read_lock(&hashinfo->lhash_lock);
+       sk = _inet_lookup_listener(hashinfo, daddr, hnum, dif);
+       if (sk)
                sock_hold(sk);
-       }
        read_unlock(&hashinfo->lhash_lock);
        return sk;
 }
@@ -353,7 +363,7 @@ sherry_cache:
  * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
  * not check it for lookups anymore, thanks Alexey. -DaveM
  *
- * Local BH must be disabled here.
+ * This is an RCU read section, the caller must do an rcu_read_lock_bh().
  */
 static inline struct sock *
        __inet_lookup_established(struct inet_hashinfo *hashinfo,
@@ -371,8 +381,6 @@ static inline struct sock *
        unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
        struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
 
-       prefetch(head->chain.first);
-       read_lock(&head->lock);
        sk_for_each(sk, node, &head->chain) {
                if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
                        goto hit; /* You sunk my battleship! */
@@ -385,10 +393,8 @@ static inline struct sock *
        }
        sk = NULL;
 out:
-       read_unlock(&head->lock);
        return sk;
 hit:
-       sock_hold(sk);
        goto out;
 }
 
@@ -399,7 +405,7 @@ static inline struct sock *__inet_lookup
 {
        struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, 
daddr,
                                                    hnum, dif);
-       return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif);
+       return sk ? : _inet_lookup_listener(hashinfo, daddr, hnum, dif);
 }
 
 static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
@@ -409,9 +415,11 @@ static inline struct sock *inet_lookup(s
 {
        struct sock *sk;
 
-       local_bh_disable();
+       rcu_read_lock_bh();
        sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
-       local_bh_enable();
+       if (sk)
+               sock_hold(sk);
+       rcu_read_unlock_bh();
 
        return sk;
 }
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 1da294c..1341d2e 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -22,6 +22,7 @@
 #include <linux/timer.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <linux/rcupdate.h>
 
 #include <net/inet_sock.h>
 #include <net/sock.h>
@@ -134,6 +135,7 @@ struct inet_timewait_sock {
        unsigned long           tw_ttd;
        struct inet_bind_bucket *tw_tb;
        struct hlist_node       tw_death_node;
+       struct rcu_head         tw_rcu_head;
 };
 
 static inline void inet_twsk_add_node(struct inet_timewait_sock *tw,
@@ -193,16 +195,15 @@ static inline u32 inet_rcv_saddr(const s
                inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr;
 }
 
+extern void __inet_twsk_put(struct rcu_head *tw_rcu);
 static inline void inet_twsk_put(struct inet_timewait_sock *tw)
 {
        if (atomic_dec_and_test(&tw->tw_refcnt)) {
-               struct module *owner = tw->tw_prot->owner;
 #ifdef SOCK_REFCNT_DEBUG
                printk(KERN_DEBUG "%s timewait_sock %p released\n",
                       tw->tw_prot->name, tw);
 #endif
-               kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
-               module_put(owner);
+               call_rcu_bh(&tw->tw_rcu_head, __inet_twsk_put);
        }
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 3075803..57e5f6b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -251,6 +251,8 @@ struct sock {
        int                     (*sk_backlog_rcv)(struct sock *sk,
                                                  struct sk_buff *skb);  
        void                    (*sk_destruct)(struct sock *sk);
+
+       struct rcu_head         sk_rcu_head;
 };
 
 /*
@@ -274,12 +276,12 @@ static inline struct sock *sk_next(const
 
 static inline int sk_unhashed(const struct sock *sk)
 {
-       return hlist_unhashed(&sk->sk_node);
+       return sk->sk_node.pprev == NULL || sk->sk_node.pprev == LIST_POISON2;
 }
 
 static inline int sk_hashed(const struct sock *sk)
 {
-       return sk->sk_node.pprev != NULL;
+       return sk->sk_node.pprev != NULL && sk->sk_node.pprev != LIST_POISON2;
 }
 
 static __inline__ void sk_node_init(struct hlist_node *node)
@@ -289,14 +291,13 @@ static __inline__ void sk_node_init(stru
 
 static __inline__ void __sk_del_node(struct sock *sk)
 {
-       __hlist_del(&sk->sk_node);
+       hlist_del_rcu(&sk->sk_node);
 }
 
 static __inline__ int __sk_del_node_init(struct sock *sk)
 {
        if (sk_hashed(sk)) {
                __sk_del_node(sk);
-               sk_node_init(&sk->sk_node);
                return 1;
        }
        return 0;
@@ -335,7 +336,7 @@ static __inline__ int sk_del_node_init(s
 
 static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list)
 {
-       hlist_add_head(&sk->sk_node, list);
+       hlist_add_head_rcu(&sk->sk_node, list);
 }
 
 static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
@@ -346,27 +347,24 @@ static __inline__ void sk_add_node(struc
 
 static __inline__ void __sk_del_bind_node(struct sock *sk)
 {
-       __hlist_del(&sk->sk_bind_node);
+       hlist_del_rcu(&sk->sk_bind_node);
 }
 
 static __inline__ void sk_add_bind_node(struct sock *sk,
                                        struct hlist_head *list)
 {
-       hlist_add_head(&sk->sk_bind_node, list);
+       hlist_add_head_rcu(&sk->sk_bind_node, list);
 }
 
 #define sk_for_each(__sk, node, list) \
-       hlist_for_each_entry(__sk, node, list, sk_node)
+       hlist_for_each_entry_rcu(__sk, node, list, sk_node)
 #define sk_for_each_from(__sk, node) \
        if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
                hlist_for_each_entry_from(__sk, node, sk_node)
-#define sk_for_each_continue(__sk, node) \
-       if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
-               hlist_for_each_entry_continue(__sk, node, sk_node)
 #define sk_for_each_safe(__sk, node, tmp, list) \
        hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node)
 #define sk_for_each_bound(__sk, node, list) \
-       hlist_for_each_entry(__sk, node, list, sk_bind_node)
+       hlist_for_each_entry_rcu(__sk, node, list, sk_bind_node)
 
 /* Sock flags */
 enum sock_flags {
diff --git a/net/core/sock.c b/net/core/sock.c
index 6e00811..f152783 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -677,8 +677,9 @@ out_free:
        return NULL;
 }
 
-void sk_free(struct sock *sk)
+void __sk_free(struct rcu_head *rcu)
 {
+       struct sock *sk = container_of(rcu, struct sock, sk_rcu_head);
        struct sk_filter *filter;
        struct module *owner = sk->sk_prot_creator->owner;
 
@@ -705,6 +706,11 @@ void sk_free(struct sock *sk)
        module_put(owner);
 }
 
+void sk_free(struct sock *sk)
+{
+       call_rcu_bh(&sk->sk_rcu_head, __sk_free);
+}
+
 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 {
        struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 7098f10..33f6dbf 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -612,18 +612,22 @@ static struct sock *dccp_v4_hnd_req(stru
        if (req != NULL)
                return dccp_check_req(sk, skb, req, prev);
 
+       rcu_read_lock_bh();
        nsk = __inet_lookup_established(&dccp_hashinfo,
                                        iph->saddr, dh->dccph_sport,
                                        iph->daddr, ntohs(dh->dccph_dport),
                                        inet_iif(skb));
        if (nsk != NULL) {
                if (nsk->sk_state != DCCP_TIME_WAIT) {
+                       sock_hold(nsk);
                        bh_lock_sock(nsk);
+                       rcu_read_unlock_bh();
                        return nsk;
                }
-               inet_twsk_put((struct inet_timewait_sock *)nsk);
+               rcu_read_unlock_bh();
                return NULL;
        }
+       rcu_read_unlock_bh();
 
        return sk;
 }
@@ -925,6 +929,7 @@ static int dccp_v4_rcv(struct sk_buff *s
 
        /* Step 2:
         *      Look up flow ID in table and get corresponding socket */
+       rcu_read_lock_bh();
        sk = __inet_lookup(&dccp_hashinfo,
                           skb->nh.iph->saddr, dh->dccph_sport,
                           skb->nh.iph->daddr, ntohs(dh->dccph_dport),
@@ -939,9 +944,13 @@ static int dccp_v4_rcv(struct sk_buff *s
        if (sk == NULL) {
                dccp_pr_debug("failed to look up flow ID in table and "
                              "get corresponding socket\n");
+               rcu_read_unlock_bh();
                goto no_dccp_socket;
        }
 
+       sock_hold(sk);
+       rcu_read_unlock_bh();
+
        /* 
         * Step 2:
         *      ... or S.state == TIMEWAIT,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 9e97ce6..304d1a9 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -986,7 +986,7 @@ static int __init dccp_init(void)
        }
 
        for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
-               rwlock_init(&dccp_hashinfo.ehash[i].lock);
+               rwlock_init(&dccp_hashinfo.ehash[i].wlock);
                INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
        }
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 457db99..b3713e9 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -749,7 +749,7 @@ skip_listen_ht:
                if (i > s_i)
                        s_num = 0;
 
-               read_lock_bh(&head->lock);
+               rcu_read_lock_bh(); /* &head->lock */
                num = 0;
                sk_for_each(sk, node, &head->chain) {
                        struct inet_sock *inet = inet_sk(sk);
@@ -765,7 +765,7 @@ skip_listen_ht:
                            r->id.idiag_dport)
                                goto next_normal;
                        if (inet_csk_diag_dump(sk, skb, cb) < 0) {
-                               read_unlock_bh(&head->lock);
+                               rcu_read_unlock_bh(); /* &head->lock */
                                goto done;
                        }
 next_normal:
@@ -787,14 +787,14 @@ next_normal:
                                    r->id.idiag_dport)
                                        goto next_dying;
                                if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
-                                       read_unlock_bh(&head->lock);
+                                       rcu_read_unlock_bh(); /* &head->lock */
                                        goto done;
                                }
 next_dying:
                                ++num;
                        }
                }
-               read_unlock_bh(&head->lock);
+               rcu_read_unlock_bh(); /* &head->lock */
        }
 
 done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 3322811..791b9a9 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -185,7 +185,7 @@ static int __inet_check_established(stru
        struct inet_timewait_sock *tw;
 
        prefetch(head->chain.first);
-       write_lock(&head->lock);
+       write_lock(&head->wlock);
 
        /* Check TIME-WAIT sockets first. */
        sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
@@ -213,9 +213,9 @@ unique:
        inet->sport = htons(lport);
        sk->sk_hash = hash;
        BUG_TRAP(sk_unhashed(sk));
-       __sk_add_node(sk, &head->chain);
        sock_prot_inc_use(sk->sk_prot);
-       write_unlock(&head->lock);
+       __sk_add_node(sk, &head->chain);
+       write_unlock(&head->wlock);
 
        if (twp) {
                *twp = tw;
@@ -231,7 +231,7 @@ unique:
        return 0;
 
 not_unique:
-       write_unlock(&head->lock);
+       write_unlock(&head->wlock);
        return -EADDRNOTAVAIL;
 }
 
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 417f126..c973f1c 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -14,6 +14,17 @@
 #include <net/inet_timewait_sock.h>
 #include <net/ip.h>
 
+void __inet_twsk_put(struct rcu_head *tw_rcu)
+{
+       struct inet_timewait_sock *tw = container_of(tw_rcu,
+                                       struct inet_timewait_sock, tw_rcu_head);
+       struct module *owner = tw->tw_prot->owner;
+       sk_node_init(&tw->tw_node);
+       kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
+       module_put(owner);
+}
+EXPORT_SYMBOL_GPL(__inet_twsk_put);
+
 /* Must be called with locally disabled BHs. */
 void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo 
*hashinfo)
 {
@@ -22,14 +33,13 @@ void __inet_twsk_kill(struct inet_timewa
        /* Unlink from established hashes. */
        struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, 
tw->tw_hash);
 
-       write_lock(&ehead->lock);
-       if (hlist_unhashed(&tw->tw_node)) {
-               write_unlock(&ehead->lock);
+       write_lock(&ehead->wlock);
+       if (hlist_unhashed(&tw->tw_node) || tw->tw_node.pprev == LIST_POISON2) {
+               write_unlock(&ehead->wlock);
                return;
        }
-       __hlist_del(&tw->tw_node);
-       sk_node_init(&tw->tw_node);
-       write_unlock(&ehead->lock);
+       hlist_del_rcu(&tw->tw_node);
+       write_unlock(&ehead->wlock);
 
        /* Disassociate with bind bucket. */
        bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, 
hashinfo->bhash_size)];
@@ -73,7 +83,7 @@ void __inet_twsk_hashdance(struct inet_t
        inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
        spin_unlock(&bhead->lock);
 
-       write_lock(&ehead->lock);
+       write_lock(&ehead->wlock);
 
        /* Step 2: Remove SK from established hash. */
        if (__sk_del_node_init(sk))
@@ -83,7 +93,7 @@ void __inet_twsk_hashdance(struct inet_t
        inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
        atomic_inc(&tw->tw_refcnt);
 
-       write_unlock(&ehead->lock);
+       write_unlock(&ehead->wlock);
 }
 
 EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 00aa80e..1e34030 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2072,7 +2072,7 @@ void __init tcp_init(void)
                                        0);
        tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
        for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
-               rwlock_init(&tcp_hashinfo.ehash[i].lock);
+               rwlock_init(&tcp_hashinfo.ehash[i].wlock);
                INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
        }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4eb903d..7690263 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -918,7 +918,7 @@ exit:
        return NULL;
 }
 
-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+static noinline struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff 
*skb)
 {
        struct tcphdr *th = skb->h.th;
        struct iphdr *iph = skb->nh.iph;
@@ -930,18 +930,22 @@ static struct sock *tcp_v4_hnd_req(struc
        if (req)
                return tcp_check_req(sk, skb, req, prev);
 
+       rcu_read_lock_bh();
        nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
                                        th->source, skb->nh.iph->daddr,
                                        ntohs(th->dest), inet_iif(skb));
 
        if (nsk) {
                if (nsk->sk_state != TCP_TIME_WAIT) {
+                       sock_hold(nsk);
                        bh_lock_sock(nsk);
+                       rcu_read_unlock_bh();
                        return nsk;
                }
-               inet_twsk_put((struct inet_timewait_sock *)nsk);
+               rcu_read_unlock_bh();
                return NULL;
        }
+       rcu_read_unlock_bh();
 
 #ifdef CONFIG_SYN_COOKIES
        if (!th->rst && !th->syn && th->ack)
@@ -1035,6 +1039,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
        struct sock *sk;
        int ret;
 
+       rcu_read_lock_bh();
        if (skb->pkt_type != PACKET_HOST)
                goto discard_it;
 
@@ -1071,7 +1076,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
        sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
                           skb->nh.iph->daddr, ntohs(th->dest),
                           inet_iif(skb));
-
        if (!sk)
                goto no_tcp_socket;
 
@@ -1097,7 +1101,7 @@ process:
                sk_add_backlog(sk, skb);
        bh_unlock_sock(sk);
 
-       sock_put(sk);
+       rcu_read_unlock_bh();
 
        return ret;
 
@@ -1113,15 +1117,18 @@ bad_packet:
        }
 
 discard_it:
+       rcu_read_unlock_bh();
+
        /* Discard frame. */
        kfree_skb(skb);
        return 0;
 
 discard_and_relse:
-       sock_put(sk);
+       rcu_read_unlock_bh();
        goto discard_it;
 
 do_time_wait:
+       sock_hold(sk);
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                inet_twsk_put((struct inet_timewait_sock *) sk);
                goto discard_it;
@@ -1424,7 +1431,6 @@ static void *established_get_first(struc
                /* We can reschedule _before_ having picked the target: */
                cond_resched_softirq();
 
-               read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
                sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
                        if (sk->sk_family != st->family) {
                                continue;
@@ -1441,7 +1447,6 @@ static void *established_get_first(struc
                        rc = tw;
                        goto out;
                }
-               read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
                st->state = TCP_SEQ_STATE_ESTABLISHED;
        }
 out:
@@ -1468,16 +1473,14 @@ get_tw:
                        cur = tw;
                        goto out;
                }
-               read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
                st->state = TCP_SEQ_STATE_ESTABLISHED;
 
-               /* We can reschedule between buckets: */
+               /* We can reschedule _before_ having picked the target: */
                cond_resched_softirq();
 
-               if (++st->bucket < tcp_hashinfo.ehash_size) {
-                       read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+               if (++st->bucket < tcp_hashinfo.ehash_size)
                        sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
-               } else {
+               else {
                        cur = NULL;
                        goto out;
                }
@@ -1520,7 +1523,7 @@ static void *tcp_get_idx(struct seq_file
 
        if (!rc) {
                inet_listen_unlock(&tcp_hashinfo);
-               local_bh_disable();
+               rcu_read_lock_bh();
                st->state = TCP_SEQ_STATE_ESTABLISHED;
                rc        = established_get_idx(seq, pos);
        }
@@ -1553,7 +1556,7 @@ static void *tcp_seq_next(struct seq_fil
                rc = listening_get_next(seq, v);
                if (!rc) {
                        inet_listen_unlock(&tcp_hashinfo);
-                       local_bh_disable();
+                       rcu_read_lock_bh();
                        st->state = TCP_SEQ_STATE_ESTABLISHED;
                        rc        = established_get_first(seq);
                }
@@ -1584,9 +1587,7 @@ static void tcp_seq_stop(struct seq_file
                break;
        case TCP_SEQ_STATE_TIME_WAIT:
        case TCP_SEQ_STATE_ESTABLISHED:
-               if (v)
-                       read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
-               local_bh_enable();
+               rcu_read_unlock_bh();
                break;
        }
 }
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 4154f3a..3e6edf9 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -101,7 +101,7 @@ static int __inet6_check_established(str
        struct inet_timewait_sock *tw;
 
        prefetch(head->chain.first);
-       write_lock(&head->lock);
+       write_lock(&head->wlock);
 
        /* Check TIME-WAIT sockets first. */
        sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
@@ -133,7 +133,7 @@ unique:
        __sk_add_node(sk, &head->chain);
        sk->sk_hash = hash;
        sock_prot_inc_use(sk->sk_prot);
-       write_unlock(&head->lock);
+       write_unlock(&head->wlock);
 
        if (twp != NULL) {
                *twp = tw;
@@ -148,7 +148,7 @@ unique:
        return 0;
 
 not_unique:
-       write_unlock(&head->lock);
+       write_unlock(&head->wlock);
        return -EADDRNOTAVAIL;
 }
 
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to