Hello again, This patch introduces the use of rcu for the ipv4 established connections hashtable, as well as the timewait table since they are closely intertwined. This removes 4 atomic operations per packet from the tcp_v4_rcv codepath, which helps quite a bit when the other performance barriers in the system are removed. Eliminating the rwlock cache bouncing should also help on SMP systems.
By itself, this improves local netperf performance on a P4/HT by ~260Mbit/s on average. With smaller packets (say, ethernet size) the difference should be larger. Note that this patch changes the semantics of __inet_lookup() and __inet_lookup_established() to *not* perform a sock_hold(). This way we can avoid the atomic inc and dec of the reference count and rely on rcu pinning the socket over the scope of the rcu_read_lock_bh() region. Only minimal fixes for ipv6 and dccp to use the renamed wlock, as I am not setup to test them. I have stared at this patch for a while, and I think it's correct, but more eyes are definately warranted. Most of the issues regarding the state and lifespan of a struct sock are already handled by the network layer as the socket can be locked after being retrieved from the connection hashtable, we just have to be careful about reinitializing fields that the rcu hash list depends on. -ben Before: max 7838.86, min 7771.90, avg 7811.68 87380 16384 16384 10.01 7793.04 90.56 90.56 1.904 1.904 87380 16384 16384 10.01 7806.54 91.61 91.61 1.923 1.923 87380 16384 16384 10.00 7819.29 90.80 90.80 1.903 1.903 87380 16384 16384 10.00 7815.89 90.70 90.70 1.901 1.901 87380 16384 16384 10.01 7771.90 91.66 91.66 1.932 1.932 87380 16384 16384 10.00 7831.59 90.20 90.20 1.887 1.887 87380 16384 16384 10.01 7796.56 91.26 91.26 1.918 1.918 87380 16384 16384 10.01 7838.86 89.26 89.26 1.866 1.866 87380 16384 16384 10.01 7835.44 90.56 90.56 1.894 1.894 After: max 8113.70, min 8026.32, avg 8072.34 87380 16384 16384 10.01 8045.55 87.11 87.11 1.774 1.774 87380 16384 16384 10.01 8065.14 90.86 90.86 1.846 1.846 87380 16384 16384 10.00 8077.76 89.85 89.85 1.822 1.822 87380 16384 16384 10.00 8026.32 89.80 89.80 1.833 1.833 87380 16384 16384 10.01 8108.59 89.81 89.81 1.815 1.815 87380 16384 16384 10.01 8034.53 89.01 89.01 1.815 1.815 87380 16384 16384 10.00 8113.70 90.45 90.45 1.827 1.827 87380 16384 16384 10.00 8111.37 89.90 89.90 1.816 1.816 87380 16384 16384 10.01 8077.75 87.96 87.96 1.784 1.784 87380 16384 16384 10.00 8062.70 90.25 90.25 1.834 1.834 -ben -- "Time is of no importance, Mr. President, only life is important." Don't Email: <[EMAIL PROTECTED]>. Signed-off-by: Benjamin LaHaise <[EMAIL PROTECTED]> diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 25f708f..73b05ab 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -65,7 +65,7 @@ static inline void __inet6_hash(struct i sk->sk_hash = hash = inet6_sk_ehashfn(sk); hash &= (hashinfo->ehash_size - 1); list = &hashinfo->ehash[hash].chain; - lock = &hashinfo->ehash[hash].lock; + lock = &hashinfo->ehash[hash].wlock; write_lock(lock); } @@ -98,7 +98,7 @@ static inline struct sock * struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); prefetch(head->chain.first); - read_lock(&head->lock); + read_lock(&head->wlock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif)) @@ -118,12 +118,12 @@ static inline struct sock * goto hit; } } - read_unlock(&head->lock); + read_unlock(&head->wlock); return NULL; hit: sock_hold(sk); - read_unlock(&head->lock); + read_unlock(&head->wlock); return sk; } diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 135d80f..4cde832 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -39,7 +39,7 @@ * for the rest. I'll experiment with dynamic table growth later. */ struct inet_ehash_bucket { - rwlock_t lock; + rwlock_t wlock; struct hlist_head chain; }; @@ -224,7 +224,7 @@ static inline void __inet_hash(struct in sk->sk_hash = inet_sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; - lock = &head->lock; + lock = &head->wlock; write_lock(lock); } __sk_add_node(sk, list); @@ -255,7 +255,7 @@ static inline void inet_unhash(struct in inet_listen_wlock(hashinfo); lock = &hashinfo->lhash_lock; } else { - lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock; + lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->wlock; write_lock_bh(lock); } @@ -279,14 +279,13 @@ extern struct sock *__inet_lookup_listen /* Optimize the common listener case. */ static inline struct sock * - inet_lookup_listener(struct inet_hashinfo *hashinfo, + _inet_lookup_listener(struct inet_hashinfo *hashinfo, const u32 daddr, const unsigned short hnum, const int dif) { struct sock *sk = NULL; const struct hlist_head *head; - read_lock(&hashinfo->lhash_lock); head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; if (!hlist_empty(head)) { const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); @@ -298,10 +297,21 @@ static inline struct sock * goto sherry_cache; sk = __inet_lookup_listener(head, daddr, hnum, dif); } - if (sk) { sherry_cache: + return sk; +} + +static inline struct sock * + inet_lookup_listener(struct inet_hashinfo *hashinfo, + const u32 daddr, + const unsigned short hnum, const int dif) +{ + struct sock *sk; + + read_lock(&hashinfo->lhash_lock); + sk = _inet_lookup_listener(hashinfo, daddr, hnum, dif); + if (sk) sock_hold(sk); - } read_unlock(&hashinfo->lhash_lock); return sk; } @@ -353,7 +363,7 @@ sherry_cache: * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM * - * Local BH must be disabled here. + * This is an RCU read section, the caller must do an rcu_read_lock_bh(). */ static inline struct sock * __inet_lookup_established(struct inet_hashinfo *hashinfo, @@ -371,8 +381,6 @@ static inline struct sock * unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); - prefetch(head->chain.first); - read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ @@ -385,10 +393,8 @@ static inline struct sock * } sk = NULL; out: - read_unlock(&head->lock); return sk; hit: - sock_hold(sk); goto out; } @@ -399,7 +405,7 @@ static inline struct sock *__inet_lookup { struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, daddr, hnum, dif); - return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif); + return sk ? : _inet_lookup_listener(hashinfo, daddr, hnum, dif); } static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo, @@ -409,9 +415,11 @@ static inline struct sock *inet_lookup(s { struct sock *sk; - local_bh_disable(); + rcu_read_lock_bh(); sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); - local_bh_enable(); + if (sk) + sock_hold(sk); + rcu_read_unlock_bh(); return sk; } diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 1da294c..1341d2e 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -22,6 +22,7 @@ #include <linux/timer.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <linux/rcupdate.h> #include <net/inet_sock.h> #include <net/sock.h> @@ -134,6 +135,7 @@ struct inet_timewait_sock { unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; struct hlist_node tw_death_node; + struct rcu_head tw_rcu_head; }; static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, @@ -193,16 +195,15 @@ static inline u32 inet_rcv_saddr(const s inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr; } +extern void __inet_twsk_put(struct rcu_head *tw_rcu); static inline void inet_twsk_put(struct inet_timewait_sock *tw) { if (atomic_dec_and_test(&tw->tw_refcnt)) { - struct module *owner = tw->tw_prot->owner; #ifdef SOCK_REFCNT_DEBUG printk(KERN_DEBUG "%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif - kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); - module_put(owner); + call_rcu_bh(&tw->tw_rcu_head, __inet_twsk_put); } } diff --git a/include/net/sock.h b/include/net/sock.h index 3075803..57e5f6b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -251,6 +251,8 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + + struct rcu_head sk_rcu_head; }; /* @@ -274,12 +276,12 @@ static inline struct sock *sk_next(const static inline int sk_unhashed(const struct sock *sk) { - return hlist_unhashed(&sk->sk_node); + return sk->sk_node.pprev == NULL || sk->sk_node.pprev == LIST_POISON2; } static inline int sk_hashed(const struct sock *sk) { - return sk->sk_node.pprev != NULL; + return sk->sk_node.pprev != NULL && sk->sk_node.pprev != LIST_POISON2; } static __inline__ void sk_node_init(struct hlist_node *node) @@ -289,14 +291,13 @@ static __inline__ void sk_node_init(stru static __inline__ void __sk_del_node(struct sock *sk) { - __hlist_del(&sk->sk_node); + hlist_del_rcu(&sk->sk_node); } static __inline__ int __sk_del_node_init(struct sock *sk) { if (sk_hashed(sk)) { __sk_del_node(sk); - sk_node_init(&sk->sk_node); return 1; } return 0; @@ -335,7 +336,7 @@ static __inline__ int sk_del_node_init(s static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list) { - hlist_add_head(&sk->sk_node, list); + hlist_add_head_rcu(&sk->sk_node, list); } static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) @@ -346,27 +347,24 @@ static __inline__ void sk_add_node(struc static __inline__ void __sk_del_bind_node(struct sock *sk) { - __hlist_del(&sk->sk_bind_node); + hlist_del_rcu(&sk->sk_bind_node); } static __inline__ void sk_add_bind_node(struct sock *sk, struct hlist_head *list) { - hlist_add_head(&sk->sk_bind_node, list); + hlist_add_head_rcu(&sk->sk_bind_node, list); } #define sk_for_each(__sk, node, list) \ - hlist_for_each_entry(__sk, node, list, sk_node) + hlist_for_each_entry_rcu(__sk, node, list, sk_node) #define sk_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_from(__sk, node, sk_node) -#define sk_for_each_continue(__sk, node) \ - if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ - hlist_for_each_entry_continue(__sk, node, sk_node) #define sk_for_each_safe(__sk, node, tmp, list) \ hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node) #define sk_for_each_bound(__sk, node, list) \ - hlist_for_each_entry(__sk, node, list, sk_bind_node) + hlist_for_each_entry_rcu(__sk, node, list, sk_bind_node) /* Sock flags */ enum sock_flags { diff --git a/net/core/sock.c b/net/core/sock.c index 6e00811..f152783 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -677,8 +677,9 @@ out_free: return NULL; } -void sk_free(struct sock *sk) +void __sk_free(struct rcu_head *rcu) { + struct sock *sk = container_of(rcu, struct sock, sk_rcu_head); struct sk_filter *filter; struct module *owner = sk->sk_prot_creator->owner; @@ -705,6 +706,11 @@ void sk_free(struct sock *sk) module_put(owner); } +void sk_free(struct sock *sk) +{ + call_rcu_bh(&sk->sk_rcu_head, __sk_free); +} + struct sock *sk_clone(const struct sock *sk, const gfp_t priority) { struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 7098f10..33f6dbf 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -612,18 +612,22 @@ static struct sock *dccp_v4_hnd_req(stru if (req != NULL) return dccp_check_req(sk, skb, req, prev); + rcu_read_lock_bh(); nsk = __inet_lookup_established(&dccp_hashinfo, iph->saddr, dh->dccph_sport, iph->daddr, ntohs(dh->dccph_dport), inet_iif(skb)); if (nsk != NULL) { if (nsk->sk_state != DCCP_TIME_WAIT) { + sock_hold(nsk); bh_lock_sock(nsk); + rcu_read_unlock_bh(); return nsk; } - inet_twsk_put((struct inet_timewait_sock *)nsk); + rcu_read_unlock_bh(); return NULL; } + rcu_read_unlock_bh(); return sk; } @@ -925,6 +929,7 @@ static int dccp_v4_rcv(struct sk_buff *s /* Step 2: * Look up flow ID in table and get corresponding socket */ + rcu_read_lock_bh(); sk = __inet_lookup(&dccp_hashinfo, skb->nh.iph->saddr, dh->dccph_sport, skb->nh.iph->daddr, ntohs(dh->dccph_dport), @@ -939,9 +944,13 @@ static int dccp_v4_rcv(struct sk_buff *s if (sk == NULL) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); + rcu_read_unlock_bh(); goto no_dccp_socket; } + sock_hold(sk); + rcu_read_unlock_bh(); + /* * Step 2: * ... or S.state == TIMEWAIT, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 9e97ce6..304d1a9 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -986,7 +986,7 @@ static int __init dccp_init(void) } for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) { - rwlock_init(&dccp_hashinfo.ehash[i].lock); + rwlock_init(&dccp_hashinfo.ehash[i].wlock); INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 457db99..b3713e9 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -749,7 +749,7 @@ skip_listen_ht: if (i > s_i) s_num = 0; - read_lock_bh(&head->lock); + rcu_read_lock_bh(); /* &head->lock */ num = 0; sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); @@ -765,7 +765,7 @@ skip_listen_ht: r->id.idiag_dport) goto next_normal; if (inet_csk_diag_dump(sk, skb, cb) < 0) { - read_unlock_bh(&head->lock); + rcu_read_unlock_bh(); /* &head->lock */ goto done; } next_normal: @@ -787,14 +787,14 @@ next_normal: r->id.idiag_dport) goto next_dying; if (inet_twsk_diag_dump(tw, skb, cb) < 0) { - read_unlock_bh(&head->lock); + rcu_read_unlock_bh(); /* &head->lock */ goto done; } next_dying: ++num; } } - read_unlock_bh(&head->lock); + rcu_read_unlock_bh(); /* &head->lock */ } done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3322811..791b9a9 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -185,7 +185,7 @@ static int __inet_check_established(stru struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(&head->wlock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { @@ -213,9 +213,9 @@ unique: inet->sport = htons(lport); sk->sk_hash = hash; BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + __sk_add_node(sk, &head->chain); + write_unlock(&head->wlock); if (twp) { *twp = tw; @@ -231,7 +231,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(&head->wlock); return -EADDRNOTAVAIL; } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 417f126..c973f1c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -14,6 +14,17 @@ #include <net/inet_timewait_sock.h> #include <net/ip.h> +void __inet_twsk_put(struct rcu_head *tw_rcu) +{ + struct inet_timewait_sock *tw = container_of(tw_rcu, + struct inet_timewait_sock, tw_rcu_head); + struct module *owner = tw->tw_prot->owner; + sk_node_init(&tw->tw_node); + kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); + module_put(owner); +} +EXPORT_SYMBOL_GPL(__inet_twsk_put); + /* Must be called with locally disabled BHs. */ void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { @@ -22,14 +33,13 @@ void __inet_twsk_kill(struct inet_timewa /* Unlink from established hashes. */ struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash); - write_lock(&ehead->lock); - if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); + write_lock(&ehead->wlock); + if (hlist_unhashed(&tw->tw_node) || tw->tw_node.pprev == LIST_POISON2) { + write_unlock(&ehead->wlock); return; } - __hlist_del(&tw->tw_node); - sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); + hlist_del_rcu(&tw->tw_node); + write_unlock(&ehead->wlock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; @@ -73,7 +83,7 @@ void __inet_twsk_hashdance(struct inet_t inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); - write_lock(&ehead->lock); + write_lock(&ehead->wlock); /* Step 2: Remove SK from established hash. */ if (__sk_del_node_init(sk)) @@ -83,7 +93,7 @@ void __inet_twsk_hashdance(struct inet_t inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain); atomic_inc(&tw->tw_refcnt); - write_unlock(&ehead->lock); + write_unlock(&ehead->wlock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 00aa80e..1e34030 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2072,7 +2072,7 @@ void __init tcp_init(void) 0); tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { - rwlock_init(&tcp_hashinfo.ehash[i].lock); + rwlock_init(&tcp_hashinfo.ehash[i].wlock); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4eb903d..7690263 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -918,7 +918,7 @@ exit: return NULL; } -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +static noinline struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = skb->h.th; struct iphdr *iph = skb->nh.iph; @@ -930,18 +930,22 @@ static struct sock *tcp_v4_hnd_req(struc if (req) return tcp_check_req(sk, skb, req, prev); + rcu_read_lock_bh(); nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, ntohs(th->dest), inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { + sock_hold(nsk); bh_lock_sock(nsk); + rcu_read_unlock_bh(); return nsk; } - inet_twsk_put((struct inet_timewait_sock *)nsk); + rcu_read_unlock_bh(); return NULL; } + rcu_read_unlock_bh(); #ifdef CONFIG_SYN_COOKIES if (!th->rst && !th->syn && th->ack) @@ -1035,6 +1039,7 @@ int tcp_v4_rcv(struct sk_buff *skb) struct sock *sk; int ret; + rcu_read_lock_bh(); if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -1071,7 +1076,6 @@ int tcp_v4_rcv(struct sk_buff *skb) sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, ntohs(th->dest), inet_iif(skb)); - if (!sk) goto no_tcp_socket; @@ -1097,7 +1101,7 @@ process: sk_add_backlog(sk, skb); bh_unlock_sock(sk); - sock_put(sk); + rcu_read_unlock_bh(); return ret; @@ -1113,15 +1117,18 @@ bad_packet: } discard_it: + rcu_read_unlock_bh(); + /* Discard frame. */ kfree_skb(skb); return 0; discard_and_relse: - sock_put(sk); + rcu_read_unlock_bh(); goto discard_it; do_time_wait: + sock_hold(sk); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; @@ -1424,7 +1431,6 @@ static void *established_get_first(struc /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); - read_lock(&tcp_hashinfo.ehash[st->bucket].lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; @@ -1441,7 +1447,6 @@ static void *established_get_first(struc rc = tw; goto out; } - read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -1468,16 +1473,14 @@ get_tw: cur = tw; goto out; } - read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; - /* We can reschedule between buckets: */ + /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); - if (++st->bucket < tcp_hashinfo.ehash_size) { - read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + if (++st->bucket < tcp_hashinfo.ehash_size) sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); - } else { + else { cur = NULL; goto out; } @@ -1520,7 +1523,7 @@ static void *tcp_get_idx(struct seq_file if (!rc) { inet_listen_unlock(&tcp_hashinfo); - local_bh_disable(); + rcu_read_lock_bh(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_idx(seq, pos); } @@ -1553,7 +1556,7 @@ static void *tcp_seq_next(struct seq_fil rc = listening_get_next(seq, v); if (!rc) { inet_listen_unlock(&tcp_hashinfo); - local_bh_disable(); + rcu_read_lock_bh(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_first(seq); } @@ -1584,9 +1587,7 @@ static void tcp_seq_stop(struct seq_file break; case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: - if (v) - read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); - local_bh_enable(); + rcu_read_unlock_bh(); break; } } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 4154f3a..3e6edf9 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -101,7 +101,7 @@ static int __inet6_check_established(str struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(&head->wlock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { @@ -133,7 +133,7 @@ unique: __sk_add_node(sk, &head->chain); sk->sk_hash = hash; sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + write_unlock(&head->wlock); if (twp != NULL) { *twp = tw; @@ -148,7 +148,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(&head->wlock); return -EADDRNOTAVAIL; } - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html