On Thu, Mar 09, 2006 at 09:43:00AM -0800, Benjamin LaHaise ([EMAIL PROTECTED]) wrote: > On Thu, Mar 09, 2006 at 01:18:26PM +0300, Evgeniy Polyakov wrote: > > Ok, I hacked quite a bit in the patch, but I think nothing major was > > changed, basically patch rejects. > > And I'm now unable to bind to 0.0.0.0 address, i.e. bind() does not > > fail, but all connections are refused. > > Bind to machine's IP works fine. > > Odd. Can you fire off a copy of your backport to me? I'll see if I can > spot any quirks. bind to 0.0.0.0 seems to work for me (ssh, portmap and > a few test programs worked), but maybe something different is being done > in your case.
Sure. inet6_hashtable is missed. diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -72,7 +72,7 @@ static inline struct sock * struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); prefetch(head->chain.first); - read_lock(&head->lock); + read_lock(&head->wlock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif)) @@ -92,12 +92,12 @@ static inline struct sock * goto hit; } } - read_unlock(&head->lock); + read_unlock(&head->wlock); return NULL; hit: sock_hold(sk); - read_unlock(&head->lock); + read_unlock(&head->wlock); return sk; } diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -38,7 +38,7 @@ * for the rest. I'll experiment with dynamic table growth later. */ struct inet_ehash_bucket { - rwlock_t lock; + rwlock_t wlock; struct hlist_head chain; }; @@ -243,7 +243,7 @@ static inline void __inet_hash(struct in sk->sk_hash = inet_sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; - lock = &head->lock; + lock = &head->wlock; write_lock(lock); } __sk_add_node(sk, list); @@ -274,7 +274,7 @@ static inline void inet_unhash(struct in inet_listen_wlock(hashinfo); lock = &hashinfo->lhash_lock; } else { - lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock; + lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->wlock; write_lock_bh(lock); } @@ -298,14 +298,13 @@ extern struct sock *__inet_lookup_listen /* Optimize the common listener case. */ static inline struct sock * - inet_lookup_listener(struct inet_hashinfo *hashinfo, + _inet_lookup_listener(struct inet_hashinfo *hashinfo, const u32 daddr, const unsigned short hnum, const int dif) { struct sock *sk = NULL; const struct hlist_head *head; - read_lock(&hashinfo->lhash_lock); head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; if (!hlist_empty(head)) { const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); @@ -317,10 +316,21 @@ static inline struct sock * goto sherry_cache; sk = __inet_lookup_listener(head, daddr, hnum, dif); } - if (sk) { sherry_cache: + return sk; +} + +static inline struct sock * + inet_lookup_listener(struct inet_hashinfo *hashinfo, + const u32 daddr, + const unsigned short hnum, const int dif) +{ + struct sock *sk; + + read_lock(&hashinfo->lhash_lock); + sk = _inet_lookup_listener(hashinfo, daddr, hnum, dif); + if (sk) sock_hold(sk); - } read_unlock(&hashinfo->lhash_lock); return sk; } @@ -372,7 +382,7 @@ sherry_cache: * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM * - * Local BH must be disabled here. + * This is an RCU read section, the caller must do an rcu_read_lock_bh(). */ static inline struct sock * __inet_lookup_established(struct inet_hashinfo *hashinfo, @@ -390,8 +400,6 @@ static inline struct sock * unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); - prefetch(head->chain.first); - read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ @@ -404,10 +412,8 @@ static inline struct sock * } sk = NULL; out: - read_unlock(&head->lock); return sk; hit: - sock_hold(sk); goto out; } @@ -418,7 +424,7 @@ static inline struct sock *__inet_lookup { struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, daddr, hnum, dif); - return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif); + return sk ? : _inet_lookup_listener(hashinfo, daddr, hnum, dif); } static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo, @@ -428,9 +434,11 @@ static inline struct sock *inet_lookup(s { struct sock *sk; - local_bh_disable(); + rcu_read_lock_bh(); sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); - local_bh_enable(); + if (sk) + sock_hold(sk); + rcu_read_unlock_bh(); return sk; } diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -23,6 +23,7 @@ #include <linux/timer.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <linux/rcupdate.h> #include <net/sock.h> #include <net/tcp_states.h> @@ -132,6 +133,7 @@ struct inet_timewait_sock { unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; struct hlist_node tw_death_node; + struct rcu_head tw_rcu_head; }; static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, @@ -191,16 +193,15 @@ static inline u32 inet_rcv_saddr(const s inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr; } +extern void __inet_twsk_put(struct rcu_head *tw_rcu); static inline void inet_twsk_put(struct inet_timewait_sock *tw) { if (atomic_dec_and_test(&tw->tw_refcnt)) { - struct module *owner = tw->tw_prot->owner; #ifdef SOCK_REFCNT_DEBUG printk(KERN_DEBUG "%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif - kmem_cache_free(tw->tw_prot->twsk_slab, tw); - module_put(owner); + call_rcu_bh(&tw->tw_rcu_head, __inet_twsk_put); } } diff --git a/include/net/sock.h b/include/net/sock.h --- a/include/net/sock.h +++ b/include/net/sock.h @@ -255,6 +255,8 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + + struct rcu_head sk_rcu_head; }; /* @@ -278,12 +280,12 @@ static inline struct sock *sk_next(const static inline int sk_unhashed(const struct sock *sk) { - return hlist_unhashed(&sk->sk_node); + return sk->sk_node.pprev == NULL || sk->sk_node.pprev == LIST_POISON2; } static inline int sk_hashed(const struct sock *sk) { - return sk->sk_node.pprev != NULL; + return sk->sk_node.pprev != NULL && sk->sk_node.pprev != LIST_POISON2; } static __inline__ void sk_node_init(struct hlist_node *node) @@ -293,14 +295,13 @@ static __inline__ void sk_node_init(stru static __inline__ void __sk_del_node(struct sock *sk) { - __hlist_del(&sk->sk_node); + hlist_del_rcu(&sk->sk_node); } static __inline__ int __sk_del_node_init(struct sock *sk) { if (sk_hashed(sk)) { __sk_del_node(sk); - sk_node_init(&sk->sk_node); return 1; } return 0; @@ -339,7 +340,7 @@ static __inline__ int sk_del_node_init(s static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list) { - hlist_add_head(&sk->sk_node, list); + hlist_add_head_rcu(&sk->sk_node, list); } static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) @@ -350,27 +351,24 @@ static __inline__ void sk_add_node(struc static __inline__ void __sk_del_bind_node(struct sock *sk) { - __hlist_del(&sk->sk_bind_node); + hlist_del_rcu(&sk->sk_bind_node); } static __inline__ void sk_add_bind_node(struct sock *sk, struct hlist_head *list) { - hlist_add_head(&sk->sk_bind_node, list); + hlist_add_head_rcu(&sk->sk_bind_node, list); } #define sk_for_each(__sk, node, list) \ - hlist_for_each_entry(__sk, node, list, sk_node) + hlist_for_each_entry_rcu(__sk, node, list, sk_node) #define sk_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_from(__sk, node, sk_node) -#define sk_for_each_continue(__sk, node) \ - if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ - hlist_for_each_entry_continue(__sk, node, sk_node) #define sk_for_each_safe(__sk, node, tmp, list) \ hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node) #define sk_for_each_bound(__sk, node, list) \ - hlist_for_each_entry(__sk, node, list, sk_bind_node) + hlist_for_each_entry_rcu(__sk, node, list, sk_bind_node) /* Sock flags */ enum sock_flags { diff --git a/net/core/sock.c b/net/core/sock.c --- a/net/core/sock.c +++ b/net/core/sock.c @@ -686,8 +686,9 @@ out_free: return NULL; } -void sk_free(struct sock *sk) +void __sk_free(struct rcu_head *rcu) { + struct sock *sk = container_of(rcu, struct sock, sk_rcu_head); struct sk_filter *filter; struct module *owner = sk->sk_prot_creator->owner; @@ -714,6 +715,11 @@ void sk_free(struct sock *sk) module_put(owner); } +void sk_free(struct sock *sk) +{ + call_rcu_bh(&sk->sk_rcu_head, __sk_free); +} + struct sock *sk_clone(const struct sock *sk, const gfp_t priority) { struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -805,18 +805,22 @@ static struct sock *dccp_v4_hnd_req(stru if (req != NULL) return dccp_check_req(sk, skb, req, prev); + rcu_read_lock_bh(); nsk = __inet_lookup_established(&dccp_hashinfo, iph->saddr, dh->dccph_sport, iph->daddr, ntohs(dh->dccph_dport), inet_iif(skb)); if (nsk != NULL) { if (nsk->sk_state != DCCP_TIME_WAIT) { + sock_hold(nsk); bh_lock_sock(nsk); + rcu_read_unlock_bh(); return nsk; } - inet_twsk_put((struct inet_timewait_sock *)nsk); + rcu_read_unlock_bh(); return NULL; } + rcu_read_unlock_bh(); return sk; } @@ -1113,6 +1117,7 @@ int dccp_v4_rcv(struct sk_buff *skb) /* Step 2: * Look up flow ID in table and get corresponding socket */ + rcu_read_lock_bh(); sk = __inet_lookup(&dccp_hashinfo, skb->nh.iph->saddr, dh->dccph_sport, skb->nh.iph->daddr, ntohs(dh->dccph_dport), @@ -1127,9 +1132,13 @@ int dccp_v4_rcv(struct sk_buff *skb) if (sk == NULL) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); + rcu_read_unlock_bh(); goto no_dccp_socket; } + sock_hold(sk); + rcu_read_unlock_bh(); + /* * Step 2: * ... or S.state == TIMEWAIT, diff --git a/net/dccp/proto.c b/net/dccp/proto.c --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -812,7 +812,7 @@ static int __init dccp_init(void) } for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) { - rwlock_init(&dccp_hashinfo.ehash[i].lock); + rwlock_init(&dccp_hashinfo.ehash[i].wlock); INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -671,7 +671,7 @@ skip_listen_ht: if (i > s_i) s_num = 0; - read_lock_bh(&head->lock); + rcu_read_lock_bh(); /* &head->lock */ num = 0; sk_for_each(sk, node, &head->chain) { @@ -687,7 +687,7 @@ skip_listen_ht: if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) goto next_normal; if (inet_diag_dump_sock(skb, sk, cb) < 0) { - read_unlock_bh(&head->lock); + rcu_read_unlock_bh(); /* &head->lock */ goto done; } next_normal: @@ -708,14 +708,14 @@ next_normal: r->id.idiag_dport) goto next_dying; if (inet_diag_dump_sock(skb, sk, cb) < 0) { - read_unlock_bh(&head->lock); + rcu_read_unlock_bh(); /* &head->lock */ goto done; } next_dying: ++num; } } - read_unlock_bh(&head->lock); + rcu_read_unlock_bh(); /* &head->lock */ } done: diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -14,6 +14,17 @@ #include <net/inet_timewait_sock.h> #include <net/ip.h> +void __inet_twsk_put(struct rcu_head *tw_rcu) +{ + struct inet_timewait_sock *tw = container_of(tw_rcu, + struct inet_timewait_sock, tw_rcu_head); + struct module *owner = tw->tw_prot->owner; + sk_node_init(&tw->tw_node); + kmem_cache_free(tw->tw_prot->twsk_slab, tw); + module_put(owner); +} +EXPORT_SYMBOL_GPL(__inet_twsk_put); + /* Must be called with locally disabled BHs. */ void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { @@ -22,14 +33,13 @@ void __inet_twsk_kill(struct inet_timewa /* Unlink from established hashes. */ struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash); - write_lock(&ehead->lock); - if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); + write_lock(&ehead->wlock); + if (hlist_unhashed(&tw->tw_node) || tw->tw_node.pprev == LIST_POISON2) { + write_unlock(&ehead->wlock); return; } - __hlist_del(&tw->tw_node); - sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); + hlist_del_rcu(&tw->tw_node); + write_unlock(&ehead->wlock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; @@ -73,7 +83,7 @@ void __inet_twsk_hashdance(struct inet_t inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); - write_lock(&ehead->lock); + write_lock(&ehead->wlock); /* Step 2: Remove SK from established hash. */ if (__sk_del_node_init(sk)) @@ -83,7 +93,7 @@ void __inet_twsk_hashdance(struct inet_t inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain); atomic_inc(&tw->tw_refcnt); - write_unlock(&ehead->lock); + write_unlock(&ehead->wlock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2342,7 +2342,7 @@ void __init tcp_init(void) 0); tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { - rwlock_init(&tcp_hashinfo.ehash[i].lock); + rwlock_init(&tcp_hashinfo.ehash[i].wlock); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -136,7 +136,7 @@ static int __tcp_v4_check_established(st struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(&head->wlock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { @@ -192,7 +192,7 @@ unique: BUG_TRAP(sk_unhashed(sk)); __sk_add_node(sk, &head->chain); sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + write_unlock(&head->wlock); if (twp) { *twp = tw; @@ -208,7 +208,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(&head->wlock); return -EADDRNOTAVAIL; } @@ -1077,7 +1077,7 @@ exit: return NULL; } -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +static noinline struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = skb->h.th; struct iphdr *iph = skb->nh.iph; @@ -1089,18 +1089,22 @@ static struct sock *tcp_v4_hnd_req(struc if (req) return tcp_check_req(sk, skb, req, prev); + rcu_read_lock_bh(); nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, ntohs(th->dest), inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { + sock_hold(nsk); bh_lock_sock(nsk); + rcu_read_unlock_bh(); return nsk; } - inet_twsk_put((struct inet_timewait_sock *)nsk); + rcu_read_unlock_bh(); return NULL; } + rcu_read_unlock_bh(); #ifdef CONFIG_SYN_COOKIES if (!th->rst && !th->syn && th->ack) @@ -1194,6 +1198,7 @@ int tcp_v4_rcv(struct sk_buff *skb) struct sock *sk; int ret; + rcu_read_lock_bh(); if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -1230,7 +1235,6 @@ int tcp_v4_rcv(struct sk_buff *skb) sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, ntohs(th->dest), inet_iif(skb)); - if (!sk) goto no_tcp_socket; @@ -1261,7 +1265,7 @@ process: bh_unlock_sock(sk); } - sock_put(sk); + rcu_read_unlock_bh(); return ret; @@ -1277,15 +1281,18 @@ bad_packet: } discard_it: + rcu_read_unlock_bh(); + /* Discard frame. */ kfree_skb(skb); return 0; discard_and_relse: - sock_put(sk); + rcu_read_unlock_bh(); goto discard_it; do_time_wait: + sock_hold(sk); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; @@ -1597,7 +1604,6 @@ static void *established_get_first(struc /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); - read_lock(&tcp_hashinfo.ehash[st->bucket].lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; @@ -1614,7 +1620,6 @@ static void *established_get_first(struc rc = tw; goto out; } - read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -1641,16 +1646,14 @@ get_tw: cur = tw; goto out; } - read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; - /* We can reschedule between buckets: */ + /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); - if (++st->bucket < tcp_hashinfo.ehash_size) { - read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + if (++st->bucket < tcp_hashinfo.ehash_size) sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); - } else { + else { cur = NULL; goto out; } @@ -1693,7 +1696,7 @@ static void *tcp_get_idx(struct seq_file if (!rc) { inet_listen_unlock(&tcp_hashinfo); - local_bh_disable(); + rcu_read_lock_bh(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_idx(seq, pos); } @@ -1726,7 +1729,7 @@ static void *tcp_seq_next(struct seq_fil rc = listening_get_next(seq, v); if (!rc) { inet_listen_unlock(&tcp_hashinfo); - local_bh_disable(); + rcu_read_lock_bh(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_first(seq); } @@ -1757,9 +1760,7 @@ static void tcp_seq_stop(struct seq_file break; case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: - if (v) - read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); - local_bh_enable(); + rcu_read_unlock_bh(); break; } } > -ben -- Evgeniy Polyakov - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html