On Thu, Mar 09, 2006 at 09:43:00AM -0800, Benjamin LaHaise ([EMAIL PROTECTED])
wrote:
> On Thu, Mar 09, 2006 at 01:18:26PM +0300, Evgeniy Polyakov wrote:
> > Ok, I hacked quite a bit in the patch, but I think nothing major was
> > changed, basically patch rejects.
> > And I'm now unable to bind to 0.0.0.0 address, i.e. bind() does not
> > fail, but all connections are refused.
> > Bind to machine's IP works fine.
>
> Odd. Can you fire off a copy of your backport to me? I'll see if I can
> spot any quirks. bind to 0.0.0.0 seems to work for me (ssh, portmap and
> a few test programs worked), but maybe something different is being done
> in your case.
Sure.
inet6_hashtable is missed.
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -72,7 +72,7 @@ static inline struct sock *
struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
prefetch(head->chain.first);
- read_lock(&head->lock);
+ read_lock(&head->wlock);
sk_for_each(sk, node, &head->chain) {
/* For IPV6 do the cheaper port and family tests first. */
if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif))
@@ -92,12 +92,12 @@ static inline struct sock *
goto hit;
}
}
- read_unlock(&head->lock);
+ read_unlock(&head->wlock);
return NULL;
hit:
sock_hold(sk);
- read_unlock(&head->lock);
+ read_unlock(&head->wlock);
return sk;
}
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -38,7 +38,7 @@
* for the rest. I'll experiment with dynamic table growth later.
*/
struct inet_ehash_bucket {
- rwlock_t lock;
+ rwlock_t wlock;
struct hlist_head chain;
};
@@ -243,7 +243,7 @@ static inline void __inet_hash(struct in
sk->sk_hash = inet_sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
list = &head->chain;
- lock = &head->lock;
+ lock = &head->wlock;
write_lock(lock);
}
__sk_add_node(sk, list);
@@ -274,7 +274,7 @@ static inline void inet_unhash(struct in
inet_listen_wlock(hashinfo);
lock = &hashinfo->lhash_lock;
} else {
- lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+ lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->wlock;
write_lock_bh(lock);
}
@@ -298,14 +298,13 @@ extern struct sock *__inet_lookup_listen
/* Optimize the common listener case. */
static inline struct sock *
- inet_lookup_listener(struct inet_hashinfo *hashinfo,
+ _inet_lookup_listener(struct inet_hashinfo *hashinfo,
const u32 daddr,
const unsigned short hnum, const int dif)
{
struct sock *sk = NULL;
const struct hlist_head *head;
- read_lock(&hashinfo->lhash_lock);
head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
if (!hlist_empty(head)) {
const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
@@ -317,10 +316,21 @@ static inline struct sock *
goto sherry_cache;
sk = __inet_lookup_listener(head, daddr, hnum, dif);
}
- if (sk) {
sherry_cache:
+ return sk;
+}
+
+static inline struct sock *
+ inet_lookup_listener(struct inet_hashinfo *hashinfo,
+ const u32 daddr,
+ const unsigned short hnum, const int dif)
+{
+ struct sock *sk;
+
+ read_lock(&hashinfo->lhash_lock);
+ sk = _inet_lookup_listener(hashinfo, daddr, hnum, dif);
+ if (sk)
sock_hold(sk);
- }
read_unlock(&hashinfo->lhash_lock);
return sk;
}
@@ -372,7 +382,7 @@ sherry_cache:
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
* not check it for lookups anymore, thanks Alexey. -DaveM
*
- * Local BH must be disabled here.
+ * This is an RCU read section, the caller must do an rcu_read_lock_bh().
*/
static inline struct sock *
__inet_lookup_established(struct inet_hashinfo *hashinfo,
@@ -390,8 +400,6 @@ static inline struct sock *
unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
- prefetch(head->chain.first);
- read_lock(&head->lock);
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
@@ -404,10 +412,8 @@ static inline struct sock *
}
sk = NULL;
out:
- read_unlock(&head->lock);
return sk;
hit:
- sock_hold(sk);
goto out;
}
@@ -418,7 +424,7 @@ static inline struct sock *__inet_lookup
{
struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport,
daddr,
hnum, dif);
- return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif);
+ return sk ? : _inet_lookup_listener(hashinfo, daddr, hnum, dif);
}
static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
@@ -428,9 +434,11 @@ static inline struct sock *inet_lookup(s
{
struct sock *sk;
- local_bh_disable();
+ rcu_read_lock_bh();
sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
- local_bh_enable();
+ if (sk)
+ sock_hold(sk);
+ rcu_read_unlock_bh();
return sk;
}
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -23,6 +23,7 @@
#include <linux/timer.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
#include <net/sock.h>
#include <net/tcp_states.h>
@@ -132,6 +133,7 @@ struct inet_timewait_sock {
unsigned long tw_ttd;
struct inet_bind_bucket *tw_tb;
struct hlist_node tw_death_node;
+ struct rcu_head tw_rcu_head;
};
static inline void inet_twsk_add_node(struct inet_timewait_sock *tw,
@@ -191,16 +193,15 @@ static inline u32 inet_rcv_saddr(const s
inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr;
}
+extern void __inet_twsk_put(struct rcu_head *tw_rcu);
static inline void inet_twsk_put(struct inet_timewait_sock *tw)
{
if (atomic_dec_and_test(&tw->tw_refcnt)) {
- struct module *owner = tw->tw_prot->owner;
#ifdef SOCK_REFCNT_DEBUG
printk(KERN_DEBUG "%s timewait_sock %p released\n",
tw->tw_prot->name, tw);
#endif
- kmem_cache_free(tw->tw_prot->twsk_slab, tw);
- module_put(owner);
+ call_rcu_bh(&tw->tw_rcu_head, __inet_twsk_put);
}
}
diff --git a/include/net/sock.h b/include/net/sock.h
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -255,6 +255,8 @@ struct sock {
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
void (*sk_destruct)(struct sock *sk);
+
+ struct rcu_head sk_rcu_head;
};
/*
@@ -278,12 +280,12 @@ static inline struct sock *sk_next(const
static inline int sk_unhashed(const struct sock *sk)
{
- return hlist_unhashed(&sk->sk_node);
+ return sk->sk_node.pprev == NULL || sk->sk_node.pprev == LIST_POISON2;
}
static inline int sk_hashed(const struct sock *sk)
{
- return sk->sk_node.pprev != NULL;
+ return sk->sk_node.pprev != NULL && sk->sk_node.pprev != LIST_POISON2;
}
static __inline__ void sk_node_init(struct hlist_node *node)
@@ -293,14 +295,13 @@ static __inline__ void sk_node_init(stru
static __inline__ void __sk_del_node(struct sock *sk)
{
- __hlist_del(&sk->sk_node);
+ hlist_del_rcu(&sk->sk_node);
}
static __inline__ int __sk_del_node_init(struct sock *sk)
{
if (sk_hashed(sk)) {
__sk_del_node(sk);
- sk_node_init(&sk->sk_node);
return 1;
}
return 0;
@@ -339,7 +340,7 @@ static __inline__ int sk_del_node_init(s
static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
- hlist_add_head(&sk->sk_node, list);
+ hlist_add_head_rcu(&sk->sk_node, list);
}
static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
@@ -350,27 +351,24 @@ static __inline__ void sk_add_node(struc
static __inline__ void __sk_del_bind_node(struct sock *sk)
{
- __hlist_del(&sk->sk_bind_node);
+ hlist_del_rcu(&sk->sk_bind_node);
}
static __inline__ void sk_add_bind_node(struct sock *sk,
struct hlist_head *list)
{
- hlist_add_head(&sk->sk_bind_node, list);
+ hlist_add_head_rcu(&sk->sk_bind_node, list);
}
#define sk_for_each(__sk, node, list) \
- hlist_for_each_entry(__sk, node, list, sk_node)
+ hlist_for_each_entry_rcu(__sk, node, list, sk_node)
#define sk_for_each_from(__sk, node) \
if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
hlist_for_each_entry_from(__sk, node, sk_node)
-#define sk_for_each_continue(__sk, node) \
- if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
- hlist_for_each_entry_continue(__sk, node, sk_node)
#define sk_for_each_safe(__sk, node, tmp, list) \
hlist_for_each_entry_safe(__sk, node, tmp, list, sk_node)
#define sk_for_each_bound(__sk, node, list) \
- hlist_for_each_entry(__sk, node, list, sk_bind_node)
+ hlist_for_each_entry_rcu(__sk, node, list, sk_bind_node)
/* Sock flags */
enum sock_flags {
diff --git a/net/core/sock.c b/net/core/sock.c
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -686,8 +686,9 @@ out_free:
return NULL;
}
-void sk_free(struct sock *sk)
+void __sk_free(struct rcu_head *rcu)
{
+ struct sock *sk = container_of(rcu, struct sock, sk_rcu_head);
struct sk_filter *filter;
struct module *owner = sk->sk_prot_creator->owner;
@@ -714,6 +715,11 @@ void sk_free(struct sock *sk)
module_put(owner);
}
+void sk_free(struct sock *sk)
+{
+ call_rcu_bh(&sk->sk_rcu_head, __sk_free);
+}
+
struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
{
struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -805,18 +805,22 @@ static struct sock *dccp_v4_hnd_req(stru
if (req != NULL)
return dccp_check_req(sk, skb, req, prev);
+ rcu_read_lock_bh();
nsk = __inet_lookup_established(&dccp_hashinfo,
iph->saddr, dh->dccph_sport,
iph->daddr, ntohs(dh->dccph_dport),
inet_iif(skb));
if (nsk != NULL) {
if (nsk->sk_state != DCCP_TIME_WAIT) {
+ sock_hold(nsk);
bh_lock_sock(nsk);
+ rcu_read_unlock_bh();
return nsk;
}
- inet_twsk_put((struct inet_timewait_sock *)nsk);
+ rcu_read_unlock_bh();
return NULL;
}
+ rcu_read_unlock_bh();
return sk;
}
@@ -1113,6 +1117,7 @@ int dccp_v4_rcv(struct sk_buff *skb)
/* Step 2:
* Look up flow ID in table and get corresponding socket */
+ rcu_read_lock_bh();
sk = __inet_lookup(&dccp_hashinfo,
skb->nh.iph->saddr, dh->dccph_sport,
skb->nh.iph->daddr, ntohs(dh->dccph_dport),
@@ -1127,9 +1132,13 @@ int dccp_v4_rcv(struct sk_buff *skb)
if (sk == NULL) {
dccp_pr_debug("failed to look up flow ID in table and "
"get corresponding socket\n");
+ rcu_read_unlock_bh();
goto no_dccp_socket;
}
+ sock_hold(sk);
+ rcu_read_unlock_bh();
+
/*
* Step 2:
* ... or S.state == TIMEWAIT,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -812,7 +812,7 @@ static int __init dccp_init(void)
}
for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
- rwlock_init(&dccp_hashinfo.ehash[i].lock);
+ rwlock_init(&dccp_hashinfo.ehash[i].wlock);
INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -671,7 +671,7 @@ skip_listen_ht:
if (i > s_i)
s_num = 0;
- read_lock_bh(&head->lock);
+ rcu_read_lock_bh(); /* &head->lock */
num = 0;
sk_for_each(sk, node, &head->chain) {
@@ -687,7 +687,7 @@ skip_listen_ht:
if (r->id.idiag_dport != inet->dport &&
r->id.idiag_dport)
goto next_normal;
if (inet_diag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
+ rcu_read_unlock_bh(); /* &head->lock */
goto done;
}
next_normal:
@@ -708,14 +708,14 @@ next_normal:
r->id.idiag_dport)
goto next_dying;
if (inet_diag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
+ rcu_read_unlock_bh(); /* &head->lock */
goto done;
}
next_dying:
++num;
}
}
- read_unlock_bh(&head->lock);
+ rcu_read_unlock_bh(); /* &head->lock */
}
done:
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -14,6 +14,17 @@
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
+void __inet_twsk_put(struct rcu_head *tw_rcu)
+{
+ struct inet_timewait_sock *tw = container_of(tw_rcu,
+ struct inet_timewait_sock, tw_rcu_head);
+ struct module *owner = tw->tw_prot->owner;
+ sk_node_init(&tw->tw_node);
+ kmem_cache_free(tw->tw_prot->twsk_slab, tw);
+ module_put(owner);
+}
+EXPORT_SYMBOL_GPL(__inet_twsk_put);
+
/* Must be called with locally disabled BHs. */
void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo
*hashinfo)
{
@@ -22,14 +33,13 @@ void __inet_twsk_kill(struct inet_timewa
/* Unlink from established hashes. */
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo,
tw->tw_hash);
- write_lock(&ehead->lock);
- if (hlist_unhashed(&tw->tw_node)) {
- write_unlock(&ehead->lock);
+ write_lock(&ehead->wlock);
+ if (hlist_unhashed(&tw->tw_node) || tw->tw_node.pprev == LIST_POISON2) {
+ write_unlock(&ehead->wlock);
return;
}
- __hlist_del(&tw->tw_node);
- sk_node_init(&tw->tw_node);
- write_unlock(&ehead->lock);
+ hlist_del_rcu(&tw->tw_node);
+ write_unlock(&ehead->wlock);
/* Disassociate with bind bucket. */
bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num,
hashinfo->bhash_size)];
@@ -73,7 +83,7 @@ void __inet_twsk_hashdance(struct inet_t
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
spin_unlock(&bhead->lock);
- write_lock(&ehead->lock);
+ write_lock(&ehead->wlock);
/* Step 2: Remove SK from established hash. */
if (__sk_del_node_init(sk))
@@ -83,7 +93,7 @@ void __inet_twsk_hashdance(struct inet_t
inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
atomic_inc(&tw->tw_refcnt);
- write_unlock(&ehead->lock);
+ write_unlock(&ehead->wlock);
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2342,7 +2342,7 @@ void __init tcp_init(void)
0);
tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
- rwlock_init(&tcp_hashinfo.ehash[i].lock);
+ rwlock_init(&tcp_hashinfo.ehash[i].wlock);
INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -136,7 +136,7 @@ static int __tcp_v4_check_established(st
struct inet_timewait_sock *tw;
prefetch(head->chain.first);
- write_lock(&head->lock);
+ write_lock(&head->wlock);
/* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
@@ -192,7 +192,7 @@ unique:
BUG_TRAP(sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
+ write_unlock(&head->wlock);
if (twp) {
*twp = tw;
@@ -208,7 +208,7 @@ unique:
return 0;
not_unique:
- write_unlock(&head->lock);
+ write_unlock(&head->wlock);
return -EADDRNOTAVAIL;
}
@@ -1077,7 +1077,7 @@ exit:
return NULL;
}
-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+static noinline struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff
*skb)
{
struct tcphdr *th = skb->h.th;
struct iphdr *iph = skb->nh.iph;
@@ -1089,18 +1089,22 @@ static struct sock *tcp_v4_hnd_req(struc
if (req)
return tcp_check_req(sk, skb, req, prev);
+ rcu_read_lock_bh();
nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
th->source, skb->nh.iph->daddr,
ntohs(th->dest), inet_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
+ sock_hold(nsk);
bh_lock_sock(nsk);
+ rcu_read_unlock_bh();
return nsk;
}
- inet_twsk_put((struct inet_timewait_sock *)nsk);
+ rcu_read_unlock_bh();
return NULL;
}
+ rcu_read_unlock_bh();
#ifdef CONFIG_SYN_COOKIES
if (!th->rst && !th->syn && th->ack)
@@ -1194,6 +1198,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
struct sock *sk;
int ret;
+ rcu_read_lock_bh();
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
@@ -1230,7 +1235,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
skb->nh.iph->daddr, ntohs(th->dest),
inet_iif(skb));
-
if (!sk)
goto no_tcp_socket;
@@ -1261,7 +1265,7 @@ process:
bh_unlock_sock(sk);
}
- sock_put(sk);
+ rcu_read_unlock_bh();
return ret;
@@ -1277,15 +1281,18 @@ bad_packet:
}
discard_it:
+ rcu_read_unlock_bh();
+
/* Discard frame. */
kfree_skb(skb);
return 0;
discard_and_relse:
- sock_put(sk);
+ rcu_read_unlock_bh();
goto discard_it;
do_time_wait:
+ sock_hold(sk);
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
inet_twsk_put((struct inet_timewait_sock *) sk);
goto discard_it;
@@ -1597,7 +1604,6 @@ static void *established_get_first(struc
/* We can reschedule _before_ having picked the target: */
cond_resched_softirq();
- read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
if (sk->sk_family != st->family) {
continue;
@@ -1614,7 +1620,6 @@ static void *established_get_first(struc
rc = tw;
goto out;
}
- read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
@@ -1641,16 +1646,14 @@ get_tw:
cur = tw;
goto out;
}
- read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
- /* We can reschedule between buckets: */
+ /* We can reschedule _before_ having picked the target: */
cond_resched_softirq();
- if (++st->bucket < tcp_hashinfo.ehash_size) {
- read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+ if (++st->bucket < tcp_hashinfo.ehash_size)
sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
- } else {
+ else {
cur = NULL;
goto out;
}
@@ -1693,7 +1696,7 @@ static void *tcp_get_idx(struct seq_file
if (!rc) {
inet_listen_unlock(&tcp_hashinfo);
- local_bh_disable();
+ rcu_read_lock_bh();
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_idx(seq, pos);
}
@@ -1726,7 +1729,7 @@ static void *tcp_seq_next(struct seq_fil
rc = listening_get_next(seq, v);
if (!rc) {
inet_listen_unlock(&tcp_hashinfo);
- local_bh_disable();
+ rcu_read_lock_bh();
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_first(seq);
}
@@ -1757,9 +1760,7 @@ static void tcp_seq_stop(struct seq_file
break;
case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
- if (v)
- read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
- local_bh_enable();
+ rcu_read_unlock_bh();
break;
}
}
> -ben
--
Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html