On Wed, Jun 20, 2007 at 12:17:04AM -0400, C. Scott Ananian wrote: > I'm working on a patch to implement RDNSS options > in Router Advertisement messages in IPv6. (Draft RFC at: > http://tools.ietf.org/html/draft-jeong-dnsop-ipv6-dns-discovery-12 > Support is already in radvd.) > > I don't quite understand how 'struct rt6_info' allocation/deallocation > and locking are happening. For example, where are rt6_info > deallocated? I couldn't find a call to any sort of free any of the > places I expected. When I'm writing to rt6_info during > autoconfiguration, how do I ensure that it is not concurrently mutated > or deallocated? It didn't seem like there was a per-struct lock, but > none of the coarser locks I found seemed quite right. > > Any help or pointers you could give would be appreciated. My > (partial, unfinished) patch is appended, so you can get an idea of > what I'm doing.
The rt6_info struct seems to be protected by RCU via the fib structures, so I would suggest taking a look at the files in Documentation/RCU in a recent Linux-kernel source tree if you have not already done so. The basic trick is that an "RCU read-side critical section" (which begins with rcu_read_lock() and ends with rcu_read_unlock()) prevents any subsequent "grace period" from completing before the RCU read-side critical section completes. Primitives like synchronize_rcu() (AKA synchronize_net()) wait for a grace period to complete. So if you remove an element from an RCU-protected data structure and then execute synchronize_rcu(), you will be guaranteed that no readers hold references to the removed element after return from synchronize_rcu(). The upshot is that a read-mostly data structure can use coarse-grained locking to guard updates. Readers can often avoid any synchronization instructions whatsoever, though it looks like some of the rt6_info code paths may use reference counting in conjunction with RCU. Thanx, Paul > Thanks! > --scott > --------- > > diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/include/net/ip6_fib.h > linux-2.6.22-rc5/include/net/ip6_fib.h > --- linux-2.6.22-rc5-orig/include/net/ip6_fib.h 2007-06-16 > 22:09:12.000000000 -0400 > +++ linux-2.6.22-rc5/include/net/ip6_fib.h 2007-06-19 > 12:00:57.000000000 -0400 > @@ -79,6 +79,7 @@ struct rt6key > }; > > struct fib6_table; > +struct rdns6_info; > > struct rt6_info > { > @@ -105,6 +106,8 @@ struct rt6_info > struct rt6key rt6i_src; > > u8 rt6i_protocol; > + > + struct rdns6_info *rt6i_rdnss; > }; > > static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) > diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/include/net/ip6_rdnss.h > linux-2.6.22-rc5/include/net/ip6_rdnss.h > --- linux-2.6.22-rc5-orig/include/net/ip6_rdnss.h 1969-12-31 > 19:00:00.000000000 -0500 > +++ linux-2.6.22-rc5/include/net/ip6_rdnss.h 2007-06-19 > 16:42:26.000000000 -0400 > @@ -0,0 +1,27 @@ > +#ifndef _NET_IP6_RDNSS_H > +#define _NET_IP6_RDNSS_H > + > +#ifdef __KERNEL__ > + > +#include <linux/in6.h> > + > +struct nd_opt_rdnss { > + __u8 type; > + __u8 length; > + __u16 reserved; > + __be32 lifetime; > + struct in6_addr rdnss[1]; /* 1 or more */ > +}; > + > +struct rdns6_info { > + struct rdns6_info * next; > + struct in6_addr rdnss; > + __u32 lifetime; > + unsigned long expires; > +}; > + > +extern void rdns6_rcv(struct inet6_dev *dev, struct rt6_info *rt, > + struct nd_opt_rdnss **opts, int opt_cnt); > + > +#endif > +#endif > diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/include/net/ndisc.h > linux-2.6.22-rc5/include/net/ndisc.h > --- linux-2.6.22-rc5-orig/include/net/ndisc.h 2007-06-16 > 22:09:12.000000000 -0400 > +++ linux-2.6.22-rc5/include/net/ndisc.h 2007-06-18 > 15:30:00.000000000 -0400 > @@ -24,6 +24,7 @@ enum { > ND_OPT_MTU = 5, /* RFC2461 */ > __ND_OPT_ARRAY_MAX, > ND_OPT_ROUTE_INFO = 24, /* RFC4191 */ > + ND_OPT_RDNSS_INFO = 25, /* draft/radvd */ > __ND_OPT_MAX > }; > > diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/net/ipv6/Makefile > linux-2.6.22-rc5/net/ipv6/Makefile > --- linux-2.6.22-rc5-orig/net/ipv6/Makefile 2007-06-16 > 22:09:12.000000000 -0400 > +++ linux-2.6.22-rc5/net/ipv6/Makefile 2007-06-18 16:39:02.000000000 > -0400 > @@ -8,7 +8,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_ou > route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ > raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ > exthdrs.o sysctl_net_ipv6.o datagram.o \ > - ip6_flowlabel.o inet6_connection_sock.o > + ip6_flowlabel.o inet6_connection_sock.o ip6_rdnss.o > > ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ > xfrm6_output.o > diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/net/ipv6/ip6_rdnss.c > linux-2.6.22-rc5/net/ipv6/ip6_rdnss.c > --- linux-2.6.22-rc5-orig/net/ipv6/ip6_rdnss.c 1969-12-31 > 19:00:00.000000000 -0500 > +++ linux-2.6.22-rc5/net/ipv6/ip6_rdnss.c 2007-06-19 > 19:01:04.000000000 -0400 > @@ -0,0 +1,260 @@ > +/* > + * Recursive DNS Server autoconfiguration for IPv6 > + * Linux INET6 implementation. > + * > + * Authors: > + * C. Scott Ananian <[EMAIL PROTECTED]> > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + */ > + > +#include <linux/errno.h> > +#include <linux/types.h> > +#include <linux/timer.h> > +#include <linux/spinlock.h> > + > +#include <linux/in6.h> > +#include <linux/ipv6.h> > +#include <linux/icmpv6.h> > + > +#include <net/ipv6.h> > +#include <net/ip6_fib.h> > +#include <net/ip6_rdnss.h> > + > +#define INFINITY_LIFE_TIME 0xFFFFFFFF > +/* the maximum number of recursive DNS servers we'll remember per > + * router. we have to set some limit to prevent an easy DoS, and > + * 3 DNS servers seems to be standard practice. */ > +#define __RDNS6_MAX_ENTRIES 3 > + > +static void rdns6_expire(unsigned long); > + > +static DEFINE_TIMER(rdns6_chk_timer, rdns6_expire, 0, 0); > +static DEFINE_SPINLOCK(rdns6_expire_lock); > + > +static int rdns6_update_entry(struct rdns6_info **p, struct in6_addr *addr, > + uint32_t lifetime) { > + int changed = false; > + /* if lifetime is zero, delete this entry */ > + if (lifetime == 0) { > + struct rdns6_info *r6i = (*p); > + *p = (*p)->next; > + kfree(r6i); > + return true; > + } > + /* otherwise, update lifetime and expiration time. */ > + if (lifetime > (*p)->lifetime) > + (*p)->lifetime = lifetime; > + if ((*p)->lifetime != INFINITY_LIFE_TIME) { > + unsigned long nexpires = jiffies + lifetime * HZ; > + if (time_before((*p)->expires, nexpires)) > + (*p)->expires = nexpires; > + } > + return changed; > +} > + > +/* According to the draft RFC, if we need to delete an entry, "delete the > + * entry with the smallest expiration time that will expire first". */ > +static int rdns6_cmp_entry(struct rdns6_info *a, struct rdns6_info *b) { > + if ( a->lifetime != b->lifetime ) > + return a->lifetime < b->lifetime ? -1 : 1; > + if (time_before( a->expires, b->expires )) > + return -1; > + if (time_after( a->expires, b->expires )) > + return 1; > + return 0; > +} > + > +/* Look for an entry in the DNS server list which is 'worse' than this one; > + * delete it if found. */ > +static int rdns6_expire_worse(struct rt6_info *rt, struct rdns6_info > *nentry){ > + struct rdns6_info **worst = NULL, **p; > + for (p = &(rt->rt6i_rdnss); *p != NULL; p = &((*p)->next)) { > + if (worst==NULL || > + rdns6_cmp_entry(*worst, *p) < 0) > + worst = p; > + } > + if (worst && rdns6_cmp_entry(*worst, nentry) < 0) { > + struct rdns6_info *r6i = (*worst); > + *worst = (*worst)->next; /* delete it */ > + kfree(r6i); > + return true; > + } > + return false; > +} > + > +/* Create a new rdns6_info entry. */ > +static struct rdns6_info *rdns6_create_entry(struct in6_addr *addr, > + uint32_t lifetime) { > + struct rdns6_info *result; > + result = kzalloc(sizeof(*result), GFP_KERNEL); > + if (result) { > + ipv6_addr_copy(&(result->rdnss), addr); > + result->lifetime = lifetime; > + result->expires = (lifetime==INFINITY_LIFE_TIME) ? 0 : > + jiffies + lifetime * HZ; > + } > + return result; > +} > + > +/* Process a newly-received RDNSS option from a RAdv message. */ > +void rdns6_rcv(struct inet6_dev *dev, struct rt6_info *rt, > + struct nd_opt_rdnss **opts, int opt_cnt) { > + struct rdns6_info **p, **insert_point; > + int i, j, changed = false, num_entries = 0, dont_need_expires = true; > + unsigned long next_expiry; > + uint32_t lifetime; > + /* first, count the # of dns server list entries we've already got */ > + for (p = &(rt->rt6i_rdnss); *p != NULL; p = &((*p)->next)) { > + num_entries++; > + } > + /* now let's process all the RDNSS options in the RA */ > + insert_point = &(rt->rt6i_rdnss); /* add to the start of the list */ > + for (i=0; i<opt_cnt; i++) { > + int len = opts[i]->length << 3; > + if (len < sizeof(struct nd_opt_rdnss)) { > + printk(KERN_WARNING > + "ICMPv6 RA: bad RDNSS option length\n"); > + continue; > + } > + lifetime = opts[i]->lifetime; > + printk(KERN_WARNING > + "Got an RDNSS message via RA, lifetime: %u\n", > + lifetime); > + for (j=0; (j+1)*sizeof(struct in6_addr) <= len-8; j++) { > + struct in6_addr *addr = &(opts[i]->rdnss[j]); > + /* find this entry in the list. */ > + struct rdns6_info **p; > + for (p = &(rt->rt6i_rdnss); > + *p != NULL; > + p = &((*p)->next)) { > + if (ipv6_addr_equal(addr, &((*p)->rdnss))) > + break; > + } > + if (*p) { > + /* we found an existing entry, update it. */ > + if (rdns6_update_entry(p, addr, lifetime)) > + changed = true; > + if (lifetime == 0) > + num_entries--; > + } else if (lifetime) { > + /* no existing entry. make one. */ > + struct rdns6_info *nentry = > + rdns6_create_entry(addr, lifetime); > + /* make room if we must (and if we can) */ > + if (num_entries >= __RDNS6_MAX_ENTRIES) { > + /* see if we can expire an entry */ > + if (rdns6_expire_worse(rt, nentry)) > + num_entries--; > + } > + /* if we have room now, add an entry. */ > + if (num_entries < __RDNS6_MAX_ENTRIES) { > + nentry->next = *insert_point; > + *insert_point = nentry; > + insert_point = &(nentry->next); > + changed = true; > + num_entries++; > + } > + } > + } > + } > + /* okay, we're done looking at this batch of options. */ > + /* find earliest expiration time */ > + for (p = &(rt->rt6i_rdnss); *p != NULL; p = &((*p)->next)) { > + if ((*p)->lifetime != INFINITY_LIFE_TIME) { > + if (dont_need_expires || > + time_before((*p)->expires, next_expiry)) { > + next_expiry = (*p)->expires; > + dont_need_expires = false; > + } > + } > + } > + /* reset expiration timer */ > + if (dont_need_expires) > + del_timer(&rdns6_chk_timer); > + else > + mod_timer(&rdns6_chk_timer, next_expiry); > + > + /* notify userland if our DNS list has changed */ > + if (changed) > + inet6_ifinfo_notify(RTM_NEWLINK, dev); > + > + /* DEBUGGING */ > + printk(KERN_WARNING "RDNSS RA from gateway > %x:%x:%x:%x:%x:%x:%x:%x\n", > + ntohs(rt->rt6i_gateway.s6_addr16[0]), > + ntohs(rt->rt6i_gateway.s6_addr16[1]), > + ntohs(rt->rt6i_gateway.s6_addr16[2]), > + ntohs(rt->rt6i_gateway.s6_addr16[3]), > + ntohs(rt->rt6i_gateway.s6_addr16[4]), > + ntohs(rt->rt6i_gateway.s6_addr16[5]), > + ntohs(rt->rt6i_gateway.s6_addr16[6]), > + ntohs(rt->rt6i_gateway.s6_addr16[7])); > + for (p = &(rt->rt6i_rdnss); *p != NULL; p = &((*p)->next)) { > + printk(KERN_WARNING " - %x:%x:%x:%x:%x:%x:%x:%x " > + "(lifetime %d)\n", > + ntohs((*p)->rdnss.s6_addr16[0]), > + ntohs((*p)->rdnss.s6_addr16[1]), > + ntohs((*p)->rdnss.s6_addr16[2]), > + ntohs((*p)->rdnss.s6_addr16[3]), > + ntohs((*p)->rdnss.s6_addr16[4]), > + ntohs((*p)->rdnss.s6_addr16[5]), > + ntohs((*p)->rdnss.s6_addr16[6]), > + ntohs((*p)->rdnss.s6_addr16[7]), > + (*p)->lifetime); > + } > +} > + > +static void rdns6_expire(unsigned long _ignore) { > + struct rdns6_info **p; > + unsigned long now; > + > + //spin_lock_bh(&rdns6_expire_lock); > + now = jiffies; > + > + del_timer(&rdns6_chk_timer); > + > + /* find expired DNS entries & delete them */ > + for (p = &(rt->rt6i_rdnss); *p != NULL; ) { > + if (time_before((*p)->expires, now)) { > + struct rdns6_info *r6i = (*p); > + *p = (*p)->next; > + kfree(r6i); > + continue; > + } > + p = &((*p)->next); > + } > + /* reset */ > + add_timer(&rdns6_chk_timer); > +} > +/** > + notes on draft: > + server list should be kept per-router so that the resolv.conf doesn't > + ping-pong when two routers are broadcasting RAs. > + > + DNS timeout: like RA, router is responsible for broadcasting w/ > + time < timeout. What if about to expire? Can/should give RS? > + > + use fib6_clean_all to implement rdns6_expire? this will walk all > routes. > + (maybe overkill) > + > + bug: how to lock rt6_info while we're mutating dns entries? > + > + bug: how to update timer appropriately; when we modify one rt6_info, > + we don't want to scan all. so only shorten timer (which means sometimes > + we'll trigger timer when it's not needed). At some point > + we need to del_timer (when?) > + > + bug: when route is deleted (RA times out?) we need to free the > + DNS server list. (can't find where the rt6_info is deallocated?) > + > + bug: use expire_lock to ensure we don't run expiry multiple times > + concurrently. > + > + xxx: implement appropriate fill message to export the server list > + via netlink. > + > + xxx: use round_jiffies? > +*/ > diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/net/ipv6/ndisc.c > linux-2.6.22-rc5/net/ipv6/ndisc.c > --- linux-2.6.22-rc5-orig/net/ipv6/ndisc.c 2007-06-16 > 22:09:12.000000000 -0400 > +++ linux-2.6.22-rc5/net/ipv6/ndisc.c 2007-06-19 16:02:36.000000000 -0400 > @@ -15,6 +15,8 @@ > /* > * Changes: > * > + * C. Scott Ananian : RDNSS-in-RA support. > + * > * Lars Fenneberg : fixed MTU setting on receipt > * of an RA. > * > @@ -75,6 +77,7 @@ > #include <net/protocol.h> > #include <net/ndisc.h> > #include <net/ip6_route.h> > +#include <net/ip6_rdnss.h> > #include <net/addrconf.h> > #include <net/icmp.h> > > @@ -155,12 +158,16 @@ struct neigh_table nd_tbl = { > }; > > /* ND options */ > +#define __ND_OPT_RDNSS_MAX 6 /* 3 new servers + 3 cancellations */ > + > struct ndisc_options { > struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; > #ifdef CONFIG_IPV6_ROUTE_INFO > struct nd_opt_hdr *nd_opts_ri; > struct nd_opt_hdr *nd_opts_ri_end; > #endif > + int nd_opts_rdnss_cnt; > + struct nd_opt_hdr *nd_opts_rdnss[__ND_OPT_RDNSS_MAX]; > }; > > #define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] > @@ -266,6 +273,12 @@ static struct ndisc_options *ndisc_parse > ndopts->nd_opts_ri = nd_opt; > break; > #endif > + case ND_OPT_RDNSS_INFO: > + /* limit # of RDNSS options accepted to prevent DoS > */ > + if (ndopts->nd_opts_rdnss_cnt < __ND_OPT_RDNSS_MAX) > + ndopts->nd_opts_rdnss > + [ndopts->nd_opts_rdnss_cnt++]= > nd_opt; > + break; > default: > /* > * Unknown options must be silently ignored, > @@ -1045,7 +1058,36 @@ static void ndisc_router_discovery(struc > /* > * Remember the managed/otherconf flags from most recently > * received RA message (RFC 2462) -- yoshfuji > */ > + /* From RFC2462, section 5.5.3: > + On receipt of a valid Router Advertisement (as defined in > + [DISCOVERY]), a host copies the value of the advertisement's M bit > + into ManagedFlag. If the value of ManagedFlag changes from FALSE to > + TRUE, and the host is not already running the stateful address > + autoconfiguration protocol, the host should invoke the stateful > + address autoconfiguration protocol, requesting both address > + information and other information. If the value of the ManagedFlag > + changes from TRUE to FALSE, the host should continue running the > + stateful address autoconfiguration, i.e., the change in the value of > + the ManagedFlag has no effect. If the value of the flag stays > + unchanged, no special action takes place. In particular, a host MUST > + NOT reinvoke stateful address configuration if it is already > + participating in the stateful protocol as a result of an earlier > + advertisement. > + > + An advertisement's O flag field is processed in an analogous manner. > + A host copies the value of the O flag into OtherConfigFlag. If the > + value of OtherConfigFlag changes from FALSE to TRUE, the host should > + invoke the stateful autoconfiguration protocol, requesting > + information (excluding addresses if ManagedFlag is set to FALSE). If > + the value of the OtherConfigFlag changes from TRUE to FALSE, the host > + should continue running the stateful address autoconfiguration > + protocol, i.e., the change in the value of OtherConfigFlag has no > + effect. If the value of the flag stays unchanged, no special action > + takes place. In particular, a host MUST NOT reinvoke stateful > + configuration if it is already participating in the stateful protocol > + as a result of an earlier advertisement. > + */ > in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | > IF_RA_OTHERCONF)) | > (ra_msg->icmph.icmp6_addrconf_managed ? > @@ -1187,6 +1232,12 @@ skip_defrtr: > } > #endif > > + if (rt && ndopts.nd_opts_rdnss_cnt) { > + rdns6_rcv(in6_dev, rt, > + (struct nd_opt_rdnss **) ndopts.nd_opts_rdnss, > + ndopts.nd_opts_rdnss_cnt); > + } > + > if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { > struct nd_opt_hdr *p; > for (p = ndopts.nd_opts_pi; > > -- > ( http://cscott.net/ ) > - > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to [EMAIL PROTECTED] > More majordomo info at http://vger.kernel.org/majordomo-info.html - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html