On Wed, Jun 20, 2007 at 12:17:04AM -0400, C. Scott Ananian wrote:
> I'm working on a patch to implement RDNSS options
> in Router Advertisement messages in IPv6.  (Draft RFC at:
> http://tools.ietf.org/html/draft-jeong-dnsop-ipv6-dns-discovery-12
> Support is already in radvd.)
> 
> I don't quite understand how 'struct rt6_info' allocation/deallocation
> and locking are happening.   For example, where are rt6_info
> deallocated?  I couldn't find a call to any sort of free any of the
> places I expected.  When I'm writing to rt6_info during
> autoconfiguration, how do I ensure that it is not concurrently mutated
> or deallocated?  It didn't seem like there was a per-struct lock, but
> none of the coarser locks I found seemed quite right.
> 
> Any help or pointers you could give would be appreciated.  My
> (partial, unfinished) patch is appended, so you can get an idea of
> what I'm doing.

The rt6_info struct seems to be protected by RCU via the fib structures,
so I would suggest taking a look at the files in Documentation/RCU in
a recent Linux-kernel source tree if you have not already done so.

The basic trick is that an "RCU read-side critical section" (which
begins with rcu_read_lock() and ends with rcu_read_unlock()) prevents
any subsequent "grace period" from completing before the RCU read-side
critical section completes.  Primitives like synchronize_rcu() (AKA
synchronize_net()) wait for a grace period to complete.  So if you
remove an element from an RCU-protected data structure and then execute
synchronize_rcu(), you will be guaranteed that no readers hold references
to the removed element after return from synchronize_rcu().

The upshot is that a read-mostly data structure can use coarse-grained
locking to guard updates.  Readers can often avoid any synchronization
instructions whatsoever, though it looks like some of the rt6_info
code paths may use reference counting in conjunction with RCU.

                                                Thanx, Paul

> Thanks!
> --scott
> ---------
> 
> diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/include/net/ip6_fib.h
> linux-2.6.22-rc5/include/net/ip6_fib.h
> --- linux-2.6.22-rc5-orig/include/net/ip6_fib.h       2007-06-16
> 22:09:12.000000000 -0400
> +++ linux-2.6.22-rc5/include/net/ip6_fib.h    2007-06-19 
> 12:00:57.000000000 -0400
> @@ -79,6 +79,7 @@ struct rt6key
> };
> 
> struct fib6_table;
> +struct rdns6_info;
> 
> struct rt6_info
> {
> @@ -105,6 +106,8 @@ struct rt6_info
>       struct rt6key                   rt6i_src;
> 
>       u8                              rt6i_protocol;
> +
> +        struct rdns6_info               *rt6i_rdnss;
> };
> 
> static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
> diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/include/net/ip6_rdnss.h
> linux-2.6.22-rc5/include/net/ip6_rdnss.h
> --- linux-2.6.22-rc5-orig/include/net/ip6_rdnss.h     1969-12-31
> 19:00:00.000000000 -0500
> +++ linux-2.6.22-rc5/include/net/ip6_rdnss.h  2007-06-19 
> 16:42:26.000000000 -0400
> @@ -0,0 +1,27 @@
> +#ifndef _NET_IP6_RDNSS_H
> +#define _NET_IP6_RDNSS_H
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/in6.h>
> +
> +struct nd_opt_rdnss {
> +     __u8                    type;
> +     __u8                    length;
> +     __u16                   reserved;
> +     __be32                  lifetime;
> +     struct in6_addr         rdnss[1];       /* 1 or more */
> +};
> +
> +struct rdns6_info {
> +     struct rdns6_info *     next;
> +     struct in6_addr         rdnss;
> +     __u32                   lifetime;
> +     unsigned long           expires;
> +};
> +
> +extern void  rdns6_rcv(struct inet6_dev *dev, struct rt6_info *rt,
> +                       struct nd_opt_rdnss **opts, int opt_cnt);
> +
> +#endif
> +#endif
> diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/include/net/ndisc.h
> linux-2.6.22-rc5/include/net/ndisc.h
> --- linux-2.6.22-rc5-orig/include/net/ndisc.h 2007-06-16
> 22:09:12.000000000 -0400
> +++ linux-2.6.22-rc5/include/net/ndisc.h      2007-06-18 
> 15:30:00.000000000 -0400
> @@ -24,6 +24,7 @@ enum {
>       ND_OPT_MTU = 5,                 /* RFC2461 */
>       __ND_OPT_ARRAY_MAX,
>       ND_OPT_ROUTE_INFO = 24,         /* RFC4191 */
> +     ND_OPT_RDNSS_INFO = 25,         /* draft/radvd */
>       __ND_OPT_MAX
> };
> 
> diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/net/ipv6/Makefile
> linux-2.6.22-rc5/net/ipv6/Makefile
> --- linux-2.6.22-rc5-orig/net/ipv6/Makefile   2007-06-16 
> 22:09:12.000000000 -0400
> +++ linux-2.6.22-rc5/net/ipv6/Makefile        2007-06-18 16:39:02.000000000 
> -0400
> @@ -8,7 +8,7 @@ ipv6-objs :=  af_inet6.o anycast.o ip6_ou
>               route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
>               raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
>               exthdrs.o sysctl_net_ipv6.o datagram.o \
> -             ip6_flowlabel.o inet6_connection_sock.o
> +             ip6_flowlabel.o inet6_connection_sock.o ip6_rdnss.o
> 
> ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
>       xfrm6_output.o
> diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/net/ipv6/ip6_rdnss.c
> linux-2.6.22-rc5/net/ipv6/ip6_rdnss.c
> --- linux-2.6.22-rc5-orig/net/ipv6/ip6_rdnss.c        1969-12-31
> 19:00:00.000000000 -0500
> +++ linux-2.6.22-rc5/net/ipv6/ip6_rdnss.c     2007-06-19 
> 19:01:04.000000000 -0400
> @@ -0,0 +1,260 @@
> +/*
> + *   Recursive DNS Server autoconfiguration for IPv6
> + *      Linux INET6 implementation.
> + *
> + *   Authors:
> + *   C. Scott Ananian        <[EMAIL PROTECTED]>
> + *
> + *   This program is free software; you can redistribute it and/or
> + *      modify it under the terms of the GNU General Public License
> + *      as published by the Free Software Foundation; either version
> + *      2 of the License, or (at your option) any later version.
> + */
> +
> +#include <linux/errno.h>
> +#include <linux/types.h>
> +#include <linux/timer.h>
> +#include <linux/spinlock.h>
> +
> +#include <linux/in6.h>
> +#include <linux/ipv6.h>
> +#include <linux/icmpv6.h>
> +
> +#include <net/ipv6.h>
> +#include <net/ip6_fib.h>
> +#include <net/ip6_rdnss.h>
> +
> +#define INFINITY_LIFE_TIME   0xFFFFFFFF
> +/* the maximum number of recursive DNS servers we'll remember per
> + * router.  we have to set some limit to prevent an easy DoS, and
> + * 3 DNS servers seems to be standard practice. */
> +#define __RDNS6_MAX_ENTRIES  3
> +
> +static void rdns6_expire(unsigned long);
> +
> +static DEFINE_TIMER(rdns6_chk_timer, rdns6_expire, 0, 0);
> +static DEFINE_SPINLOCK(rdns6_expire_lock);
> +
> +static int rdns6_update_entry(struct rdns6_info **p, struct in6_addr *addr,
> +                           uint32_t lifetime) {
> +     int changed = false;
> +     /* if lifetime is zero, delete this entry */
> +     if (lifetime == 0) {
> +             struct rdns6_info *r6i = (*p);
> +             *p = (*p)->next;
> +             kfree(r6i);
> +             return true;
> +     }
> +     /* otherwise, update lifetime and expiration time. */
> +     if (lifetime > (*p)->lifetime)
> +             (*p)->lifetime = lifetime;
> +     if ((*p)->lifetime != INFINITY_LIFE_TIME) {
> +             unsigned long nexpires = jiffies + lifetime * HZ;
> +             if (time_before((*p)->expires, nexpires))
> +                     (*p)->expires = nexpires;
> +     }
> +     return changed;
> +}
> +
> +/* According to the draft RFC, if we need to delete an entry, "delete the
> + * entry with the smallest expiration time that will expire first". */
> +static int rdns6_cmp_entry(struct rdns6_info *a, struct rdns6_info *b) {
> +     if ( a->lifetime != b->lifetime )
> +             return a->lifetime < b->lifetime ? -1 : 1;
> +     if (time_before( a->expires, b->expires ))
> +             return -1;
> +     if (time_after( a->expires, b->expires ))
> +             return 1;
> +     return 0;
> +}
> +
> +/* Look for an entry in the DNS server list which is 'worse' than this one;
> + * delete it if found. */
> +static int rdns6_expire_worse(struct rt6_info *rt, struct rdns6_info 
> *nentry){
> +     struct rdns6_info **worst = NULL, **p;
> +     for (p = &(rt->rt6i_rdnss); *p != NULL; p = &((*p)->next)) {
> +             if (worst==NULL ||
> +                 rdns6_cmp_entry(*worst, *p) < 0)
> +                     worst = p;
> +     }
> +     if (worst && rdns6_cmp_entry(*worst, nentry) < 0) {
> +             struct rdns6_info *r6i = (*worst);
> +             *worst = (*worst)->next; /* delete it */
> +             kfree(r6i);
> +             return true;
> +     }
> +     return false;
> +}
> +
> +/* Create a new rdns6_info entry. */
> +static struct rdns6_info *rdns6_create_entry(struct in6_addr *addr,
> +                                          uint32_t lifetime) {
> +     struct rdns6_info *result;
> +     result = kzalloc(sizeof(*result), GFP_KERNEL);
> +     if (result) {
> +             ipv6_addr_copy(&(result->rdnss), addr);
> +             result->lifetime = lifetime;
> +             result->expires = (lifetime==INFINITY_LIFE_TIME) ? 0 :
> +                     jiffies + lifetime * HZ;
> +     }
> +     return result;
> +}
> +
> +/* Process a newly-received RDNSS option from a RAdv message. */
> +void rdns6_rcv(struct inet6_dev *dev, struct rt6_info *rt,
> +            struct nd_opt_rdnss **opts, int opt_cnt) {
> +     struct rdns6_info **p, **insert_point;
> +     int i, j, changed = false, num_entries = 0, dont_need_expires = true;
> +     unsigned long next_expiry;
> +     uint32_t lifetime;
> +     /* first, count the # of dns server list entries we've already got */
> +     for (p = &(rt->rt6i_rdnss); *p != NULL; p = &((*p)->next)) {
> +             num_entries++;
> +     }
> +     /* now let's process all the RDNSS options in the RA */
> +     insert_point = &(rt->rt6i_rdnss); /* add to the start of the list */
> +     for (i=0; i<opt_cnt; i++) {
> +             int len = opts[i]->length << 3;
> +             if (len < sizeof(struct nd_opt_rdnss)) {
> +                     printk(KERN_WARNING
> +                            "ICMPv6 RA: bad RDNSS option length\n");
> +                     continue;
> +             }
> +             lifetime = opts[i]->lifetime;
> +             printk(KERN_WARNING
> +                    "Got an RDNSS message via RA, lifetime: %u\n",
> +                    lifetime);
> +             for (j=0; (j+1)*sizeof(struct in6_addr) <= len-8; j++) {
> +                     struct in6_addr *addr = &(opts[i]->rdnss[j]);
> +                     /* find this entry in the list. */
> +                     struct rdns6_info **p;
> +                     for (p = &(rt->rt6i_rdnss);
> +                          *p != NULL;
> +                          p = &((*p)->next)) {
> +                             if (ipv6_addr_equal(addr, &((*p)->rdnss)))
> +                                     break;
> +                     }
> +                     if (*p) {
> +                             /* we found an existing entry, update it. */
> +                             if (rdns6_update_entry(p, addr, lifetime))
> +                                     changed = true;
> +                             if (lifetime == 0)
> +                                     num_entries--;
> +                     } else if (lifetime) {
> +                             /* no existing entry. make one. */
> +                             struct rdns6_info *nentry =
> +                                     rdns6_create_entry(addr, lifetime);
> +                             /* make room if we must (and if we can) */
> +                             if (num_entries >= __RDNS6_MAX_ENTRIES) {
> +                                     /* see if we can expire an entry */
> +                                     if (rdns6_expire_worse(rt, nentry))
> +                                             num_entries--;
> +                             }
> +                             /* if we have room now, add an entry. */
> +                             if (num_entries < __RDNS6_MAX_ENTRIES) {
> +                                     nentry->next = *insert_point;
> +                                     *insert_point = nentry;
> +                                     insert_point = &(nentry->next);
> +                                     changed = true;
> +                                     num_entries++;
> +                             }
> +                     }
> +             }
> +     }
> +     /* okay, we're done looking at this batch of options. */
> +     /* find earliest expiration time */
> +     for (p = &(rt->rt6i_rdnss); *p != NULL; p = &((*p)->next)) {
> +             if ((*p)->lifetime != INFINITY_LIFE_TIME) {
> +                     if (dont_need_expires ||
> +                         time_before((*p)->expires, next_expiry)) {
> +                             next_expiry = (*p)->expires;
> +                             dont_need_expires = false;
> +                     }
> +             }
> +     }
> +     /* reset expiration timer */
> +     if (dont_need_expires)
> +             del_timer(&rdns6_chk_timer);
> +     else
> +             mod_timer(&rdns6_chk_timer, next_expiry);
> +
> +     /* notify userland if our DNS list has changed */
> +     if (changed)
> +             inet6_ifinfo_notify(RTM_NEWLINK, dev);
> +
> +     /* DEBUGGING */
> +     printk(KERN_WARNING "RDNSS RA from gateway 
> %x:%x:%x:%x:%x:%x:%x:%x\n",
> +            ntohs(rt->rt6i_gateway.s6_addr16[0]),
> +            ntohs(rt->rt6i_gateway.s6_addr16[1]),
> +            ntohs(rt->rt6i_gateway.s6_addr16[2]),
> +            ntohs(rt->rt6i_gateway.s6_addr16[3]),
> +            ntohs(rt->rt6i_gateway.s6_addr16[4]),
> +            ntohs(rt->rt6i_gateway.s6_addr16[5]),
> +            ntohs(rt->rt6i_gateway.s6_addr16[6]),
> +            ntohs(rt->rt6i_gateway.s6_addr16[7]));
> +     for (p = &(rt->rt6i_rdnss); *p != NULL; p = &((*p)->next)) {
> +             printk(KERN_WARNING " - %x:%x:%x:%x:%x:%x:%x:%x "
> +                    "(lifetime %d)\n",
> +                    ntohs((*p)->rdnss.s6_addr16[0]),
> +                    ntohs((*p)->rdnss.s6_addr16[1]),
> +                    ntohs((*p)->rdnss.s6_addr16[2]),
> +                    ntohs((*p)->rdnss.s6_addr16[3]),
> +                    ntohs((*p)->rdnss.s6_addr16[4]),
> +                    ntohs((*p)->rdnss.s6_addr16[5]),
> +                    ntohs((*p)->rdnss.s6_addr16[6]),
> +                    ntohs((*p)->rdnss.s6_addr16[7]),
> +                    (*p)->lifetime);
> +     }
> +}
> +
> +static void rdns6_expire(unsigned long _ignore) {
> +     struct rdns6_info **p;
> +     unsigned long now;
> +     
> +     //spin_lock_bh(&rdns6_expire_lock);
> +     now = jiffies;
> +
> +     del_timer(&rdns6_chk_timer);
> +
> +     /* find expired DNS entries & delete them */
> +     for (p = &(rt->rt6i_rdnss); *p != NULL; ) {
> +             if (time_before((*p)->expires, now)) {
> +                     struct rdns6_info *r6i = (*p);
> +                     *p = (*p)->next;
> +                     kfree(r6i);
> +                     continue;
> +             }
> +             p = &((*p)->next);
> +     }
> +     /* reset */
> +     add_timer(&rdns6_chk_timer);
> +}
> +/**
> + notes on draft:
> +   server list should be kept per-router so that the resolv.conf doesn't
> +   ping-pong when two routers are broadcasting RAs.
> +
> +   DNS timeout: like RA, router is responsible for broadcasting w/
> +   time < timeout.  What if about to expire?  Can/should give RS?
> +
> +   use fib6_clean_all to implement rdns6_expire?  this will walk all 
> routes.
> +   (maybe overkill)
> +
> +   bug: how to lock rt6_info while we're mutating dns entries?
> +
> +   bug: how to update timer appropriately; when we modify one rt6_info,
> +   we don't want to scan all.  so only shorten timer (which means sometimes
> +   we'll trigger timer when it's not needed).  At some point
> +   we need to del_timer (when?)
> +
> +   bug: when route is deleted (RA times out?) we need to free the
> +   DNS server list.  (can't find where the rt6_info is deallocated?)
> +
> +   bug: use expire_lock to ensure we don't run expiry multiple times
> +   concurrently.
> +
> +   xxx: implement appropriate fill message to export the server list
> +   via netlink.
> +
> +   xxx: use round_jiffies?
> +*/
> diff -ruHpN -X dontdiff linux-2.6.22-rc5-orig/net/ipv6/ndisc.c
> linux-2.6.22-rc5/net/ipv6/ndisc.c
> --- linux-2.6.22-rc5-orig/net/ipv6/ndisc.c    2007-06-16 
> 22:09:12.000000000 -0400
> +++ linux-2.6.22-rc5/net/ipv6/ndisc.c 2007-06-19 16:02:36.000000000 -0400
> @@ -15,6 +15,8 @@
> /*
>  *    Changes:
>  *
> + *   C. Scott Ananian                :       RDNSS-in-RA support.
> + *
>  *    Lars Fenneberg                  :       fixed MTU setting on receipt
>  *                                            of an RA.
>  *
> @@ -75,6 +77,7 @@
> #include <net/protocol.h>
> #include <net/ndisc.h>
> #include <net/ip6_route.h>
> +#include <net/ip6_rdnss.h>
> #include <net/addrconf.h>
> #include <net/icmp.h>
> 
> @@ -155,12 +158,16 @@ struct neigh_table nd_tbl = {
> };
> 
> /* ND options */
> +#define __ND_OPT_RDNSS_MAX   6 /* 3 new servers + 3 cancellations */
> +
> struct ndisc_options {
>       struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX];
> #ifdef CONFIG_IPV6_ROUTE_INFO
>       struct nd_opt_hdr *nd_opts_ri;
>       struct nd_opt_hdr *nd_opts_ri_end;
> #endif
> +     int                nd_opts_rdnss_cnt;
> +     struct nd_opt_hdr *nd_opts_rdnss[__ND_OPT_RDNSS_MAX];
> };
> 
> #define nd_opts_src_lladdr    nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
> @@ -266,6 +273,12 @@ static struct ndisc_options *ndisc_parse
>                               ndopts->nd_opts_ri = nd_opt;
>                       break;
> #endif
> +             case ND_OPT_RDNSS_INFO:
> +                     /* limit # of RDNSS options accepted to prevent DoS 
> */
> +                     if (ndopts->nd_opts_rdnss_cnt < __ND_OPT_RDNSS_MAX)
> +                             ndopts->nd_opts_rdnss
> +                                     [ndopts->nd_opts_rdnss_cnt++]= 
> nd_opt;
> +                     break;
>               default:
>                       /*
>                        * Unknown options must be silently ignored,
> @@ -1045,7 +1058,36 @@ static void ndisc_router_discovery(struc
>       /*
>        * Remember the managed/otherconf flags from most recently
>        * received RA message (RFC 2462) -- yoshfuji
>        */
> +     /* From RFC2462, section 5.5.3:
> +       On receipt of a valid Router Advertisement (as defined in
> +   [DISCOVERY]), a host copies the value of the advertisement's M bit
> +   into ManagedFlag. If the value of ManagedFlag changes from FALSE to
> +   TRUE, and the host is not already running the stateful address
> +   autoconfiguration protocol, the host should invoke the stateful
> +   address autoconfiguration protocol, requesting both address
> +   information and other information.  If the value of the ManagedFlag
> +   changes from TRUE to FALSE, the host should continue running the
> +   stateful address autoconfiguration, i.e., the change in the value of
> +   the ManagedFlag has no effect.  If the value of the flag stays
> +   unchanged, no special action takes place. In particular, a host MUST
> +   NOT reinvoke stateful address configuration if it is already
> +   participating in the stateful protocol as a result of an earlier
> +   advertisement.
> +
> +   An advertisement's O flag field is processed in an analogous manner.
> +   A host copies the value of the O flag into OtherConfigFlag. If the
> +   value of OtherConfigFlag changes from FALSE to TRUE, the host should
> +   invoke the stateful autoconfiguration protocol, requesting
> +   information (excluding addresses if ManagedFlag is set to FALSE).  If
> +   the value of the OtherConfigFlag changes from TRUE to FALSE, the host
> +   should continue running the stateful address autoconfiguration
> +   protocol, i.e., the change in the value of OtherConfigFlag has no
> +   effect. If the value of the flag stays unchanged, no special action
> +   takes place. In particular, a host MUST NOT reinvoke stateful
> +   configuration if it is already participating in the stateful protocol
> +   as a result of an earlier advertisement.
> +     */
>       in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED |
>                               IF_RA_OTHERCONF)) |
>                               (ra_msg->icmph.icmp6_addrconf_managed ?
> @@ -1187,6 +1232,12 @@ skip_defrtr:
>       }
> #endif
> 
> +     if (rt && ndopts.nd_opts_rdnss_cnt) {
> +             rdns6_rcv(in6_dev, rt,
> +                       (struct nd_opt_rdnss **) ndopts.nd_opts_rdnss,
> +                       ndopts.nd_opts_rdnss_cnt);
> +     }
> +
>       if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) {
>               struct nd_opt_hdr *p;
>               for (p = ndopts.nd_opts_pi;
> 
> --
>                         ( http://cscott.net/ )
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to