Hi, My company has also started having what appears to be the same problem, since we upgraded our embedded system to linux kernel 3.16.
I tried applying the suggested fix of READ_ONCE (and also had to add in the necessary code to compiler.h as 3.16 didn't have it) and unfortunately it did not fix the issue at all. Unfortunately we do not have an easy reproduction method, and do not know precisely what is going on in the system when the issue occurs. We know it is a multicast UDP packet but that is about it. For us, the crash happens during a critical stage in our system initialisation, making additional debugging and instrumentation difficult. Our reproduction rate is approximately 1 out of 100 test runs; testing overnight we will usually see 3-5 instances of the crash happening. All our attempts to increase the reproduction rate, or reproduce the issue in a simpler/more controlled way have failed. Because we have customised the linux kernel, in some places radically, we assumed this was just a problem only we were seeing, so we were trying to fix it ourselves. Now that this appears to be a generic problem upstream, we've simply disabled UDP early demux in our system (since it's a new optimisation that we have lived without up till now) and will wait for this issue to be fixed upstream instead. So I'm sharing the debug patch I've written to help gather data on what is going on in the system, and some of the output we've gotten from the debug, in case this is useful for anyone else who is seeing this problem or would like to try and fix it. Feel free to ask questions, I'm not sure how much help I can be but will do my best. We'll be happy to assist in testing any proposed fixes. I also have some more examples of kernel oops and debug output if that could be useful, although the debug is from earlier iterations of the patch so that historical output is not as detailed as the output generated by the latest version of the patch attached here. Thanks, Greg Hoggarth
From: Gregory Hoggarth <gregory.hogga...@alliedtelesis.co.nz> dst tracking debug patch The debugging works by adding poisoned function pointers to a dst structure when it is cleaned up by the garbage collector. Then, at several points where these pointers are de-referenced, check if they hold the poisoned value. If so, it means the dst that is trying to be referenced has gone through the garbage collector, ie a 'use-after-free' scenario. Instead of letting the system access the garbage pointer and crash, pre-empty the de-reference and panic allowing us to print out the debug information. At many places in the code where dst structures are manipulated, add tracking information about the current dst structure to an external array that can store 100 dst structures. This method of storing the data is not actually very safe, as each new dst structure allocated will consume a slot in the array. If many dst structures are created quickly, this will quickly wrap-around and destroy valid data. A linked-list implementation would be better. --- include/net/dst.h | 145 +++++++++++++++++++++++++++++++++++++++++++++++++--- include/net/sock.h | 2 + net/core/dst.c | 109 +++++++++++++++++++++++++++++++++++++++ net/ipv4/udp.c | 2 + 4 files changed, 248 insertions(+), 10 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 273c15b..f746999 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -103,8 +103,106 @@ struct dst_entry { struct rt6_info *rt6_next; struct dn_route __rcu *dn_next; }; + + // dst tracking ID + int track_id; +}; + +#include <linux/fs.h> +#include <asm/uaccess.h> +#include <linux/buffer_head.h> +#include <linux/time.h> + +#define DST_LIST_SZ 25 +extern int dst_latest; +static char dst_filename[50]; + +struct dst_track { + unsigned long address[DST_LIST_SZ]; + unsigned int action[DST_LIST_SZ]; + unsigned int refCount[DST_LIST_SZ]; + struct timespec tvs[DST_LIST_SZ]; + + unsigned int index; + unsigned int reuseMagic; }; +extern struct dst_track track_array[100]; + +static char *dst_list_action[16] = +{ + "Free", "Hold", "Clon", "Aloc", "AlNr", + "ReAl", "ReNr", "Grbg", "NUse", "NoRf", + "Chld", "Zero", "SkBf", "Sach", "Gotn", + "NSwp" +}; + +struct file* file_open1(const char* path, int flags, int rights); +void file_close1(struct file* file); +int file_write1(struct file* file, unsigned long long offset, unsigned char* data, unsigned int size); + +#define DST_TRACKING(TYPE) \ +do { \ + struct dst_track *track = &track_array[dst->track_id]; \ + int index = track->index; \ + if (index < DST_LIST_SZ) \ + { \ + track->address[index] = \ + (unsigned long) __builtin_return_address(0); \ + track->action[index] = (TYPE); \ + track->refCount[index] = dst->__refcnt.counter; \ + getrawmonotonic (&track->tvs[index]); \ + track->index++; \ + } \ +} while (0) + +#define DST_PANIC \ +do { \ + int k, i, len = 0; \ + struct dst_track *track; \ + char *buf = kmalloc(10240, GFP_ATOMIC); \ + struct file* file; \ + unsigned long long pos = 0; \ + struct timeval now; \ + do_gettimeofday(&now); \ + sprintf (dst_filename, "/flash/dst-%d.log", (int) now.tv_sec); \ + k = dst->track_id; \ + file = file_open1 (dst_filename, O_RDWR | O_CREAT | O_APPEND, 0644); \ + \ + if (file == NULL) \ + panic ("file pointer %p", file); \ + \ + pr_emerg("Used/freed dst %d 0x%pK %s %d ", k, dst, \ + __FUNCTION__, __LINE__); \ + for (track = &track_array[k]; k != dst->track_id +1; k--) \ + { \ + len = sprintf (buf, "\n\n== dst %d ==\n", k); \ + pr_emerg ("%s", buf); \ + file_write1 (file, pos, buf, len); \ + pos += len; \ + for (i=0; i< track->index; i++) \ + { \ + len = sprintf (buf, "%d.%ld: r%d %s %p %pS\n", \ + (int) track->tvs[i].tv_sec % 1000, \ + track->tvs[i].tv_nsec,track->refCount[i], \ + dst_list_action[track->action[i]], \ + (void *)track->address[i], \ + (void *)track->address[i]); \ + \ + pr_emerg ("%s", buf); \ + file_write1 (file, pos, buf, len); \ + pos += len; \ + } \ + /* Wrap around */ \ + if (k == 0) \ + { \ + k = 100; \ + } \ + } \ + file_close1(file); \ + panic("Panic!!"); \ +} while (0) + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); extern const u32 dst_default_metrics[]; @@ -238,15 +336,25 @@ dst_metric_locked(const struct dst_entry *dst, int metric) return dst_metric(dst, RTAX_LOCK) & (1<<metric); } -static inline void dst_hold(struct dst_entry *dst) -{ - /* - * If your kernel compilation stops here, please check - * __pad_to_align_refcnt declaration in struct dst_entry - */ - BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); - atomic_inc(&dst->__refcnt); -} +void dst_hold(struct dst_entry *dst); + +// Replace with a regular function instead of using this static inline one. +// +//static inline void dst_hold(struct dst_entry *dst) +//{ +// /* +// * If your kernel compilation stops here, please check +// * __pad_to_align_refcnt declaration in struct dst_entry +// */ +// BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); +// atomic_inc(&dst->__refcnt); +// if ( dst->useListIndx < DST_LIST_SZ) +// { +// *(dst->useList + dst->useListIndx) = (unsigned long ) __builtin_return_address(0); +// *(dst->actionList + dst->useListIndx) = 1; +// dst->useListIndx++; +// } +//} static inline void dst_use(struct dst_entry *dst, unsigned long time) { @@ -259,12 +367,17 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { dst->__use++; dst->lastuse = time; + DST_TRACKING (8); } static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) + { atomic_inc(&dst->__refcnt); + DST_TRACKING (2); + } + return dst; } @@ -392,6 +505,7 @@ static inline void dst_free(struct dst_entry *dst) if (dst->obsolete > 0) return; if (!atomic_read(&dst->__refcnt)) { + DST_TRACKING (7); dst = dst_destroy(dst); if (!dst) return; @@ -465,6 +579,10 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout) /* Output packet to network from transport. */ static inline int dst_output_sk(struct sock *sk, struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); + if ((int) dst->output == 0xCAFEBABE) + DST_PANIC; + return skb_dst(skb)->output(sk, skb); } static inline int dst_output(struct sk_buff *skb) @@ -475,13 +593,22 @@ static inline int dst_output(struct sk_buff *skb) /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); + + if ((int) dst->input == 0xDEADBEEF) + DST_PANIC; + return skb_dst(skb)->input(skb); } static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { if (dst->obsolete) + { + if((int) dst->ops == 0xFACE1357) + DST_PANIC; dst = dst->ops->check(dst, cookie); + } return dst; } diff --git a/include/net/sock.h b/include/net/sock.h index 715c4b0..4676d9b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1731,6 +1731,8 @@ sk_dst_get(struct sock *sk) dst = rcu_dereference(sk->sk_dst_cache); if (dst && !atomic_inc_not_zero(&dst->__refcnt)) dst = NULL; + else + DST_TRACKING (14); rcu_read_unlock(); return dst; } diff --git a/net/core/dst.c b/net/core/dst.c index 7d0e962..6ad2470 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -23,6 +23,47 @@ #include <net/dst.h> +struct dst_track track_array[100]; +int dst_latest = 0; + +EXPORT_SYMBOL(track_array); +EXPORT_SYMBOL(dst_latest); + +struct file* file_open1(const char* path, int flags, int rights) { + struct file* filp = NULL; + mm_segment_t oldfs; + int err = 0; + + oldfs = get_fs(); + set_fs(get_ds()); + filp = filp_open(path, flags, rights); + set_fs(oldfs); + printk ("NLARG file pointer is %p", filp); + if(IS_ERR(filp)) { + err = PTR_ERR(filp); + return NULL; + } + return filp; +} + +void file_close1(struct file* file) { + filp_close(file, NULL); +} + +int file_write1(struct file* file, unsigned long long offset, unsigned char* data, unsigned int size) { + mm_segment_t oldfs; + int ret; + + oldfs = get_fs(); + set_fs(get_ds()); + + ret = vfs_write(file, data, size, &offset); + + set_fs(oldfs); + return ret; +} + + /* * Theory of operations: * 1) We use a list, protected by a spinlock, to add @@ -189,6 +230,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; + struct dst_track *track; if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) @@ -206,6 +248,42 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->expires = 0UL; dst->path = dst; dst->from = NULL; + + /* Work out what entry in the tracking array to use; wraps around at 100 + */ + dst->track_id = dst_latest; + track = &(track_array[dst_latest]); + dst_latest++; + if (dst_latest == 100) + dst_latest = 0; + memset (&track_array[dst_latest], 0, sizeof (struct dst_track)); + + /* This is messy. Not using DST_TRACKING because this does additional + * handling based on whether the dst is 'reused' or not. + */ + if (track->reuseMagic == 0xDEADBEEF) + { + if (track->index > DST_LIST_SZ) + track->index = 0; + } + else + track->index = 0; + + if (initial_ref > 0) + track->action[track->index] = 3; + else + track->action[track->index] = 5; + + if (track->reuseMagic == 0xDEADBEEF) + track->action[track->index] = track->action[track->index] +1; + + track->address[track->index] = (unsigned long) __builtin_return_address(0); + getrawmonotonic (&track->tvs[track->index]); + track->refCount[track->index] = dst->__refcnt.counter; + + track->index++; + track->reuseMagic = 0xDEADBEEF; + #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -265,6 +343,8 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) smp_rmb(); again: + DST_TRACKING (7); + child = dst->child; if (!(dst->flags & DST_NOCOUNT)) @@ -276,6 +356,12 @@ again: dev_put(dst->dev); kmem_cache_free(dst->ops->kmem_cachep, dst); + dst->input = (void *) 0xDEADBEEF; + dst->output = (void *) 0xCAFEBABE; + dst->ops = (void *) 0xFACE1357; + dst->path = (void *) 0xBEEF2468; + dst->from = (void *) 0xA0FACADE; + dst = child; if (dst) { int nohash = dst->flags & DST_NOHASH; @@ -283,7 +369,10 @@ again: if (atomic_dec_and_test(&dst->__refcnt)) { /* We were real parent of this dst, so kill child. */ if (nohash) + { + DST_TRACKING (10); goto again; + } } else { /* Child is still referenced, return it for freeing. */ if (nohash) @@ -310,9 +399,15 @@ void dst_release(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - WARN_ON(newrefcnt < 0); + if (newrefcnt < 0) + DST_PANIC; + DST_TRACKING (0); + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) + { + DST_TRACKING (11); call_rcu(&dst->rcu_head, dst_destroy_rcu); + } } } EXPORT_SYMBOL(dst_release); @@ -373,6 +468,7 @@ void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force) skb_dst_set(skb, dst); } else { skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; + DST_TRACKING (9); } } EXPORT_SYMBOL(__skb_dst_set_noref); @@ -445,3 +541,14 @@ void __init dst_init(void) { register_netdevice_notifier(&dst_dev_notifier); } + +void dst_hold(struct dst_entry *dst) +{ + /* + * If your kernel compilation stops here, please check + * __pad_to_align_refcnt declaration in struct dst_entry + */ + BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); + atomic_inc(&dst->__refcnt); + DST_TRACKING (1); +} diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 55bf23f..cc5c691 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1777,6 +1777,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (unlikely(sk->sk_rx_dst != dst)) udp_sk_rx_dst_set(sk, dst); + else + DST_TRACKING (15); ret = udp_queue_rcv_skb(sk, skb); sock_put(sk);
From: Gregory Hoggarth <gregory.hogga...@alliedtelesis.co.nz> dst READ_ONCE fix Suggested fix for udp early demux issue, for linux kernel 3.16. Did not resolve the issue in our testing --- include/linux/compiler.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/udp.c | 2 +- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index d5ad7b1..f892c09 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -186,6 +186,64 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__) #endif +#include <uapi/linux/types.h> + +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8 *)res = *(volatile __u8 *)p; break; + case 2: *(__u16 *)res = *(volatile __u16 *)p; break; + case 4: *(__u32 *)res = *(volatile __u32 *)p; break; + case 8: *(__u64 *)res = *(volatile __u64 *)p; break; + default: + barrier(); + __builtin_memcpy((void *)res, (const void *)p, size); + barrier(); + } +} + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8 *)p = *(__u8 *)res; break; + case 2: *(volatile __u16 *)p = *(__u16 *)res; break; + case 4: *(volatile __u32 *)p = *(__u32 *)res; break; + case 8: *(volatile __u64 *)p = *(__u64 *)res; break; + default: + barrier(); + __builtin_memcpy((void *)p, (const void *)res, size); + barrier(); + } +} + +/* + * Prevent the compiler from merging or refetching reads or writes. The + * compiler is also forbidden from reordering successive instances of + * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the + * compiler is aware of some particular ordering. One way to make the + * compiler aware of ordering is to put the two invocations of READ_ONCE, + * WRITE_ONCE or ACCESS_ONCE() in different C statements. + * + * In contrast to ACCESS_ONCE these two macros will also work on aggregate + * data types like structs or unions. If the size of the accessed data + * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) + * READ_ONCE() and WRITE_ONCE() will fall back to memcpy and print a + * compile-time warning. + * + * Their two major use cases are: (1) Mediating communication between + * process-level code and irq/NMI handlers, all running on the same CPU, + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * mutilate accesses that either do not require ordering or that interact + * with an explicit memory barrier or atomic instruction that provides the + * required ordering. + */ + +#define READ_ONCE(x) \ + ({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; }) + +#define WRITE_ONCE(x, val) \ + ({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; }) + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7d5a866..55bf23f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1978,7 +1978,7 @@ void udp_v4_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_edemux; - dst = sk->sk_rx_dst; + dst = READ_ONCE(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0);
example debug output
Description: example debug output