dst.c : Should'nt dst_run_gc() be more scalable and friendly ?

Eric Dumazet Thu, 16 Aug 2007 05:10:18 -0700

Hi all

When the periodic route cache flush is done, this machine suffers a lot and 
eventually triggers the "soft lockup" message.


dst_run_gc() is doing a scan of a possibly huge list of dst_entries, eventually 
freeing some (less than 1%) of them, but holding the dst_lock spinlock for the 
whole scan.

Then it rearms a timer to redo the full thing 1/10 s later. The slowdown can 
last one minute or so.
No need to say that machine is not really usable in the meantime.

I already looked at the code and I am testing a patch (included in this mail) 
where I limit the number of entries that can be scanned at each dst_run_gc() 
call, and not holding dst_lock during the scan. But then I dont know how to 
address the dst_dev_event() problem, since we must be able to dst_ifdown() all 
entries attached to one dev.

I am wondering how to really solve the problem. Could a workqueue be used here 
instead of a timer ?


Aug 16 06:21:37 SRV1 kernel: BUG: soft lockup detected on CPU#0!
Aug 16 06:21:37 SRV1 kernel: 
Aug 16 06:21:37 SRV1 kernel: Call Trace:
Aug 16 06:21:37 SRV1 kernel:  <IRQ>  [<ffffffff802286f0>] 
wake_up_process+0x10/0x20
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80251e09>] softlockup_tick+0xe9/0x110
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803cd380>] dst_run_gc+0x0/0x140
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff802376f3>] run_local_timers+0x13/0x20
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff802379c7>] 
update_process_times+0x57/0x90
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80216034>] 
smp_local_timer_interrupt+0x34/0x60
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff802165cc>] 
smp_apic_timer_interrupt+0x5c/0x80
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8020a816>] 
apic_timer_interrupt+0x66/0x70
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803cd3d3>] dst_run_gc+0x53/0x140
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803cd3c6>] dst_run_gc+0x46/0x140
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80237148>] run_timer_softirq+0x148/0x1c0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8023340c>] __do_softirq+0x6c/0xe0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8020ad6c>] call_softirq+0x1c/0x30
Aug 16 06:21:37 SRV1 kernel:  <EOI>  [<ffffffff8020cb34>] do_softirq+0x34/0x90
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff802331cf>] local_bh_enable_ip+0x3f/0x60
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80422913>] _spin_unlock_bh+0x13/0x20
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803dfde8>] 
rt_garbage_collect+0x1d8/0x320
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803cd4dd>] dst_alloc+0x1d/0xa0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803e1433>] 
__ip_route_output_key+0x573/0x800
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803c02e2>] sock_common_recvmsg+0x32/0x50
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803e16dc>] 
ip_route_output_flow+0x1c/0x60
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80400160>] tcp_v4_connect+0x150/0x610
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803ebf07>] 
inet_bind_bucket_create+0x17/0x60
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8040cd16>] 
inet_stream_connect+0xa6/0x2c0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80422981>] _spin_lock_bh+0x11/0x30
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803c0bdf>] lock_sock_nested+0xcf/0xe0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80422981>] _spin_lock_bh+0x11/0x30
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803be551>] sys_connect+0x71/0xa0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803eee3f>] tcp_setsockopt+0x1f/0x30
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803c030f>] 
sock_common_setsockopt+0xf/0x20
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803be4bd>] sys_setsockopt+0x9d/0xc0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8028881e>] sys_ioctl+0x5e/0x80
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80209c4e>] system_call+0x7e/0x83

Prelimary patch :

--- linux-2.6.22/net/core/dst.c
+++ linux-2.6.22-ed/net/core/dst.c
@@ -28,6 +28,9 @@
  * 4) All operations modify state, so a spinlock is used.
  */
 static struct dst_entry        *dst_garbage_list;
+static struct dst_entry     *dst_garbage_wrk;
+static struct dst_entry     *dst_garbage_inuse;
+static int dst_gc_running;
 #if RT_CACHE_DEBUG >= 2
 static atomic_t                         dst_total = ATOMIC_INIT(0);
 #endif
@@ -42,26 +45,42 @@
 
 static void dst_run_gc(unsigned long dummy)
 {
-       int    delayed = 0;
-       int    work_performed;
-       struct dst_entry * dst, **dstp;
-
-       if (!spin_trylock(&dst_lock)) {
-               mod_timer(&dst_gc_timer, jiffies + HZ/10);
-               return;
-       }
-
+       int    quota = 1000;
+       int    work_performed = 0;
+       struct dst_entry *dst, *next;
+       struct dst_entry *first = NULL, *last = NULL;
+#if RT_CACHE_DEBUG >= 2
+       unsigned long t0 = jiffies;
+#endif
+       spin_lock(&dst_lock);
+       if (dst_gc_running)
+               goto out;
        del_timer(&dst_gc_timer);
-       dstp = &dst_garbage_list;
-       work_performed = 0;
-       while ((dst = *dstp) != NULL) {
-               if (atomic_read(&dst->__refcnt)) {
-                       dstp = &dst->next;
-                       delayed++;
+       if (!dst_garbage_wrk) {
+               dst_garbage_wrk = dst_garbage_list;
+               dst_garbage_list = dst_garbage_inuse;
+               dst_garbage_inuse = NULL;
+       }
+       /*
+        * before releasing dst_lock, tell others we are currently running
+        * so they wont start timer or mess dst_garbage_wrk
+        */
+       dst_gc_running = 1;
+       next = dst_garbage_wrk;
+       spin_unlock(&dst_lock);
+
+       while ((dst = next) != NULL) {
+               next = dst->next;
+               if (likely(atomic_read(&dst->__refcnt))) {
+                       if (unlikely(last == NULL))
+                               last = dst;
+                       dst->next = first;
+                       first = dst;
+                       if (--quota == 0)
+                               break;
                        continue;
                }
-               *dstp = dst->next;
-               work_performed = 1;
+               work_performed++;
 
                dst = dst_destroy(dst);
                if (dst) {
@@ -77,16 +96,26 @@
                                continue;
 
                        ___dst_free(dst);
-                       dst->next = *dstp;
-                       *dstp = dst;
-                       dstp = &dst->next;
+                       dst->next = next;
+                       next = dst;
                }
        }
-       if (!dst_garbage_list) {
+       dst_garbage_wrk = next;
+
+       spin_lock(&dst_lock);
+       dst_gc_running = 0;
+       if (last != NULL) {
+               last->next = dst_garbage_inuse;
+               dst_garbage_inuse = first;
+       }
+       /*
+        * If all lists are empty, no need to rearm a timer
+        */
+       if (!dst_garbage_list && !dst_garbage_wrk && !dst_garbage_inuse) {
                dst_gc_timer_inc = DST_GC_MAX;
                goto out;
        }
-       if (!work_performed) {
+       if (!work_performed && !dst_garbage_wrk) {
                if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
                        dst_gc_timer_expires = DST_GC_MAX;
                dst_gc_timer_inc += DST_GC_INC;
@@ -95,8 +124,12 @@
                dst_gc_timer_expires = DST_GC_MIN;
        }
 #if RT_CACHE_DEBUG >= 2
-       printk("dst_total: %d/%d %ld\n",
-              atomic_read(&dst_total), delayed,  dst_gc_timer_expires);
+       if (net_msg_warn)
+               printk(KERN_DEBUG "dst_run_gc: "
+               "dst_total=%d quota=%d perf=%d expires=%ld elapsed=%lu\n",
+               atomic_read(&dst_total),
+               quota,  work_performed,
+               dst_gc_timer_expires, jiffies - t0);
 #endif
        /* if the next desired timer is more than 4 seconds in the future
         * then round the timer to whole seconds
@@ -157,7 +190,7 @@
        ___dst_free(dst);
        dst->next = dst_garbage_list;
        dst_garbage_list = dst;
-       if (dst_gc_timer_inc > DST_GC_INC) {
+       if (dst_gc_timer_inc > DST_GC_INC && !dst_gc_running) {
                dst_gc_timer_inc = DST_GC_INC;
                dst_gc_timer_expires = DST_GC_MIN;
                mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC] net/core/dst.c : Should'nt dst_run_gc() be more scalable and friendly ?

Reply via email to