Seems to me that the bug is in function
  bdi_position_ratio()
within file
  mm/page-writeback.c
The internal variable declaration is
  long long pos_ratio;
and calculation of it overflows. - Maybe, changing the declaration to
u64 would help. But also, pos_ratio is used without any bounds checks
as return value though that is declared as unsigned long.

I do not yet understand what bdi_position_ratio() is meant to do, so
cannot yet offer patches.

---

What I did:

I added many lines like
  BUG_ON(pos_ratio<0);
into kernel sources. Running that kernel and creating my files with
  n=0; while [ $n -lt 99 ]; do dd bs=1M count=1024 if=/dev/zero of=x$n; (( n = 
$n + 1 )); done &
I got after about 15 files created:
/bin/bash: line 1:  2755 Segmentation fault      dd bs=1M count=1024 
if=/dev/zero of=x$n
Message from syslogd@zeno at Sat Dec 15 19:46:37 2012 ...
zeno kernel: ------------[ cut here ]------------
zeno kernel: invalid opcode: 0000 [#1] SMP 
...
and in the logs:

Dec 15 19:46:37 zeno kernel: ------------[ cut here ]------------
Dec 15 19:46:37 zeno kernel: kernel BUG at mm/page-writeback.c:569!
Dec 15 19:46:37 zeno kernel: invalid opcode: 0000 [#1] SMP 
Dec 15 19:46:37 zeno kernel: Modules linked in: nfsd exportfs quota_v2 
quota_tree fuse joydev usb_storage coretemp crc32c_intel aesni_intel sg cryptd 
sr_mod aes_i586 aes_generic 8250_pnp evdev i2c_i801 8250 serial_core processor 
thermal_sys button
Dec 15 19:46:37 zeno kernel: 
Dec 15 19:46:37 zeno kernel: Pid: 2755, comm: dd Not tainted 
3.2.32-pk06.08-i386t02 #1 Supermicro X9DR3-F/X9DR3-F
Dec 15 19:46:37 zeno kernel: EIP: 0060:[<c107bf30>] EFLAGS: 00010282 CPU: 0
Dec 15 19:46:37 zeno kernel: EIP is at bdi_position_ratio.isra.16+0x220/0x230
Dec 15 19:46:37 zeno kernel: EAX: fffaadbc EBX: 00000524 ECX: fffaadbc EDX: 
760dae6b
Dec 15 19:46:37 zeno kernel: ESI: 00000524 EDI: ea673c18 EBP: d6235d2c ESP: 
d6235d00
Dec 15 19:46:37 zeno kernel:  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Dec 15 19:46:37 zeno kernel: Process dd (pid: 2755, ti=d6234000 task=d607bb10 
task.ti=d6234000)
Dec 15 19:46:37 zeno kernel: Stack:
Dec 15 19:46:37 zeno kernel:  d6235d1c 000280cd 33b036ce 00000098 0000047f 
760db26b fffaadbc 0000007a
Dec 15 19:46:37 zeno kernel:  00000004 00000546 d5de809c d6235db0 c107c963 
000004f1 00000523 00000546
Dec 15 19:46:37 zeno kernel:  00000000 00140669 00000007 00000000 d5de80bc 
00032afc 00000000 d5e83800
Dec 15 19:46:37 zeno kernel: Call Trace:
Dec 15 19:46:37 zeno kernel:  [<c107c963>] 
balance_dirty_pages_ratelimited_nr+0x253/0x520
Dec 15 19:46:37 zeno kernel:  [<c10747cf>] 
generic_file_buffered_write+0x16f/0x210
Dec 15 19:46:37 zeno kernel:  [<c1075f7d>] __generic_file_aio_write+0x24d/0x4b0
Dec 15 19:46:37 zeno kernel:  [<c1076240>] generic_file_aio_write+0x60/0xc0
Dec 15 19:46:37 zeno kernel:  [<c10a2fa7>] do_sync_write+0xb7/0xf0
Dec 15 19:46:37 zeno kernel:  [<c1036455>] ? irq_exit+0x55/0x60
Dec 15 19:46:37 zeno kernel:  [<c10a2ef0>] ? wait_on_retry_sync_kiocb+0x50/0x50
Dec 15 19:46:37 zeno kernel:  [<c10a3aa7>] vfs_write+0x87/0x170
Dec 15 19:46:37 zeno kernel:  [<c10a2ef0>] ? wait_on_retry_sync_kiocb+0x50/0x50
Dec 15 19:46:37 zeno kernel:  [<c10a3da8>] sys_write+0x38/0x70
Dec 15 19:46:37 zeno kernel:  [<c160fd14>] sysenter_do_call+0x12/0x26
Dec 15 19:46:37 zeno kernel: Code: 55 ff ff ff 0f 0b 90 8d 74 26 00 0f a4 cb 03 
c1 e1 03 e9 74 ff ff ff 8d 74 26 00 89 d0 31 d2 f7 75 10 89 c6 e9 59 ff ff ff 
0f 0b <0f> 0b 0f 0b 0f 0b 31 c0 e9 5d ff ff ff 8d 76 00 55 89 e5 83 ec 
Dec 15 19:46:37 zeno kernel: EIP: [<c107bf30>] 
bdi_position_ratio.isra.16+0x220/0x230 SS:ESP 0068:d6235d00
Dec 15 19:46:37 zeno kernel: ---[ end trace c9c79e2ba8a36130 ]---

Relevant part of file  mm/page-writeback.c :

   525  static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
   526                                          unsigned long thresh,
   527                                          unsigned long bg_thresh,
   528                                          unsigned long dirty,
   529                                          unsigned long bdi_thresh,
   530                                          unsigned long bdi_dirty)
   531  {
   532          unsigned long write_bw = bdi->avg_write_bandwidth;
   533          unsigned long freerun = dirty_freerun_ceiling(thresh, 
bg_thresh);
   534          unsigned long limit = hard_dirty_limit(thresh);
   535          unsigned long x_intercept;
   536          unsigned long setpoint;         /* dirty pages' target balance 
point */
   537          unsigned long bdi_setpoint;
   538          unsigned long span;
   539          long long pos_ratio;            /* for scaling up/down the rate 
limit */
   540          long x;
   541  
   542          if (unlikely(dirty >= limit))
   543                  return 0;
   544  
   545          /*
   546           * global setpoint
   547           *
   548           *                           setpoint - dirty 3
   549           *        f(dirty) := 1.0 + (----------------)
   550           *                           limit - setpoint
   551           *
   552           * it's a 3rd order polynomial that subjects to
   553           *
   554           * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably 
fast
   555           * (2) f(setpoint) = 1.0 => the balance point
   556           * (3) f(limit)    = 0   => the hard limit
   557           * (4) df/dx      <= 0   => negative feedback control
   558           * (5) the closer to setpoint, the smaller |df/dx| (and the 
reverse)
   559           *     => fast response on large errors; small oscillation near 
setpoint
   560           */
   561          setpoint = (freerun + limit) / 2;
   562          x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
   563                      limit - setpoint + 1);
   564  BUG_ON(x<0);
   565          pos_ratio = x;
   566          pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
   567          pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
   568          pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
   569  BUG_ON(pos_ratio<0);
   570  
   571          /*
   572           * We have computed basic pos_ratio above based on global 
situation. If
   573           * the bdi is over/under its share of dirty pages, we want to 
scale
   574           * pos_ratio further down/up. That is done by the following 
mechanism.
   575           */
   576  
   577          /*
   578           * bdi setpoint
   579           *
   580           *        f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
   581           *
   582           *                        x_intercept - bdi_dirty
   583           *                     := --------------------------
   584           *                        x_intercept - bdi_setpoint
   585           *
   586           * The main bdi control line is a linear function that subjects 
to
   587           *
   588           * (1) f(bdi_setpoint) = 1.0
   589           * (2) k = - 1 / (8 * write_bw)  (in single bdi case)
   590           *     or equally: x_intercept = bdi_setpoint + 8 * write_bw
   591           *
   592           * For single bdi case, the dirty pages are observed to 
fluctuate
   593           * regularly within range
   594           *        [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
   595           * for various filesystems, where (2) can yield in a reasonable 
12.5%
   596           * fluctuation range for pos_ratio.
   597           *
   598           * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate 
up to its
   599           * own size, so move the slope over accordingly and choose a 
slope that
   600           * yields 100% pos_ratio fluctuation on suddenly doubled 
bdi_thresh.
   601           */
   602          if (unlikely(bdi_thresh > thresh))
   603                  bdi_thresh = thresh;
   604          /*
   605           * It's very possible that bdi_thresh is close to 0 not because 
the
   606           * device is slow, but that it has remained inactive for long 
time.
   607           * Honour such devices a reasonable good (hopefully IO 
efficient)
   608           * threshold, so that the occasional writes won't be blocked 
and active
   609           * writes can rampup the threshold quickly.
   610           */
   611          bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
   612          /*
   613           * scale global setpoint to bdi's:
   614           *      bdi_setpoint = setpoint * bdi_thresh / thresh
   615           */
   616          x = div_u64((u64)bdi_thresh << 16, thresh + 1);
   617  BUG_ON(x<0);
   618          bdi_setpoint = setpoint * (u64)x >> 16;
   619          /*
   620           * Use span=(8*write_bw) in single bdi case as indicated by
   621           * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD 
case.
   622           *
   623           *        bdi_thresh                    thresh - bdi_thresh
   624           * span = ---------- * (8 * write_bw) + ------------------- * 
bdi_thresh
   625           *          thresh                            thresh
   626           */
   627          span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
   628          x_intercept = bdi_setpoint + span;
   629  
   630          if (bdi_dirty < x_intercept - span / 4) {
   631                  pos_ratio = div_u64(pos_ratio * (x_intercept - 
bdi_dirty),
   632                                      x_intercept - bdi_setpoint + 1);
   633          } else
   634                  pos_ratio /= 4;
   635  BUG_ON(pos_ratio<0);
   636  
   637          /*
   638           * bdi reserve area, safeguard against dirty pool underrun and 
disk idle
   639           * It may push the desired control point of global dirty pages 
higher
   640           * than setpoint.
   641           */
   642          x_intercept = bdi_thresh / 2;
   643          if (bdi_dirty < x_intercept) {
   644                  if (bdi_dirty > x_intercept / 8)
   645                          pos_ratio = div_u64(pos_ratio * x_intercept, 
bdi_dirty);
   646                  else
   647                          pos_ratio *= 8;
   648          }
   649  BUG_ON(pos_ratio<0);
   650  
   651          return pos_ratio;
   652  }

Cheers, Paul

Paul Szabo   p...@maths.usyd.edu.au   http://www.maths.usyd.edu.au/u/psz/
School of Mathematics and Statistics   University of Sydney    Australia


-- 
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org

Reply via email to