You have been subscribed to a public bug: Problem Description: Migration Guest running with IO stress crashed@security_file_permission+0xf4/0x160 after couple of migrations.
Steps to re-create: Source host - boslcp3 Destination host - boslcp4 1.boslcp3 & boslcp4 installed with latest kernel root@boslcp3:~# uname -a Linux boslcp3 4.15.0-20-generic #21+bug166588 SMP Thu Apr 26 15:05:59 CDT 2018 ppc64le ppc64le ppc64le GNU/Linux root@boslcp3:~# root@boslcp4:~# uname -a Linux boslcp4 4.15.0-20-generic #21+bug166588 SMP Thu Apr 26 15:05:59 CDT 2018 ppc64le ppc64le ppc64le GNU/Linux root@boslcp3:~# 2. Installed guest boslcp3g1 with kernel and started LTP run from boslcp3 host root@boslcp3g1:~# uname -a Linux boslcp3g1 4.15.0-15-generic #16+bug166877 SMP Wed Apr 18 14:47:30 CDT 2018 ppc64le ppc64le ppc64le GNU/Linux 3. Started migrating boslcp3g1 guest from source to destination & viceversa. 4. After couple of migrations it crashed at boslcp4 & enters into xmon 8:mon> t [c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160 [c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120 [c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0 [c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110 [c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c --- Exception: c01 (System Call) at 000071f1779fe280 SP (7fffe99ece50) is in userspace 8:mon> S msr = 8000000000001033 sprg0 = 0000000000000000 pvr = 00000000004e1202 sprg1 = c000000007a85800 dec = 00000000591e3e03 sprg2 = c000000007a85800 sp = c0000004f8a234a0 sprg3 = 0000000000010008 toc = c0000000016eae00 dar = 000000000000023c srr0 = c0000000000c355c srr1 = 8000000000001033 dsisr = 40000000 dscr = 0000000000000000 ppr = 0010000000000000 pir = 00000011 amr = 0000000000000000 uamor = 0000000000000000 dpdes = 0000000000000000 tir = 0000000000000000 cir = 00000000 fscr = 0500000000000180 tar = 0000000000000000 pspb = 00000000 mmcr0 = 0000000080000000 mmcr1 = 0000000000000000 mmcr2 = 0000000000000000 pmc1 = 00000000 pmc2 = 00000000 pmc3 = 00000000 pmc4 = 00000000 mmcra = 0000000000000000 siar = 0000000000000000 pmc5 = 0000026c sdar = 0000000000000000 sier = 0000000000000000 pmc6 = 00000861 ebbhr = 0000000000000000 ebbrr = 0000000000000000 bescr = 0000000000000000 iamr = 4000000000000000 pidr = 0000000000000034 tidr = 0000000000000000 cpu 0x8: Vector: 700 (Program Check) at [c0000004f8a23220] pc: c0000000000e4854: xmon_core+0x1f24/0x3520 lr: c0000000000e4850: xmon_core+0x1f20/0x3520 sp: c0000004f8a234a0 msr: 8000000000041033 current = 0xc0000004f89faf00 paca = 0xc000000007a85800 softe: 0 irq_happened: 0x01 pid = 24028, comm = top Linux version 4.15.0-20-generic (buildd@bos02-ppc64el-002) (gcc version 7.3.0 (Ubuntu 7.3.0-16ubuntu3)) #21-Ubuntu SMP Tue Apr 24 06:14:44 UTC 2018 (Ubuntu 4.15.0-20.21-generic 4.15.17) cpu 0x8: Exception 700 (Program Check) in xmon, returning to main loop [c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160 [c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120 [c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0 [c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110 [c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c --- Exception: c01 (System Call) at 000071f1779fe280 SP (7fffe99ece50) is in userspace 8:mon> r R00 = c00000000043b7fc R16 = 0000000000000000 R01 = c0000004f8a23c90 R17 = ffffffffffffff70 R02 = c0000000016eae00 R18 = 00000a51b4bebfc8 R03 = c000000279557200 R19 = 00007fffe99edbb0 R04 = c0000003242499c0 R20 = 00000a51b4c04db0 R05 = 0000000000020000 R21 = 00000a51b4c20e90 R06 = 0000000000000004 R22 = 0000000000040f00 R07 = ffffff8100000000 R23 = 00000a51b4c06560 R08 = ffffff8000000000 R24 = ffffffffffffff80 R09 = 0000000000000000 R25 = 00000a51b4bec2b8 R10 = 0000000000000000 R26 = 000071f177bb0b20 R11 = 0000000000000000 R27 = 0000000000000000 R12 = 0000000000002000 R28 = c000000279557200 R13 = c000000007a85800 R29 = c0000004c7734210 R14 = 0000000000000000 R30 = 0000000000000000 R15 = 0000000000000000 R31 = c0000003242499c0 pc = c00000000043b808 __fsnotify_parent+0x88/0x1a0 cfar= c0000000003f9e78 dget_parent+0xe8/0x150 lr = c00000000043b7fc __fsnotify_parent+0x7c/0x1a0 msr = 8000000000009033 cr = 28002222 ctr = c0000000006252b0 xer = 0000000000000000 trap = 300 dar = 000000000000023c dsisr = 40000000 8:mon> e cpu 0x8: Vector: 300 (Data Access) at [c0000004f8a23a10] pc: c00000000043b808: __fsnotify_parent+0x88/0x1a0 lr: c00000000043b7fc: __fsnotify_parent+0x7c/0x1a0 sp: c0000004f8a23c90 msr: 8000000000009033 dar: 23c dsisr: 40000000 current = 0xc0000004f89faf00 paca = 0xc000000007a85800 softe: 0 irq_happened: 0x01 pid = 24028, comm = top Linux version 4.15.0-20-generic (buildd@bos02-ppc64el-002) (gcc version 7.3.0 (Ubuntu 7.3.0-16ubuntu3)) #21-Ubuntu SMP Tue Apr 24 06:14:44 UTC 2018 (Ubuntu 4.15.0-20.21-generic 4.15.17) 6. Guest enters into xmon after migrating from boslcp3 to boslcp4. > > 8:mon> t > [c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160 > [c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120 > [c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0 > [c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110 > [c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c > --- Exception: c01 (System Call) at 000071f1779fe280 > SP (7fffe99ece50) is in userspace > 8:mon> r > R00 = c00000000043b7fc R16 = 0000000000000000 > R01 = c0000004f8a23c90 R17 = ffffffffffffff70 > R02 = c0000000016eae00 R18 = 00000a51b4bebfc8 > R03 = c000000279557200 R19 = 00007fffe99edbb0 > R04 = c0000003242499c0 R20 = 00000a51b4c04db0 > R05 = 0000000000020000 R21 = 00000a51b4c20e90 > R06 = 0000000000000004 R22 = 0000000000040f00 > R07 = ffffff8100000000 R23 = 00000a51b4c06560 > R08 = ffffff8000000000 R24 = ffffffffffffff80 > R09 = 0000000000000000 R25 = 00000a51b4bec2b8 > R10 = 0000000000000000 R26 = 000071f177bb0b20 > R11 = 0000000000000000 R27 = 0000000000000000 > R12 = 0000000000002000 R28 = c000000279557200 > R13 = c000000007a85800 R29 = c0000004c7734210 > R14 = 0000000000000000 R30 = 0000000000000000 > R15 = 0000000000000000 R31 = c0000003242499c0 > pc = c00000000043b808 __fsnotify_parent+0x88/0x1a0 > cfar= c0000000003f9e78 dget_parent+0xe8/0x150 > lr = c00000000043b7fc __fsnotify_parent+0x7c/0x1a0 > msr = 8000000000009033 cr = 28002222 > ctr = c0000000006252b0 xer = 0000000000000000 trap = 300 > dar = 000000000000023c dsisr = 40000000 > BUG_ON in jbd2_journal_write_metadata_buffer I've included xmon crash data from a more recent crash, this time a BUG_ON in fs/jbd2/journal.c:jbd2_journal_write_metadata_buffer(): int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct buffer_head **bh_out, sector_t blocknr) { int need_copy_out = 0; int done_copy_out = 0; int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing * transaction is allowed to write it, so nobody else is allowed * to do any IO. * * akpm: except if we're journalling data, and write() output is * also part of a shared mapping, and another thread has * decided to launch a writepage() against this buffer. */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); This is not the same as the original bug, but I suspect they are part of a class of issues we're hitting while running under very particular circumstances which might not generally be seen during normal operation and triggering various corner cases. As such I think it makes sense to group them under this bug for the time being. The general workload is running IO-heavy disk workloads on large guests (20GB memory, 16 vcpus) with SAN-based storage, and then performing migration during the workload. During migration we begin to see a high occurrence of rcu_sched stall warnings, and after 1-3 hours of operations we hit filesystem-related crashes like the ones posted. We've seen this with 2 separate FC cards, emulex and qlogic, where we invoke QEMU through libvirt as: C_ALL=C PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin QEMU_AUDIO_DRV=none /usr/bin/qemu-system-ppc64 -name guest=boslcp3g1 ,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-7-boslcp3g1 /master-key.aes -machine pseries-2.10,accel=kvm,usb=off,dump-guest- core=off,max-cpu-compat=power9 -cpu host -m 20480 -realtime mlock=off -smp 16,maxcpus=32,sockets=4,cores=8,threads=1 -object memory-backend- file,id=ram-node0,prealloc=yes,mem- path=/dev/hugepages/libvirt/qemu/7-boslcp3g1,size=10737418240 -numa node,nodeid=0,cpus=0-7,memdev=ram-node0 -object memory-backend-ram,id =ram-node1,size=10737418240 -numa node,nodeid=1,cpus=8-15,memdev=ram- node1 -uuid bd110ed9-dcfc-4470-b4ae-6166a56819f0 -display none -no-user- config -nodefaults -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-7-boslcp3g1/monitor.sock,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no- shutdown -boot menu=on,strict=on -device spapr-pci-host- bridge,index=1,id=pci.1 -device nec-usb-xhci,id=usb,bus=pci.0,addr=0x3 -device virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0x2 -drive file=/home /bionic-server-ppc64el.iso,format=raw,if=none,id=drive- scsi0-0-0-2,readonly=on,cache=none -device scsi-cd,bus=scsi0.0,channel=0 ,scsi-id=0,lun=2,drive=drive-scsi0-0-0-2,id=scsi0-0-0-2 -drive file=/dev/disk/by-id/dm-uuid-part1-mpath- 3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio- disk0,cache=none -device virtio-blk- pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio- disk0,bootindex=1 -drive file=/dev/disk/by-id/dm-uuid-part2-mpath- 3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio- disk1,cache=none -device virtio-blk- pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk1,id=virtio-disk1 -drive file=/dev/disk/by-id/dm-uuid-part3-mpath- 3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio- disk2,cache=none -device virtio-blk- pci,scsi=off,bus=pci.0,addr=0x7,drive=drive-virtio-disk2,id=virtio-disk2 -netdev tap,fd=27,id=hostnet0,vhost=on,vhostfd=30 -device virtio-net- pci,netdev=hostnet0,id=net0,mac=52:54:00:72:d2:69,bus=pci.0,addr=0x1,bootindex=2 -chardev pty,id=charserial0 -device spapr- vty,chardev=charserial0,id=serial0,reg=0x30000000 -device virtio- balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -msg timestamp=on I will attach the libvirt XML separately IBM is requesting some general filesystem skills from Canonical if they have some as we continue debugging... ** Affects: linux (Ubuntu) Importance: Undecided Assignee: Ubuntu on IBM Power Systems Bug Triage (ubuntu-power-triage) Status: New ** Tags: architecture-ppc64le bugnameltc-167290 severity-high targetmilestone-inin1804 -- ISST-LTE:KVM:Ubuntu1804:BostonLC:boslcp3g1: Migration guest running with IO stress crashed@security_file_permission+0xf4/0x160. https://bugs.launchpad.net/bugs/1768115 You received this bug notification because you are a member of Kernel Packages, which is subscribed to linux in Ubuntu. -- Mailing list: https://launchpad.net/~kernel-packages Post to : kernel-packages@lists.launchpad.net Unsubscribe : https://launchpad.net/~kernel-packages More help : https://help.launchpad.net/ListHelp