You have been subscribed to a public bug:

Problem Description: Migration Guest running with IO stress
crashed@security_file_permission+0xf4/0x160 after couple of migrations.

Steps to re-create:

Source host - boslcp3
Destination host - boslcp4

1.boslcp3 & boslcp4 installed with latest kernel 
root@boslcp3:~# uname -a
Linux boslcp3 4.15.0-20-generic #21+bug166588 SMP Thu Apr 26 15:05:59 CDT 2018 
ppc64le ppc64le ppc64le GNU/Linux
root@boslcp3:~#

root@boslcp4:~# uname -a
Linux boslcp4 4.15.0-20-generic #21+bug166588 SMP Thu Apr 26 15:05:59 CDT 2018 
ppc64le ppc64le ppc64le GNU/Linux
root@boslcp3:~#

2. Installed guest boslcp3g1 with kernel and started LTP run from
boslcp3 host

root@boslcp3g1:~# uname -a
Linux boslcp3g1 4.15.0-15-generic #16+bug166877 SMP Wed Apr 18 14:47:30 CDT 
2018 ppc64le ppc64le ppc64le GNU/Linux

3. Started migrating boslcp3g1 guest from source to destination & viceversa.
4. After couple of migrations it crashed at boslcp4 & enters into xmon

8:mon> t
[c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160
[c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120
[c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0
[c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110
[c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c
--- Exception: c01 (System Call) at 000071f1779fe280
SP (7fffe99ece50) is in userspace
8:mon> S
msr    = 8000000000001033  sprg0 = 0000000000000000
pvr    = 00000000004e1202  sprg1 = c000000007a85800
dec    = 00000000591e3e03  sprg2 = c000000007a85800
sp     = c0000004f8a234a0  sprg3 = 0000000000010008
toc    = c0000000016eae00  dar   = 000000000000023c
srr0   = c0000000000c355c  srr1  = 8000000000001033 dsisr  = 40000000
dscr   = 0000000000000000  ppr   = 0010000000000000 pir    = 00000011
amr    = 0000000000000000  uamor = 0000000000000000
dpdes  = 0000000000000000  tir   = 0000000000000000 cir    = 00000000
fscr   = 0500000000000180  tar   = 0000000000000000 pspb   = 00000000
mmcr0  = 0000000080000000  mmcr1 = 0000000000000000 mmcr2  = 0000000000000000
pmc1   = 00000000 pmc2 = 00000000  pmc3 = 00000000  pmc4   = 00000000
mmcra  = 0000000000000000   siar = 0000000000000000 pmc5   = 0000026c
sdar   = 0000000000000000   sier = 0000000000000000 pmc6   = 00000861
ebbhr  = 0000000000000000  ebbrr = 0000000000000000 bescr  = 0000000000000000
iamr   = 4000000000000000
pidr   = 0000000000000034  tidr  = 0000000000000000
cpu 0x8: Vector: 700 (Program Check) at [c0000004f8a23220]
    pc: c0000000000e4854: xmon_core+0x1f24/0x3520
    lr: c0000000000e4850: xmon_core+0x1f20/0x3520
    sp: c0000004f8a234a0
   msr: 8000000000041033
  current = 0xc0000004f89faf00
  paca    = 0xc000000007a85800   softe: 0        irq_happened: 0x01
    pid   = 24028, comm = top
Linux version 4.15.0-20-generic (buildd@bos02-ppc64el-002) (gcc version 7.3.0 
(Ubuntu 7.3.0-16ubuntu3)) #21-Ubuntu SMP Tue Apr 24 06:14:44 UTC 2018 (Ubuntu 
4.15.0-20.21-generic 4.15.17)
cpu 0x8: Exception 700 (Program Check) in xmon, returning to main loop
[c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160
[c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120
[c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0
[c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110
[c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c
--- Exception: c01 (System Call) at 000071f1779fe280
SP (7fffe99ece50) is in userspace
8:mon> r
R00 = c00000000043b7fc   R16 = 0000000000000000
R01 = c0000004f8a23c90   R17 = ffffffffffffff70
R02 = c0000000016eae00   R18 = 00000a51b4bebfc8
R03 = c000000279557200   R19 = 00007fffe99edbb0
R04 = c0000003242499c0   R20 = 00000a51b4c04db0
R05 = 0000000000020000   R21 = 00000a51b4c20e90
R06 = 0000000000000004   R22 = 0000000000040f00
R07 = ffffff8100000000   R23 = 00000a51b4c06560
R08 = ffffff8000000000   R24 = ffffffffffffff80
R09 = 0000000000000000   R25 = 00000a51b4bec2b8
R10 = 0000000000000000   R26 = 000071f177bb0b20
R11 = 0000000000000000   R27 = 0000000000000000
R12 = 0000000000002000   R28 = c000000279557200
R13 = c000000007a85800   R29 = c0000004c7734210
R14 = 0000000000000000   R30 = 0000000000000000
R15 = 0000000000000000   R31 = c0000003242499c0
pc  = c00000000043b808 __fsnotify_parent+0x88/0x1a0
cfar= c0000000003f9e78 dget_parent+0xe8/0x150
lr  = c00000000043b7fc __fsnotify_parent+0x7c/0x1a0
msr = 8000000000009033   cr  = 28002222
ctr = c0000000006252b0   xer = 0000000000000000   trap =  300
dar = 000000000000023c   dsisr = 40000000
8:mon> e
cpu 0x8: Vector: 300 (Data Access) at [c0000004f8a23a10]
    pc: c00000000043b808: __fsnotify_parent+0x88/0x1a0
    lr: c00000000043b7fc: __fsnotify_parent+0x7c/0x1a0
    sp: c0000004f8a23c90
   msr: 8000000000009033
   dar: 23c
 dsisr: 40000000
  current = 0xc0000004f89faf00
  paca    = 0xc000000007a85800   softe: 0        irq_happened: 0x01
    pid   = 24028, comm = top
Linux version 4.15.0-20-generic (buildd@bos02-ppc64el-002) (gcc version 7.3.0 
(Ubuntu 7.3.0-16ubuntu3)) #21-Ubuntu SMP Tue Apr 24 06:14:44 UTC 2018 (Ubuntu 
4.15.0-20.21-generic 4.15.17)

6. Guest enters into xmon after migrating from boslcp3 to boslcp4.


> 
> 8:mon> t
> [c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160
> [c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120
> [c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0
> [c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110
> [c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c
> --- Exception: c01 (System Call) at 000071f1779fe280
> SP (7fffe99ece50) is in userspace

> 8:mon> r
> R00 = c00000000043b7fc   R16 = 0000000000000000
> R01 = c0000004f8a23c90   R17 = ffffffffffffff70
> R02 = c0000000016eae00   R18 = 00000a51b4bebfc8
> R03 = c000000279557200   R19 = 00007fffe99edbb0
> R04 = c0000003242499c0   R20 = 00000a51b4c04db0
> R05 = 0000000000020000   R21 = 00000a51b4c20e90
> R06 = 0000000000000004   R22 = 0000000000040f00
> R07 = ffffff8100000000   R23 = 00000a51b4c06560
> R08 = ffffff8000000000   R24 = ffffffffffffff80
> R09 = 0000000000000000   R25 = 00000a51b4bec2b8
> R10 = 0000000000000000   R26 = 000071f177bb0b20
> R11 = 0000000000000000   R27 = 0000000000000000
> R12 = 0000000000002000   R28 = c000000279557200
> R13 = c000000007a85800   R29 = c0000004c7734210
> R14 = 0000000000000000   R30 = 0000000000000000
> R15 = 0000000000000000   R31 = c0000003242499c0
> pc  = c00000000043b808 __fsnotify_parent+0x88/0x1a0
> cfar= c0000000003f9e78 dget_parent+0xe8/0x150
> lr  = c00000000043b7fc __fsnotify_parent+0x7c/0x1a0
> msr = 8000000000009033   cr  = 28002222
> ctr = c0000000006252b0   xer = 0000000000000000   trap =  300
> dar = 000000000000023c   dsisr = 40000000


> BUG_ON in jbd2_journal_write_metadata_buffer

I've included xmon crash data from a more recent crash, this time a
BUG_ON in fs/jbd2/journal.c:jbd2_journal_write_metadata_buffer():

int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
                                  struct journal_head  *jh_in,
                                  struct buffer_head **bh_out,
                                  sector_t blocknr)
{
        int need_copy_out = 0;
        int done_copy_out = 0;
        int do_escape = 0;
        char *mapped_data;
        struct buffer_head *new_bh;
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
        journal_t *journal = transaction->t_journal;

        /*
         * The buffer really shouldn't be locked: only the current committing
         * transaction is allowed to write it, so nobody else is allowed
         * to do any IO.
         *
         * akpm: except if we're journalling data, and write() output is
         * also part of a shared mapping, and another thread has
         * decided to launch a writepage() against this buffer.
         */
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));


This is not the same as the original bug, but I suspect they are part of a 
class of issues we're hitting while running under very particular circumstances 
which might not generally be seen during normal operation and triggering 
various corner cases. As such I think it makes sense to group them under this 
bug for the time being.

The general workload is running IO-heavy disk workloads on large guests
(20GB memory, 16 vcpus) with SAN-based storage, and then performing
migration during the workload. During migration we begin to see a high
occurrence of rcu_sched stall warnings, and after 1-3  hours of
operations we hit filesystem-related crashes like the ones posted. We've
seen this with 2 separate FC cards, emulex and qlogic, where we invoke
QEMU through libvirt as:

C_ALL=C
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
QEMU_AUDIO_DRV=none /usr/bin/qemu-system-ppc64 -name guest=boslcp3g1
,debug-threads=on -S -object
secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-7-boslcp3g1
/master-key.aes -machine pseries-2.10,accel=kvm,usb=off,dump-guest-
core=off,max-cpu-compat=power9 -cpu host -m 20480 -realtime mlock=off
-smp 16,maxcpus=32,sockets=4,cores=8,threads=1 -object memory-backend-
file,id=ram-node0,prealloc=yes,mem-
path=/dev/hugepages/libvirt/qemu/7-boslcp3g1,size=10737418240 -numa
node,nodeid=0,cpus=0-7,memdev=ram-node0 -object memory-backend-ram,id
=ram-node1,size=10737418240 -numa node,nodeid=1,cpus=8-15,memdev=ram-
node1 -uuid bd110ed9-dcfc-4470-b4ae-6166a56819f0 -display none -no-user-
config -nodefaults -chardev
socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-7-boslcp3g1/monitor.sock,server,nowait
-mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-
shutdown -boot menu=on,strict=on -device spapr-pci-host-
bridge,index=1,id=pci.1 -device nec-usb-xhci,id=usb,bus=pci.0,addr=0x3
-device virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0x2 -drive file=/home
/bionic-server-ppc64el.iso,format=raw,if=none,id=drive-
scsi0-0-0-2,readonly=on,cache=none -device scsi-cd,bus=scsi0.0,channel=0
,scsi-id=0,lun=2,drive=drive-scsi0-0-0-2,id=scsi0-0-0-2 -drive
file=/dev/disk/by-id/dm-uuid-part1-mpath-
3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio-
disk0,cache=none -device virtio-blk-
pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-
disk0,bootindex=1 -drive file=/dev/disk/by-id/dm-uuid-part2-mpath-
3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio-
disk1,cache=none -device virtio-blk-
pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk1,id=virtio-disk1
-drive file=/dev/disk/by-id/dm-uuid-part3-mpath-
3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio-
disk2,cache=none -device virtio-blk-
pci,scsi=off,bus=pci.0,addr=0x7,drive=drive-virtio-disk2,id=virtio-disk2
-netdev tap,fd=27,id=hostnet0,vhost=on,vhostfd=30 -device virtio-net-
pci,netdev=hostnet0,id=net0,mac=52:54:00:72:d2:69,bus=pci.0,addr=0x1,bootindex=2
-chardev pty,id=charserial0 -device spapr-
vty,chardev=charserial0,id=serial0,reg=0x30000000 -device virtio-
balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -msg timestamp=on

I will attach the libvirt XML separately

IBM is requesting some general filesystem skills from Canonical if they
have some as we continue debugging...

** Affects: linux (Ubuntu)
     Importance: Undecided
     Assignee: Ubuntu on IBM Power Systems Bug Triage (ubuntu-power-triage)
         Status: New


** Tags: architecture-ppc64le bugnameltc-167290 severity-high 
targetmilestone-inin1804
-- 
ISST-LTE:KVM:Ubuntu1804:BostonLC:boslcp3g1: Migration guest running with IO 
stress crashed@security_file_permission+0xf4/0x160.
https://bugs.launchpad.net/bugs/1768115
You received this bug notification because you are a member of Kernel Packages, 
which is subscribed to linux in Ubuntu.

-- 
Mailing list: https://launchpad.net/~kernel-packages
Post to     : kernel-packages@lists.launchpad.net
Unsubscribe : https://launchpad.net/~kernel-packages
More help   : https://help.launchpad.net/ListHelp

Reply via email to