------- Comment From pavra...@in.ibm.com 2018-03-29 11:31 EDT-------
(In reply to comment #10)
> I built a Bionic test kernel with the three commits mentioned in the bug
> description.  The test kernel can be downloaded from:
> http://kernel.ubuntu.com/~jsalisbury/lp1758206
>
> Can you test this kernel and see if it resolves this bug?
>
> Note, to test this kernel, you need to install both the linux-image and
> linux-image-extra .deb packages.
>
> Thanks in advance!

Tried with given kernel, kexec still failed. Please find logs below.

root@ltc-wspoon4:~# ppc64_cpu --smt
SMT is off
root@ltc-wspoon4:~# kdump-config show
DUMP_MODE:        kdump
USE_KDUMP:        1
KDUMP_SYSCTL:     kernel.panic_on_oops=1
KDUMP_COREDIR:    /var/crash
crashkernel addr:
/var/lib/kdump/vmlinuz: symbolic link to /boot/vmlinux-4.15.0-12-generic
kdump initrd:
/var/lib/kdump/initrd.img: symbolic link to 
/var/lib/kdump/initrd.img-4.15.0-12-generic
current state:    ready to kdump

kexec command:
/sbin/kexec -p --command-line="root=UUID=0266024d-8ea3-4132-ad62-b49befd6f8d9 
ro quiet splash nr_cpus=1 systemd.unit=kdump-tools.service irqpoll noirqdistrib 
nousb" --initrd=/var/lib/kdump/initrd.img /var/lib/kdump/vmlinuz
root@ltc-wspoon4:~# echo "c" > /proc/sysrq-trigger
[  951.567597] sysrq: SysRq : This sysrq operation is disabled.
root@ltc-wspoon4:~# echo 1 > /proc/sys/kernel/sysrq
root@ltc-wspoon4:~# echo "c" > /proc/sysrq-trigger
[  968.396522] sysrq: SysRq : Trigger a crash
[  968.396558] Unable to handle kernel paging request for data at address 
0x00000000
[  968.396602] Faulting instruction address: 0xc0000000007ec768
[  968.396640] Oops: Kernel access of bad area, sig: 11 [#1]
[  968.396670] LE SMP NR_CPUS=2048 NUMA PowerNV
[  968.396703] Modules linked in: idt_89hpesx(E) at24 uio_pdrv_genirq ofpart 
cmdlinepart powernv_flash mtd uio ibmpowernv ipmi_powernv vmx_crypto 
ipmi_devintf ipmi_msghandler opal_prd crct10dif_vpmsum sch_fq_codel ip_tables 
x_tables autofs4 ast i2c_algo_bit ttm drm_kms_helper syscopyarea sysfillrect 
sysimgblt fb_sys_fops ahci crc32c_vpmsum drm tg3 libahci
[  968.396893] CPU: 28 PID: 3086 Comm: bash Tainted: G            E    
4.15.0-12-generic #13~lp1758206
[  968.396944] NIP:  c0000000007ec768 LR: c0000000007ed6a8 CTR: c0000000007ec740
[  968.396989] REGS: c0000000054fb9f0 TRAP: 0300   Tainted: G            E     
(4.15.0-12-generic)
[  968.397040] MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 28222222  
XER: 20040000
[  968.397090] CFAR: c0000000007ed6a4 DAR: 0000000000000000 DSISR: 42000000 
SOFTE: 1
[  968.397090] GPR00: c0000000007ed6a8 c0000000054fbc70 c0000000016eaf00 
0000000000000063
[  968.397090] GPR04: c000001ff76bce18 c000001ff76d4368 9000000000009033 
000000000000000a
[  968.397090] GPR08: 0000000000000007 0000000000000001 0000000000000000 
9000000000001003
[  968.397090] GPR12: c0000000007ec740 c000000007a33400 00000a463c88ae48 
0000000000000000
[  968.397090] GPR16: 00000a462439e9f0 00000a4624431998 00000a46244319d0 
00000a4624468204
[  968.397090] GPR20: 0000000000000000 0000000000000001 0000000000000000 
00007ffff9ecd164
[  968.397090] GPR24: 00007ffff9ecd160 00000a462446afc4 c0000000015e9968 
0000000000000002
[  968.397090] GPR28: 0000000000000063 0000000000000007 c000000001572a9c 
c0000000015e9d08
[  968.397486] NIP [c0000000007ec768] sysrq_handle_crash+0x28/0x30
[  968.397524] LR [c0000000007ed6a8] __handle_sysrq+0xf8/0x2c0
[  968.397554] Call Trace:
[  968.397571] [c0000000054fbc70] [c0000000007ed688] __handle_sysrq+0xd8/0x2c0 
(unreliable)
[  968.397618] [c0000000054fbd10] [c0000000007edeb4] 
write_sysrq_trigger+0x64/0x90
[  968.397664] [c0000000054fbd40] [c00000000047dfe8] proc_reg_write+0x88/0xd0
[  968.397703] [c0000000054fbd70] [c0000000003d131c] __vfs_write+0x3c/0x70
[  968.397742] [c0000000054fbd90] [c0000000003d1578] vfs_write+0xd8/0x220
[  968.397781] [c0000000054fbde0] [c0000000003d1898] SyS_write+0x68/0x110
[  968.397821] [c0000000054fbe30] [c00000000000b184] system_call+0x58/0x6c
[  968.397857] Instruction dump:
[  968.397881] 4bfff9f1 4bfffe50 3c4c00f0 3842e7c0 7c0802a6 60000000 39200001 
3d42001c
[  968.397929] 394a6db0 912a0000 7c0004ac 39400000 <992a0000> 4e800020 3c4c00f0 
3842e790
[  968.397979] ---[ end trace 42b5936ebd77f0df ]---
[  969.403420]
[  969.403499] Sending IPI to other CPUs
[  970.[ 9304.282854548,5] OPAL: Switch to big-endian OS
699527] IPI c[ 9308.106771743,5] OPAL: Switch to little-endian OS
[ 9309.438684420,3] PHB#0000[0:0]: CRESET: Unexpected slot state 00000102, 
resetting...
[ 9310.039758053,3] PHB#0000[0:0]: Timeout waiting for DLP PG reset !
[ 9310.039836165,3] PHB#0000[0:0]: Initialization failed
[ 9312.102310864,3] PHB#0001[0:1]: Timeout waiting for DLP PG reset !
[ 9312.102386083,3] PHB#0001[0:1]: Initialization failed
[ 9314.164868252,3] PHB#0002[0:2]: Timeout waiting for DLP PG reset !
[ 9314.165418307,3] PHB#0002[0:2]: Initialization failed
[ 9316.116455526,3] PHB#0003[0:3]: CRESET: Unexpected slot state 00000102, 
resetting...
[ 9316.229566014,3] PHB#0003[0:3]: Timeout waiting for DLP PG reset !
[ 9316.229647769,3] PHB#0003[0:3]: Initialization failed
[ 9318.292115865,3] PHB#0004[0:4]: Timeout waiting for DLP PG reset !
[ 9318.294132075,3] PHB#0004[0:4]: Initialization failed
[ 9320.356607557,3] PHB#0005[0:5]: Timeout waiting for DLP PG reset !
[ 9320.357808219,3] PHB#0005[0:5]: Initialization failed
[ 9322.308528728,3] PHB#0030[8:0]: CRESET: Unexpected slot state 00000102, 
resetting...
[ 9322.421406022,3] PHB#0030[8:0]: Timeout waiting for DLP PG reset !
[ 9322.422552792,3] PHB#0030[8:0]: Initialization failed
[ 9324.372530715,3] PHB#0033[8:3]: CRESET: Unexpected slot state 00000102, 
resetting...
[ 9324.485399371,3] PHB#0033[8:3]: Timeout waiting for DLP PG reset !
[ 9324.485730350,3] PHB#0033[8:3]: Initialization failed
[ 9326.436229911,3] PHB#0034[8:4]: CRESET: Unexpected slot state 00000102, 
resetting...
[ 9328.510462286,3] PHB#0035[8:5]: Timeout waiting for DLP PG reset !
[ 9328.511713428,3] PHB#0035[8:5]: Initialization failed
omplete
[  972.402103] kexec: Starting switchover sequence.
[    0.753802] integrity: Unable to open file: /etc/keys/x509_ima.der (-2)
[    0.753808] integrity: Unable to open file: /etc/keys/x509_evm.der (-2)
[    0.830129] vio vio: uevent: failed to send synthetic uevent

Gave up waiting for root file system device.  Common problems:
- Boot args (cat /proc/cmdline)
- Check rootdelay= (did the system wait long enough?)
- Missing modules (cat /proc/modules; ls /dev)
ALERT!  UUID=0266024d-8ea3-4132-ad62-b49befd6f8d9 does not exist.  Dropping to 
a shell!

BusyBox v1.27.2 (Ubuntu 1:1.27.2-2ubuntu3) built-in shell (ash)
Enter 'help' for a list of built-in commands.

(initramfs)
(initramfs)
(initramfs)
(initramfs)
(initramfs)

-- 
You received this bug notification because you are a member of Kernel
Packages, which is subscribed to linux in Ubuntu.
https://bugs.launchpad.net/bugs/1758206

Title:
  Ubuntu 18.04 [ WSP DD2.2 with stop4 and stop5 enabled ]: kdump fails
  to capture dump when smt=2 or off.

Status in The Ubuntu-power-systems project:
  In Progress
Status in linux package in Ubuntu:
  In Progress
Status in linux source package in Bionic:
  In Progress

Bug description:
  ---Problem Description---

  Ubuntu 18.04 [ WSP DD2.2 with stop4 and stop5 enabled ]: kdump fails
  to capture dump when smt=2 or off.

  ---Environment--
  Kernel Build:  4.15.0-13-generic
  System Name :  ltc-wspoon4
  Model/Type  :  P9
  Platform    :  BML

  ---Steps to reproduce--

  1. Configure kdump.
  2. Set smt=off
  # ppc64_cpu --smt=off
  3. trigger crash.
  echo 1 > /proc/sys/kernel/sysrq
  echo "c" > /proc/sysrq-trigger

  ---Logs----

  root@ltc-wspoon4:~# dpkg -l|grep kexec
  ii  kexec-tools                         1:2.0.16-1ubuntu1                 
ppc64el      tools to support fast kexec reboots
  root@ltc-wspoon4:~# makedumpfile -v
  makedumpfile: version 1.6.3 (released on 29 Jun 2018)
  lzo   enabled
  snappy        disabled

  
  [  285.519832] [c000001fe2d83de0] [c0000000003d1898] SyS_write+0x68/0x110
  [  285.519926] [c000001fe2d83e30] [c00000000000b184] system_call+0x58/0x6c
  [  285.520007] Instruction dump:
  [  285.520053] 4bfff9f1 4bfffe50 3c4c00f0 3842e800 7c0802a6 60000000 39200001 
3d42001c 
  [  285.520158] 394a6db0 912a0000 7c0004ac 39400000 <992a0000> 4e800020 
3c4c00f0 3842e7d0 
  [  285.520261] ---[ end trace 90a666dc7ca6f0ec ]---
  [  286.525787] 
  [  286.525883] Sending IPI to other CPUs
  [  28[  401.296284048,5] OPAL: Switch to big-endian OS
  [  402.297026662,3] OPAL: CPU 0x1 not in OPAL !
  6.851284] IPI complete
  [  403.455520784,3] OPAL: CPU 0x1 not in OPAL !nce.
  [  403.455569636,5] OPAL: Switch to little-endian OS
  [  404.455711332,3] OPAL: CPU 0x1 not in OPAL !
  [  404.470276386,3] PHB#0000[0:0]: CRESET: Unexpected slot state 00000102, 
resetting...
  [  413.140065625,3] PHB#0003[0:3]: CRESET: Unexpected slot state 00000102, 
resetting...
  [  421.393193605,3] PHB#0030[8:0]: CRESET: Unexpected slot state 00000102, 
resetting...
  [  423.353977316,3] PHB#0033[8:3]: CRESET: Unexpected slot state 00000102, 
resetting...
  [  425.314547966,3] PHB#0034[8:4]: CRESET: Unexpected slot state 00000102, 
resetting...

  [    5.004718] Processor 1 is stuck.
  [   10.007584] Processor 2 is stuck.
  [   15.010425] Processor 3 is stuck.
  [   16.135550] integrity: Unable to open file: /etc/keys/x509_ima.der (-2)
  [   16.135554] integrity: Unable to open file: /etc/keys/x509_evm.der (-2)
  [   16.250952] vio vio: uevent: failed to send synthetic uevent

  
  --== Welcome to Hostboot hostboot-5fc3b52/hbicore.bin ==--

    4.52180|secure|SecureROM valid - enabling functionality
    4.53193|secure|Booting in non-secure mode.
    6.00924|Booting from SBE side 0 on master proc=00050000

  
  There could be a firmware issue there but still there is need for the below 
kernel
  patches to be included to ensure kdump kernel captures dump successfully
  when SMT is set to 2/off

  
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=04b9c96eae72d862726f2f4bfcec2078240c33c5
  ("powerpc/crash: Remove the test for cpu_online in the IPI callback")

  
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4145f358644b970fcff293c09fdcc7939e8527d2
  ("powernv/kdump: Fix cases where the kdump kernel can get HMI's")

  
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=910961754572a2f4c83ad7e610d180
  ("powerpc/kdump: Fix powernv build break when KEXEC_CORE=n")

  Thanks
  Hari

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu-power-systems/+bug/1758206/+subscriptions

-- 
Mailing list: https://launchpad.net/~kernel-packages
Post to     : kernel-packages@lists.launchpad.net
Unsubscribe : https://launchpad.net/~kernel-packages
More help   : https://help.launchpad.net/ListHelp

Reply via email to