Linux 内核调试 kdump vmcore
1. kdump介绍
linux内核发送崩溃时,kdump会生成一个内核转储文件vmcore。 可以通过分析vmcore分析出内核崩溃的原因.
crash是一个被广泛应用的内核奔溃转储文件分析工具.
使用crash调试内核转储文件,需要安装crash工具和内核调试工具kernel-debuginfo.
2. 安装kdump
crash kexec-tools 一般在系统镜像文件中就有相对应的rpm包
3. 配置kdump
vim /boot/grub/menu.lst: 设置 crashkernel=auto
vim /etc/kdump.conf: path /var/crash (core文件产生的目录)
4. 启动kdump
systemctl start kdump
5. 安装kernel-debuginfo
下载内核版本对应的文件
kernel-debuginfo-3.10.0-957.el7.x86_64.rpm
kernel-debuginfo-common-x86_64-3.10.0-957.el7.x86_64.rpm
6. 分析vmcore
abrt-cli list
crash /usr/lib/debug/lib/modules/3.10.0-957.el7.x86_64/vmlinux vmcore
crash> bt
PID: 7473 TASK: ffff9027d874bf40 CPU: 0 COMMAND: "cat"
#0 [ffff9026d0ea3638] machine_kexec at ffffffffbd060b2a
#1 [ffff9026d0ea3698] __crash_kexec at ffffffffbd113402
#2 [ffff9026d0ea3768] crash_kexec at ffffffffbd1134f0
#3 [ffff9026d0ea3780] oops_end at ffffffffbd717778
#4 [ffff9026d0ea37a8] no_context at ffffffffbd706f98
#5 [ffff9026d0ea37f8] __bad_area_nosemaphore at ffffffffbd70702f
#6 [ffff9026d0ea3848] bad_area_nosemaphore at ffffffffbd7071a0
#7 [ffff9026d0ea3858] __do_page_fault at ffffffffbd71a730
#8 [ffff9026d0ea38c0] do_page_fault at ffffffffbd71a925
#9 [ffff9026d0ea38f0] page_fault at ffffffffbd716768
[exception RIP: strcmp+32]
RIP: ffffffffbd353d20 RSP: ffff9026d0ea39a0 RFLAGS: 00010202
RAX: 000000000000002f RBX: ffff90240da5a080 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff9026cd27fc11
RBP: ffff9026d0ea39a0 R8: 00000000004b1de2 R9: ffff9026cd27fc10
R10: ffff90253fc01d00 R11: ffffc0428c349fc0 R12: 0000000000000001
R13: ffff9026cd27fc10 R14: 0000000000000001 R15: ffff9027c16f1580
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#10 [ffff9026d0ea39a8] send_log at ffffffffc0c22fd5 [xx]
#11 [ffff9026d0ea3ac0] user_file at ffffffffc0c0c571 [xx]
#12 [ffff9026d0ea3f00] sys_open at ffffffffc0c56670 [xxt]
#13 [ffff9026d0ea3f50] system_call_fastpath at ffffffffbd71f7d5
RIP: 00007f2e860c2a30 RSP: 00007fff6d8755a8 RFLAGS: 00010202
RAX: 0000000000000002 RBX: 00007fff6d875868 RCX: 000000000060bc60
RDX: 1fffffffffff0000 RSI: 0000000000000000 RDI: 00007fff6d876293
RBP: 0000000000001000 R8: 0000000000000000 R9: 0000000000000000
R10: 00007fff6d875020 R11: 0000000000000246 R12: 0000000000402644
R13: 0000000000010000 R14: 0000000000000000 R15: 0000000000000000
ORIG_RAX: 0000000000000002 CS: 0033 SS: 002b
crash> dis -l ffffffffbd353d20
/usr/src/debug/kernel-3.10.0-862.el7/linux-3.10.0-862.el7.x86_64/lib/string.c: 253
0xffffffffbd353d20 <strcmp+32>: cmp -0x1(%rsi),%al
crash> dis -l ffffffffc0c22fd5
0xffffffffc0c22fd5 <send_log+901>: test %eax,%eax
可以看到问题出现在send_to_user_identify_log+901
901 = 0x385
反汇编xx.ko
objdump -S xx.ko > tmp
打开tmp文件,查看汇编代码,找到send_log+0x385位置
16fc5: 4d 8b 6f 50 mov 0x50(%r15),%r13
else if (result > 0) node = node->rb_right;
16fc9: 48 8b 73 50 mov 0x50(%rbx),%rsi
if (result < 0) node = node->rb_left;
16fcd: 4c 89 ef mov %r13,%rdi
16fd0: e8 00 00 00 00 callq 16fd5 <send_log+0x385>
16fd5: 85 c0 test %eax,%eax
16fd7: 78 57 js 17030 <send_log+0x3e0>
else if (result > 0) node = node->rb_right;
16fd9: 75 43 jne 1701e <send_log+0x3ce>
if (result < 0) node = node->rb_left;
16fdb: 4d 85 ed test %r13,%r13
16fde: 66 90 xchg %ax,%ax
16fe0: 74 10 je 16ff2 <send_log+0x3a2>
16fe2: 4c 89 ef mov %r13,%rdi
else if (result > 0) node = node->rb_right;
16fe5: e8 00 00 00 00 callq 16fea <send_log+0x39a>
result = req->op_result - data->req.op_result;
16fea: 49 c7 47 50 00 00 00 movq $0x0,0x50(%r15)
16ff1: 00
if (result < 0) node = node->rb_left;
可以看出问题出现在
if (result < 0) node = node->rb_left;
crash 简单的命令
crash> help
files mach repeat timer
alias foreach mod runq tree
ascii fuser mount search union
bt gdb net set vm
btop help p sig vtop
dev ipcs ps struct waitq
dis irq pte swap whatis
eval kmem ptob sym wr
exit list ptov sys q
extend log rd task