Details
-
Bug
-
Resolution: Cannot Reproduce
-
Major
-
None
-
Lustre 2.1.6
-
None
-
Bull environment
-
3
-
15033
Description
One of our customer had kernel Null pointer dereference in __iget.
The backtrace is as follows:
PID: 29825 TASK: ffff88044c49e7b0 CPU: 3 COMMAND: "ll_ost_583"
[...]
[exception RIP: __iget+45]
RIP: ffffffff81180cfd RSP: ffff88044c517ac0 RFLAGS: 00010246
RAX: ffff880040aa5550 RBX: ffff880040aa5540 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff88040c7bd3a9 RDI: ffff880040aa5540
RBP: ffff88044c517ac0 R8: 00000000fffffff3 R9: 00000000fffffff6
R10: 0000000000000008 R11: 0000000000000096 R12: ffff8800b59f0a80
R13: ffff88040c7bd300 R14: ffff8804243b22f8 R15: 000000000000000b
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#9 [ffff88044c517ac8] igrab at ffffffff81180fd8
#10 [ffff88044c517ae8] filter_lvbo_init at ffffffffa0bdc795 [obdfilter]
#11 [ffff88044c517b18] ldlm_resource_get at ffffffffa07c33a4 [ptlrpc]
#12 [ffff88044c517b88] ldlm_lock_create at ffffffffa07bcb85 [ptlrpc]
#13 [ffff88044c517bd8] ldlm_handle_enqueue0 at ffffffffa07e40a4 [ptlrpc]
#14 [ffff88044c517c48] ldlm_handle_enqueue at ffffffffa07e4ef6 [ptlrpc]
#15 [ffff88044c517c88] ost_handle at ffffffffa0964e83 [ost]
#16 [ffff88044c517da8] ptlrpc_main at ffffffffa08134e6 [ptlrpc]
#17 [ffff88044c517f48] kernel_thread at ffffffff8100412a
The crash occurs here:
crash> dis __iget 0xffffffff81180cd0 <__iget>: push %rbp 0xffffffff81180cd1 <__iget+1>: mov %rsp,%rbp 0xffffffff81180cd4 <__iget+4>: nopl 0x0(%rax,%rax,1) 0xffffffff81180cd9 <__iget+9>: mov 0x48(%rdi),%eax 0xffffffff81180cdc <__iget+12>: test %eax,%eax 0xffffffff81180cde <__iget+14>: jne 0xffffffff81180d30 <__iget+96> 0xffffffff81180ce0 <__iget+16>: lock incl 0x48(%rdi) 0xffffffff81180ce4 <__iget+20>: testq $0x107,0x218(%rdi) 0xffffffff81180cef <__iget+31>: jne 0xffffffff81180d22 <__iget+82> 0xffffffff81180cf1 <__iget+33>: mov 0x18(%rdi),%rdx 0xffffffff81180cf5 <__iget+37>: mov 0x10(%rdi),%rcx 0xffffffff81180cf9 <__iget+41>: lea 0x10(%rdi),%rax 0xffffffff81180cfd <__iget+45>: mov %rdx,0x8(%rcx) <=== HERE
which corresponds to :
if (!(inode->i_state & (I_DIRTY|I_SYNC))) list_move(&inode->i_list, &inode_in_use);
The %rcx is supposed to hold &inode->i_list, but is NULL.
Looking at the inode structure, all first fields contain zeros:
struct inode {
i_hash = {
next = 0x0,
pprev = 0x0
},
i_list = {
next = 0x0,
prev = 0x0
},
i_sb_list = {
next = 0x0,
prev = 0x0
},
i_dentry = {
next = 0x0,
prev = 0x0
},
i_ino = 0,
i_count = {
counter = 1
},
i_nlink = 0,
....
Looking at the dentry structure from which the inode address comes from, it looks to be ok:
crash> struct dentry ffff88040c7bd300
struct dentry {
d_count = {
counter = 1
},
d_flags = 8,
d_lock = {
raw_lock = {
slock = 2555943
}
},
d_mounted = -559087616,
d_inode = 0xffff880040aa5540,
d_hash = {
next = 0xffff88039a004f18,
pprev = 0xffff8803b4f8c558
},
d_parent = 0xffff8804235ea9c0,
d_name = {
hash = 72921089,
len = 9,
name = 0xffff88040c7bd3a0 "120408088"
},
d_lru = {
next = 0xffff88040c7bd400,
prev = 0xffff88040c7bd280
},
d_u = {
d_child = {
next = 0xffff88040c31dc10,
prev = 0xffff88054d37b950
},
d_rcu = {
next = 0xffff88040c31dc10,
func = 0xffff88054d37b950
}
},
d_subdirs = {
next = 0xffff88040c7bd360,
prev = 0xffff88040c7bd360
},
d_alias = {
next = 0xffff880040aa5570,
prev = 0xffff880040aa5570
},
d_time = 0,
d_op = 0x0,
d_sb = 0xffff880bc74dd400,
d_fsdata = 0x0,
d_iname = "120408088\000\000\000\000\000\000\000\000\000\b\000\000\000\000\000\000\000\000\000\000\000\000"
}
and is consistent with its parent directory:
crash> struct dentry.d_name ffff8804235ea9c0
d_name = {
hash = 2243934,
len = 3,
name = 0xffff8804235eaa60 "d24"
}
Can you find how this corruption happened ?