Details
-
Bug
-
Resolution: Fixed
-
Critical
-
Lustre 2.4.0
-
github.com/chaos/lustre tag 2.3.58-14chaos
PPC client
-
3
-
7278
Description
A PowerPC64 sequoia LAC node panicked when I ran the following test program:
#include <stdio.h> #include <errno.h> #include <sys/ioctl.h> #include <string.h> #include <sys/stat.h> #include <fcntl.h> #include <lustre/lustre_user.h> int main(int argc, char **argv) { int rc = -1; int mdtidx = 0; int fd; char *path = argv[1]; if (argc != 2) { fprintf(stderr, "Usage: %s <path>\n", argv[0]); goto out; } fd = open(path, O_RDONLY); if (fd < 0) { fprintf(stderr, "open() error on %s: %s\n", path, strerror(errno)); goto out; } rc = ioctl(fd, LL_IOC_GET_MDTIDX, &mdtidx); if (rc < 0) { fprintf(stderr, "ioctl() error: %s\n", strerror(errno)); goto out; } printf("mdtidx %d\n", mdtidx); out: return rc; }
Here is the backtrace from crash:
PID: 4522 TASK: c000000f565f6900 CPU: 29 COMMAND: "a.out" #0 [c000000f4c2632c0] .crash_kexec at c0000000000e5bf4 #1 [c000000f4c2634c0] .die at c0000000000309d8 #2 [c000000f4c263570] .bad_page_fault at c000000000043378 #3 [c000000f4c2635f0] handle_page_fault at c000000000005228 Data Access error [300] exception frame: R0: 0000000000000000 R1: c000000f4c2638e0 R2: d0000000127283e0 R3: c000000e0d6f6f00 R4: c000000de5ea1700 R5: 0000000000000000 R6: c000000de5ea1838 R7: 0000000000000001 R8: 0000000000000720 R9: 0000000000000000 R10: 2b94515100000000 R11: 0000000000003000 R12: d0000000136b27f0 R13: c000000001006d80 R14: 000000001012b3dc R15: 0000000000000000 R16: 0000000000000000 R17: 0000000010129c58 R18: 0000000010129bf8 R19: 000000001012b948 R20: 0000000000000000 R21: 000000001012daf0 R22: c000000f4b772480 R23: 00000000400466af R24: c000000e046d33f8 R25: 0000000000000000 R26: c000000e0d6f6f00 R27: d00000000cc74b48 R28: c000000de5ea1700 R29: d00000000cc74b48 R30: d000000012726ee0 R31: c000000f4c2638e0 NIP: d000000012701940 MSR: 8000000000009032 OR3: d0000000136e7588 CTR: d0000000127018c0 LR: d0000000136166ec XER: 0000000020000010 CCR: 0000000024000428 MQ: 0000000000000001 DAR: 0000000000000000 DSISR: 0000000042000000 Syscall Result: 0000000000000000 #4 [c000000f4c2638e0] .mdc_getattr at d000000012701940 [mdc] [Link Register ] [c000000f4c2638e0] .ll_get_mdt_idx at d0000000136166ec #5 [c000000f4c263990] .ll_get_mdt_idx at d0000000136166ec [lustre] (unreliable) #6 [c000000f4c263a60] .ll_dir_ioctl at d0000000136234c4 [lustre] #7 [c000000f4c263c00] .vfs_ioctl at c0000000001d7f24 #8 [c000000f4c263c90] .do_vfs_ioctl at c0000000001d8170 #9 [c000000f4c263d80] .sys_ioctl at c0000000001d8954 #10 [c000000f4c263e30] syscall_exit at c000000000008564 syscall [c01] exception frame: R0: 0000000000000036 R1: 00000fffffffedb0 R2: 0000008053993268 R3: 0000000000000003 R4: 00000000400466af R5: 00000fffffffeea0 R6: 0000000000004000 R7: 00000080538c91f0 R8: 800000000200f032 R9: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 00000080537ac250 NIP: 00000080538cfc70 MSR: 800000000200f032 OR3: 0000000000000003 CTR: 00000080538cfbd0 LR: 000000001000082c XER: 0000000000000010 CCR: 0000000042000428 MQ: 0000000000000001 DAR: 00000080538c91dc DSISR: 0000000040000000 Syscall Result: 0000000000000003
Console panic message:
Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xd000000012701940 Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=1024 NUMA pSeries Modules linked in: xt_owner nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack mgc(U) lustre(U) mdc(U) fid(U) fld(U) lov(U) osc(U) ptlrpc(U) obdclass(U) lvfs(U) nfs fscache lockd auth_rpcgss nfs_acl ko2iblnd(U) lnet(U) sha512_generic sha256_generic libcfs(U) sunrpc ipt_LOG xt_multiport iptable_filter ip_tables ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ipv6 uinput raid1 sg ses enclosure mlx4_ib ib_sa ib_mad ib_core mlx4_en mlx4_core e1000e ehea ext4 jbd2 mbcache raid456 async_pq async_xor xor async_raid6_recov raid6_pq async_memcpy async_tx sd_mod crc_t10dif ipr dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan] NIP: d000000012701940 LR: d0000000136166ec CTR: d0000000127018c0 REGS: c000000f4c263660 TRAP: 0300 Not tainted (2.6.32-348.1chaos.bgq62.ppc64) MSR: 8000000000009032 <EE,ME,IR,DR> CR: 24000428 XER: 20000010 DAR: 0000000000000000, DSISR: 0000000042000000 TASK = c000000f565f6900[4522] 'a.out' THREAD: c000000f4c260000 CPU: 29 GPR00: 0000000000000000 c000000f4c2638e0 d0000000127283e0 c000000e0d6f6f00 GPR04: c000000de5ea1700 0000000000000000 c000000de5ea1838 0000000000000001 GPR08: 0000000000000720 0000000000000000 2b94515100000000 0000000000003000 GPR12: d0000000136b27f0 c000000001006d80 000000001012b3dc 0000000000000000 GPR16: 0000000000000000 0000000010129c58 0000000010129bf8 000000001012b948 GPR20: 0000000000000000 000000001012daf0 c000000f4b772480 00000000400466af GPR24: c000000e046d33f8 0000000000000000 c000000e0d6f6f00 d00000000cc74b48 GPR28: c000000de5ea1700 d00000000cc74b48 d000000012726ee0 c000000f4c2638e0 NIP [d000000012701940] .mdc_getattr+0x80/0x3d0 [mdc] LR [d0000000136166ec] .ll_get_mdt_idx+0x1ac/0x8d0 [lustre] Call Trace: [c000000f4c2638e0] [c000000f4c263990] 0xc000000f4c263990 (unreliable) [c000000f4c263990] [d0000000136166ec] .ll_get_mdt_idx+0x1ac/0x8d0 [lustre] [c000000f4c263a60] [d0000000136234c4] .ll_dir_ioctl+0x1d14/0x8080 [lustre] [c000000f4c263c00] [c0000000001d7f24] .vfs_ioctl+0x54/0x140 [c000000f4c263c90] [c0000000001d8170] .do_vfs_ioctl+0x90/0x7c0 [c000000f4c263d80] [c0000000001d8954] .SyS_ioctl+0xb4/0xd0 [c000000f4c263e30] [c000000000008564] syscall_exit+0x0/0x40 Instruction dump: 7fa85840 41dd02e0 eb7e8028 801b0000 780907e1 41820014 e93e8030 80090000 7809ffe3 408201fc 38000000 7f43d378 <f8190000> 48012c3d e8410028 e89e8048
Source code information:
(gdb) l *(mdc_getattr+0x80) 0x11940 is in mdc_getattr (/builddir/build/BUILD/lustre-2.3.58/lustre/mdc/mdc_request.c:211). 206 /builddir/build/BUILD/lustre-2.3.58/lustre/mdc/mdc_request.c: No such file or directory. in /builddir/build/BUILD/lustre-2.3.58/lustre/mdc/mdc_request.c (gdb)
204 int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data, 205 struct ptlrpc_request **request) 206 { 207 struct ptlrpc_request *req; 208 int rc; 209 ENTRY; 210 211 *request = NULL; 212 req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR); 213 if (req == NULL) 214 RETURN(-ENOMEM);