Details
-
Bug
-
Resolution: Fixed
-
Critical
-
None
-
Lustre 2.2.0
-
[root@n-mds1 ~]# cat /proc/fs/lustre/version
lustre: 2.2.0
kernel: patchless_client
build: 2.2.0-RC2--PRISTINE-2.6.32-220.4.2.el6_lustre.x86_64
[root@n-mds1 ~]# uname -r
2.6.32-220.4.2.el6_lustre.x86_64
[root@n-mds1 ~]# rpm -qa|grep lustre
lustre-ldiskfs-3.3.0-2.6.32_220.4.2.el6_lustre.x86_64.x86_64
lustre-2.2.0-2.6.32_220.4.2.el6_lustre.x86_64.x86_64
kernel-firmware-2.6.32-220.4.2.el6_lustre.x86_64
lustre-modules-2.2.0-2.6.32_220.4.2.el6_lustre.x86_64.x86_64
kernel-headers-2.6.32-220.4.2.el6_lustre.x86_64
kernel-2.6.32-220.4.2.el6_lustre.x86_64
kernel-devel-2.6.32-220.4.2.el6_lustre.x86_64[ root@n-mds1 ~]# cat /proc/fs/lustre/version lustre: 2.2.0 kernel: patchless_client build: 2.2.0-RC2--PRISTINE-2.6.32-220.4.2.el6_lustre.x86_64 [ root@n-mds1 ~]# uname -r 2.6.32-220.4.2.el6_lustre.x86_64 [ root@n-mds1 ~]# rpm -qa|grep lustre lustre-ldiskfs-3.3.0-2.6.32_220.4.2.el6_lustre.x86_64.x86_64 lustre-2.2.0-2.6.32_220.4.2.el6_lustre.x86_64.x86_64 kernel-firmware-2.6.32-220.4.2.el6_lustre.x86_64 lustre-modules-2.2.0-2.6.32_220.4.2.el6_lustre.x86_64.x86_64 kernel-headers-2.6.32-220.4.2.el6_lustre.x86_64 kernel-2.6.32-220.4.2.el6_lustre.x86_64 kernel-devel-2.6.32-220.4.2.el6_lustre.x86_64
Description
We recently experienced two MDS crashes on our Lustre installation.
I've attached the netconsole output of both crashes (that's all i got: there is nothing in the syslog and i wasn't able to create a screenshot of the console output as the crashed mds was already powercycled by its failover partner).
no, i've got what I need, thanks
a bit of analysis here:
0000000000004d26 <osd_trans_stop+0x56> mov 0x50(%rbx),%r12
0000000000004d2a <osd_trans_stop+0x5a> test %r12,%r12
0000000000004d2d <osd_trans_stop+0x5d> je 0000000000004e82 <osd_trans_stop+0x1b2>
0000000000004d33 <osd_trans_stop+0x63> movzbl 0x28(%r12),%eax
0000000000004d39 <osd_trans_stop+0x69> movzbl 0x4c(%rbx),%edx
0000000000004d3d <osd_trans_stop+0x6d> and $0xfffffffe,%eax
0000000000004d40 <osd_trans_stop+0x70> and $0x1,%edx
0000000000004d43 <osd_trans_stop+0x73> or %edx,%eax
0000000000004d45 <osd_trans_stop+0x75> mov %al,0x28(%r12)
0000000000004d4a <osd_trans_stop+0x7a> mov (%r12),%rax
so rbx contains pointer to oh:
(gdb) p/x sizeof(struct thandle)
$2 = 0x50
struct osd_thandle {
struct thandle ot_super;
handle_t *ot_handle;
0000000000004db3 <osd_trans_stop+0xe3> mov (%rbx),%rax
0000000000004db6 <osd_trans_stop+0xe6> test %rax,%rax
0000000000004db9 <osd_trans_stop+0xe9> je 0000000000004dc4 <osd_trans_stop+0xf4>
0000000000004dbb <osd_trans_stop+0xeb> mov 0x8(%rax),%rax
0000000000004dbf <osd_trans_stop+0xef> testb $0x1,(%rax)
these lines implement:
if (lu_device_is_md(&th->th_dev->dd_lu_dev)) {
RAX: 0006000100000002 is supposed to be ld_type (and 0x8(%rax) is ld_type->ldt_tags)
IOW, thandle was broken and pointing to garbage instead of a device.
now the question what broke that..