Details
-
Bug
-
Resolution: Fixed
-
Major
-
Lustre 2.0.0
-
None
-
3
-
4756
Description
At both TGCC and Tera100 we have recently experienced 3 different LBUGs of the same kind/family :
_ "(genops.c:757:class_export_put()) ASSERTION(cfs_atomic_read(&exp->exp_refcount) < 0x5a5a5a) failed", on a TGCC MDS.
_ "(genops.c:911:class_import_get()) ASSERTION(cfs_atomic_read(&import->imp_refcount) < 0x5a5a5a) failed", on a T100 Client.
_ "(genops.c:925:class_import_put()) ASSERTION(cfs_atomic_read(&imp->imp_refcount) < 0x5a5a5a) failed", on an other T100 Client.
in each case, I have been able to confirm that the value xxx_refcount value triggering the assert was good and not poisoned, but simply reflecting a huge number of references due to high/slow activity.
Having a look to the concerned sources/code, it seems that all this 3 Assert()s/LBUGs and also 2 others one :
_ lustre/include/lustre_log.h:449 LASSERT(cfs_atomic_read(&ctxt->loc_refcount) < 0x5a5a5a);
_ lustre/obdclass/llog_obd.c:139 LASSERT(cfs_atomic_read(&ctxt->loc_refcount) < 0x5a5a5a);
are wrong regarding the 4 cfs_atomic_t variables/fields they check, and must be at least coded as per the following patch/changes :
============================================================
[root@curie1 lustre-2.0.0.1] # diff -urN lustre/include/lustre_log.h lustre/include/lustre_log.h.bfi
— lustre/include/lustre_log.h 2010-08-04 13:13:04.000000000 +0200
+++ lustre/include/lustre_log.h.bfi 2011-12-13 14:38:56.071839517 +0100
@@ -446,7 +446,7 @@
if (ctxt == NULL)
return;
LASSERT(cfs_atomic_read(&ctxt->loc_refcount) > 0);
- LASSERT(cfs_atomic_read(&ctxt->loc_refcount) < 0x5a5a5a);
+ LASSERT(cfs_atomic_read(&ctxt->loc_refcount) < LI_POISON);
CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
cfs_atomic_read(&ctxt->loc_refcount) - 1);
__llog_ctxt_put(ctxt);
[root@curie1 lustre-2.0.0.1] # diff -urN lustre/obdclass/genops.c lustre/obdclass/genops.c.bfi-
- lustre/obdclass/genops.c 2010-08-04 13:13:03.000000000 +0200
+++ lustre/obdclass/genops.c.bfi 2011-12-13 14:39:46.961868491 +0100
@@ -754,7 +754,7 @@
CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
cfs_atomic_read(&exp->exp_refcount) - 1);
LASSERT(cfs_atomic_read(&exp->exp_refcount) > 0);
- lustre/obdclass/genops.c 2010-08-04 13:13:03.000000000 +0200
-
- LASSERT(cfs_atomic_read(&exp->exp_refcount) < 0x5a5a5a);
+ LASSERT(cfs_atomic_read(&exp->exp_refcount) < LI_POISON);
if (cfs_atomic_dec_and_test(&exp->exp_refcount)) {
LASSERT(!cfs_list_empty(&exp->exp_obd_chain));
@@ -908,7 +908,7 @@
struct obd_import *class_import_get(struct obd_import *import)
{
LASSERT(cfs_atomic_read(&import->imp_refcount) >= 0);
- LASSERT(cfs_atomic_read(&import->imp_refcount) < 0x5a5a5a);
+ LASSERT(cfs_atomic_read(&import->imp_refcount) < LI_POISON);
cfs_atomic_inc(&import->imp_refcount);
CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
cfs_atomic_read(&import->imp_refcount),
@@ -922,7 +922,7 @@
ENTRY;
LASSERT(cfs_atomic_read(&imp->imp_refcount) > 0);
- LASSERT(cfs_atomic_read(&imp->imp_refcount) < 0x5a5a5a);
+ LASSERT(cfs_atomic_read(&imp->imp_refcount) < LI_POISON);
LASSERT(cfs_list_empty(&imp->imp_zombie_chain));
CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
[root@curie1 lustre-2.0.0.1] # diff -urN lustre/obdclass/llog_obd.c lustre/obdclass/llog_obd.c.bfi
— lustre/obdclass/llog_obd.c 2010-08-04 13:13:03.000000000 +0200
+++ lustre/obdclass/llog_obd.c.bfi 2011-12-13 14:40:13.921587979 +0100
@@ -136,7 +136,7 @@
/*
- Banlance the ctxt get when calling llog_cleanup()
*/
- LASSERT(cfs_atomic_read(&ctxt->loc_refcount) < 0x5a5a5a);
+ LASSERT(cfs_atomic_read(&ctxt->loc_refcount) < LI_POISON);
LASSERT(cfs_atomic_read(&ctxt->loc_refcount) > 1);
llog_ctxt_put(ctxt);
[root@curie1 lustre-2.0.0.1] #
============================================================
and may be need to be enhanced by checking that more fields in the same struct are not poisoned ...
Attachments
Issue Links
- is duplicated by
-
LU-1765 client LBUGs with ASSERTION(atomic_read(&import->imp_refcount) < 0x5a5a5a) failed
- Resolved
- Trackbacks
-
Changelog 2.1 Changes from version 2.1.0 to version 2.1.1 Server support for kernels: 2.6.18274.12.1.el5 (RHEL5) 2.6.32220.el6 (RHEL6) Client support for unpatched kernels: 2.6.18274.12.1.el5 (RHEL5) 2.6.32220.el6 (RHEL6) 2.6.32.360....
-
Changelog 2.2 version 2.2.0 Support for networks: o2iblnd OFED 1.5.4 Server support for kernels: 2.6.32220.4.2.el6 (RHEL6) Client support for unpatched kernels: 2.6.18274.18.1.el5 (RHEL5) 2.6.32220.4.2.el6 (RHEL6) 2.6.32.360....