<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:19:30 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8663] LustreError: 106058:0:(hash.c:554:cfs_hash_bd_del_locked()) ASSERTION( bd-&gt;bd_bucket-&gt;hsb_count &gt; 0 ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-8663</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While the filesystem was mounted and active, we began power cycling OSS&apos;s to verify failover worked properly.&lt;/p&gt;

&lt;p&gt;Several OSS nodes crashed, with traces like this one:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2016-09-30 14:40:14 [11785.975211] BUG: unable to handle kernel paging request at 00000000deadbeef
2016-09-30 14:40:14 [11785.984272] IP: [&amp;lt;ffffffff81334259&amp;gt;] memset+0x9/0xb0
2016-09-30 14:40:14 [11785.986998] LustreError: 106058:0:(hash.c:554:cfs_hash_bd_del_locked()) ASSERTION( bd-&amp;gt;bd_bucket-&amp;gt;hsb_count &amp;gt; 0 ) failed:
2016-09-30 14:40:14 [11785.986999] LustreError: 106058:0:(hash.c:554:cfs_hash_bd_del_locked()) LBUG
2016-09-30 14:40:14 [11785.987000] Pid: 106058, comm: ldlm_bl_10
2016-09-30 14:40:14 [11785.987000]
2016-09-30 14:40:14 [11786.490434] Call Trace:
2016-09-30 14:40:14 [11786.493875]  [&amp;lt;ffffffffa0d2e00d&amp;gt;] ? ofd_lvbo_free+0x4d/0xe0 [ofd]
2016-09-30 14:40:14 [11786.501409]  [&amp;lt;ffffffffa1099643&amp;gt;] ldlm_resource_putref_locked+0x133/0x430 [ptlrpc]
2016-09-30 14:40:14 [11786.510593]  [&amp;lt;ffffffffa1099952&amp;gt;] ldlm_res_hop_put_locked+0x12/0x20 [ptlrpc]
2016-09-30 14:40:14 [11786.519183]  [&amp;lt;ffffffffa08d1b74&amp;gt;] cfs_hash_for_each_relax+0x1b4/0x3d0 [libcfs]
2016-09-30 14:40:14 [11786.527976]  [&amp;lt;ffffffffa1096d60&amp;gt;] ? cleanup_resource+0x370/0x370 [ptlrpc]
2016-09-30 14:40:14 [11786.536291]  [&amp;lt;ffffffffa1096d60&amp;gt;] ? cleanup_resource+0x370/0x370 [ptlrpc]
2016-09-30 14:40:14 [11786.544587]  [&amp;lt;ffffffffa08d4dc5&amp;gt;] cfs_hash_for_each_nolock+0x75/0x1c0 [libcfs]
2016-09-30 14:40:14 [11786.553382]  [&amp;lt;ffffffffa1094eb0&amp;gt;] ldlm_namespace_cleanup+0x30/0xc0 [ptlrpc]
2016-09-30 14:40:14 [11786.561886]  [&amp;lt;ffffffffa1095d5f&amp;gt;] __ldlm_namespace_free+0x5f/0x5c0 [ptlrpc]
2016-09-30 14:40:14 [11786.570385]  [&amp;lt;ffffffffa0c660e4&amp;gt;] ? lfsck_instance_find+0x74/0xb0 [lfsck]
2016-09-30 14:40:14 [11786.578678]  [&amp;lt;ffffffff8169d015&amp;gt;] ? mutex_lock+0x25/0x42
2016-09-30 14:40:14 [11786.585329]  [&amp;lt;ffffffffa0c6a0a8&amp;gt;] ? lfsck_stop+0x1b8/0x4f0 [lfsck]
2016-09-30 14:40:14 [11786.592951]  [&amp;lt;ffffffff811e5fd6&amp;gt;] ? kmem_cache_alloc_trace+0x226/0x250
2016-09-30 14:40:14 [11786.600978]  [&amp;lt;ffffffffa109631a&amp;gt;] ldlm_namespace_free_prior+0x5a/0x210 [ptlrpc]
2016-09-30 14:40:14 [11786.609869]  [&amp;lt;ffffffffa0d1089a&amp;gt;] ofd_device_fini+0x8a/0x2a0 [ofd]
2016-09-30 14:40:14 [11786.617527]  [&amp;lt;ffffffffa0a0a21c&amp;gt;] class_cleanup+0x8dc/0xd70 [obdclass]
2016-09-30 14:40:14 [11786.625561]  [&amp;lt;ffffffffa0a0cbfc&amp;gt;] class_process_config+0x1e2c/0x2f70 [obdclass]
2016-09-30 14:40:14 [11786.634454]  [&amp;lt;ffffffff811e5a63&amp;gt;] ? __kmalloc+0x233/0x280
2016-09-30 14:40:14 [11786.641219]  [&amp;lt;ffffffffa0a0611b&amp;gt;] ? lustre_cfg_new+0x8b/0x400 [obdclass]
2016-09-30 14:40:14 [11786.649424]  [&amp;lt;ffffffffa0a0de2f&amp;gt;] class_manual_cleanup+0xef/0x810 [obdclass]
2016-09-30 14:40:14 [11786.658007]  [&amp;lt;ffffffffa0a3fece&amp;gt;] server_put_super+0x8de/0xcd0 [obdclass]
2016-09-30 14:40:14 [11786.666272]  [&amp;lt;ffffffff81209572&amp;gt;] generic_shutdown_super+0x72/0xf0
2016-09-30 14:40:14 [11786.673841]  [&amp;lt;ffffffff81209942&amp;gt;] kill_anon_super+0x12/0x20
2016-09-30 14:40:14 [11786.680720]  [&amp;lt;ffffffffa0a11592&amp;gt;] lustre_kill_super+0x32/0x50 [obdclass]
2016-09-30 14:40:14 [11786.688840]  [&amp;lt;ffffffff81209cf9&amp;gt;] deactivate_locked_super+0x49/0x60
2016-09-30 14:40:14 [11786.696457]  [&amp;lt;ffffffff8120a2f6&amp;gt;] deactivate_super+0x46/0x60
2016-09-30 14:40:14 [11786.703375]  [&amp;lt;ffffffff812282c5&amp;gt;] mntput_no_expire+0xc5/0x120
2016-09-30 14:40:14 [11786.710372]  [&amp;lt;ffffffff81229440&amp;gt;] SyS_umount+0xa0/0x3b0
2016-09-30 14:40:14 [11786.716770]  [&amp;lt;ffffffff816aa4c9&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>lustre-2.8.0_0.0.llnlpreview.41-2.ch6.x86_64</environment>
        <key id="40242">LU-8663</key>
            <summary>LustreError: 106058:0:(hash.c:554:cfs_hash_bd_del_locked()) ASSERTION( bd-&gt;bd_bucket-&gt;hsb_count &gt; 0 ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="ofaaland">Olaf Faaland</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Mon, 3 Oct 2016 06:46:38 +0000</created>
                <updated>Thu, 16 Feb 2017 19:46:56 +0000</updated>
                            <resolved>Sun, 18 Dec 2016 13:55:14 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="168015" author="ofaaland" created="Mon, 3 Oct 2016 06:48:00 +0000"  >&lt;p&gt;Looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7157&quot; title=&quot;sanity test_27z: cfs_hash_bd_del_locked()) ASSERTION( bd-&amp;gt;bd_bucket-&amp;gt;hsb_count &amp;gt; 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7157&quot;&gt;LU-7157&lt;/a&gt; but I wasn&apos;t sure whether to put notes from a production system issue into a ticket about a test failure.&lt;/p&gt;</comment>
                            <comment id="168016" author="ofaaland" created="Mon, 3 Oct 2016 06:54:17 +0000"  >&lt;p&gt;Console output preceeding this assertion:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
2016-09-30 14:39:50 [11762.576750] Lustre: 99059:0:(client.c:2063:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1475271484/real 1475271484]  req@ffff881782684500 x1546922312654596/t0(0) o400-&amp;gt;MGC172.19.3.1@o2ib600@172.19.3.1@o2ib600:26/25 lens 224/224 e 0 to 1 dl 1475271590 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
2016-09-30 14:39:50 [11762.611814] Lustre: 99059:0:(client.c:2063:ptlrpc_expire_one_request()) Skipped 5 previous similar messages
2016-09-30 14:39:50 [11762.623781] LustreError: 166-1: MGC172.19.3.1@o2ib600: Connection to MGS (at 172.19.3.1@o2ib600) was lost; in progress operations using this service will fail
2016-09-30 14:40:13 [11785.220067] LNetError: 94499:0:(o2iblnd_cb.c:3134:kiblnd_check_txs_locked()) Timed out tx: tx_queue, 3 seconds
2016-09-30 14:40:13 [11785.232466] LNetError: 94499:0:(o2iblnd_cb.c:3197:kiblnd_check_conns()) Timed out RDMA with 172.19.3.1@o2ib600 (53): c: 0, oc: 0, rc: 8
2016-09-30 14:40:13 [11785.249479] Lustre: Failing over lsh-OST0002
2016-09-30 14:40:13 [11785.266567] Lustre: lsh-OST0002: Not available for connect from 192.168.136.240@o2ib27 (stopping)
2016-09-30 14:40:13 [11785.279740] LustreError: 124427:0:(ldlm_lockd.c:2368:ldlm_cancel_handler()) ldlm_cancel from 192.168.135.99@o2ib27 arrived at 1475271613 with bad export cookie 9157528843916976698
2016-09-30 14:40:13 [11785.301432] LustreError: 124427:0:(ldlm_lock.c:2597:ldlm_lock_dump_handle()) ### ### ns: filter-lsh-OST0002_UUID lock: ffff881f8fe6ea00/0x7f1613f702807cf5 lrc: 3/0,0 mode: PW/PW res: [0xc0466a:0x0:0x0].0x0 rrc: 1 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;4095) flags: 0x40000000000000 nid: 192.168.135.99@o2ib27 remote: 0x320df3ff8e6289ab expref: 1777 pid: 99124 timeout: 0 lvb_type: 0
2016-09-30 14:40:14 [11785.787090] Lustre: lsh-OST0002: Not available for connect from 192.168.134.161@o2ib27 (stopping)
2016-09-30 14:40:14 [11785.791589] LustreError: 104316:0:(ldlm_lockd.c:2368:ldlm_cancel_handler()) ldlm_cancel from 192.168.134.141@o2ib27 arrived at 1475271614 with bad export cookie 9157528843916977104
2016-09-30 14:40:14 [11785.791591] LustreError: 104316:0:(ldlm_lockd.c:2368:ldlm_cancel_handler()) Skipped 2 previous similar messages
2016-09-30 14:40:14 [11785.834020] Lustre: Skipped 74 previous similar messages
2016-09-30 14:40:14 [11785.975211] BUG: unable to handle kernel paging request at 00000000deadbeef
2016-09-30 14:40:14 [11785.984272] IP: [&amp;lt;ffffffff81334259&amp;gt;] memset+0x9/0xb0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="168017" author="ofaaland" created="Mon, 3 Oct 2016 06:55:45 +0000"  >&lt;p&gt;O/S on the OSS is RHEL 7.3 derivative.&lt;/p&gt;</comment>
                            <comment id="168020" author="bfaccini" created="Mon, 3 Oct 2016 08:05:46 +0000"  >&lt;p&gt;Olaf, do you mean umount OSTs by power-cycling OSSs ?&lt;br/&gt;
And yes it definitely looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7157&quot; title=&quot;sanity test_27z: cfs_hash_bd_del_locked()) ASSERTION( bd-&amp;gt;bd_bucket-&amp;gt;hsb_count &amp;gt; 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7157&quot;&gt;LU-7157&lt;/a&gt; which seems still to be investigated.&lt;/p&gt;</comment>
                            <comment id="168042" author="ofaaland" created="Mon, 3 Oct 2016 14:01:06 +0000"  >&lt;p&gt;Bruno,&lt;br/&gt;
I&apos;ll have to get the sysadmin to describe what he was doing in more detail.  You&apos;re right, there&apos;s SyS_umount at the bottom of the stack so he must have been doing umount; possibly HA did this on his behalf.&lt;/p&gt;</comment>
                            <comment id="168048" author="charr" created="Mon, 3 Oct 2016 14:53:04 +0000"  >&lt;p&gt;It&apos;s difficult to say what was happening at the time of the crash as I was working on sibling OSSs and rebooting a couple of them several times. I believe the MDS node was rebooted as well. When I looked at the status of the other OSS nodes (which had been up and healthy), I noticed these crashes.&lt;/p&gt;</comment>
                            <comment id="168082" author="pjones" created="Mon, 3 Oct 2016 17:17:12 +0000"  >&lt;p&gt;Oleg is looking into this&lt;/p&gt;</comment>
                            <comment id="168183" author="green" created="Tue, 4 Oct 2016 15:07:59 +0000"  >&lt;p&gt;I believe this is a dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6304&quot; title=&quot;crash on umount in cleanup_resource&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6304&quot;&gt;&lt;del&gt;LU-6304&lt;/del&gt;&lt;/a&gt; and the patch is here: &lt;a href=&quot;http://review.whamcloud.com/13908&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/13908&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="178308" author="pjones" created="Sun, 18 Dec 2016 13:55:14 +0000"  >&lt;p&gt;This is believed to be a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6304&quot; title=&quot;crash on umount in cleanup_resource&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6304&quot;&gt;&lt;del&gt;LU-6304&lt;/del&gt;&lt;/a&gt; which has been landed for 2.8.1&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="28881">LU-6304</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyq8v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>