<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:34:55 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10419] LFSCK fails to start, hangs systems. </title>
                <link>https://jira.whamcloud.com/browse/LU-10419</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We do OSS failover, trigger LFSCK:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;lctl lfsck_start -M soaked-MDT0000 -s 1000 -t all -A{code]&lt;/p&gt;

&lt;p&gt;The lfsck start hangs, lfsck is not started, the clients wedge in state &apos;comp&apos; the entire system wedges. I have dumped Lustre Logs from all MDS, attached.  I have crash-dumped all the MDT nodes and the dumps are available on Spirit. lfsck_layout is unkillable. &lt;/p&gt;</description>
                <environment>Soak performance cluster - Lustre version=2.10.2_4_gb151f34</environment>
        <key id="49947">LU-10419</key>
            <summary>LFSCK fails to start, hangs systems. </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Wed, 20 Dec 2017 19:17:45 +0000</created>
                <updated>Wed, 1 Aug 2018 21:13:47 +0000</updated>
                            <resolved>Thu, 14 Jun 2018 03:55:08 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                    <version>Lustre 2.10.2</version>
                    <version>Lustre 2.10.3</version>
                                    <fixVersion>Lustre 2.12.0</fixVersion>
                    <fixVersion>Lustre 2.10.5</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="216890" author="jgmitter" created="Wed, 20 Dec 2017 20:56:35 +0000"  >&lt;p&gt;Assigning this to Fan Yong so that it is in his queue when he returns from vacation.&lt;/p&gt;</comment>
                            <comment id="217679" author="gerrit" created="Mon, 8 Jan 2018 08:14:01 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30768&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30768&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: no delay for notify RPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d8827a8ce44db121f80223dc7189e32f5bf3fd45&lt;/p&gt;</comment>
                            <comment id="217916" author="cliffw" created="Wed, 10 Jan 2018 19:26:58 +0000"  >&lt;p&gt;We are hitting this issue on b2_10.3 RC1 - need to patch ported over there&lt;/p&gt;</comment>
                            <comment id="217945" author="gerrit" created="Thu, 11 Jan 2018 02:38:00 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30831&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30831&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: no delay for notify RPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e20f5d402a8a6547544b33d4890b6910e7cf9f95&lt;/p&gt;</comment>
                            <comment id="218461" author="cliffw" created="Wed, 17 Jan 2018 22:22:45 +0000"  >&lt;p&gt;The patch for master is way out of date, uses and old kernel and contains old bugs (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10459&quot; title=&quot;LBUG o2iblnd_cb.c:991:kiblnd_check_sends_locked()) ASSERTION( conn-&amp;gt;ibc_nsends_posted &amp;lt;= conn-&amp;gt;ibc_queue_depth ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10459&quot;&gt;&lt;del&gt;LU-10459&lt;/del&gt;&lt;/a&gt;) Can you move the patch to the tip of current master, so that it is testable? &lt;/p&gt;</comment>
                            <comment id="218476" author="yong.fan" created="Thu, 18 Jan 2018 03:45:07 +0000"  >&lt;p&gt;&lt;a href=&quot;https://review.whamcloud.com/#/c/30768/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/30768/&lt;/a&gt; set 2 is against the latest master, here is the Jenkins build:&lt;br/&gt;
&lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-reviews/53764/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-reviews/53764/&lt;/a&gt; &lt;/p&gt;</comment>
                            <comment id="219073" author="gerrit" created="Thu, 25 Jan 2018 04:46:19 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/30768/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30768/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: no delay for notify RPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 39816213632cf9083530f1a8b644459d13e3c980&lt;/p&gt;</comment>
                            <comment id="219100" author="pjones" created="Thu, 25 Jan 2018 04:55:36 +0000"  >&lt;p&gt;Landed for 2.11&lt;/p&gt;</comment>
                            <comment id="221590" author="cliffw" created="Fri, 23 Feb 2018 17:55:58 +0000"  >&lt;p&gt;Seeing this again on DNE-enable system. version=2.10.57_58_gf24340c&lt;br/&gt;
I can crash dump systems if desired&lt;/p&gt;</comment>
                            <comment id="221621" author="yong.fan" created="Sat, 24 Feb 2018 07:41:21 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=cliffw&quot; class=&quot;user-hover&quot; rel=&quot;cliffw&quot;&gt;cliffw&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;Where can I get related logs?&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="221831" author="cliffw" created="Tue, 27 Feb 2018 17:18:22 +0000"  >&lt;p&gt;Logs are on spirit /scratch/logs/syslogs and /scratch/logs/console. The crash dumps are in /scratch/dumps on spirit. &lt;/p&gt;</comment>
                            <comment id="222032" author="gerrit" created="Thu, 1 Mar 2018 06:32:37 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/31475&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31475&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: skip dead target&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: be9f2eedf5039fa6308460aca6a84daa6b8003b1&lt;/p&gt;</comment>
                            <comment id="222630" author="cliffw" created="Tue, 6 Mar 2018 21:11:18 +0000"  >&lt;p&gt;With the current patch, lfsck does not stop. Currently also having mount timeouts. I have crashed dumped soak-8 while lfsck was hanging, logs are available on spirit.&lt;br/&gt;
/scratch/dumps/soak-8.spirit.hpdd.intel.com/10.10.1.108-2018-03-06-19:16:47&lt;/p&gt;</comment>
                            <comment id="222834" author="gerrit" created="Thu, 8 Mar 2018 17:36:02 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/31475/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31475/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: skip dead target&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 012834c5e7c7be50ff117cee4ac473d7fee4294d&lt;/p&gt;</comment>
                            <comment id="222846" author="pjones" created="Thu, 8 Mar 2018 17:41:44 +0000"  >&lt;p&gt;Landed for 2.11&lt;/p&gt;</comment>
                            <comment id="222905" author="gerrit" created="Fri, 9 Mar 2018 00:19:54 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/31600&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31600&lt;/a&gt;&lt;br/&gt;
Subject: Revert &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: skip dead target&quot;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 1387fa1c012dfdf5eb4f90efeb06edd45788064f&lt;/p&gt;</comment>
                            <comment id="222906" author="gerrit" created="Fri, 9 Mar 2018 00:20:07 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/31600/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31600/&lt;/a&gt;&lt;br/&gt;
Subject: Revert &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: skip dead target&quot;&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 9ba637b8949b1b8a5f2506e654a9b62d5c0cc245&lt;/p&gt;</comment>
                            <comment id="223480" author="yong.fan" created="Tue, 13 Mar 2018 07:20:17 +0000"  >&lt;blockquote&gt;

&lt;p&gt;With the current patch, lfsck does not stop. Currently also having mount timeouts. I have crashed dumped soak-8 while lfsck was hanging, logs are available on spirit.&lt;br/&gt;
/scratch/dumps/soak-8.spirit.hpdd.intel.com/10.10.1.108-2018-03-06-19:16:47&lt;/p&gt;
&lt;/blockquote&gt;

&lt;p&gt;The LFSCK master engine was blocked when sending&#160;OUT_ATTR_GET RPC to MDT2 that may be offline or in recovery. We expect the lfsck_stop() can wakeup the blocked LFSCK engines and make them to exit, but we only single (SIGINT)&#160;the LFSCK assistant engines, forget to do that&#160;for the LFSCK master engine.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;So the trouble is not related with the patch &lt;a href=&quot;https://review.whamcloud.com/31475/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31475/.&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I will make another patch to notify the master engine when lfsck_stop().&lt;/p&gt;</comment>
                            <comment id="223481" author="gerrit" created="Tue, 13 Mar 2018 07:20:43 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/31627&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31627&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: single master engine when stop&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e3e7d1a41711cfb0a12b941a88bf8c0bf3b4cc89&lt;/p&gt;</comment>
                            <comment id="229534" author="gerrit" created="Thu, 14 Jun 2018 03:53:37 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/31627/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31627/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: signal master engine when stop&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 1ece380412efd5dba2a8c345830f0456a4922301&lt;/p&gt;</comment>
                            <comment id="231245" author="gerrit" created="Wed, 1 Aug 2018 16:34:41 +0000"  >&lt;p&gt;John L. Hammond (jhammond@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/30831/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30831/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10419&quot; title=&quot;LFSCK fails to start, hangs systems. &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10419&quot;&gt;&lt;del&gt;LU-10419&lt;/del&gt;&lt;/a&gt; lfsck: no delay for notify RPC&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 9fef9ad10b26a4338c22105e66308ead5408173e&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="48483">LU-10036</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="29030" name="soak-10.lustre.log.gz" size="2692934" author="cliffw" created="Wed, 20 Dec 2017 19:17:40 +0000"/>
                            <attachment id="29029" name="soak-11.lustre.log.gz" size="2323326" author="cliffw" created="Wed, 20 Dec 2017 19:17:39 +0000"/>
                            <attachment id="29032" name="soak-8.lustre.log.gz" size="2247048" author="cliffw" created="Wed, 20 Dec 2017 19:17:40 +0000"/>
                            <attachment id="29031" name="soak-9.lustre.log.gz" size="2445080" author="cliffw" created="Wed, 20 Dec 2017 19:17:39 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzpz3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>