<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:13:54 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8017] All Nodes report NOT HEALTHY, system is healthy</title>
                <link>https://jira.whamcloud.com/browse/LU-8017</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Current build installed; &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-reviews/38245/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-reviews/38245/&lt;/a&gt;&lt;br/&gt;
This issue has persisted for the last two builds. &lt;br/&gt;
After mounting the filesystem, all nodes report NOT HEALTHY in /proc/fs/lustre/health_check. &lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;pdsh -g server &apos;lctl get_param health_check&apos; |dshbak -c&lt;br/&gt;
----------------&lt;br/&gt;
lola-&lt;span class=&quot;error&quot;&gt;&amp;#91;2-11&amp;#93;&lt;/span&gt;&lt;br/&gt;
----------------&lt;br/&gt;
health_check=healthy&lt;br/&gt;
NOT HEALTHY&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;The filesystem otherwise operates normally, jobs run, results are created.&lt;br/&gt;
We were using the health_check as part of our monitoring - this has been discontinued. &lt;br/&gt;
We are uncertain as to the cause, as all operations we can test work fine, and no errors are reported. &lt;/p&gt;</description>
                <environment></environment>
        <key id="36123">LU-8017</key>
            <summary>All Nodes report NOT HEALTHY, system is healthy</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="simmonsja">James A Simmons</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Wed, 13 Apr 2016 17:51:15 +0000</created>
                <updated>Wed, 7 Dec 2016 20:51:31 +0000</updated>
                            <resolved>Tue, 7 Jun 2016 21:07:54 +0000</resolved>
                                    <version>Lustre 2.9.0</version>
                                    <fixVersion>Lustre 2.9.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="148769" author="adilger" created="Wed, 13 Apr 2016 18:03:34 +0000"  >&lt;p&gt;This looks like a bug introduced by &lt;a href=&quot;http://review.whamcloud.com/16933&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16933&lt;/a&gt; &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6215&quot; title=&quot;Sync Lustre external tree with lustre linux kernel client&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6215&quot;&gt;&lt;del&gt;LU-6215&lt;/del&gt;&lt;/a&gt; lprocfs: handle seq_printf api change&quot;.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;lustre/obdclass/linux/linux-module.c
@@ -275,7 +277,7 @@ &lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; obd_proc_health_seq_show(struct seq_file *m, void *data)
        read_unlock(&amp;amp;obd_dev_lock);
 
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (healthy)
-               &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; seq_printf(m, &lt;span class=&quot;code-quote&quot;&gt;&quot;healthy\n&quot;&lt;/span&gt;);
+               seq_puts(m, &lt;span class=&quot;code-quote&quot;&gt;&quot;healthy\n&quot;&lt;/span&gt;);
 
        seq_printf(m, &lt;span class=&quot;code-quote&quot;&gt;&quot;NOT HEALTHY\n&quot;&lt;/span&gt;);
        &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; 0;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It should still have returned after printing &quot;healthy&quot; instead of continuing to &quot;NOT HEALTHY&quot;.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (healthy) {
               seq_puts(m, &lt;span class=&quot;code-quote&quot;&gt;&quot;healthy\n&quot;&lt;/span&gt;);
               &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;;
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="148770" author="di.wang" created="Wed, 13 Apr 2016 18:05:14 +0000"  >&lt;p&gt;Even in a healthy environment, it still show &quot;NOT HEALTHY&quot;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@testnode tests]# MDSCOUNT=4 sh llmount.sh 
Stopping clients: testnode /mnt/lustre (opts:)
Stopping clients: testnode /mnt/lustre2 (opts:)
Loading modules from /work/lustre-new/lustre-release/lustre/tests/..
........
[root@testnode tests]# cat /proc/fs/lustre/health_check 
healthy
NOT HEALTHY
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I checked the code, and it looks like a typo in the code&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;static int obd_proc_health_seq_show(struct seq_file *m, void *data)
{
    ............
        if (healthy)
                seq_puts(m, &quot;healthy\n&quot;);
                                     ---------------------------&amp;gt; probably else is missing here.
        seq_printf(m, &quot;NOT HEALTHY\n&quot;);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="148772" author="cliffw" created="Wed, 13 Apr 2016 18:08:02 +0000"  >&lt;p&gt;This also indicates a test miss, since we &lt;em&gt;should&lt;/em&gt; check health_check in auto test. &lt;/p&gt;</comment>
                            <comment id="148773" author="adilger" created="Wed, 13 Apr 2016 18:10:24 +0000"  >&lt;p&gt;I&apos;d be fine with an &quot;else&quot; also.&lt;/p&gt;

&lt;p&gt;Please also add a test that this is working properly.&lt;/p&gt;</comment>
                            <comment id="148802" author="cliffw" created="Wed, 13 Apr 2016 19:59:52 +0000"  >&lt;p&gt;I think QA team can do this. &lt;/p&gt;</comment>
                            <comment id="148856" author="simmonsja" created="Thu, 14 Apr 2016 00:47:56 +0000"  >&lt;p&gt;Oops, missed fixing up a sed change. Will fix. Sorry I didn&apos;t add a test with this patch but it is a really good idea. This way we can see if the upstream client will also behave properly. Which test should it go into?&lt;/p&gt;</comment>
                            <comment id="148860" author="gerrit" created="Thu, 14 Apr 2016 01:01:44 +0000"  >&lt;p&gt;James Simmons (uja.ornl@yahoo.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/19537&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/19537&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8017&quot; title=&quot;All Nodes report NOT HEALTHY, system is healthy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8017&quot;&gt;&lt;del&gt;LU-8017&lt;/del&gt;&lt;/a&gt; obd: report correct health state of a node&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ad8fb18517e31bc3878b13319c428b27c7279c16&lt;/p&gt;</comment>
                            <comment id="148865" author="adilger" created="Thu, 14 Apr 2016 02:40:35 +0000"  >&lt;p&gt;The test can go into sanity.sh. It should be enough to have a simple test that checks for &quot;healthy&quot; visible on all nodes and all services, AND that &quot;NOT HEALTHY&quot; is not present.  &lt;/p&gt;</comment>
                            <comment id="151430" author="gerrit" created="Sun, 8 May 2016 17:40:28 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/19537/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/19537/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8017&quot; title=&quot;All Nodes report NOT HEALTHY, system is healthy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8017&quot;&gt;&lt;del&gt;LU-8017&lt;/del&gt;&lt;/a&gt; obd: report correct health state of a node&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: c28933602a6971739cb5ec3a1e920409ff19b01e&lt;/p&gt;</comment>
                            <comment id="151433" author="simmonsja" created="Sun, 8 May 2016 18:20:48 +0000"  >&lt;p&gt;Patch has landed.&lt;/p&gt;</comment>
                            <comment id="151745" author="bogl" created="Tue, 10 May 2016 23:12:56 +0000"  >&lt;p&gt;running a client build that has the new additional check added in test-framework.sh by this fix on a server that has the old problem about returning incorrect health status causes failures from test-framework nearly everywhere.   do we need some version test around the health check in test-framework to avoid phony failures in interop?&lt;/p&gt;</comment>
                            <comment id="151804" author="simmonsja" created="Wed, 11 May 2016 01:28:45 +0000"  >&lt;p&gt;Do you have a example log of the failure? Also what does lctl get_param health_check show?&lt;/p&gt;</comment>
                            <comment id="151832" author="adilger" created="Wed, 11 May 2016 09:01:29 +0000"  >&lt;p&gt;Bob, which specific versions are you testing?  I thought this only affected 2.8.51 and was fixed in 2.8.52, and we typically do not test interop between point releases during development?  Was the 16933 patch backported to some maintenance branch?  In that case, the right answer is to also backport 19537 to that same branch, or any HA system that checks &lt;tt&gt;health_check&lt;/tt&gt; will fail.&lt;/p&gt;</comment>
                            <comment id="151853" author="bogl" created="Wed, 11 May 2016 14:31:30 +0000"  >&lt;p&gt;Andreas,&lt;br/&gt;
client is from master, v2.8.52 built yesterday.  It has the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8017&quot; title=&quot;All Nodes report NOT HEALTHY, system is healthy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8017&quot;&gt;&lt;del&gt;LU-8017&lt;/del&gt;&lt;/a&gt; fix.&lt;br/&gt;
servers are also from master, but older.  also v2.8.52, but don&apos;t have the fix.&lt;/p&gt;

&lt;p&gt;James,&lt;br/&gt;
example error from runtests:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;subsystem_debug=all -lnet -lnd -pinger
Setup mgs, mdt, osts
Starting mds1:   /dev/sdb /mnt/mds1
 runtests test_1: @@@@@@ FAIL: mds1 is in a unhealthy state 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:4769:error()
  = /usr/lib64/lustre/tests/test-framework.sh:1281:mount_facet()
  = /usr/lib64/lustre/tests/test-framework.sh:1344:start()
  = /usr/lib64/lustre/tests/test-framework.sh:3649:setupall()
  = /usr/lib64/lustre/tests/runtests:90:test_1()
  = /usr/lib64/lustre/tests/test-framework.sh:5033:run_one()
  = /usr/lib64/lustre/tests/test-framework.sh:5072:run_one_logged()
  = /usr/lib64/lustre/tests/test-framework.sh:4919:run_test()
  = /usr/lib64/lustre/tests/runtests:135:main()
Dumping lctl log to /tmp/test_logs/2016-05-10/154232/runtests.test_1.*.1462920188.log
Resetting fail_loc on all nodes...done.
FAIL 1 (32s)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;lctl get_param health_check on servers show:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lctl get_param health_check
health_check=healthy
NOT HEALTHY
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="151854" author="adilger" created="Wed, 11 May 2016 14:38:14 +0000"  >&lt;p&gt;Bob, in that case it wouldn&apos;t even be &lt;em&gt;possible&lt;/em&gt; to have a version check, even if we did that for development versions, since they both have the same version number. I would just update the old nodes and move on. &lt;/p&gt;</comment>
                            <comment id="151856" author="bogl" created="Wed, 11 May 2016 14:38:40 +0000"  >&lt;p&gt;I can avoid the failure by commenting or deleting out or deleting the new health test check in test-framework.sh on the client.&lt;br/&gt;
Pretty sure I could also avoid the fail by installing a newer build on servers too.&lt;br/&gt;
Wasn&apos;t sure how exposed we are to hitting this sort of fail in general, in interop between master and other versions.&lt;/p&gt;

&lt;p&gt;Andreas, I will take your advice.&lt;/p&gt;</comment>
                            <comment id="151862" author="simmonsja" created="Wed, 11 May 2016 14:54:55 +0000"  >&lt;p&gt;Thankfully the window of failure with broken server version is very small.&lt;/p&gt;</comment>
                            <comment id="154978" author="simmonsja" created="Tue, 7 Jun 2016 20:43:23 +0000"  >&lt;p&gt;Shall we close this ticket again?&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="36381">LU-8066</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzy7xb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>