<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:10:48 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-828] Lustre Client Unstable</title>
                <link>https://jira.whamcloud.com/browse/LU-828</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Hi ,&lt;/p&gt;







&lt;p&gt;We have lustre 2.0 at our setup with 2 mds servers and 6 oss servers.&lt;/p&gt;







&lt;p&gt;Facing an issue where &quot;lfs check servers&quot; output varies from node to node.&lt;/p&gt;



&lt;p&gt;Please find below the outputs of two different clients took at the same time.&lt;/p&gt;



&lt;p&gt;Request your help in solving this issue.&lt;/p&gt;



&lt;p&gt;Also find attached var log messags of the node having this error.&lt;/p&gt;















&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root &amp;lt;at&amp;gt; cn367 ~&amp;#93;&lt;/span&gt;# lfs check servers&lt;/p&gt;



&lt;p&gt;scratch-OST0011-osc-ffff810c01a9c000 active.&lt;/p&gt;







&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root &amp;lt;at&amp;gt; mgmt00 ~&amp;#93;&lt;/span&gt;# lfs check servers&lt;/p&gt;



&lt;p&gt;error: check &apos;scratch-OST0011-osc-ffff810c056c4000&apos; Resource temporarily unavailable&lt;/p&gt;






&lt;p&gt;Thanks &amp;amp; Regards,&lt;br/&gt;
N.Chakravarthy.&lt;/p&gt;</description>
                <environment>RHEL 5.5&lt;br/&gt;
2* MDS servers, 6*OSS servers handling 3OST each (HW config: Dual Intel Westmere processor with 6 cores each, 24GB Memory)&lt;br/&gt;
no. of clients: 368</environment>
        <key id="12379">LU-828</key>
            <summary>Lustre Client Unstable</summary>
                <type id="3" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11318&amp;avatarType=issuetype">Task</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="prabhu.chakra">Chakravarthy N</reporter>
                        <labels>
                            <label>o2iblnd</label>
                    </labels>
                <created>Wed, 9 Nov 2011 00:38:55 +0000</created>
                <updated>Fri, 27 Jan 2012 18:41:05 +0000</updated>
                            <resolved>Fri, 27 Jan 2012 18:41:05 +0000</resolved>
                                    <version>Lustre 2.0.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>0</watches>
                                                                            <comments>
                            <comment id="22718" author="cliffw" created="Wed, 9 Nov 2011 00:47:19 +0000"  >&lt;p&gt;Check the system logs for the two nodes, it is quite possible for one client to be happy and a second client to be unable to reach a particular server. This can be caused by network issues or other errors Check system logs and dmesg for both clients, and the OSS involved, there should be some addition errors that will help you sort out the situation. &lt;/p&gt;</comment>
                            <comment id="22719" author="prabhu.chakra" created="Wed, 9 Nov 2011 01:32:03 +0000"  >&lt;p&gt;Thanks for your reply...&lt;/p&gt;

&lt;p&gt;I could see errors in logs as below... Does it means that it&apos;s due to mcast packet drops in infiniband network?&lt;/p&gt;

&lt;p&gt;ADDRCONF(NETDEV_UP): ib0: link is not ready&lt;br/&gt;
ib0: enabling connected mode will cause multicast packet drops&lt;br/&gt;
ib0: mtu &amp;gt; 4092 will cause multicast packet drops.&lt;br/&gt;
ib0: mtu &amp;gt; 4092 will cause multicast packet drops.&lt;br/&gt;
ib1: enabling connected mode will cause multicast packet drops&lt;br/&gt;
ib1: mtu &amp;gt; 4092 will cause multicast packet drops.&lt;br/&gt;
ib1: mtu &amp;gt; 4092 will cause multicast packet drops.&lt;/p&gt;

&lt;p&gt;LustreError: 11-0: an error occurred while communicating with 10.2.2.187@o2ib. The mds_getxattr operation failed with -95&lt;br/&gt;
LustreError: 11-0: an error occurred while communicating with 10.2.2.187@o2ib. The mds_getxattr operation failed with -95&lt;br/&gt;
LustreError: 11-0: an error occurred while communicating with 10.2.2.187@o2ib. The mds_getxattr operation failed with -95&lt;br/&gt;
LustreError: 11-0: an error occurred while communicating with 10.2.2.187@o2ib. The mds_getxattr operation failed with -95&lt;br/&gt;
LustreError: 11-0: an error occurred while communicating with 10.2.2.187@o2ib. The mds_getxattr operation failed with -95&lt;/p&gt;</comment>
                            <comment id="22723" author="prabhu.chakra" created="Wed, 9 Nov 2011 02:54:47 +0000"  >&lt;p&gt;The only difference i could see is this node has two IB interfaces configured in load balancing mode, the lnet entry is &quot;options lnet networks=o2ib0&quot;. Hope it&apos;s ok.&lt;/p&gt;</comment>
                            <comment id="22724" author="prabhu.chakra" created="Wed, 9 Nov 2011 05:05:37 +0000"  >&lt;p&gt;Just wanted to update on this issue, whenever we do &quot;ls&quot; on the lustre fs then only it&apos;s showing this error. We&apos;ve checked the same with the non problematic node as well the issue remains...&lt;/p&gt;

&lt;p&gt;Please suggest...&lt;/p&gt;</comment>
                            <comment id="23000" author="prabhu.chakra" created="Mon, 14 Nov 2011 02:06:41 +0000"  >&lt;p&gt;Appreciate your help ASAP, since the entire production is on toss...&lt;/p&gt;

&lt;p&gt;Please do the needful.&lt;/p&gt;</comment>
                            <comment id="23020" author="ravibadiger24" created="Tue, 15 Nov 2011 01:51:54 +0000"  >&lt;p&gt;Sample compute node syslog messages where issue is noticed&lt;/p&gt;</comment>
                            <comment id="23021" author="ravibadiger24" created="Tue, 15 Nov 2011 01:53:11 +0000"  >&lt;p&gt;Syslog messages of lustre MGS,MDS,OSS Server nodes for /home lustre filesystem &lt;/p&gt;</comment>
                            <comment id="23022" author="ravibadiger24" created="Tue, 15 Nov 2011 01:53:45 +0000"  >&lt;p&gt;Syslog messages of lustre MGS,MDS,OSS Server nodes for /scratch lustre filesystem &lt;/p&gt;</comment>
                            <comment id="23023" author="ravibadiger24" created="Tue, 15 Nov 2011 02:07:02 +0000"  >&lt;p&gt;Hi Cliff,&lt;/p&gt;

&lt;p&gt;I have uploaded following 3 archive files having Syslog messages of client node where these problem is noticed, and MGS,MDS,OSS server nodes messages for both /home /scratch lustre filesystems. Could you please quickly glance through the logs to see what is the cause for the issue and unexpected symptoms (like Resource temporarily unavailable and ls,cat cmd hangs etc) reported here.&lt;/p&gt;

&lt;p&gt;cn363-computenode_messages.rar&lt;br/&gt;
HOME_MESSAGES.rar&lt;br/&gt;
SCRATCH_MESSAGES.rar&lt;/p&gt;


&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root &amp;lt;at&amp;gt; cn367 ~&amp;#93;&lt;/span&gt;# lfs check servers&lt;br/&gt;
scratch-OST0011-osc-ffff810c01a9c000 active.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root &amp;lt;at&amp;gt; mgmt00 ~&amp;#93;&lt;/span&gt;# lfs check servers&lt;br/&gt;
error: check &apos;scratch-OST0011-osc-ffff810c056c4000&apos; Resource temporarily unavailable&lt;/p&gt;

&lt;p&gt;Additional symptoms are, &lt;br/&gt;
When we try to ls dir /scratch/qgr/R8 dir it stucks. And resource become unavialbale.&lt;br/&gt;
We tried with ls --color=none , in this case we are able to list the files.&lt;br/&gt;
But when we do ls -l --color=none it again stucks.&lt;br/&gt;
we are not able to cat the file on which ls -l stucks others are working fine.&lt;/p&gt;

&lt;p&gt;Thanks &amp;amp; Regards&lt;br/&gt;
-Raghu&lt;/p&gt;</comment>
                            <comment id="23028" author="cliffw" created="Tue, 15 Nov 2011 10:51:05 +0000"  >&lt;p&gt;If your infiniband network is dropping packets, that would cause this issue. &lt;/p&gt;</comment>
                            <comment id="23463" author="prabhu.chakra" created="Mon, 28 Nov 2011 09:25:22 +0000"  >&lt;p&gt;Cliff,&lt;/p&gt;

&lt;p&gt;Just an update on this issue...&lt;/p&gt;

&lt;p&gt;We&apos;ve taken the downtime of the entire lustre and &quot;ls, du&quot; everthing started working..&lt;/p&gt;

&lt;p&gt;To my understanding the recovery, open files and caching in the client has solved the issue.&lt;/p&gt;

&lt;p&gt;Could you please suggest some permanant solution for this issue like clearing the cache, close open files automatically without doing lfsck?&lt;/p&gt;

&lt;p&gt;Appreciate your early help on this..&lt;/p&gt;</comment>
                            <comment id="23484" author="prabhu.chakra" created="Tue, 29 Nov 2011 01:05:41 +0000"  >&lt;p&gt;Cliif,&lt;/p&gt;

&lt;p&gt;Appreciate your suggestions on this, since we are in a bad shape... Please do the needful.&lt;/p&gt;</comment>
                            <comment id="27530" author="adilger" created="Fri, 27 Jan 2012 18:40:49 +0000"  >&lt;p&gt;Since you are an unsupported customer, the only thing I can suggest is that you upgrade to the latest Lustre 2.1.0 release to determine if this is fixing your problem.  To get Whamcloud support for your system, please contact info@whamcloud.com for more information.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="10615" name="HOME_MESSAGES.rar" size="980068" author="ravibadiger24" created="Tue, 15 Nov 2011 01:53:11 +0000"/>
                            <attachment id="10616" name="SCRATCH_MESSAGES.rar" size="1487698" author="ravibadiger24" created="Tue, 15 Nov 2011 01:53:45 +0000"/>
                            <attachment id="10614" name="cn363-computenode_messages.rar" size="66965" author="ravibadiger24" created="Tue, 15 Nov 2011 01:51:54 +0000"/>
                            <attachment id="10599" name="performance.zip" size="62668" author="prabhu.chakra" created="Wed, 9 Nov 2011 00:38:55 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10040" key="com.atlassian.jira.plugin.system.customfieldtypes:labels">
                        <customfieldname>Epic</customfieldname>
                        <customfieldvalues>
                                        <label>client</label>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw0rb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>10203</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>