<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:32:20 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10132] IO execvp errors 2.10 client/EE3.1.1 server</title>
                <link>https://jira.whamcloud.com/browse/LU-10132</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Our users are seeing consistent IO errors at very early attempts to scale jobs that access executables on lustre volumes.  The errors hang the entire job.   These instances are 2.10 client and EE 3.1.1 server scenarios.  I&apos;ve been able to replicate the errors on several different filesystems.  The errors do not occur when loading the EE 3.1.1/2.7 client stack to run the same jobs.  Is there any expectation of 2.10 client compatibility with EE 3.1.1 servers?&lt;/p&gt;


&lt;p&gt;Example job:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$ mpirun -perhost 1 -np 12 -host ekf067,ekf082,ekf087,ekf095,ekf194,ekf195,ekf355,ekf358,ekf359,ekf361,ekf364 -PSM2 /lfs/lfs11/tgeerdes/hello
Hello world from processor ekf087, rank 2 out of 12 processors
Hello world from processor ekf067, rank 11 out of 12 processors
Hello world from processor ekf082, rank 1 out of 12 processors
Hello world from processor ekf195, rank 5 out of 12 processors
Hello world from processor ekf095, rank 3 out of 12 processors
Hello world from processor ekf194, rank 4 out of 12 processors
Hello world from processor ekf355, rank 6 out of 12 processors
Hello world from processor ekf358, rank 7 out of 12 processors
Hello world from processor ekf364, rank 10 out of 12 processors
Hello world from processor ekf361, rank 9 out of 12 processors
Hello world from processor ekf359, rank 8 out of 12 processors
Hello world from processor ekf067, rank 0 out of 12 processors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;$mpirun -perhost 2 -np 24 -host ekf067,ekf082,ekf087,ekf095,ekf194,ekf195,ekf355,ekf358,ekf359,ekf361,ekf364 -PSM2 /lfs/lfs11/tgeerdes/hello
[proxy:0:5@ekf195] HYDU_create_process (../../utils/launch/launch.c:825): execvp error on file /lfs/lfs11/tgeerdes/hello (Input/output error)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="48752">LU-10132</key>
            <summary>IO execvp errors 2.10 client/EE3.1.1 server</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jhammond">John Hammond</assignee>
                                    <reporter username="tgeerdes">Trent Geerdes</reporter>
                        <labels>
                    </labels>
                <created>Fri, 13 Oct 2017 17:38:59 +0000</created>
                <updated>Wed, 15 Nov 2017 12:25:02 +0000</updated>
                            <resolved>Wed, 1 Nov 2017 08:32:23 +0000</resolved>
                                                    <fixVersion>Lustre 2.11.0</fixVersion>
                    <fixVersion>Lustre 2.10.2</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="211081" author="pjones" created="Fri, 13 Oct 2017 17:42:37 +0000"  >&lt;p&gt;Hi Trent&lt;/p&gt;

&lt;p&gt;We would expect this combination to interoperate and it is included in our regular release tesitng. When you say 2.10 do you mean 2.10.0 or 2.10.1 clients?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="211082" author="tgeerdes" created="Fri, 13 Oct 2017 18:07:42 +0000"  >&lt;p&gt;Hi Peter,&lt;br/&gt;
2.10.0 clients and this is on Endeavour.&lt;br/&gt;
Also the only FS I&apos;m not able to replicate it on is our highest performing SSD based EE 3.1.1 FS.  Smaller SSD based and large HDD based all exhibit the issue.&lt;/p&gt;</comment>
                            <comment id="211094" author="jhammond" created="Fri, 13 Oct 2017 21:04:50 +0000"  >&lt;p&gt;Hi Trent, can you attach logs from the clients that experienced the error, the MDT(s), and any OSTs that contained stripes from the executable.&lt;/p&gt;</comment>
                            <comment id="211095" author="tgeerdes" created="Fri, 13 Oct 2017 21:18:56 +0000"  >&lt;p&gt;Client, MDT, OST&apos;s don&apos;t log any messages related to the failures.  Just the typical, unrelated client disconnects, reconnects.&lt;/p&gt;</comment>
                            <comment id="211185" author="jhammond" created="Mon, 16 Oct 2017 15:40:03 +0000"  >&lt;p&gt;How easy is it to reproduce this?&lt;/p&gt;</comment>
                            <comment id="211186" author="tgeerdes" created="Mon, 16 Oct 2017 15:43:02 +0000"  >&lt;p&gt;Pretty easy.  Many of our customers have hit it and I didn&apos;t have trouble reproducing when trying.  &lt;/p&gt;</comment>
                            <comment id="211192" author="jhammond" created="Mon, 16 Oct 2017 15:58:33 +0000"  >&lt;p&gt;Could you run the following on each client:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl set_param debug_mb=256
lctl set_param debug=&quot;vfstrace rpctrace dlmtrace net neterror ha trace&quot;
lctl clear
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Then run your reproducer&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;date +%s
mpirun -perhost 2 -np 24 -host ...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Then run &lt;tt&gt;lctl dk &amp;gt; &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10132&quot; title=&quot;IO execvp errors 2.10 client/EE3.1.1 server&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10132&quot;&gt;&lt;del&gt;INTL-313&lt;/del&gt;&lt;/a&gt;.log&lt;/tt&gt; on one of the clients where execvp() fails and attach the log file here?&lt;/p&gt;

&lt;p&gt;You will probably want to restore the settings to &lt;tt&gt;debug&lt;/tt&gt; and &lt;tt&gt;debug_mb&lt;/tt&gt; afterwards.&lt;/p&gt;</comment>
                            <comment id="211307" author="tgeerdes" created="Tue, 17 Oct 2017 17:48:10 +0000"  >&lt;p&gt;I&apos;ve set the params and captured the log.  Attached.&lt;/p&gt;</comment>
                            <comment id="211324" author="jhammond" created="Tue, 17 Oct 2017 19:57:24 +0000"  >&lt;p&gt;Trent, do you mind if I move this to the LU project?&lt;/p&gt;

&lt;p&gt;Thanks to your logs I can reproduce it locally and now understand where this is coming from.&lt;/p&gt;</comment>
                            <comment id="211325" author="tgeerdes" created="Tue, 17 Oct 2017 19:59:12 +0000"  >&lt;p&gt;Sure, go ahead.  Thank you.&lt;/p&gt;</comment>
                            <comment id="211328" author="jhammond" created="Tue, 17 Oct 2017 20:26:02 +0000"  >&lt;p&gt;This is from:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ll_xattr_cache_refill(struct inode *inode)
{
        ...
        &lt;span class=&quot;code-comment&quot;&gt;/* Matched but no cache? Cancelled on error by a parallel refill. */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (unlikely(req == NULL)) {
                CDEBUG(D_CACHE, &lt;span class=&quot;code-quote&quot;&gt;&quot;cancelled by a parallel getxattr\n&quot;&lt;/span&gt;);
                ll_intent_drop_lock(&amp;amp;oit);
                GOTO(err_unlock, rc = -EIO);
        }
        ...
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Looking at the code here, we should be returning &lt;tt&gt;-EAGAIN&lt;/tt&gt; instead of &lt;tt&gt;-EIO&lt;/tt&gt; so that &lt;tt&gt;ll_getxattr_common()&lt;/tt&gt; will handle the race.&lt;/p&gt;

&lt;p&gt;This affects master as well.&lt;/p&gt;</comment>
                            <comment id="211334" author="gerrit" created="Tue, 17 Oct 2017 20:36:15 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29654&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29654&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10132&quot; title=&quot;IO execvp errors 2.10 client/EE3.1.1 server&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10132&quot;&gt;&lt;del&gt;LU-10132&lt;/del&gt;&lt;/a&gt; llite: handle xattr cache refill race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 9cc8c3a20c547ec75325dde3dd17f4b1dcc66348&lt;/p&gt;</comment>
                            <comment id="212064" author="gerrit" created="Thu, 26 Oct 2017 14:05:14 +0000"  >&lt;p&gt;Minh Diep (minh.diep@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29795&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29795&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10132&quot; title=&quot;IO execvp errors 2.10 client/EE3.1.1 server&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10132&quot;&gt;&lt;del&gt;LU-10132&lt;/del&gt;&lt;/a&gt; llite: handle xattr cache refill race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: f9a2f8bc817829416646dc7d3ea3add16055cefe&lt;/p&gt;</comment>
                            <comment id="212490" author="gerrit" created="Wed, 1 Nov 2017 04:57:41 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/29654/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29654/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10132&quot; title=&quot;IO execvp errors 2.10 client/EE3.1.1 server&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10132&quot;&gt;&lt;del&gt;LU-10132&lt;/del&gt;&lt;/a&gt; llite: handle xattr cache refill race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 3dcb7d098759614ae7deb532e1555bd82dac7936&lt;/p&gt;</comment>
                            <comment id="212499" author="pjones" created="Wed, 1 Nov 2017 08:32:23 +0000"  >&lt;p&gt;Landed for 2.11&lt;/p&gt;</comment>
                            <comment id="212504" author="gerrit" created="Wed, 1 Nov 2017 12:40:19 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/29795/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29795/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10132&quot; title=&quot;IO execvp errors 2.10 client/EE3.1.1 server&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10132&quot;&gt;&lt;del&gt;LU-10132&lt;/del&gt;&lt;/a&gt; llite: handle xattr cache refill race&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 78a5d681932e30797775aa10d22fc25b20aa58f7&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="28483" name="INTL-313.log" size="5911732" author="tgeerdes" created="Tue, 17 Oct 2017 17:47:21 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzlvj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>