<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:48:39 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11984] Intermittent file create or rm fail with  EINVAL</title>
                <link>https://jira.whamcloud.com/browse/LU-11984</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;mdtest intermittently fails and reports EINVAL error when trying to create or remove a file. &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mdtest-1.8.3 was launched with 1024 total task(s) on 64 nodes
Command line used: /g/g0/faaland1/projects/mdtest/mdtest/mdtest -d /p/lquake/faaland1/lustre-212-reconnects -n 1024 -F -u -v
Path: /p/lquake/faaland1                                                                                                    
FS: 1867.3 TiB   Used FS: 34.2%   Inodes: 765.8 Mi   Used Inodes: 57.1%                                                     
1024 tasks, 1048576 files

 Operation               Duration              Rate
   ---------               --------              ----
 * iteration 1 02/20/2019 13:37:43 *                 
   Tree creation     :      0.076 sec,     13.191 ops/sec

02/20/2019 13:39:00: Process 158(opal119): FAILED in create_remove_items_helper, unable to unlink file file.mdtest.158.223 (cwd=/p/lquake/faaland1/lustre-212-reconnects/#test-dir.0/mdtest_tree.158.0): Invalid argument                                                                         
--------------------------------------------------------------------------                                                                       
MPI_ABORT was invoked on rank 158 in communicator MPI_COMM_WORLD                                                                                 with errorcode 1.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Seen with:&lt;br/&gt;
no DoM&lt;br/&gt;
no PFL&lt;br/&gt;
16 MDTs in the file system, but directory mdtest is using is not striped.&lt;br/&gt;
64 nodes x 16 ppn&lt;/p&gt;</description>
                <environment>linux 3.10.0-957.1.3.1chaos.ch6.x86_64&lt;br/&gt;
lustre-2.12.0_1.chaos-1.ch6.x86_64&lt;br/&gt;
Clients OmniPath &amp;lt;-&amp;gt; routers &amp;lt;-&amp;gt; Servers mlx5&lt;br/&gt;
See &lt;a href=&quot;https://github.com/LLNL/lustre/releases&quot;&gt;https://github.com/LLNL/lustre/releases&lt;/a&gt; for contents of 2.12.0_1.chaos.</environment>
        <key id="54939">LU-11984</key>
            <summary>Intermittent file create or rm fail with  EINVAL</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="pfarrell">Patrick Farrell</assignee>
                                    <reporter username="ofaaland">Olaf Faaland</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Thu, 21 Feb 2019 01:54:19 +0000</created>
                <updated>Wed, 13 Mar 2019 16:29:31 +0000</updated>
                            <resolved>Wed, 13 Mar 2019 16:29:05 +0000</resolved>
                                    <version>Lustre 2.12.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="242400" author="ofaaland" created="Thu, 21 Feb 2019 01:55:51 +0000"  >&lt;p&gt;All on MDT0002&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@opal67:toss-4452]# lfs getdirstripe /p/lquake/faaland1/lustre-212-reconnects
lmv_stripe_count: 0 lmv_stripe_offset: 2 lmv_hash_type: none
[root@opal67:toss-4452]# lfs getdirstripe /p/lquake/faaland1/lustre-212-reconnects/#test-dir.0/
lmv_stripe_count: 0 lmv_stripe_offset: 2 lmv_hash_type: none
[root@opal67:toss-4452]# lfs getdirstripe /p/lquake/faaland1/lustre-212-reconnects/#test-dir.0/mdtest_tree.550.0
lmv_stripe_count: 0 lmv_stripe_offset: 2 lmv_hash_type: none
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="242401" author="ofaaland" created="Thu, 21 Feb 2019 02:00:57 +0000"  >&lt;p&gt;Reproduced with +rpctrace and gathered debug log from the client, opal67.  Log attached as dk.opal67.1550709082.gz.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mdtest-1.8.3 was launched with 1024 total task(s) on 64 nodes
Command line used: /g/g0/faaland1/projects/mdtest/mdtest/mdtest -d /p/lquake/faaland1/lustre-212-reconnects -n 1024 -F -u -v                                                                                                  
Path: /p/lquake/faaland1                                                                                       
FS: 1867.3 TiB   Used FS: 34.2%   Inodes: 757.4 Mi   Used Inodes: 57.7%                                        
1024 tasks, 1048576 files
   Operation               Duration              Rate
   ---------               --------              ----
 * iteration 1 02/20/2019 16:26:38 *                 
   Tree creation     :      0.077 sec,     13.047 ops/sec

02/20/2019 16:27:23: Process 550(opal67): FAILED in create_remove_items_helper, unable to create file file.mdtest.550.791 (cwd=/p/lquake/faaland1/lustre-212-reconnects/#test-dir.0/mdtest_tree.550.0): Invalid argument
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="242402" author="ofaaland" created="Thu, 21 Feb 2019 02:01:55 +0000"  >&lt;p&gt;Not sure if it&apos;s relevant, but the debug log from opal67 includes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;dk.opal67.1550709082:00000002:00100000:13.0:1550708843.710600:0:172721:0:(mdc_locks.c:627:mdc_finish_enqueue()) @@@ op: 3 disposition: 17, status: -22  req@ffff888518842400 x1625568471279792/t0(0) o101-&amp;gt;lquake-MDT0002-mdc-ffff8895b3f6f800@172.19.1.113@o2ib100:12/10 lens 720/560 e 0 to 0 dl 1550708949 ref 1 fl Complete:R/0/0 rc 301/301
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="242406" author="pfarrell" created="Thu, 21 Feb 2019 03:57:00 +0000"  >&lt;p&gt;Olaf,&lt;/p&gt;

&lt;p&gt;It&apos;s definitely relevant, but unfortunately, all the client logs show us is that the server didn&apos;t like your open.&lt;/p&gt;

&lt;p&gt;Can you get dmesg and the output of lctl dk from the MDS you&apos;re using here?&#160; Start with that, but if you can reproduce with any of trace, inode or info (or all) on the MDS, that would be great.&#160; Inode and info are lighter, but likely less helpful.&lt;/p&gt;</comment>
                            <comment id="242464" author="ofaaland" created="Thu, 21 Feb 2019 18:14:27 +0000"  >&lt;p&gt;Patrick,&lt;/p&gt;

&lt;p&gt;I attached dk.jet3.1550709103.gz which is from the node running MDT0002 at the time.&#160; This is from yesterday, so does not have any of the debug flags you asked for set.&#160; When I get time on that machine again, hopefully today, I&apos;ll try again with your flags.&#160; Thank you.&lt;/p&gt;</comment>
                            <comment id="242466" author="pfarrell" created="Thu, 21 Feb 2019 18:31:17 +0000"  >&lt;p&gt;Unfortunately, the MDS dk log starts just a few seconds after that failure:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 00000100:00100000:6.0:1550708942.344015:0:56668:0:(ptlrpcd.c:409:ptlrpcd_check()) transfer 1 async RPCs [1-&amp;gt;0]
1550708942 &amp;lt;--

00000002:00100000:13.0:1550708843.710600:0:172721:0:(mdc_locks.c:627:mdc_finish_enqueue()) @@@ op: 3 disposition: 17, status: -22 req@ffff888518842400 x1625568471279792/t0(0) o101-&amp;gt;lquake-MDT0002-mdc-ffff8895b3f6f800@172.19.1.113@o2ib100:12/10 lens 720/560 e 0 to 0 dl 1550708949 ref 1 fl Complete:R/0/0 rc 301/301

1550708843 &amp;lt;--&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So it contains no useful info.&lt;/p&gt;

&lt;p&gt;Dmesg from the MDS might have something...?&lt;/p&gt;</comment>
                            <comment id="242491" author="ofaaland" created="Fri, 22 Feb 2019 00:23:43 +0000"  >&lt;blockquote&gt;&lt;p&gt;Unfortunately, the MDS dk log starts just a few seconds after that failure:&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Argh, I should have seen that.&lt;/p&gt;

&lt;p&gt;There&apos;s nothing in dmesg from the MDS anywhere near the time of the failure.&#160;&#160; I should be able to get the cluster back later today or tomorrow to try again.&lt;/p&gt;</comment>
                            <comment id="242525" author="pfarrell" created="Fri, 22 Feb 2019 16:13:35 +0000"  >&lt;p&gt;No problem.&lt;/p&gt;

&lt;p&gt;All right, I&apos;ll wait for more from you.&#160; Thanks.&lt;/p&gt;</comment>
                            <comment id="242907" author="aboyko" created="Wed, 27 Feb 2019 04:20:07 +0000"  >&lt;p&gt;This probably duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11827&quot; title=&quot;Race between llog_cat_declare_add_rec and llog_cat_current_log&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11827&quot;&gt;&lt;del&gt;LU-11827&lt;/del&gt;&lt;/a&gt;. We saw unlink fail with invalid argument during mdtest regular.&lt;/p&gt;

&lt;p&gt;@Olaf Faaland, could you check with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11827&quot; title=&quot;Race between llog_cat_declare_add_rec and llog_cat_current_log&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11827&quot;&gt;&lt;del&gt;LU-11827&lt;/del&gt;&lt;/a&gt; patch? it was landed to master yesterday.&lt;/p&gt;</comment>
                            <comment id="243069" author="pfarrell" created="Thu, 28 Feb 2019 20:00:14 +0000"  >&lt;p&gt;Alex,&lt;/p&gt;

&lt;p&gt;Thank you very much!&#160; This does indeed look like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11827&quot; title=&quot;Race between llog_cat_declare_add_rec and llog_cat_current_log&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11827&quot;&gt;&lt;del&gt;LU-11827&lt;/del&gt;&lt;/a&gt;.&#160; &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=ofaaland&quot; class=&quot;user-hover&quot; rel=&quot;ofaaland&quot;&gt;ofaaland&lt;/a&gt;, when you get a chance, it would be good to try that out.&lt;/p&gt;</comment>
                            <comment id="243526" author="ofaaland" created="Fri, 8 Mar 2019 02:15:50 +0000"  >&lt;p&gt;Hi Patrick,&lt;/p&gt;

&lt;p&gt;I got the cluster back.&#160; I applied the patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11827&quot; title=&quot;Race between llog_cat_declare_add_rec and llog_cat_current_log&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11827&quot;&gt;&lt;del&gt;LU-11827&lt;/del&gt;&lt;/a&gt; to Lustre 2.12.0 and am using that build on both client and server.  Creates now fail, but much more consistently and with different symptoms.&#160; The user process gets back ENOENT instead of EINVAL. On the server&apos;s console is a lustre error, which did not occur before.  It is:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 49910:0:(lod_lov.c:896:lod_gen_component_ea()) lquake-MDT0001-mdtlov: Can not locate [0x700000bd5:0x16:0x0]: rc = -2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This was produced running a single-node x 16tpp mdtest, without DoM or PFL.&lt;/p&gt;

&lt;p&gt;This seems to me like a different problem entirely, so I am not uploading the debug logs.  If you agree it&apos;s distinct, I can create a new ticket and put them there.&lt;/p&gt;</comment>
                            <comment id="243851" author="pfarrell" created="Wed, 13 Mar 2019 16:26:47 +0000"  >&lt;p&gt;Whoops, missed this update...&#160; But this most recent report is (your new ticket)&#160;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12063&quot; title=&quot;mktemp fails with ENOENT and MDS log reports lod_gen_component_ea() Can not locate [0x700000bd9:0x56:0x0]: rc = -2&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12063&quot;&gt;&lt;del&gt;LU-12063&lt;/del&gt;&lt;/a&gt;.&#160; Let&apos;s close this one out as a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11827&quot; title=&quot;Race between llog_cat_declare_add_rec and llog_cat_current_log&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11827&quot;&gt;&lt;del&gt;LU-11827&lt;/del&gt;&lt;/a&gt; and move discussion there.&lt;/p&gt;</comment>
                            <comment id="243852" author="pfarrell" created="Wed, 13 Mar 2019 16:29:05 +0000"  >&lt;p&gt;Duplicate of&#160;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11827&quot; title=&quot;Race between llog_cat_declare_add_rec and llog_cat_current_log&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11827&quot;&gt;&lt;del&gt;LU-11827&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="54399">LU-11827</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="32058" name="dk.jet3.1550709103.gz" size="3622306" author="ofaaland" created="Thu, 21 Feb 2019 18:11:57 +0000"/>
                            <attachment id="32049" name="dk.opal67.1550709082.gz" size="9617411" author="ofaaland" created="Thu, 21 Feb 2019 01:58:22 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00bzb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>