<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:28:46 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9735] Sles12Sp2 and 2.9 getcwd() sometimes fails</title>
                <link>https://jira.whamcloud.com/browse/LU-9735</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This is a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9208&quot; title=&quot;getcwd() sometimes fails&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9208&quot;&gt;&lt;del&gt;LU-9208&lt;/del&gt;&lt;/a&gt;. Opening this case for tracking for nasa. We start to see this once we updated the clients to Sles12SP2 and lustre2.9&lt;/p&gt;

&lt;p&gt;Using the test code provide &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9208&quot; title=&quot;getcwd() sometimes fails&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9208&quot;&gt;&lt;del&gt;LU-9208&lt;/del&gt;&lt;/a&gt; (miranda) I was able to reproduce the bug on a single node.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Iteration =    868, Run Time =     0.9614 sec., Transfer Rate =   120.7790 10e+06 Bytes/sec/proc
Iteration =    869, Run Time =     1.5308 sec., Transfer Rate =    75.8561 10e+06 Bytes/sec/proc
forrtl: severe (121): Cannot access current working directory &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; unit 10012, file &lt;span class=&quot;code-quote&quot;&gt;&quot;Unknown&quot;&lt;/span&gt;
Image              PC                Routine            Line        Source             
miranda            0000000000409F29  Unknown               Unknown  Unknown
miranda            00000000004169D2  Unknown               Unknown  Unknown
miranda            0000000000404045  Unknown               Unknown  Unknown
miranda            0000000000402FDE  Unknown               Unknown  Unknown
libc.so.6          00002AAAAB5B96E5  Unknown               Unknown  Unknown
miranda            0000000000402EE9  Unknown               Unknown  Unknown
MPT ERROR: MPI_COMM_WORLD rank 12 has terminated without calling MPI_Finalize()
	aborting job

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;I was able to capture some debug logs I have attached to the case. I was unable to reproduce it using &quot;+trace&quot;. But will continue to try.&lt;/p&gt;</description>
                <environment></environment>
        <key id="47106">LU-9735</key>
            <summary>Sles12Sp2 and 2.9 getcwd() sometimes fails</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="simmonsja">James A Simmons</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                            <label>ORNL</label>
                    </labels>
                <created>Wed, 5 Jul 2017 06:57:59 +0000</created>
                <updated>Wed, 24 Jul 2019 17:36:25 +0000</updated>
                            <resolved>Wed, 24 Jul 2019 17:14:47 +0000</resolved>
                                    <version>Lustre 2.9.0</version>
                                    <fixVersion>Lustre 2.11.0</fixVersion>
                    <fixVersion>Lustre 2.10.4</fixVersion>
                                        <due></due>
                            <votes>1</votes>
                                    <watches>24</watches>
                                                                            <comments>
                            <comment id="201006" author="pjones" created="Wed, 5 Jul 2017 13:27:29 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please assist with this issue?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="201012" author="bobijam" created="Wed, 5 Jul 2017 14:20:46 +0000"  >&lt;p&gt;Is it NFS involved? I haven&apos;t found -2 (ENOENT) error is the log.&lt;/p&gt;</comment>
                            <comment id="201057" author="mhanafi" created="Wed, 5 Jul 2017 21:16:57 +0000"  >&lt;p&gt;NFS is not involved. I&apos;ll try to get additional debugging to catch the ENOENT.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="201140" author="mhanafi" created="Thu, 6 Jul 2017 11:42:02 +0000"  >&lt;p&gt;&#160;I am unable to&#160;reproduce with -1 debugging. What is the min debugging level need&#160;to diag the issue?&lt;/p&gt;</comment>
                            <comment id="201143" author="bobijam" created="Thu, 6 Jul 2017 11:49:11 +0000"  >&lt;p&gt;How about start with &quot;trace inode info other dentry rpctrace vfstrace console&quot; ? And if possible add &quot;dlmtrace&quot; as well.&lt;/p&gt;</comment>
                            <comment id="201144" author="mhanafi" created="Thu, 6 Jul 2017 12:08:01 +0000"  >&lt;p&gt;Uploaded miranda.debug.1499341246.gz but I still didn&apos;t see any ENOENT Error. This was the error on the client&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Iteration =   9649, Run Time =     2.5712 sec., Transfer Rate =    45.1620 10e+06 Bytes/sec/proc
forrtl: severe (121): Cannot access current working directory &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; unit 10013, file &lt;span class=&quot;code-quote&quot;&gt;&quot;Unknown&quot;&lt;/span&gt;
Image              PC                Routine            Line        Source             
miranda            0000000000409F29  Unknown               Unknown  Unknown
miranda            00000000004169D2  Unknown               Unknown  Unknown
miranda            0000000000404045  Unknown               Unknown  Unknown
miranda            0000000000402FDE  Unknown               Unknown  Unknown
libc.so.6          00002AAAAB5B96E5  Unknown               Unknown  Unknown
miranda            0000000000402EE9  Unknown               Unknown  Unknown
MPT ERROR: MPI_COMM_WORLD rank 13 has terminated without calling MPI_Finalize()
        aborting job

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201147" author="bobijam" created="Thu, 6 Jul 2017 12:23:05 +0000"  >&lt;p&gt;Is it possible to locate where does &quot;miranda            0000000000409F29  Unknown               Unknown  Unknown&quot; call in miranda_io.F ?&lt;/p&gt;</comment>
                            <comment id="201148" author="bobijam" created="Thu, 6 Jul 2017 12:27:33 +0000"  >&lt;p&gt;And what files does miranda_io.F created/write/read ? I don&apos;t know fortran well.&lt;/p&gt;</comment>
                            <comment id="201156" author="mhanafi" created="Thu, 6 Jul 2017 12:53:39 +0000"  >&lt;p&gt;attached object dump of miranda.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="201176" author="bobijam" created="Thu, 6 Jul 2017 14:02:58 +0000"  >&lt;p&gt;The calling path should be like this&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;402fd9 callq 402ff0 &amp;lt;MAIN__&amp;gt;
404040 callq 415ea0 &amp;lt;for_open&amp;gt;
416a27 callq 44f410 &amp;lt;for__reopen_file&amp;gt;
44f48b callq  44f550 &amp;lt;for__compute_filename&amp;gt;
4509cd callq  402cb0 &amp;lt;getcwd@plt&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;it seems that the getcwd is called from the open() for write, and the open() is trying to get the full path name of the file.&lt;/p&gt;</comment>
                            <comment id="201228" author="mhanafi" created="Thu, 6 Jul 2017 19:00:36 +0000"  >&lt;p&gt;That is correct. forran open always call getcwd to get full path before doing the actual open.&lt;/p&gt;

&lt;p&gt;I uploaded new debug log to ftp site: /uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9735&quot; title=&quot;Sles12Sp2 and 2.9 getcwd() sometimes fails&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9735&quot;&gt;&lt;del&gt;LU-9735&lt;/del&gt;&lt;/a&gt;/miranda.debug.1499351104.gz&lt;/p&gt;

&lt;p&gt;What about a debug patch to help diag the issue.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;The files written are all called &apos;miranda_io.out.100xx&apos; where xx is the rank number.&lt;/p&gt;</comment>
                            <comment id="201278" author="bobijam" created="Fri, 7 Jul 2017 01:37:30 +0000"  >&lt;p&gt;In what path did you start the test program? Is it possible that during the test run, your working path got some change temporary?&lt;/p&gt;</comment>
                            <comment id="201283" author="bobijam" created="Fri, 7 Jul 2017 02:08:15 +0000"  >&lt;p&gt;The log shows that miranda_io.out.10013 has been going through the read phase while no write happens on it. And other miranda_io.out.xxxx has the write phase happened.  &lt;/p&gt;

&lt;p&gt;The thing is getcwd only determine the current working directory, is it under Lustre file system? If not, Lustre log cannot know the reason why it fails. &lt;/p&gt;</comment>
                            <comment id="201370" author="mhanafi" created="Fri, 7 Jul 2017 16:36:09 +0000"  >&lt;p&gt;yes the working directory is on the lustre file system.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;This is similar issue as &lt;del&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-645&quot; title=&quot;getcwd fails&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-645&quot;&gt;&lt;del&gt;LU-645&lt;/del&gt;&lt;/a&gt;&lt;/del&gt;. Our LD_PRELOAD workaround works in this case as well.&lt;br/&gt;
 I wrote a system tap script to trace &lt;br/&gt;
 kernel.function(&quot;SyS_getcwd@../fs/dcache.c:3254&quot;)&lt;/p&gt;

&lt;p&gt;Here you can see at the failure it returns &lt;br/&gt;
 0xfffffffffffffffe&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=0x1c
call=buf=0x7fffffff5a67 size=0x1000
&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=0x1c
call=buf=0x7fffffff5a67 size=0x1000
&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=0x1c
call=buf=0x7fffffff5a67 size=0x1000
&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=0xfffffffffffffffe
call=buf=0x7fffffff5a67 size=0x1000
&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=0x1c
call=buf=0x7fffffff5a67 size=0x1000
&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=0x1c
call=buf=0x7fffffff5a67 size=0x1000
&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=0x1c
call=buf=0x7fffffff5a67 size=0x1000
&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=0x1c
call=buf=0x8bc040 size=0x1000
&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=&lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=0xb


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201381" author="jaylan" created="Fri, 7 Jul 2017 17:56:35 +0000"  >&lt;p&gt;error of 0x1c is ENOSPC, error of 0xb is EAGAIN&lt;br/&gt;
#define ENOSPC          28      /* No space left on device */&lt;br/&gt;
#define EAGAIN          11      /* Try again */&lt;/p&gt;</comment>
                            <comment id="201397" author="mhanafi" created="Fri, 7 Jul 2017 19:05:44 +0000"  >&lt;p&gt;return=0xfffffffffffffffe is -2&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;SyS_getcwd      0 miranda.debug(26848): -&amp;gt; buf=0x7fffffff5a67 size=0x1000
 0x2aaaab676d92 : getcwd+0x72/0x6d0 [/lib64/libc-2.22.so]
 0x452762 : for__compute_filename+0x1482/0x16d0 [/home7/mhanafi/LUSTRE/BUGS/GETPWD/miranda.debug]
 0x453a89 : for__open_proc+0xc9/0x34b0 [/home7/mhanafi/LUSTRE/BUGS/GETPWD/miranda.debug]
Returning from:  0xffffffff81225fa0 : sys_getcwd+0x0/0x190 [kernel]
Returning to  :  0xffffffff81618e1e : tracesys_phase2+0x84/0x89 [kernel]
 0xffffffff8101ae0c : try_stack_unwind+0x17c/0x190 [kernel]
 0xffffffff81019ce4 : dump_trace+0x64/0x380 [kernel]
 0xffffffffa1ae1664 [stap_971bda5852e0877040783dde29bd60d_26789+0xe664/0x0]
 0xffffffffa1ae2315 [stap_971bda5852e0877040783dde29bd60d_26789+0xf315/0x0]
 0xffffffffa1ae24d0 [stap_971bda5852e0877040783dde29bd60d_26789+0xf4d0/0x0]
 0xffffffff8105c18b : trampoline_handler+0x11b/0x1d0 [kernel]
 0xffffffff8105bcae : kretprobe_trampoline+0x25/0x57 [kernel]
 0xffffffff81618e1e : tracesys_phase2+0x84/0x89 [kernel] (inexact)
SyS_getcwd    135 miranda.debug(26848): &amp;lt;- &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=-2 


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201436" author="mhanafi" created="Sat, 8 Jul 2017 06:33:12 +0000"  >&lt;p&gt;I think this is what is is happing.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;SYSCALL_DEFINE2(getcwd, &lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt; __user *, buf, unsigned &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt;, size)
{
    &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; error;
    struct path pwd, root;
    &lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt; *page = __getname();

    &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!page)
        &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -ENOMEM;

    rcu_read_lock();
    get_fs_root_and_pwd_rcu(current-&amp;gt;fs, &amp;amp;root, &amp;amp;pwd);

    error = -ENOENT;
    &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!d_unlinked(pwd.dentry)) {

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;In the getcwd syscall between&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;    get_fs_root_and_pwd_rcu(current-&amp;gt;fs, &amp;amp;root, &amp;amp;pwd);

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!d_unlinked(pwd.dentry)) {

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The dentry has is getting dropped. From systemtap trace you can see here that .d_iname=&quot;1stripe&quot; is dropped and getpwd errors out.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;__d_drop 363  dentry={.d_flags=2621646, .d_seq={.sequence=2787}, .d_hash={.next=0x0, .pprev=0xffffc90002553280}, .d_parent=0xffff88105c8bc918, .d_name={&amp;lt;union&amp;gt;={&amp;lt;&lt;span class=&quot;code-keyword&quot;&gt;class&amp;&lt;/span&gt;gt;={.hash=1222078897, .len=7}, .hash_len=31286849969}, .name=&lt;span class=&quot;code-quote&quot;&gt;&quot;1stripe&quot;&lt;/span&gt;}, .d_inode=0xffff880dd6b2c550, .d_iname=&lt;span class=&quot;code-quote&quot;&gt;&quot;1stripe&quot;&lt;/span&gt;, .d_lockref={&amp;lt;union&amp;gt;={.lock_count=120259084289, &amp;lt;&lt;span class=&quot;code-keyword&quot;&gt;class&amp;&lt;/span&gt;gt;={.lock={&amp;lt;union&amp;gt;={.rlock={.raw_lock={.val={.counter=1}}}}}, .count=28}}}, .d_op=0xffffffffa0ef8240, .d_sb=0xffff880816ab6800, .d_time=0, .d_fsdata=0xffff880e403b3740, .d_lru={.next=0x

__d_drop 363  dentry={.d_flags=142, .d_seq={.sequence=1}, .d_hash={.next=0x0, .pprev=0x0}, .d_parent=0xffff88105c8bc918, .d_name={&amp;lt;union&amp;gt;={&amp;lt;&lt;span class=&quot;code-keyword&quot;&gt;class&amp;&lt;/span&gt;gt;={.hash=1222078897, .len=7}, .hash_len=31286849969}, .name=&lt;span class=&quot;code-quote&quot;&gt;&quot;1stripe&quot;&lt;/span&gt;}, .d_inode=0x0, .d_iname=&lt;span class=&quot;code-quote&quot;&gt;&quot;1stripe&quot;&lt;/span&gt;, .d_lockref={&amp;lt;union&amp;gt;={.lock_count=4294967297, &amp;lt;&lt;span class=&quot;code-keyword&quot;&gt;class&amp;&lt;/span&gt;gt;={.lock={&amp;lt;union&amp;gt;={.rlock={.raw_lock={.val={.counter=1}}}}}, .count=1}}}, .d_op=0xffffffffa0ef8240, .d_sb=0xffff880816ab6800, .d_time=0, .d_fsdata=0x0, .d_lru={.next=0xffff88105c85ebd8, .prev=0xffff88105c85ebd8}, .d_child={

__d_drop 363  dentry={.d_flags=142, .d_seq={.sequence=2}, .d_hash={.next=0x0, .pprev=0x0}, .d_parent=0xffff88105c8bc918, .d_name={&amp;lt;union&amp;gt;={&amp;lt;&lt;span class=&quot;code-keyword&quot;&gt;class&amp;&lt;/span&gt;gt;={.hash=1222078897, .len=7}, .hash_len=31286849969}, .name=&lt;span class=&quot;code-quote&quot;&gt;&quot;1stripe&quot;&lt;/span&gt;}, .d_inode=0x0, .d_iname=&lt;span class=&quot;code-quote&quot;&gt;&quot;1stripe&quot;&lt;/span&gt;, .d_lockref={&amp;lt;union&amp;gt;={.lock_count=18446743523953737729, &amp;lt;&lt;span class=&quot;code-keyword&quot;&gt;class&amp;&lt;/span&gt;gt;={.lock={&amp;lt;union&amp;gt;={.rlock={.raw_lock={.val={.counter=1}}}}}, .count=-128}}}, .d_op=0xffffffffa0ef8240, .d_sb=0xffff880816ab6800, .d_time=0, .d_fsdata=0x0, .d_lru={.next=0xffff88105c85ebd8, .prev=0xffff88105c85ebd8

SyS_getcwd      0 miranda.debug(354): &amp;lt;- &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;=-2 /nobackupp8/mhanafi/1stripe
 0x2aaaab676d92 [/lib64/libc-2.22.so+0xddd92/0x39f000]
Returning from:  0xffffffff81225fa0 : sys_getcwd+0x0/0x190 [kernel]
Returning to  :  0xffffffff81618e1e : tracesys_phase2+0x84/0x89 [kernel]
 0xffffffff8101ae0c : try_stack_unwind+0x17c/0x190 [kernel]
 0xffffffff81019ce4 : dump_trace+0x64/0x380 [kernel]
 0xffffffffa1155b09 [stap_257b7a801b68bce2a60161ccda5d2586_4_302+0x12b09/0x0]
 0xffffffffa1156865 [stap_257b7a801b68bce2a60161ccda5d2586_4_302+0x13865/0x0]
 0xffffffffa1156a20 [stap_257b7a801b68bce2a60161ccda5d2586_4_302+0x13a20/0x0]
 0xffffffff8105c18b : trampoline_handler+0x11b/0x1d0 [kernel]
 0xffffffff8105bcae : kretprobe_trampoline+0x25/0x57 [kernel]
 0xffffffff81618e1e : tracesys_phase2+0x84/0x89 [kernel] (inexact)
Pass 5: run completed in 0usr/180sys/11238real ms.

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="201468" author="bobijam" created="Mon, 10 Jul 2017 03:10:40 +0000"  >&lt;p&gt;Would  you mind trying this patch?&lt;/p&gt;</comment>
                            <comment id="201481" author="mhanafi" created="Mon, 10 Jul 2017 05:38:05 +0000"  >&lt;p&gt;No patch was attached.&lt;/p&gt;</comment>
                            <comment id="201506" author="bobijam" created="Mon, 10 Jul 2017 12:22:21 +0000"  >&lt;p&gt;Can you see  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/27536/27536_unoptimize-atomic_open-of-negative-dentry.patch&quot; title=&quot;unoptimize-atomic_open-of-negative-dentry.patch attached to LU-9735&quot;&gt;unoptimize-atomic_open-of-negative-dentry.patch&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;</comment>
                            <comment id="201568" author="mhanafi" created="Mon, 10 Jul 2017 18:00:36 +0000"  >&lt;p&gt;I can see the attached path now. But is it not in Gerrit?&lt;/p&gt;</comment>
                            <comment id="201606" author="mhanafi" created="Mon, 10 Jul 2017 22:16:25 +0000"  >&lt;p&gt;We tried the patch and the code failed they same way.&lt;/p&gt;

&lt;p&gt;&#160;Could this be related to &lt;a href=&quot;https://jira.hpdd.intel.com/browse/LU-8891?&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jira.hpdd.intel.com/browse/LU-8891?&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="201740" author="green" created="Wed, 12 Jul 2017 04:54:01 +0000"  >&lt;p&gt;So it looks like the sequence of events unfolds like this:&lt;br/&gt;
Process1: open_create(&quot;1stripe/somefilename&quot;); -&amp;gt; this in turn goes to cancel the update lock we have for &quot;1stripe&quot;. But the lock is shared with lookup lock, so cancelling it means the dentry is dropped.&lt;br/&gt;
Process2: does getcwd - revalidates the &quot;1stripe&quot; first and then checks validity.&lt;/p&gt;

&lt;p&gt;In the bigger log it&apos;s all happening at 1499341246.216525&lt;br/&gt;
Process 1 is 34167 and process 2 is 34165:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000080:00200000:12.0:1499341246.216524:0:34165:0:(dcache.c:357:ll_revalidate_nd()) VFS Op:name=1stripe, flags=17
00000080:00002000:14.0:1499341246.216525:0:34167:0:(llite_internal.h:1388:d_lustre_invalidate()) invalidate dentry 1stripe (ffff8808fcbfdd98) parent ffff8808fcbfdf18 inode ffff8810414180d0 refc 265
00000080:00000001:12.0:1499341246.216525:0:34165:0:(dcache.c:360:ll_revalidate_nd()) Process leaving (rc=1 : 1 : 1)
00000080:00000001:12.0:1499341246.216525:0:34165:0:(file.c:3695:ll_inode_permission()) Process entered
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This sounds kind of a normal race - I don&apos;t see why dentry could not be invalidated between lookup and check and then revalidated later (also explains why NFS would trip this at times).&lt;br/&gt;
I see no easy way to quash this without a kernel patch and even that likely would not be too easy and we&apos;d need to get Al Viro behind it I imagine.&lt;/p&gt;</comment>
                            <comment id="202392" author="mhanafi" created="Mon, 17 Jul 2017 20:35:52 +0000"  >&lt;p&gt;Any updates?&lt;/p&gt;</comment>
                            <comment id="202679" author="bobijam" created="Wed, 19 Jul 2017 06:28:54 +0000"  >&lt;p&gt;we&#8216;re still investigating it&lt;/p&gt;</comment>
                            <comment id="202812" author="mhanafi" created="Wed, 19 Jul 2017 20:11:14 +0000"  >&lt;p&gt;Btw, I did get it to fail with sles12sp2 and lustre2.10.&lt;/p&gt;</comment>
                            <comment id="202998" author="jay" created="Thu, 20 Jul 2017 22:19:12 +0000"  >&lt;p&gt;The problem is clear as Oleg mentioned. It doesn&apos;t revalidate dentry in the sys call of getcwd() so we need to patch the kernel. I can&apos;t think of any workaround solution at this moment.&lt;/p&gt;</comment>
                            <comment id="204691" author="ndauchy" created="Mon, 7 Aug 2017 18:47:43 +0000"  >&lt;p&gt;For reference, uploaded the code for the getcwd() retry hack we are using with LD_PRELOAD to work around this problem.&#160; We have not noticed any significant performance impacts with this approach.&#160; Perhaps similar code can be added to Lustre itself.&lt;/p&gt;

&lt;p&gt;Built with something like:&lt;/p&gt;

&lt;p&gt;&lt;tt&gt;gcc -Wall -c -fPIC -DPIC getcwdHack.c -o getcwdHack.lo&lt;/tt&gt;&lt;br/&gt;
 &lt;tt&gt;gcc -Wall -shared getcwdHack.lo -ldl -Wl,-soname -Wl,libgetcwdHack.so -o libgetcwdHack.so&lt;/tt&gt;&lt;/p&gt;

&lt;p&gt;Run with something like:&lt;/p&gt;

&lt;p&gt;&lt;tt&gt;export LD_PRELOAD=`pwd`/libgetcwdHack.so&lt;/tt&gt;&lt;/p&gt;</comment>
                            <comment id="206925" author="sparschauer" created="Wed, 30 Aug 2017 13:29:00 +0000"  >&lt;p&gt;The retry mechanism has to be applied to Intel/SGI MPI code (libmpifort.so). The string &quot;forrtl&quot; is included in it and it is the only runtime library which calls getcwd(). So only MPI Fortran code is affected.&lt;/p&gt;

&lt;p&gt;openmpi and mvapich2 are not affected (nothing calls getcwd() there).&lt;/p&gt;</comment>
                            <comment id="206927" author="sparschauer" created="Wed, 30 Aug 2017 13:43:08 +0000"  >&lt;p&gt;The kernel maintains a dentry lookup hash list. If getcwd() returns a dentry which is not in that list, then -ENOENT is returned. This is the case at initialization time of &lt;tt&gt;struct hlist_bl_node&lt;/tt&gt; and when it has been deleted from that list. This works pretty well with local file systems.&lt;/p&gt;

&lt;p&gt;Can someone explain how Lustre handles the local dentry lookup hash list?&lt;br/&gt;
Is it rebuilt from time to time?&lt;br/&gt;
Is it possible that getcwd() can be called before the dentry is added to the list?&lt;/p&gt;</comment>
                            <comment id="206928" author="sparschauer" created="Wed, 30 Aug 2017 13:49:38 +0000"  >&lt;p&gt;Since kernel 3.12, getcwd() performance has been improved a lot by the way. Spin locks have been replaced with RCU.&lt;/p&gt;</comment>
                            <comment id="207091" author="green" created="Thu, 31 Aug 2017 17:24:18 +0000"  >&lt;p&gt;Sebastien:&lt;br/&gt;
The way Lustre handles the local dentry lookup hash is the same as any other local filesystem. It is not possible for getcwd to be called before teh entry was added to the list because the lookup places it there.&lt;/p&gt;

&lt;p&gt;The problem at hand is the entry could be removed from the list between lookup and getcwd. On normal filesystems that&apos;s possible if the entry was unlinked meanwhile. On Lustre (and to a degree on NFS) additional way to get the entry dropped from the list is because the entry became stale (i.e. somebody attempted a conflicting operation that revoked the lock of that entry so it was invalidated). Since getcwd does not revalidate entries it might return false errors in those cases.&lt;/p&gt;</comment>
                            <comment id="207747" author="sparschauer" created="Thu, 7 Sep 2017 10:03:56 +0000"  >&lt;p&gt;I&apos;ve talked to our NFS developer Neil Brown meanwhile. getcwd() shouldn&apos;t need to revalidate the dentry. NFS code is not affected - confirmed by our customer. Also upstream Lustre code doesn&apos;t seem to be affected - Neil checked the source.&lt;br/&gt;
If you can provide us your affected Lustre source, then he offers to have a look at it.&lt;/p&gt;</comment>
                            <comment id="207751" author="sparschauer" created="Thu, 7 Sep 2017 11:41:40 +0000"  >&lt;p&gt;I think I&apos;ve found the Lustre 2.9.0 bug for SLES12-SP2. ll_set_fs_pwd() only supports the fs-&amp;gt;lock spin lock or the fs-&amp;gt;lock write lock but not the RCU locking done by getcwd() in kernel 4.4.&lt;/p&gt;</comment>
                            <comment id="207886" author="gerrit" created="Fri, 8 Sep 2017 13:59:54 +0000"  >&lt;p&gt;Bobi Jam (bobijam@hotmail.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/28907&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/28907&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9735&quot; title=&quot;Sles12Sp2 and 2.9 getcwd() sometimes fails&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9735&quot;&gt;&lt;del&gt;LU-9735&lt;/del&gt;&lt;/a&gt; compat: heed the fs_struct::seq&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 898c57e49743e5bf57b5fb689484c71fc909bb61&lt;/p&gt;</comment>
                            <comment id="207892" author="bobijam" created="Fri, 8 Sep 2017 14:20:35 +0000"  >&lt;p&gt;Would you try this patch?&lt;/p&gt;</comment>
                            <comment id="207969" author="adilger" created="Sat, 9 Sep 2017 17:31:26 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sparschauer&quot; class=&quot;user-hover&quot; rel=&quot;sparschauer&quot;&gt;sparschauer&lt;/a&gt;, I also saw the following email on the linux-fsdevel mailing list from Neil Brown:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;Subject: Re: &lt;span class=&quot;error&quot;&gt;&amp;#91;PATCH&amp;#93;&lt;/span&gt; d_move() vs d_unhashed() race: retry under d_lock&lt;/p&gt;

&lt;p&gt;On Fri, Sep 08 2017, Goldwyn Rodrigues wrote:&lt;br/&gt;
&amp;gt;&lt;br/&gt;
&amp;gt; This is a follow-up of Alexey&apos;s patch at &lt;a href=&quot;https://patchwork.kernel.org/patch/9455345/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://patchwork.kernel.org/patch/9455345/&lt;/a&gt;&lt;br/&gt;
&amp;gt; with suggestions proposed by Al Viro.&lt;br/&gt;
&amp;gt; &lt;br/&gt;
&amp;gt; d_move() and d_unhashed() may race because there is a small window&lt;br/&gt;
&amp;gt; where the dentry is unhashed. This may result in ENOENT (for getcwd).&lt;br/&gt;
&amp;gt; This must be checked under d_lock. However, in order to keep the fast&lt;br/&gt;
&amp;gt; path, perform the d_unhashed without d_lock first, and in the unlikely&lt;br/&gt;
&amp;gt; event that it succeeds, perform the check again under d_lock.&lt;/p&gt;

&lt;p&gt;For your consideration, here is an alternate patch which - I believe -&lt;br/&gt;
achieves the same end.  I think this approach is a little more robust,&lt;br/&gt;
but there isn&apos;t a lot in it - Goldwyn&apos;s is arguably simpler so might be&lt;br/&gt;
better for that reason.&lt;/p&gt;

&lt;p&gt;NeilBrown&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;The actual patch is at &lt;a href=&quot;https://patchwork.kernel.org/patch/9945159/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://patchwork.kernel.org/patch/9945159/&lt;/a&gt; &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;From dfaa166e2afaed051c388dc9f43d1468020b5e22 Mon Sep 17 00:00:00 2001
From: NeilBrown &amp;lt;neilb@suse.com&amp;gt;
Date: Fri, 8 Sep 2017 16:03:42 +1000
Subject: [PATCH] VFS: close race between getcwd() and d_move()

d_move() will call __d_drop() and then __d_rehash()
on the dentry being moved.  This creates a small window
when the dentry appears to be unhashed.  Many tests
of d_unhashed() are made under -&amp;gt;d_lock and so are safe
from racing with this window, but some aren&apos;t.
In particular, getcwd() calls d_unlinked() (which calls
d_unhashed()) without d_lock protection, so it can race.

This races has been seen in practice with lustre, which uses d_move() as
part of name lookup.  See:
   https://jira.hpdd.intel.com/browse/LU-9735
It could race with a regular rename(), and result in ENOENT instead
of either the &apos;before&apos; or &apos;after&apos; name.

We could fix this race by taking d_lock an rechecking when
d_unhashed() reports true.  Alternately when can remove the window,
which is the approach this patch takes.

When __d_drop and __d_rehash are used to move a dentry, an extra
flag is passed which causes d_hash.pprev to not be cleared, and
to not be tested.

Signed-off-by: NeilBrown &amp;lt;neilb@suse.com&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="212035" author="bogl" created="Thu, 26 Oct 2017 06:03:23 +0000"  >&lt;p&gt;A fix for this problem is now shipped in the latest kernel version for sles12sp3.  The description of the fix is as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;From: NeilBrown &amp;lt;neilb@suse.com&amp;gt;
Subject: getcwd: Close race with d_move called by lustre
Patch-mainline: Not yet, under development
References: bsc#1052593

When lustre invalidates a dentry (e.g. do to a recalled lock) and then
revalidates it, ll_splice_alias() will call d_move() to move the old alias
to the name of a new one.
This will d_drop then d_rehash the old dentry, creating a small window
when the dentry in unhashed.
If getcwd is run at this time, it might incorrectly think that
the dentry really is unhashed, and so return ENOENT.

This is a bug in lustre (it shouldn&apos;t call d_move()) but we can work
around it in getcwd by  taking the d_lock to avoid the race.
First we test without the lock as the common case does not involve
any race.  If we find the the dentry appears to be unhashed, we take
the lock and check again.

Signed-off-by: Neil Brown &amp;lt;neilb@suse.com&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="212067" author="pjones" created="Thu, 26 Oct 2017 14:24:47 +0000"  >&lt;p&gt;So can we safely close this as &quot;not a bug&quot; from a Lustre perspective safe in the knowledge that this will be fixed in current version of SLES?&lt;/p&gt;</comment>
                            <comment id="212071" author="bogl" created="Thu, 26 Oct 2017 14:28:31 +0000"  >&lt;p&gt;I would vote for &apos;yes&apos;, but it&apos;s only fixed for new versions of sles12sp3.  not fixed for anything older.  Do other commenters in this ticket have opinions?&lt;/p&gt;

&lt;p&gt;Has there been any feedback on the other proposed fix &lt;a href=&quot;https://review.whamcloud.com/28907&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/28907&lt;/a&gt; ?&lt;/p&gt;</comment>
                            <comment id="212073" author="sparschauer" created="Thu, 26 Oct 2017 14:47:18 +0000"  >&lt;p&gt;The patch has been submitted from SLE11-SP3-LTSS to SLE15. New kernels containing the fix will be released soon.&lt;/p&gt;</comment>
                            <comment id="212097" author="mhanafi" created="Thu, 26 Oct 2017 16:51:29 +0000"  >&lt;p&gt;What about the above comment&#160;&lt;/p&gt;

&lt;p&gt;&quot;This is a bug in lustre (it shouldn&apos;t call d_move())&quot;&lt;/p&gt;</comment>
                            <comment id="212114" author="jaylan" created="Thu, 26 Oct 2017 18:35:19 +0000"  >&lt;p&gt;We carry #28907 in our nas-2.10.x and it fixed our problem. It probably would be 4-6 months before we can upgrade to sles12sp3.&lt;/p&gt;

&lt;p&gt;Would #28907 conflict with SUSE&apos;s workaround in sles12sp3?&lt;/p&gt;

&lt;p&gt;Neil Brown while proposed that WA thought it was a bug in lustre. I think we should have a valid fix.&lt;/p&gt;
</comment>
                            <comment id="219496" author="gerrit" created="Wed, 31 Jan 2018 05:51:48 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/28907/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/28907/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9735&quot; title=&quot;Sles12Sp2 and 2.9 getcwd() sometimes fails&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9735&quot;&gt;&lt;del&gt;LU-9735&lt;/del&gt;&lt;/a&gt; compat: heed the fs_struct::seq&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: fff1163fdb41190b59adb8d90919e0adf37f68fb&lt;/p&gt;</comment>
                            <comment id="219548" author="mdiep" created="Wed, 31 Jan 2018 15:23:54 +0000"  >&lt;p&gt;Landed for 2.11&lt;/p&gt;</comment>
                            <comment id="219550" author="gerrit" created="Wed, 31 Jan 2018 15:26:06 +0000"  >&lt;p&gt;Minh Diep (minh.diep@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/31106&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31106&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9735&quot; title=&quot;Sles12Sp2 and 2.9 getcwd() sometimes fails&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9735&quot;&gt;&lt;del&gt;LU-9735&lt;/del&gt;&lt;/a&gt; compat: heed the fs_struct::seq&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 6c21f76ae7ef6cb8004dd60db87583c101b50aa6&lt;/p&gt;</comment>
                            <comment id="220653" author="gerrit" created="Fri, 9 Feb 2018 21:35:26 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/31106/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/31106/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9735&quot; title=&quot;Sles12Sp2 and 2.9 getcwd() sometimes fails&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9735&quot;&gt;&lt;del&gt;LU-9735&lt;/del&gt;&lt;/a&gt; compat: heed the fs_struct::seq&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 030b15004d3acf6b98c198263fcca232129568cc&lt;/p&gt;</comment>
                            <comment id="227504" author="srcc" created="Tue, 8 May 2018 15:20:15 +0000"  >&lt;p&gt;Hi!&lt;/p&gt;

&lt;p&gt;As an additional datapoint, we&apos;d like to report that we&apos;ve been seeing this exact same behavior with the latest Maintenance Release (2.10.3) and the latest available CentOS 7.4 kernel&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;# uname -a
Linux sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; 3.10.0-693.21.1.el7.x86_64 #1 SMP Wed Mar 7 19:03:37 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux

# cat /sys/fs/lustre/version
2.10.3 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Symptoms are the same stack as initially reported, and happened while running VASP jobs:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-quote&quot;&gt;&quot;forrtl: severe (121): Cannot access current working directory &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; unit 7, file &quot;&lt;/span&gt;Unknown&quot;
Image              PC                Routine            Line        Source
vasp_gam           000000000140B496  Unknown               Unknown  Unknown
vasp_gam           000000000142511E  Unknown               Unknown  Unknown
vasp_gam           000000000091665F  Unknown               Unknown  Unknown
vasp_gam           0000000000CFE655  Unknown               Unknown  Unknown
vasp_gam           00000000012AF330  Unknown               Unknown  Unknown
vasp_gam           0000000000408D1E  Unknown               Unknown  Unknown
libc-2.17.so       00007F839E16FC05  __libc_start_main     Unknown  Unknown
vasp_gam           0000000000408C29  Unknown               Unknown  Unknown&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;The &quot;try-again&quot; workaround provided by&#160;@Nathan&#160;works great and we&apos;re recommending our users to use it for now. With the&#160;libgetcwdHack.so&#160;library LD_PRELOADed, the application generates this kind of log:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190701]: failed: size 4096,
buf 0x7fffd6c0959b, ret (nil): No such file or directory
NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190701]: succeeded at &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; 2
of 10: size 4096, buf 0x7fffd6c0959b, ret 0x7fffd6c0959b, path
/scratch/users/freitas/chemical_reactions/vasp_simulations/C_fixed_V/07_restart_3
NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190695]: failed: size 4096,
buf 0x7ffe51196e9b, ret (nil): No such file or directory
NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190695]: succeeded at &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; 2
of 10: size 4096, buf 0x7ffe51196e9b, ret 0x7ffe51196e9b, path
/scratch/users/freitas/chemical_reactions/vasp_simulations/C_fixed_V/07_restart_3
NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190699]: failed: size 4096,
buf 0x7ffdb3f1f31b, ret (nil): No such file or directory
NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190699]: succeeded at &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; 2
of 10: size 4096, buf 0x7ffdb3f1f31b, ret 0x7ffdb3f1f31b, path
/scratch/users/freitas/chemical_reactions/vasp_simulations/C_fixed_V/07_restart_3
NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190697]: failed: size 4096,
buf 0x7ffe9713df9b, ret (nil): No such file or directory
NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190697]: succeeded at &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; 2
of 10: size 4096, buf 0x7ffe9713df9b, ret 0x7ffe9713df9b, path
/scratch/users/freitas/chemical_reactions/vasp_simulations/C_fixed_V/07_restart_3
NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190695]: failed: size 4096,
buf 0x7ffe51196e9b, ret (nil): No such file or directory
NF: getcwd: mpi rank -1, host sh-104-39.&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;: [190695]: succeeded at &lt;span class=&quot;code-keyword&quot;&gt;try&lt;/span&gt; 2
of 10: size 4096, buf 0x7ffe51196e9b, ret 0x7ffe51196e9b, path
/scratch/users/freitas/chemical_reactions/vasp_simulations/C_fixed_V/07_restart_3
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;which seem very indicative of the same error.&lt;/p&gt;

&lt;p&gt;We&apos;re looking forward to the fix in 2.10.4.&lt;/p&gt;

&lt;p&gt;Cheers,&lt;br/&gt;
--&#160;&lt;br/&gt;
Kilian&lt;/p&gt;</comment>
                            <comment id="230533" author="spiechurski" created="Thu, 19 Jul 2018 11:56:51 +0000"  >&lt;p&gt;Hi All,&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I did not see any reaction to Mahmoud Hanafi &apos;s comment from the 26th October 2017, citing Neil Brown:&lt;/p&gt;

&lt;p&gt;&quot;This is a bug in lustre (it shouldn&apos;t call d_move())&quot;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;We have installed 2.10.4 at a customer&apos;s site which encountered this problem (with RHEL 7 clients), and even though this considerably decreased the number of occurences, we still see the &quot;small window when dentry is unhashed&quot;, making the job fail.&lt;/p&gt;

&lt;p&gt;Is there something that can be done in ll_splice_alias() to close this race ?&lt;/p&gt;

&lt;p&gt;I understand this is closed by SuSE in their latest kernels, but this is not the case for earlier kernels, nor for RHEL kernels.&lt;/p&gt;

&lt;p&gt;Or should we push Red Hat to apply the same kind of patch SuSE did (does not seem really fair to me) ?&lt;/p&gt;</comment>
                            <comment id="234069" author="m.magrys" created="Thu, 27 Sep 2018 15:09:54 +0000"  >&lt;p&gt;It looks like we hit the same issue on Lustre 2.10.5 client and Centos 7.5 kernel. Should the fix come from Lustre or kernel, as I&apos;m confused by the previous discussion.&lt;/p&gt;</comment>
                            <comment id="234075" author="simmonsja" created="Thu, 27 Sep 2018 15:41:10 +0000"  >&lt;p&gt;I wonder if the fixes from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9868&quot; title=&quot;dcache/namei fixes for lustre&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9868&quot;&gt;LU-9868&lt;/a&gt; would fix this? Note the patch post has a bug in it. I have a fix but haven&apos;t pushed it.&lt;/p&gt;</comment>
                            <comment id="234079" author="simmonsja" created="Thu, 27 Sep 2018 17:13:24 +0000"  >&lt;p&gt;Can you give&#160; &lt;a href=&quot;https://review.whamcloud.com/#/c/28486&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/28486&lt;/a&gt;&#160;a try.&lt;/p&gt;</comment>
                            <comment id="234177" author="simmonsja" created="Mon, 1 Oct 2018 17:25:48 +0000"  >&lt;p&gt;As reported the earlier patch for this bug didn&apos;t completely solve the problem. The work from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9868&quot; title=&quot;dcache/namei fixes for lustre&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9868&quot;&gt;LU-9868&lt;/a&gt; has been reported as solving this problem which is now linked to this ticket.&lt;/p&gt;</comment>
                            <comment id="234358" author="simmonsja" created="Thu, 4 Oct 2018 14:31:17 +0000"  >&lt;p&gt;So it appears that the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9868&quot; title=&quot;dcache/namei fixes for lustre&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9868&quot;&gt;LU-9868&lt;/a&gt; while fixing this bug has exposed another potential bug in lustre. If you run sanity test 233 you see&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;37212.956888&amp;#93;&lt;/span&gt; VFS: Lookup of &apos;&lt;span class=&quot;error&quot;&gt;&amp;#91;0x200000007:0x1:0x0&amp;#93;&lt;/span&gt;&apos; in lustre lustre would have caused loop&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;37217.817624&amp;#93;&lt;/span&gt; Lustre: DEBUG MARKER: sanity test_233a: @@@@@@ FAIL: cannot access /lustre/lustre using its FID &apos;&lt;span class=&quot;error&quot;&gt;&amp;#91;0x200000007:0x1:0x0&amp;#93;&lt;/span&gt;&apos;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;37236.855424&amp;#93;&lt;/span&gt; Lustre: DEBUG MARKER: == sanity test 233b: checking that OBF of the FS .lustre succeeds ==================================== 03:34:33 (1538379273)&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;37238.362201&amp;#93;&lt;/span&gt; VFS: Lookup of &apos;&lt;span class=&quot;error&quot;&gt;&amp;#91;0x200000002:0x1:0x0&amp;#93;&lt;/span&gt;&apos; in lustre lustre would have caused loop&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;37243.442480&amp;#93;&lt;/span&gt; Lustre: DEBUG MARKER: sanity test_233b: @@@@@@ FAIL: cannot access /lustre/lustre/.lustre using its FID &apos;&lt;span class=&quot;error&quot;&gt;&amp;#91;0x200000002:0x1:0x0&amp;#93;&lt;/span&gt;&apos;&lt;/p&gt;

&lt;p&gt;Some how the parent child relationship got inverted. Will investigate.&lt;/p&gt;</comment>
                            <comment id="251954" author="mhanafi" created="Wed, 24 Jul 2019 17:06:24 +0000"  >&lt;p&gt;We can close this case.&lt;/p&gt;</comment>
                            <comment id="251959" author="pjones" created="Wed, 24 Jul 2019 17:14:47 +0000"  >&lt;p&gt;ok so, given that the initial fix seems to satisfy NASA (the original reporter) we can close the ticket. &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=simmonsja&quot; class=&quot;user-hover&quot; rel=&quot;simmonsja&quot;&gt;simmonsja&lt;/a&gt; can you track any remaining work under a new ticket?&lt;/p&gt;</comment>
                            <comment id="251960" author="simmonsja" created="Wed, 24 Jul 2019 17:36:25 +0000"  >&lt;p&gt;I already have another ticket for this &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="44710">LU-9208</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="47785">LU-9868</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="48963">LU-10164</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="27955" name="getcwdHack.c" size="5697" author="ndauchy" created="Mon, 7 Aug 2017 18:43:47 +0000"/>
                            <attachment id="27439" name="miranda.debug.1499341246.gz" size="88220414" author="mhanafi" created="Thu, 6 Jul 2017 12:10:16 +0000"/>
                            <attachment id="27440" name="miranda.dis" size="9634212" author="mhanafi" created="Thu, 6 Jul 2017 12:52:59 +0000"/>
                            <attachment id="27423" name="r481i7n17.dump1.log.gz" size="14534086" author="mhanafi" created="Wed, 5 Jul 2017 06:55:06 +0000"/>
                            <attachment id="27536" name="unoptimize-atomic_open-of-negative-dentry.patch" size="2144" author="bobijam" created="Mon, 10 Jul 2017 03:10:01 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzg2v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>