<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:24:43 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2380] Hang and eviction scenario when multiple tasks/nodes do ftruncate() on the same file in parallel</title>
                <link>https://jira.whamcloud.com/browse/LU-2380</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have an scenario where some of the Lustre clients are hang and consequently evicted causing frequent job failures.&lt;/p&gt;

&lt;p&gt;This issue arises when ftruncate() is done on the same file and by several tasks/nodes in parallel. Even if it&apos;s clear that this is not a good practice, this behavior is quite used by some middle-tier layer heavily used at customer&lt;br/&gt;
site. The consequences are that customer suffers frequent job failures.&lt;/p&gt;

&lt;p&gt;I think Lustre should be more resilient to this kind of issues.&lt;/p&gt;

&lt;p&gt;This bug has been reproduced in lustre 2.1.3 with 36 nodes running 576 mpi taks. In this case 50% of the times the test failed but the problem could maybe also be reproduced with less nodes/tasks.&lt;/p&gt;

&lt;p&gt;This is the reproducer:&lt;/p&gt;

&lt;p&gt;% cat truncate.c&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;#include &amp;lt;mpi.h&amp;gt;
#include &amp;lt;stdio.h&amp;gt;
#include &amp;lt;stdlib.h&amp;gt;
#include &amp;lt;string.h&amp;gt;
#include &amp;lt;fcntl.h&amp;gt;
#include &amp;lt;sys/types.h&amp;gt;
#include &amp;lt;sys/stat.h&amp;gt;
#include &amp;lt;unistd.h&amp;gt;


main(&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; argc,&lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt; **argv)
{
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; npes,myrank,l,i,j,k,n,ierr;
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; *len;
        &lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt; *rank ,*name;
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; *buf;
        struct stat *bufstat;
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; fd;

        MPI_Init(&amp;amp;argc,&amp;amp;argv);
        MPI_Comm_size(MPI_COMM_WORLD,&amp;amp;npes);
        MPI_Comm_rank(MPI_COMM_WORLD,&amp;amp;myrank);

        printf(&lt;span class=&quot;code-quote&quot;&gt;&quot; My rank is %d of %d \n&quot;&lt;/span&gt;,myrank,npes);
        buf=(&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; *) malloc(sizeof(&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)*10000*npes);
        &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; (i=0;i&amp;lt;10000;i++) buf[i]=myrank*i;

        bufstat=(struct stat *) malloc(sizeof(struct stat));

        &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; (i=0;i&amp;lt;10;i++) {
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ((fd=open(&lt;span class=&quot;code-quote&quot;&gt;&quot;/ptmp/user/mytrunk&quot;&lt;/span&gt;,O_CREAT|O_RDWR, 0666)) &amp;lt; 0) {
                        perror(&lt;span class=&quot;code-quote&quot;&gt;&quot;open()&quot;&lt;/span&gt;);
                        exit(-1);
                }
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (fstat(fd,bufstat) &amp;lt; 0) {
                        perror(&lt;span class=&quot;code-quote&quot;&gt;&quot;fstat()&quot;&lt;/span&gt;);
                        exit(-1);
                }
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lseek(fd, 0, SEEK_SET) &amp;lt; 0) {
                        perror(&lt;span class=&quot;code-quote&quot;&gt;&quot;lseek(SEEK_SET)&quot;&lt;/span&gt;);
                        exit(-1);
                }
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lseek(fd, 0, SEEK_CUR) &amp;lt; 0) {
                        perror(&lt;span class=&quot;code-quote&quot;&gt;&quot;lseek(SEEK_CUR)&quot;&lt;/span&gt;);
                        exit(-1);
                }
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ((ierr=ftruncate(fd,0)) &amp;lt; 0) {
                        perror(&lt;span class=&quot;code-quote&quot;&gt;&quot;ftruncate()&quot;&lt;/span&gt;);
                        exit(-1);
                }
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (write(fd,buf,10000*npes) &amp;lt; 0) {
                        perror(&lt;span class=&quot;code-quote&quot;&gt;&quot;write()&quot;&lt;/span&gt;);
                        exit(-1);
                }
                close(fd);
        }

        MPI_Finalize();
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And this is what happened:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;% mpicc truncate.c -o truncate
% srun -n 576 -N 36 --exclusive --resv-ports ./truncate
srun: job 269744 queued and waiting for resources
srun: job 269744 has been allocated resources
&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt; Job/run stuck for minutes !!... &amp;gt;&amp;gt;&amp;gt;&amp;gt;&amp;gt;&amp;gt;&amp;gt;
fstat(): Interrupted system call
fstat(): Interrupted system call
fstat(): Interrupted system call
fstat(): Interrupted system call
ftruncate(): Interrupted system call
 My rank is 10 of 16
 My rank is 11 of 16
 My rank is 13 of 16
 My rank is 14 of 16
 My rank is 15 of 16
fstat(): Interrupted system call
 My rank is 8 of 16
ftruncate(): Cannot send after transport endpoint shutdown
 My rank is 9 of 16
srun: error: lascaux2825: tasks 8-11,13-15: Exited with exit code 255
srun: Terminating job step 269744.0
slurmd[lascaux2825]: *** STEP 269744.0 KILLED AT 2012-06-12T16:09:25 WITH SIGNAL 9 ***
slurmd[lascaux2824]: *** STEP 269744.0 KILLED AT 2012-06-12T16:09:25 WITH SIGNAL 9 ***
srun: Job step aborted: Waiting up to 2 seconds for job step to finish.
slurmd[lascaux2825]: *** STEP 269744.0 KILLED AT 2012-06-12T16:09:25 WITH SIGNAL 9 ***
slurmd[lascaux2824]: *** STEP 269744.0 KILLED AT 2012-06-12T16:09:25 WITH SIGNAL 9 ***
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And one of both Nodes/Clients reported the folling messages :&lt;br/&gt;
=============================================================&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;1336119249 2012 May  4 10:14:09 cartan1174 kern warning kernel Lustre: ptmp-OST001a-osc-ffff880464a98000: Connection
to service ptmp-OST001a via nid JO.BOO.ZO.LF@o2ib3 was lost; in progress operations using this service will wait for
recovery to complete.
1336119249 2012 May  4 10:14:09 cartan1174 kern err kernel LustreError: 45831:0:(ldlm_request.c:1152:ldlm_cli_cancel_req())
Got rc -107 from cancel RPC: canceling anyway
1336119249 2012 May  4 10:14:09 cartan1174 kern err kernel LustreError: 45831:0:(ldlm_request.c:1714:ldlm_cli_cancel_list())
ldlm_cli_cancel_list: -107
1336119249 2012 May  4 10:14:09 cartan1174 kern err kernel LustreError: 167-0: This client was evicted by ptmp-OST001a;
in progress operations using this service will fail.
1336119249 2012 May  4 10:14:09 cartan1174 kern err kernel LustreError: 16559:0:(client.c:1057:ptlrpc_import_delay_req())
@@@ IMP_INVALID  req@ffff880375166000 x1398228708669187/t0(0) o101-&amp;gt;ptmp-OST001a_UUID@JO.BOO.ZO.LF@o2ib3:28/4 lens
296/352 e 0 to 0 dl 0 ref 1 fl Rpc:/ffffffff/ffffffff rc 0/-1
1336119249 2012 May  4 10:14:09 cartan1174 kern err kernel LustreError: 16559:0:(client.c:1057:ptlrpc_import_delay_req())
Skipped 31 previous similar messages
1336119249 2012 May  4 10:14:09 cartan1174 kern err kernel LustreError: 16559:0:(client.c:1057:ptlrpc_import_delay_req())
@@@ IMP_INVALID  req@ffff880d6719c800 x1398228708669192/t0(0) o101-&amp;gt;ptmp-OST001a_UUID@JO.BOO.ZO.LF@o2ib3:28/4 lens
296/352 e 0 to 0 dl 0 ref 1 fl Rpc:/ffffffff/ffffffff rc 0/-1
1336119249 2012 May  4 10:14:09 cartan1174 kern info kernel Lustre: ptmp-OST001a-osc-ffff880464a98000: Connection restored
to service ptmp-OST001a using nid JO.BOO.ZO.LF@o2ib3.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;=============================================================&lt;/p&gt;

&lt;p&gt;When one of the OSS handling the one of the OSTs hosting the file also reported on its side the following messages/errors:&lt;/p&gt;

&lt;p&gt;====================================================&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;1336119248 2012 May  4 10:14:08 cartan204 kern err kernel LustreError: 0:0:(ldlm_lockd.c:345:waiting_locks_callback())
### lock callback timer expired after 100s: evicting client at JO.BOO.ZL.BFP@o2ib3  ns: filter-ptmp-OST001a_UUID lock:
ffff880d1d021b40/0x2f7b35259ae3e490 lrc: 3/0,0 mode: PW/PW res: 20974735/0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615]
(req 0-&amp;gt;18446744073709551615) flags: 0x80010020 remote: 0x307f20f32f0be585 expref: 700 pid: 28661 timeout 4562700589
1336119249 2012 May  4 10:14:09 cartan204 kern err kernel LustreError: 4578:0:(ldlm_lockd.c:1970:ldlm_cancel_handler())
operation 103 from 12345-JO.BOO.ZL.BFP@o2ib3 with bad export cookie 3421386777158859540
1336119249 2012 May  4 10:14:09 cartan204 kern warning kernel Lustre: 28674:0:(ldlm_lib.c:866:target_handle_connect())
ptmp-OST001a: connection from 700d60eb-8a82-cf9e-f029-a5868449ec78@JO.BOO.ZL.BFP@o2ib3 t0 exp (null) cur 1336119249
last 0
1336119249 2012 May  4 10:14:09 cartan204 kern warning kernel Lustre: 28674:0:(filter.c:2813:filter_connect()) ptmp-OST001a:
Received MDS connection (0x2f7b35259ae430da); group 0
1336119249 2012 May  4 10:14:09 cartan204 kern warning kernel Lustre: 28674:0:(filter.c:2813:filter_connect()) Skipped
5 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;====================================================        &lt;/p&gt;</description>
                <environment></environment>
        <key id="16757">LU-2380</key>
            <summary>Hang and eviction scenario when multiple tasks/nodes do ftruncate() on the same file in parallel</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="dmoreno">Diego Moreno</reporter>
                        <labels>
                    </labels>
                <created>Fri, 23 Nov 2012 05:29:35 +0000</created>
                <updated>Thu, 17 Mar 2022 16:07:40 +0000</updated>
                            <resolved>Thu, 17 Mar 2022 16:07:40 +0000</resolved>
                                    <version>Lustre 2.1.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="48309" author="pjones" created="Fri, 23 Nov 2012 10:26:02 +0000"  >&lt;p&gt;Lai could you please look into this one? Thanks Peter&lt;/p&gt;</comment>
                            <comment id="48313" author="adilger" created="Fri, 23 Nov 2012 12:54:13 +0000"  >&lt;p&gt;We had a similar problem in the past with every process truncating the same file. However, in that case I don&apos;t think it was doing the stat and write of the file as well. From an application point of view, it doesn&apos;t make any sense for the application to be truncating then writing the same offset of the same file from all ranks of the program. &lt;/p&gt;

&lt;p&gt;This will cause all of the data to be written over top of itself many times, probably resulting in garbage data, or at best the data from some random rank to be the last one written.&lt;/p&gt;

&lt;p&gt;The major problem is that this write(), along with the stat() forces the lock to be on the client, but then the truncate from the different nodes forces the lock to be revoked. In essence this is a giant engine for causing locking conflicts between all of the clients, and there is no simple way for the operations from the clients to be parallelized at all, since they are always overlapping.&lt;/p&gt;

&lt;p&gt;It is worthwhile to check if the server is detecting this situation and forcing the clients to use lockless writes and lockless truncates?  That would at least minimize or eliminate lock holding on the clients. It is also worth checking that glimpse locks are not granted to the client in the case of high contention where the server has entered lockless operation mode, even though no client is holding the lock. &lt;/p&gt;

&lt;p&gt;On a programming note, open(O_TRUNC) would be more efficient, and the lseek() calls are both redundant and useless, since newly open files use offset 0 already.  Is it a programming bug in the reproducer or the library that these offsets should be related to the process rank?&lt;/p&gt;</comment>
                            <comment id="48331" author="jay" created="Fri, 23 Nov 2012 17:01:59 +0000"  >&lt;p&gt;Do you have a chance to check if there is any deadlock on the clients node?&lt;/p&gt;</comment>
                            <comment id="48857" author="dmoreno" created="Thu, 6 Dec 2012 09:14:27 +0000"  >&lt;p&gt;So trying to make any progress on this ticket, do you need any information from us?&lt;/p&gt;</comment>
                            <comment id="50676" author="dmoreno" created="Thu, 17 Jan 2013 11:19:44 +0000"  >&lt;p&gt;Coming back to this issue, we checked there isn&apos;t any deadlock on the client node. This issue is being seen quite often even if this can be an illogical behavior. Some users run this kind of pattern when they enable some kind of debugging logs on their programs. This problem is particularly annoying when the program has been coded in Fortran (ftrunk behavior quite different from C).&lt;/p&gt;

&lt;p&gt;So we have this scenario: users run this strange ftrunk pattern on 128 nodes and this provokes the eviction of about 500 Lustre clients. Even if users are not supposed to do that I think we could have a DoS issue if we don&apos;t fix that.&lt;/p&gt;

&lt;p&gt;What do you think?&lt;/p&gt;</comment>
                            <comment id="50728" author="adilger" created="Thu, 17 Jan 2013 16:57:19 +0000"  >&lt;p&gt;Diego, the reproducer test - is this just the C equivalent of what is happening in Fortran?  I think user education that their debug logs are useless in this case would also help - having a different log file per MPI rank, or writing to a different offset within the debug file are useful options.&lt;/p&gt;

&lt;p&gt;The open() + stat() + truncate() + write() combination is a difficult one to improve.  If it was open(O_TRUNC) + write() that would be a bit better, or open() + truncate() + write() would also be better, but in both cases the write() will still cause the client to get a DLM lock, which has to be revoked immediately by the next client.  There is no way to have parallel locking or IO, since the IO range is always conflicting with every other client.&lt;/p&gt;

&lt;p&gt;In order to handle this on the OST, we would need to build in some &quot;memory&quot; to the OST objects so that it can detect there is high contention on the DLM locking, and just not grant the DLM lock to the client for the stat() call and .  This could already be done by only returning a glimpse for stat(), but then there is still a lock conflict for the writes.  Doing server-side locking for writes might help, as long as the writes are relatively small in size.&lt;/p&gt;</comment>
                            <comment id="329508" author="paf0186" created="Thu, 17 Mar 2022 16:07:40 +0000"  >&lt;p&gt;Lockless truncate was removed in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14838&quot; title=&quot;Remove old lockless code: Truncate &amp;amp; contention based lockless i/o&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14838&quot;&gt;&lt;del&gt;LU-14838&lt;/del&gt;&lt;/a&gt; and there have been many other related fixes.&#160; This is likely resolved.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="24162">LU-4881</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Thu, 17 Jan 2013 05:29:35 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvcpr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>5649</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 23 Nov 2012 05:29:35 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>