<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:25:07 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16224] rw_seq_cst_vs_drop_caches dies with SIGBUS</title>
                <link>https://jira.whamcloud.com/browse/LU-16224</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Running the reproducer from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14541&quot; title=&quot;Memory reclaim caused a stale data read&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14541&quot;&gt;&lt;del&gt;LU-14541&lt;/del&gt;&lt;/a&gt; (rw_seq_cst_vs_drop_caches.c) fails about 50% of the time with Lustre 2.15.1 (both client and servers).&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@mutt21:toss-5803-sigbus]# ./run_test /p/olaf{a,b}/faaland1/test/sigbustest
++ ./rw_seq_cst_vs_drop_caches /p/olafa/faaland1/test/sigbustest /p/olafb/faaland1/test/sigbustest
u = 60, v = { 60, 59 }
./run_test: line 11: 120055 Aborted                 (core dumped) ./rw_seq_cst_vs_drop_caches $1 $2
++ status=134
++ signum=6
++ case $signum in
++ echo FAIL with SIGBUS
FAIL with SIGBUS
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Although it&apos;s not yet confirmed to be the same issue, we have two users reporting jobs dying with a bus error intermittently, when using Lustre for I/O, which is what prompted me to run this against Lustre 2.15.1.&lt;/p&gt;</description>
                <environment>lustre-2.15.1_5.llnl&lt;br/&gt;
4.18.0-372.26.1.1toss.t4.x86_64</environment>
        <key id="72713">LU-16224</key>
            <summary>rw_seq_cst_vs_drop_caches dies with SIGBUS</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="ofaaland">Olaf Faaland</reporter>
                        <labels>
                            <label>hxr</label>
                            <label>llnl</label>
                    </labels>
                <created>Fri, 7 Oct 2022 21:09:33 +0000</created>
                <updated>Fri, 26 May 2023 20:59:54 +0000</updated>
                            <resolved>Sat, 20 May 2023 11:22:38 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="349032" author="ofaaland" created="Fri, 7 Oct 2022 21:10:12 +0000"  >&lt;p&gt;For our reference, our local ticket is TOSS5803&lt;/p&gt;</comment>
                            <comment id="349033" author="pjones" created="Fri, 7 Oct 2022 21:28:11 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please investigate?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="349037" author="bobijam" created="Sat, 8 Oct 2022 00:21:50 +0000"  >&lt;p&gt;Can you apply this patch &lt;a href=&quot;https://review.whamcloud.com/#/c/fs/lustre-release/+/48607/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/fs/lustre-release/+/48607/&lt;/a&gt; and try the reproducer? It contains the fix concerning about the SIGBUS issue.&lt;/p&gt;</comment>
                            <comment id="349038" author="pjones" created="Sat, 8 Oct 2022 00:37:31 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Why don&apos;t you port the patch to b2_15 to make it easier for testing purposes?&#160;&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="349048" author="bobijam" created="Sat, 8 Oct 2022 03:43:01 +0000"  >&lt;p&gt;Here are the ports of the patches for SIGBUS issue.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/48804&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/48804&lt;/a&gt; &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16160&quot; title=&quot;take ldlm lock when queue sync pages&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16160&quot;&gt;&lt;del&gt;LU-16160&lt;/del&gt;&lt;/a&gt; llite: clear stale page&apos;s uptodate bit&lt;br/&gt;
&lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/48805&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/48805&lt;/a&gt; &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16160&quot; title=&quot;take ldlm lock when queue sync pages&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16160&quot;&gt;&lt;del&gt;LU-16160&lt;/del&gt;&lt;/a&gt; llite: clear page uptodate bit on cache drop&lt;/p&gt;</comment>
                            <comment id="349400" author="ofaaland" created="Wed, 12 Oct 2022 16:55:43 +0000"  >&lt;p&gt;Hi Bobijam,&lt;/p&gt;

&lt;p&gt;I pulled those changes (48804 and 48805) into our 2.15.1-based patch stack and confirmed that rw_seq_cst_vs_drop_caches runs successfully now.  I gave both changes my +1 to reflect that.&lt;/p&gt;

&lt;p&gt; rw_seq_cst_vs_drop_caches fails on our 2.12.9-based clients as well.  It looks to me like the two &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16160&quot; title=&quot;take ldlm lock when queue sync pages&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16160&quot;&gt;&lt;del&gt;LU-16160&lt;/del&gt;&lt;/a&gt; patches depend on a third patch (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14541&quot; title=&quot;Memory reclaim caused a stale data read&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14541&quot;&gt;&lt;del&gt;LU-14541&lt;/del&gt;&lt;/a&gt; llite: Check vmpage in releasepage) which Etienne backported to b2_12 in change &lt;a href=&quot;https://review.whamcloud.com/48311&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/48311&lt;/a&gt; but which never got reviews or was landed.&lt;/p&gt;

&lt;p&gt;Are those three patches the right ones for 2.12 to address the issue there?&lt;/p&gt;

&lt;p&gt;thanks&lt;/p&gt;</comment>
                            <comment id="349461" author="bobijam" created="Thu, 13 Oct 2022 05:00:55 +0000"  >&lt;p&gt;thank you for the confirmation. Yes, besides &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16160&quot; title=&quot;take ldlm lock when queue sync pages&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16160&quot;&gt;&lt;del&gt;LU-16160&lt;/del&gt;&lt;/a&gt; patches, I also think &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16064&quot; title=&quot;RPC from evicted client can corrupt data&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16064&quot;&gt;LU-16064&lt;/a&gt; is another ticket addressing the read inconsistency issue.&lt;/p&gt;</comment>
                            <comment id="350236" author="ofaaland" created="Wed, 19 Oct 2022 21:11:13 +0000"  >&lt;p&gt;Hi Bobijam,&lt;/p&gt;

&lt;p&gt;What is the status of the two backports?  I saw that there was a review question for one, and the other seems to have a build issue.&lt;/p&gt;

&lt;p&gt;thanks,&lt;br/&gt;
Olaf&lt;/p&gt;</comment>
                            <comment id="350257" author="bobijam" created="Thu, 20 Oct 2022 01:49:20 +0000"  >&lt;p&gt;A revised patch which is trying to address the review question is under review, when it&apos;s passed I&apos;d update the backports.&lt;/p&gt;</comment>
                            <comment id="352819" author="JIRAUSER17900" created="Sat, 12 Nov 2022 04:11:07 +0000"  >&lt;p&gt;2022-11-12: The b2_15 patch of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16160&quot; title=&quot;take ldlm lock when queue sync pages&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16160&quot;&gt;&lt;del&gt;LU-16160&lt;/del&gt;&lt;/a&gt; is being updated according to master one. The master patch of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16064&quot; title=&quot;RPC from evicted client can corrupt data&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16064&quot;&gt;LU-16064&lt;/a&gt; needs to be rebased.&lt;/p&gt;</comment>
                            <comment id="352957" author="ofaaland" created="Mon, 14 Nov 2022 18:49:15 +0000"  >&lt;p&gt;Thank you for the update&lt;/p&gt;</comment>
                            <comment id="355005" author="JIRAUSER17900" created="Sat, 3 Dec 2022 07:45:32 +0000"  >&lt;p&gt;2022-12-03: The b2_15 patch of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16160&quot; title=&quot;take ldlm lock when queue sync pages&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16160&quot;&gt;&lt;del&gt;LU-16160&lt;/del&gt;&lt;/a&gt; is being worked on. The master patch of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16064&quot; title=&quot;RPC from evicted client can corrupt data&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16064&quot;&gt;LU-16064&lt;/a&gt; is being reviewed.&lt;/p&gt;</comment>
                            <comment id="356287" author="ofaaland" created="Tue, 13 Dec 2022 19:53:03 +0000"  >&lt;p&gt;Hi, do you have an update for this issue?&#160; It is creating problems for at least two users.&#160; Thanks&lt;/p&gt;</comment>
                            <comment id="357766" author="pjones" created="Tue, 3 Jan 2023 17:08:04 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Do I understand correctly you intend to port &#160;&lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49534&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49534&lt;/a&gt; to b2_15 and then ask LLNL to use that in their reproducer?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="357905" author="bobijam" created="Wed, 4 Jan 2023 15:42:18 +0000"  >&lt;p&gt;yes, I&apos;d port it to b2_15 at&#160;&lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49553&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49553&lt;/a&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="359206" author="pjones" created="Mon, 16 Jan 2023 21:54:41 +0000"  >&lt;p&gt;Olaf&lt;/p&gt;

&lt;p&gt;Do you have a reliable reproducer for this issue? Are able to test the effectiveness of the patch?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="360101" author="ofaaland" created="Mon, 23 Jan 2023 20:15:11 +0000"  >&lt;p&gt;Peter,&lt;/p&gt;

&lt;p&gt;My 2 users hitting what I suspect to be the same issue are running against older Lustre versions, 2.12 clients and 2.14 servers.&lt;/p&gt;

&lt;p&gt;When Bobijam&apos;s backport allows rw_seq_cst_vs_drop_caches.c to succeed reliably, and passes the usual automated tests, then I&apos;ll pull it into our 2.15 branch and work on getting my users to run on our 2.15 machines.  Right now the backport has a -1 from Maloo.&lt;/p&gt;

&lt;p&gt;thanks&lt;/p&gt;</comment>
                            <comment id="360105" author="paf0186" created="Mon, 23 Jan 2023 20:41:12 +0000"  >&lt;p&gt;Olaf,&lt;/p&gt;

&lt;p&gt;Please try &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49647/1&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49647/&lt;/a&gt; and &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49653/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49653/&lt;/a&gt; (both, applied in that order) the first is a similar but simpler fix for this issue, the second is a fix for a possible data inconsistency exposed (though not caused) by the first.&#160; We will probably be taking this in preference to Bobi&apos;s patch.&#160; (The Maloo failures on those are unrelated to the patches, I&apos;m just waiting for review, etc, before retriggering testing.)&lt;/p&gt;

&lt;p&gt;By the way, I&apos;ve been able to reproduce this issue locally using the same sanity test Olaf referenced.&#160; For me it&apos;s only reliably hittable in the presence of memory pressure, but that seems to be timing, since it shouldn&apos;t be a hard requirement for the bug to occur.&#160; (Though might be involved in the real applications hitting the issue.)&lt;/p&gt;</comment>
                            <comment id="368099" author="JIRAUSER17900" created="Sat, 1 Apr 2023 05:00:14 +0000"  >&lt;p&gt;2023-04-01: Two patches provided to LLNL for test, one patch(#49647) landed to master, another one(#49653) is being reviewed.&lt;/p&gt;</comment>
                            <comment id="368872" author="JIRAUSER17900" created="Sat, 8 Apr 2023 07:43:58 +0000"  >&lt;p&gt;2023-04-08: Two patches provided to LLNL for test, one patch(#49647) landed to master, another one(#49653) is ready to land to master(in master-next branch).&lt;/p&gt;</comment>
                            <comment id="371490" author="JIRAUSER17900" created="Mon, 8 May 2023 04:59:26 +0000"  >&lt;p&gt;2023-05-08: Both the two patches provided to LLNL for test landed(#49647 #49653) to master.&lt;/p&gt;</comment>
                            <comment id="372209" author="pjones" created="Sat, 13 May 2023 14:38:32 +0000"  >&lt;p&gt;Both patches ready to land for b2_15.&lt;/p&gt;</comment>
                            <comment id="373037" author="JIRAUSER17900" created="Sat, 20 May 2023 07:42:34 +0000"  >&lt;p&gt;2023-05-20: Both patches landed to b2_15.&lt;/p&gt;</comment>
                            <comment id="373043" author="pjones" created="Sat, 20 May 2023 11:22:38 +0000"  >&lt;p&gt;Fix provided in upcoming 2.15.3 release. Marking as duplicate but will only remove topllnl label when LLNL have confirmed effectiveness of fixes.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="71579">LU-16064</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="72362">LU-16160</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i032db:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>