<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:34:51 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-17364] osc_page_delete LBUG - trying to delete a page under write</title>
                <link>https://jira.whamcloud.com/browse/LU-17364</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Client crashed on osc_page_delete, and the page is waiting for write&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[2281253.531369] LustreError: 81367:0:(osc_cache.c:2558:osc_teardown_async_page()) extent ffff883c74c7b810@

{[28680 -&amp;gt; 28680/32767], [2|0|-|cache|wi|ffff883ba0bbde00], [28672|1|+|-|ffff884a03601b00|4096| (null)]}

trunc at 28680.
[2281253.553678] LustreError: 81367:0:(osc_cache.c:2558:osc_teardown_async_page()) ### extent: ffff883c74c7b810 ns: euscrat-OST0004-osc-ffff887a7da55000 lock: ffff884a03601b00/0xb3e57269e5f70d90 lrc: 12/0,1 mode: PW/PW res: [0x480000402:0x1beede4b:0x0].0x0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;18446744073709551615) flags: 0x800020000020000 nid: local remote: 0x93336ee709e18f0b expref: -99 pid: 81367 timeout: 0 lvb_type: 1 l_ast_data: 0000000000000000
[2281253.597632] LustreError: 81367:0:(osc_page.c:191:osc_page_delete()) page@ffff8841ea116400[3 ffff883e49200d70 4 1 (null)]

[2281253.613495] LustreError: 81367:0:(osc_page.c:191:osc_page_delete()) vvp-page@ffff8841ea116458(1:0) vm@fffff967dd5ace00 2fffff00000835 2:0 ffff8841ea116400 28680 lru

[2281253.613499] LustreError: 81367:0:(osc_page.c:191:osc_page_delete()) lov-page@ffff8841ea116498, gen: 0

[2281253.613511] LustreError: 81367:0:(osc_page.c:191:osc_page_delete()) osc-page@ffff8841ea1164d0 28680: 1&amp;lt; 0x845fed 2 0 + - &amp;gt; 2&amp;lt; 117473280 0 4096 0x0 0x420 | (null) ffff88622eda49b0 ffff883ba0bbde00 &amp;gt; 3&amp;lt; 0 0 0 &amp;gt; 4&amp;lt; 0 0 16 242233344 - | - - + - &amp;gt; 5&amp;lt; - - + - | 0 - | 3648 - -&amp;gt;

[2281253.613512] LustreError: 81367:0:(osc_page.c:191:osc_page_delete()) end page@ffff8841ea116400

[2281253.613514] LustreError: 81367:0:(osc_page.c:191:osc_page_delete()) Trying to teardown failed: -16
[2281253.613515] LustreError: 81367:0:(osc_page.c:192:osc_page_delete()) ASSERTION( 0 ) failed:
[2281253.613516] LustreError: 81367:0:(osc_page.c:192:osc_page_delete()) LBUG
[2281253.613518] Pid: 81367, comm: julia 3.10.0-1160.88.1.el7.x86_64 #1 SMP Tue Mar 7 15:41:52 UTC 2023
[2281253.613518] Call Trace:
[2281253.613549] [&amp;lt;0&amp;gt;] libcfs_call_trace+0x90/0xf0 [libcfs]
[2281253.613560] [&amp;lt;0&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[2281253.613568] [&amp;lt;0&amp;gt;] osc_page_delete+0x47e/0x4b0 [osc]
[2281253.613592] [&amp;lt;0&amp;gt;] cl_page_delete0+0x80/0x220 [obdclass]
[2281253.613602] [&amp;lt;0&amp;gt;] cl_page_delete+0x33/0x110 [obdclass]
[2281253.613618] [&amp;lt;0&amp;gt;] ll_invalidatepage+0x87/0x180 [lustre]
[2281253.613634] [&amp;lt;0&amp;gt;] do_invalidatepage_range+0x7d/0x90
[2281253.613642] [&amp;lt;0&amp;gt;] truncate_inode_page+0x7f/0x90
[2281253.613643] [&amp;lt;0&amp;gt;] generic_error_remove_page+0x2a/0x40
[2281253.613652] [&amp;lt;0&amp;gt;] vvp_page_discard+0x5e/0xd0 [lustre]
[2281253.613663] [&amp;lt;0&amp;gt;] cl_page_discard+0x4b/0x70 [obdclass]
[2281253.613675] [&amp;lt;0&amp;gt;] cl_page_list_discard+0x56/0x160 [obdclass]
[2281253.613682] [&amp;lt;0&amp;gt;] ll_io_read_page+0x3f5/0x890 [lustre]
[2281253.613688] [&amp;lt;0&amp;gt;] ll_readpage+0xe6/0x820 [lustre]
[2281253.613693] [&amp;lt;0&amp;gt;] filemap_fault+0x1f8/0x420
[2281253.613699] [&amp;lt;0&amp;gt;] ll_filemap_fault+0x39/0x70 [lustre]
[2281253.613706] [&amp;lt;0&amp;gt;] vvp_io_fault_start+0x5fa/0xe50 [lustre]
[2281253.613718] [&amp;lt;0&amp;gt;] cl_io_start+0x70/0x140 [obdclass]
[2281253.613729] [&amp;lt;0&amp;gt;] cl_io_loop+0x9f/0x200 [obdclass]
[2281253.613735] [&amp;lt;0&amp;gt;] ll_fault+0x52d/0x8a0 [lustre]
[2281253.613746] [&amp;lt;0&amp;gt;] __do_fault.isra.61+0x8a/0x100
[2281253.613754] [&amp;lt;0&amp;gt;] do_shared_fault.isra.64+0x4c/0x280
[2281253.613758] [&amp;lt;0&amp;gt;] handle_mm_fault+0x459/0x1190
[2281253.613765] [&amp;lt;0&amp;gt;] __do_page_fault+0x213/0x510
[2281253.613766] [&amp;lt;0&amp;gt;] do_page_fault+0x35/0x90
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="79577">LU-17364</key>
            <summary>osc_page_delete LBUG - trying to delete a page under write</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="shadow">Alexey Lyashkov</assignee>
                                    <reporter username="bobijam">Zhenyu Xu</reporter>
                        <labels>
                    </labels>
                <created>Thu, 14 Dec 2023 14:20:44 +0000</created>
                <updated>Fri, 26 Jan 2024 17:15:26 +0000</updated>
                            <resolved>Tue, 23 Jan 2024 13:44:44 +0000</resolved>
                                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>11</watches>
                                                                            <comments>
                            <comment id="397152" author="icostelloddn" created="Mon, 18 Dec 2023 02:04:19 +0000"  >&lt;p&gt;This looks identical to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12752&quot; title=&quot;osc_page.c:osc_page_delete() ASSERTION( 0 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12752&quot;&gt;&lt;del&gt;LU-12752&lt;/del&gt;&lt;/a&gt; ?&lt;/p&gt;</comment>
                            <comment id="397875" author="gerrit" created="Fri, 22 Dec 2023 09:15:43 +0000"  >&lt;p&gt;&quot;Zhenyu Xu &amp;lt;bobijam@hotmail.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/53533&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/53533&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17364&quot; title=&quot;osc_page_delete LBUG - trying to delete a page under write&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17364&quot;&gt;&lt;del&gt;LU-17364&lt;/del&gt;&lt;/a&gt; llite: do not discard dirty pages&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 1b7657791d03f8d548eef53ef335cfad21652b3a&lt;/p&gt;</comment>
                            <comment id="397880" author="shadow" created="Fri, 22 Dec 2023 09:53:54 +0000"  >&lt;p&gt;In fact, this is not a bug in read. And page might don&apos;t have dirty flag once it processed by osc_make_ready and vvp_make_ready have clear this flag, but oap/extent still in rpc state which caused a bug.&lt;br/&gt;
this bug reprocoduced easy with fsx run with sysctl -w vm.drop_caches=3 in parallel.&lt;br/&gt;
just less than half hour, typically 10min.&lt;br/&gt;
this is race between page reclaim, mkwrite and any read  (read page / readahead / ... etc).&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000080:00800000:1.0:1702501537.785719:0:24553:0:(llite_mmap.c:235:ll_page_mkwrite0()) fsx mkwrite with 0
00000080:00800008:1.0:1702501537.785731:0:24553:0:(llite_mmap.c:406:ll_fault()) [0x200000401:0x10:0x0]: vma=000000005cc36916 start=0x7f24a1f83000 end=0x7f24a1f8b000 vm_flags=0x80000fb idx=12833
00000080:00000001:1.0:1702501537.785734:0:24553:0:(llite_mmap.c:304:ll_fault0()) Process entered
00000080:00200000:1.0:1702501537.785743:0:24553:0:(rw.c:1885:ll_readpage()) fast read pgno: 12833
00000080:00000001:1.0:1702501537.785757:0:24553:0:(llite_mmap.c:337:ll_fault0()) Process leaving via out (rc=0 : 0 : 0x0)
00000080:00800000:1.0:1702501537.785760:0:24553:0:(llite_mmap.c:380:ll_fault0()) fsx fault 512/0
00000080:00000001:1.0:1702501537.785761:0:24553:0:(llite_mmap.c:381:ll_fault0()) Process leaving (rc=512 : 512 : 200)
Page installed into page cache, mmap write started 

00000080:00800008:1.0:1702501537.785764:0:24553:0:(llite_mmap.c:486:ll_page_mkwrite()) [0x200000401:0x10:0x0]: vma=000000005cc36916 start=0x7f24a1f83000 end=0x7f24a1f8b000 vm_flags=0x80000fb idx=12833
00000080:00000001:1.0:1702501537.785769:0:24553:0:(llite_mmap.c:170:ll_page_mkwrite0()) Process entered
00000080:00000001:1.0:1702501537.785774:0:24553:0:(llite_mmap.c:107:ll_fault_io_init()) Process entered
00000080:00800000:1.0:1702501537.785775:0:24553:0:(llite_mmap.c:129:ll_fault_io_init()) [0x200000401:0x10:0x0]: vma=000000005cc36916 start=0x7f24a1f83000 end=0x7f24a1f8b000 vm_flags=0x80000fb idx=12833
00000080:00000001:1.0:1702501537.785805:0:24553:0:(llite_mmap.c:155:ll_fault_io_init()) Process leaving (rc=18446612688480406592 : -131385229145024 : ffff88817e168c40)


00000080:00000001:3.0:1702501537.785972:0:24541:0:(rw26.c:170:do_release_page()) Process entered
00000008:00000040:3.0:1702501537.785985:0:24541:0:(osc_cache.c:2492:osc_teardown_async_page()) teardown oap 000000004bf2105f page 000000008d8d09d3 at index 12833.
00000080:00000001:3.0:1702501537.786004:0:24541:0:(rw26.c:213:do_release_page()) Process leaving (rc=1 : 1 : 1)

page release by drop cache, but vmpage had an extra references and don&apos;t freed.


00000080:00000001:1.0:1702501537.786026:0:24553:0:(vvp_io.c:1483:vvp_io_fault_start()) Process entered
00000020:00008000:1.0:1702501537.786048:0:24553:0:(cl_page.c:359:cl_page_find()) 12833@[0x200000401:0x10:0x0] 000000000a4501fb 0 1
00000008:00000040:1.0:1702501537.786074:0:24553:0:(osc_cache.c:2289:osc_prep_async_page()) oap 00000000ca649d88 vmpage 000000000a4501fb obj off 52563968
Ops. vmpage old, but no cl page attached - so allocate new, vmpage locked, mkwrite starts io

00000008:00000020:1.0:1702501537.786117:0:24553:0:(osc_cache.c:2370:osc_queue_async_io()) obj 00000000d9a2a1d1 ready 0|-|- wr 9|-|- rd 0|- oap 00000000ca649d88 page 000000000a4501fb added for cmd 2
00000008:00000020:1.0:1702501537.786128:0:24553:0:(osc_cache.c:1388:osc_consume_write_grant()) using 4096 grant credits for brw 00000000513e54ab page 000000000a4501fb

dirty flag cleared, vmpage lock released.

00000080:00000001:1.0:1702501537.786248:0:24553:0:(vvp_io.c:1637:vvp_io_fault_start()) Process leaving
00000080:00000001:1.0:1702501537.786489:0:24553:0:(llite_mmap.c:229:ll_page_mkwrite0()) Process leaving
00000080:00800000:1.0:1702501537.786513:0:24553:0:(llite_mmap.c:235:ll_page_mkwrite0()) fsx mkwrite with 0
00000080:00800008:1.0:1702501537.786527:0:24553:0:(llite_mmap.c:406:ll_fault()) [0x200000401:0x10:0x0]: vma=000000005cc36916 start=0x7f24a1f83000 end=0x7f24a1f8b000 vm_flags=0x80000fb idx=12832
00000080:00000001:1.0:1702501537.786530:0:24553:0:(llite_mmap.c:304:ll_fault0()) Process entered
00000080:00000001:1.0:1702501537.786549:0:24553:0:(llite_mmap.c:107:ll_fault_io_init()) Process entered
00000080:00800000:1.0:1702501537.786550:0:24553:0:(llite_mmap.c:129:ll_fault_io_init()) [0x200000401:0x10:0x0]: vma=000000005cc36916 start=0x7f24a1f83000 end=0x7f24a1f8b000 vm_flags=0x80000fb idx=12832
00000080:00000001:1.0:1702501537.786581:0:24553:0:(llite_mmap.c:155:ll_fault_io_init()) Process leaving (rc=18446612688480406592 : -131385229145024 : ffff88817e168c40)

once page uptodate flag cleared by cl_page_delete early, mmap read start to read this page from network... OOOPS

00000080:00000001:1.0:1702501537.786787:0:24553:0:(vvp_io.c:1483:vvp_io_fault_start()) Process entered
00000020:00008000:1.0:1702501537.787727:0:24553:0:(cl_page.c:359:cl_page_find()) 12833@[0x200000401:0x10:0x0] 000000000a4501fb ffff88817fcefa00 1
00000008:00000020:1.0:1702501537.788082:0:24553:0:(osc_io.c:178:osc_io_submit()) Busy oap 00000000ca649d88 page 000000007e1a659d for submit.

Busy OAP -&amp;gt; return -EAGAIN to submit IO and page under delete in the ll_io_read_page. That&apos;s ALL. Panic.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="398007" author="gerrit" created="Mon, 25 Dec 2023 11:57:04 +0000"  >&lt;p&gt;&quot;Alexey Lyashkov &amp;lt;alexey.lyashkov@hpe.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/53550&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/53550&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17364&quot; title=&quot;osc_page_delete LBUG - trying to delete a page under write&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17364&quot;&gt;&lt;del&gt;LU-17364&lt;/del&gt;&lt;/a&gt; llite: don&apos;t use stale page.&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: c056724ef52cf68b7680a20e8e9a6da0848b3ef5&lt;/p&gt;</comment>
                            <comment id="398957" author="JIRAUSER18015" created="Tue, 9 Jan 2024 12:50:58 +0000"  >&lt;p&gt;Hi,&#160;&lt;/p&gt;

&lt;p&gt;We are stuck with the same problem with Lustre based on the b2_15 branch. Are you planning to backport the abovementioned patches in the near future?&lt;/p&gt;

&lt;p&gt;Best,&#160;&lt;/p&gt;

&lt;p&gt;Dominika Wanat&lt;/p&gt;</comment>
                            <comment id="399530" author="paf0186" created="Fri, 12 Jan 2024 17:36:09 +0000"  >&lt;p&gt;Dominika,&lt;/p&gt;

&lt;p&gt;Do you have any details on your setup?&#160; Was there any particular change you made that seemed linked to the problem occurring, for example, a kernel update?&#160; I ask because the underlying bug has been present for a while and we&apos;re trying to track if there&apos;s a reason it&apos;s now happening more often.&lt;/p&gt;</comment>
                            <comment id="399551" author="JIRAUSER18015" created="Fri, 12 Jan 2024 20:01:38 +0000"  >&lt;p&gt;@Patrick, next week, we will prepare a complete history of updates, patches, and bug occurrences from the client and server sides. This bug is also nothing new for us - we have been experiencing it on the client nodes since the beginning of April 2022 (and kernel 4.18.0-348.20.1.el8_5.x86_64). Now, we are using clients with kernel 4.18.0-477.27.1.el8_8.x86_64, and the issues seem to occur more often despite the latest application of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16043&quot; title=&quot;(osc_page.c:183:osc_page_delete()) LBUG&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16043&quot;&gt;&lt;del&gt;LU-16043&lt;/del&gt;&lt;/a&gt;. Interestingly, the presented LBUG is always caused by ams.exe, a part of &lt;a href=&quot;https://www.scm.com/doc/ADF/index.html&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;ADF&lt;/a&gt;, but not every job using ADF causes lbug...&lt;/p&gt;</comment>
                            <comment id="399558" author="shadow" created="Fri, 12 Jan 2024 20:53:57 +0000"  >&lt;p&gt;@Patrik,&lt;/p&gt;

&lt;p&gt;this issue exist for any lustre where up2date bit cleared in vvp_page_delete and this is related to the page reclaim.&lt;/p&gt;</comment>
                            <comment id="400724" author="gerrit" created="Tue, 23 Jan 2024 05:40:55 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/53550/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/53550/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17364&quot; title=&quot;osc_page_delete LBUG - trying to delete a page under write&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17364&quot;&gt;&lt;del&gt;LU-17364&lt;/del&gt;&lt;/a&gt; llite: don&apos;t use stale page.&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: dad3bed7617fba895db169facde91856e89c2b08&lt;/p&gt;</comment>
                            <comment id="400798" author="pjones" created="Tue, 23 Jan 2024 13:44:44 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i044xz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>