<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:27:41 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16515] sanity test_118c test_118d: No page in writeback, writeback=0</title>
                <link>https://jira.whamcloud.com/browse/LU-16515</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for S Buisson &amp;lt;sbuisson@ddn.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/8136068e-67b8-43b8-9a9a-f8d956af9458&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/8136068e-67b8-43b8-9a9a-f8d956af9458&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;test_118d failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;No page in writeback, writeback=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Test session details:&lt;br/&gt;
clients: &lt;a href=&quot;https://build.whamcloud.com/job/lustre-reviews/91915&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.whamcloud.com/job/lustre-reviews/91915&lt;/a&gt; - 4.18.0-372.32.1.el8_6.x86_64&lt;br/&gt;
servers: &lt;a href=&quot;https://build.whamcloud.com/job/lustre-reviews/91915&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.whamcloud.com/job/lustre-reviews/91915&lt;/a&gt; - 4.18.0-372.32.1.el8_lustre.x86_64&lt;/p&gt;

&lt;p&gt;Test output is just:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== sanity test 118d: Fsync validation inject a delay of the bulk ==================================================================== 14:43:29 (1674830609)
7+0 records in
7+0 records out
458752 bytes (459 kB, 448 KiB) copied, 0.00275827 s, 166 MB/s
CMD: onyx-117vm3 lctl set_param fail_val=0 fail_loc=0x214
fail_val=0
fail_loc=0x214
 sanity test_118d: @@@@@@ FAIL: No page in writeback, writeback=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;







&lt;p&gt;VVVVVVV DO NOT REMOVE LINES BELOW, Added by Maloo for auto-association VVVVVVV&lt;br/&gt;
sanity test_118d - No page in writeback, writeback=0&lt;/p&gt;</description>
                <environment></environment>
        <key id="74226">LU-16515</key>
            <summary>sanity test_118c test_118d: No page in writeback, writeback=0</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ys">Yang Sheng</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>always_except</label>
                    </labels>
                <created>Mon, 30 Jan 2023 07:25:13 +0000</created>
                <updated>Fri, 16 Jun 2023 22:09:50 +0000</updated>
                                            <version>Lustre 2.16.0</version>
                    <version>Lustre 2.15.2</version>
                    <version>Lustre 2.15.3</version>
                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>11</watches>
                                                                            <comments>
                            <comment id="361335" author="degremoa" created="Thu, 2 Feb 2023 10:19:24 +0000"  >&lt;p&gt;+1 &lt;a href=&quot;https://testing.whamcloud.com/test_sessions/953d8bc0-67fe-41ca-a159-e971ce38e1cf&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sessions/953d8bc0-67fe-41ca-a159-e971ce38e1cf&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="361412" author="adilger" created="Thu, 2 Feb 2023 18:42:06 +0000"  >&lt;p&gt;Patrick, could you please take a quick look at this failing subtest to see if this is a concern?&lt;/p&gt;

&lt;p&gt;It started failing on master 2023-01-18 and is now failing 10 times per day, so likely relates to a patch that was landed around that time.  The filtered list (excluding kfilnd and unrelated test changes) looks like:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# git log --oneline --after 2023-01-16 --before 2023-01-19 | grep -E -v &quot;kfilnd|test&quot;
cb0aa0285b32 LU-2771 ldlm: remove obsolete LDLM_FL_SERVER_LOCK
0b406c91d175 LU-13642 lnet: modify lnet_inetdev to work with large NIDS
43fe6e51804f LU-16380 osd-ldiskfs: race in OI mapping
ae98c5fdaaf3 LU-16335 mdt: skip target check for rm_entry
a883fec55694 LU-16302 llite: Use alloc_inode_sb() to allocate inodes
7bf0e557a2b3 LU-15163 osd: osd_obj_map_recover() to restart transaction
17a3b5688435 LU-16460 lnet: validate data sent from user land properly
84c9618190f9 LU-16160 revert: &quot;llite: clear stale page&apos;s uptodate bit&quot;
f66b0c3b22bf LU-6142 osc: tidy up osc_init()
d137e9823ca1 LU-10003 lnet: use Netlink to support LNet ping commands
1632ed0340ad LU-6142 ldlm: use list_first_entry in ldlm_lock
c0aa5d97da57 LU-6142 ldlm: tidy list walking in ldlm_flock()
540c293a4d0f LU-16369 ldiskfs: do not check enc context at lookup
a0132a79df9b LU-16444 enc: null-enc names cannot be digested form
6f490275b0e0 LU-16026 llite: always enable remote subdir mount
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;so I would guess the likely candidate is the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16160&quot; title=&quot;take ldlm lock when queue sync pages&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16160&quot;&gt;&lt;del&gt;LU-16160&lt;/del&gt;&lt;/a&gt; revert.&lt;/p&gt;</comment>
                            <comment id="361792" author="gerrit" created="Mon, 6 Feb 2023 23:22:38 +0000"  >&lt;p&gt;&quot;Patrick Farrell &amp;lt;pfarrell@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49918&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49918&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; clio: Remove cl_page_size()&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3b97847db29f85178f3c940332287f5fdcd7bc85&lt;/p&gt;</comment>
                            <comment id="362333" author="gerrit" created="Thu, 9 Feb 2023 20:36:58 +0000"  >&lt;p&gt;&quot;Patrick Farrell &amp;lt;pfarrell@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49960&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49960&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; kfilnd: Allow one HELLO in-flight per peer&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 833f554bc9147cf58d8c6213c91f13c0bccb7c06&lt;/p&gt;</comment>
                            <comment id="362340" author="paf0186" created="Thu, 9 Feb 2023 21:03:05 +0000"  >&lt;p&gt;So I&apos;m doing a bisection here, so there will be some odd looking patch uploads here.&#160; I&apos;m going to start tagging them with BISECT.&lt;/p&gt;</comment>
                            <comment id="362343" author="gerrit" created="Thu, 9 Feb 2023 22:09:26 +0000"  >&lt;p&gt;&quot;Patrick Farrell &amp;lt;pfarrell@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49961&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49961&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; llite: BISECT SIGBUS is possible on a race with&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 666624c75fd30aed02493d1000c32dceba431f11&lt;/p&gt;</comment>
                            <comment id="362369" author="gerrit" created="Fri, 10 Feb 2023 01:53:56 +0000"  >&lt;p&gt;&quot;Patrick Farrell &amp;lt;pfarrell@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49963&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49963&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; sec: BISECT make nodemap root squash&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 4a46141776c641b70521d14ae8b8c99b5f994ecf&lt;/p&gt;</comment>
                            <comment id="362453" author="gerrit" created="Fri, 10 Feb 2023 17:55:15 +0000"  >&lt;p&gt;&quot;Patrick Farrell &amp;lt;pfarrell@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49965&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49965&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; sec: BISECT nodemap root squash independ&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 45e874fcbd59044479c2f51fa6d98dc9f25ca28b&lt;/p&gt;</comment>
                            <comment id="362593" author="paf0186" created="Mon, 13 Feb 2023 15:44:15 +0000"  >&lt;p&gt;So, my bisecting showed that the cause of this is:&lt;br/&gt;
&lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49527&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49527&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=ys&quot; class=&quot;user-hover&quot; rel=&quot;ys&quot;&gt;ys&lt;/a&gt; , can you take a look?&#160; These tests started failing often after &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49527&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49527&lt;/a&gt; landed to master.&#160; They failed very rarely before.&#160; (I identified this patch by bisecting)&lt;/p&gt;</comment>
                            <comment id="362678" author="gerrit" created="Tue, 14 Feb 2023 06:06:43 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49918/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49918/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; clio: Remove cl_page_size()&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 19c38f6c94ae161b1c52dcb02667b4bf06cadc85&lt;/p&gt;</comment>
                            <comment id="363701" author="ssmirnov" created="Wed, 22 Feb 2023 06:08:56 +0000"  >&lt;p&gt;+1 on master: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/e93b1d78-fd45-4845-a0c2-03c3c13d6661&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/e93b1d78-fd45-4845-a0c2-03c3c13d6661&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="363746" author="hornc" created="Wed, 22 Feb 2023 16:16:41 +0000"  >&lt;p&gt;+1 on master &lt;a href=&quot;https://testing.whamcloud.com/test_sets/f66a30e2-d742-4c8f-acc0-9e66872ab142&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/f66a30e2-d742-4c8f-acc0-9e66872ab142&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="364350" author="arshad512" created="Tue, 28 Feb 2023 12:14:47 +0000"  >&lt;p&gt;+1 on Master (&lt;a href=&quot;https://testing.whamcloud.com/test_sets/3458dfd9-3ce1-4fee-a29d-37a09587b298&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/3458dfd9-3ce1-4fee-a29d-37a09587b298&lt;/a&gt;)&lt;/p&gt;</comment>
                            <comment id="364544" author="gerrit" created="Wed, 1 Mar 2023 17:37:40 +0000"  >&lt;p&gt;&quot;Patrick Farrell &amp;lt;pfarrell@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50169&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50169&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; tests: See if &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16285&quot; title=&quot;Prolong the lock BL timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16285&quot;&gt;&lt;del&gt;LU-16285&lt;/del&gt;&lt;/a&gt; fixes 118c,118d&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8887fa6d22b6bc3bb18aef7f110d24a038f0e018&lt;/p&gt;</comment>
                            <comment id="364725" author="paf0186" created="Thu, 2 Mar 2023 17:02:37 +0000"  >&lt;p&gt;Per results of that testing, it looks like this issue is fixed by &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50158&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50158&lt;/a&gt; on &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16285&quot; title=&quot;Prolong the lock BL timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16285&quot;&gt;&lt;del&gt;LU-16285&lt;/del&gt;&lt;/a&gt;.&#160; We&apos;ll wait for that to land then close this out.&lt;/p&gt;</comment>
                            <comment id="365600" author="adilger" created="Sat, 11 Mar 2023 01:05:18 +0000"  >&lt;p&gt;Still seeing this error pretty regularly on master patches, even though they have the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16285&quot; title=&quot;Prolong the lock BL timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16285&quot;&gt;&lt;del&gt;LU-16285&lt;/del&gt;&lt;/a&gt; fix.&#160; For example:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://review.whamcloud.com/#/c/fs/lustre-release/+/48096/5&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/fs/lustre-release/+/48096/5&lt;/a&gt; has v2_15_54-60-g7d05a687ee as the parent, and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16285&quot; title=&quot;Prolong the lock BL timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16285&quot;&gt;&lt;del&gt;LU-16285&lt;/del&gt;&lt;/a&gt; was landed as v2_15_54-59-g9d79f92076.&lt;/p&gt;</comment>
                            <comment id="365601" author="adilger" created="Sat, 11 Mar 2023 01:23:09 +0000"  >&lt;p&gt;Looking through the test history again, I see test_118c and test_118d started mostly failing on 2023-01-27, after the following series of patch landings (earliest failure is v2_15_53-110-geac03ca978 which is the last patch of the series):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;eac03ca978 LU-16464 osp: fix off-by-one errors in oxe_can_hold()
7e566c6a1f LU-16456 tests: skip conf-sanity test_129/132 in interop
742019b260 LU-16461 kfilnd: Modify peer credits and RX buffers
e500f49c30 LU-16451 kfilnd: Improve CQ error logging
98338572a6 LU-13530 build: Add kernel version to depmod
1335eb1d59 LU-16445 sec: make nodemap root squash independent of map_mode
b65374d96b LU-16285 ldlm: send the cancel RPC asap
f147655c33 LU-16415 quota: enforce project quota for root
96edf50039 LU-16367 misc: remove deprecated code
5efc4c1cb4 LU-16345 ofd: ofd_commitrw_read() with non-existing object
cdcf97e17e LU-16338 readahead: add stats for read-ahead page count
421b8fb4e9 LU-16333 osc: page fault in osc_release_bounce_pages()
01fb7bda97 LU-15495 tests: fixed dbench test
eb6518f7ff LU-16267 lnet: fix missing error check in LUTF
e2812e8773 LU-16228 utils: add lljobstat util
945ab61176 LU-16115 build: Linux 5.17 external module support
3774b6afbe LU-13485 ldiskfs: Parallel configure tests for ldiskfs
a346cf6cf2 LU-13485 kernel: Parallel core configure tests
182fa9be07 LU-13485 libcfs: Parallel configure tests for libcfs
b4da788a81 LU-16160 llite: SIGBUS is possible on a race with page reclaim
829af7b029 LU-16480 lov: fiemap improperly handles fm_extent_count=0
ba0d5ffc1c LU-9680 utils: new llapi_param_display_value().
178988d67a LU-14393 recovery: reply reconstruction for batched RPCs
840274b5c5 LU-14393 protocol: basic batching processing framework
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The only other test_118d failure was on 2023-01-17 for patch &lt;a href=&quot;https://review.whamcloud.com/45317&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45317&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15155&quot; title=&quot;Add lock request to readahead&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15155&quot;&gt;LU-15155&lt;/a&gt; llite: Make readahead request locks&lt;/tt&gt;&quot; which is in the middle of a bunch of other pagecache-modifying patches that haven&apos;t landed yet, so is unlikely to be related (this had confused me previously).&lt;/p&gt;

&lt;p&gt;I see that &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16285&quot; title=&quot;Prolong the lock BL timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16285&quot;&gt;&lt;del&gt;LU-16285&lt;/del&gt;&lt;/a&gt; ldlm: send the cancel RPC asap&lt;/tt&gt;&quot; is in that batch, but we might need more than &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16285&quot; title=&quot;Prolong the lock BL timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16285&quot;&gt;&lt;del&gt;LU-16285&lt;/del&gt;&lt;/a&gt; to fix it.&lt;/p&gt;</comment>
                            <comment id="366082" author="ys" created="Thu, 16 Mar 2023 06:18:57 +0000"  >&lt;p&gt;This is really confused me. From log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000080:00200000:0.0:1678874193.081576:0:332595:0:(vvp_io.c:1279:vvp_io_write_start()) f118d.sanity: write [0, 4096)
00000080:00200000:0.0:1678874193.081581:0:332595:0:(rw26.c:725:ll_write_begin()) Writing 0 of 0 to 4096 bytes
00000080:00200000:0.0:1678874193.081593:0:332595:0:(rw26.c:908:ll_write_end()) pos 0, len 4096, copied 4096
00000080:00200000:0.0:1678874193.081597:0:332595:0:(rw26.c:945:ll_write_end()) page@00000000bdbe9cf6[3 0000000097786b56 2 1 0000000095f53b23]

00000080:00200000:0.0:1678874193.081598:0:332595:0:(rw26.c:945:ll_write_end()) vmpage @00000000355f6090 fffffc0002001 3:0 ffff9ff680f81e00 0 lru       
                                                                                                                      
                                                                        =================^
00000080:00200000:0.0:1678874193.081602:0:332595:0:(rw26.c:945:ll_write_end()) osc-page@00000000e81681ab 0: 1&amp;lt; 0 - - &amp;gt; 2&amp;lt; 0 0 0 0x0 0x0 | 0000000000000000 00000000516aba8e 000000004f241537 &amp;gt; 3&amp;lt; 0 0 &amp;gt; 4&amp;lt; 0 0 8 16334848 - | - - - - &amp;gt; 5&amp;lt; - - - - | 0 - | 0 - -&amp;gt;

00000080:00200000:0.0:1678874193.081603:0:332595:0:(rw26.c:945:ll_write_end()) end page@00000000bdbe9cf6

00000080:00200000:0.0:1678874193.081604:0:332595:0:(rw26.c:945:ll_write_end()) queued page: 1.
00000080:00200000:0.0:1678874193.081605:0:332595:0:(vvp_io.c:1169:vvp_io_write_commit()) commit async pages: 1, from 0, to 4096
00000008:00010000:0.0:1678874193.081612:0:332595:0:(osc_cache.c:802:osc_extent_find()) ### extent: 0000000023189926 ns: lustre-OST0006-osc-ffff9ff6a6978800 lock: 00000000c663eb85/0xab5218239206893a lrc: 4/0,1 mode: PW/PW res: [0x52b5:0x0:0x0].0x0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;4095) gid 0 flags: 0x800020000000000 nid: local remote: 0x643532634dfd448a expref: -99 pid: 332595 timeout: 0 lvb_type: 1
00000080:00200000:0.0:1678874193.081620:0:332595:0:(vvp_io.c:1091:vvp_set_pagevec_dirty()) mapping 00000000608a50d7, count 1, dirtied 1
00000080:00200000:0.0:1678874193.081627:0:332595:0:(vvp_io.c:1192:vvp_io_write_commit()) Committed 1 pages 4096 bytes, tot: 4096
00000080:00200000:0.0:1678874193.081631:0:332595:0:(file.c:1472:ll_merge_attr()) [0x200001b78:0xd35:0x0] updating i_size 4096
00000080:00200000:0.0:1678874193.081633:0:332595:0:(file.c:4888:ll_fsync()) VFS Op:inode=[0x200001b78:0xd35:0x0](00000000101efe33), start 0, end 4095, datasync 0
00000080:00200000:0.0:1678874193.081637:0:332595:0:(vvp_io.c:1780:vvp_io_init()) [0x200001b78:0xd35:0x0] ignore/verify layout 1/0, layout version 2 restore needed 0
00020000:00200000:0.0:1678874193.081640:0:332595:0:(lov_io.c:819:lov_io_iter_init()) component[0] flags 0x10
00000008:00010000:0.0:1678874193.081645:0:332595:0:(osc_cache.c:1867:try_to_add_extent_for_io()) ### extent: 0000000023189926 ns: lustre-OST0006-osc-ffff9ff6a6978800 lock: 00000000c663eb85/0xab5218239206893a lrc: 4/0,1 mode: PW/PW res: [0x52b5:0x0:0x0].0x0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;4095) gid 0 flags: 0x800020000000000 nid: local remote: 0x643532634dfd448a expref: -99 pid: 332595 timeout: 0 lvb_type: 1
00000008:00010000:0.0:1678874193.081650:0:332595:0:(osc_cache.c:1084:osc_extent_make_ready()) ### extent: 0000000023189926 ns: lustre-OST0006-osc-ffff9ff6a6978800 lock: 00000000c663eb85/0xab5218239206893a lrc: 4/0,1 mode: PW/PW res: [0x52b5:0x0:0x0].0x0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;4095) gid 0 flags: 0x800020000000000 nid: local remote: 0x643532634dfd448a expref: -99 pid: 332595 timeout: 0 lvb_type: 1
00010000:00010000:0.0:1678874193.081662:0:332595:0:(ldlm_lock.c:1507:ldlm_lock_match_with_skip()) ### matched (0 4095) ns: lustre-OST0006-osc-ffff9ff6a6978800 lock: 00000000c663eb85/0xab5218239206893a lrc: 5/0,1 mode: PW/PW res: [0x52b5:0x0:0x0].0x0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;4095) gid 0 flags: 0x800020000000000 nid: local remote: 0x643532634dfd448a expref: -99 pid: 332595 timeout: 0 lvb_type: 1
00000008:00100000:0.0:1678874193.081675:0:332595:0:(osc_request.c:1948:osc_brw_prep_request()) brw rpc 0000000034743c16 - object 0x0:21173 offset 0&amp;lt;&amp;gt;4096
00000100:00100000:1.0:1678874193.081701:0:2905:0:(client.c:1737:ptlrpc_send_new_req()) Sending RPC req@0000000034743c16 pname:cluuid:pid:xid:nid:opc:job ptlrpcd_00_01:8231eeb6-f8da-45b6-923d-908cec5c2274:2905:1760422647264640:10.240.42.118@tcp:4:multiop.0
00000400:00000080:1.0:1678874194.489851:0:332674:0:(module.c:211:libcfs_ioctl()) libcfs ioctl cmd 3221775648
00000001:02000400:1.0:1678874194.489853:0:332674:0:(debug.c:725:libcfs_debug_mark_buffer()) DEBUG MARKER: /usr/sbin/lctl mark  sanity test_118d: @@@@@@ FAIL: No page in writeback, writeback=0

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The page flag ffffc0002001 indicts the writeback has been set for this page. The OST_WRITE hasn&apos;t received reply until the test case failed, so this flag should not be clear mean while. Then why the test failed? Maybe i miss some thing?&lt;/p&gt;</comment>
                            <comment id="366355" author="adilger" created="Sun, 19 Mar 2023 19:15:02 +0000"  >&lt;p&gt;+73 failures in the past week.  Please increase priority to fix this issue as it slows down all development efforts.&lt;/p&gt;</comment>
                            <comment id="366481" author="paf0186" created="Mon, 20 Mar 2023 17:11:00 +0000"  >&lt;p&gt;Yang Sheng,&lt;/p&gt;

&lt;p&gt;Are you interested in looking in to this further?&#160; I haven&apos;t been able to reproduce it locally, which has been making looking at it kind of tricky.&lt;/p&gt;</comment>
                            <comment id="367110" author="ys" created="Thu, 23 Mar 2023 18:19:28 +0000"  >&lt;p&gt;I wonder why sanity test lustre log without -1 flag?  Have a easy way to change it? &lt;/p&gt;</comment>
                            <comment id="367114" author="adilger" created="Thu, 23 Mar 2023 18:51:15 +0000"  >&lt;p&gt;You could push a patch to set &lt;tt&gt;debug=all&lt;/tt&gt; for this test. This is the default for sanity, but is lost when some subtests umount and remount the filesystem. &lt;/p&gt;

&lt;p&gt;You may also be able to add this to Test-Parameters:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Test-Parameters: trivial testlist=sanity env=ONLY=118,ONLY_REPEAT=100,PTLDEBUG=all
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="367117" author="paf0186" created="Thu, 23 Mar 2023 19:07:33 +0000"  >&lt;p&gt;It is not the default, I don&apos;t think?&#160; We certainly don&apos;t normally log with +trace enabled?&#160; When we accidentally landed a patch of mine which did debug=-1 in a test, it caused problems for other tests.&lt;/p&gt;

&lt;p&gt;Yang Sheng,&lt;/p&gt;

&lt;p&gt;When I need to do this, I cheat.&#160; You can place &apos;lctl set_param *debug=-1&apos; and &apos;lctl set_param debug_mb=10000&apos; (or some other large number) directly in the test, then run the test.&#160; Unless the test remounts Lustre, which resets to defaults, this level of debug will show up in the debug log for the test failure.&#160; You can even do this in tests you push to Maloo, but I would set a test-parameters to run just this test if you do that.&lt;/p&gt;</comment>
                            <comment id="367440" author="adilger" created="Mon, 27 Mar 2023 18:16:23 +0000"  >&lt;p&gt;Patrick, for sanity.sh the script definitely sets full debug at the start, so that we can debug test failures more easily:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;OLDDEBUG=$(lctl get_param -n debug 2&amp;gt; /dev/null)
lctl set_param debug=-1 2&amp;gt; /dev/null || true
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;but this is lost at some point during the tests running.  That said, it looks like this is set only on the client, so possibly the same needs to be done on the servers.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;You can place &apos;&lt;tt&gt;lctl set_param *debug=-1&lt;/tt&gt;&apos; and &apos;&lt;tt&gt;lctl set_param debug_mb=10000&lt;/tt&gt;&apos; (or some other large number) directly in the test, then run the test.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;The VM clients only have 3GB of RAM, so 10000 is too large.  The limit is 80% of RAM, but this will likely impact system operation.  Maybe 1GB is the max for testing.&lt;/p&gt;</comment>
                            <comment id="367441" author="adilger" created="Mon, 27 Mar 2023 18:38:56 +0000"  >&lt;p&gt;Sheng, can you please push a patch to enable full debug for this subtest (e.g. add &lt;tt&gt;start_full_debug_logging&lt;/tt&gt; and &lt;tt&gt;stop_full_debug_logging&lt;/tt&gt; calls into &lt;tt&gt;test_118c&lt;/tt&gt; and &lt;tt&gt;test_118d&lt;/tt&gt; and then add:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Test-Parameters: trivial testlist=sanity env=ONLY=&quot;118c 118d&quot;,ONLY_REPEAT=100
Test-Parameters: trivial testlist=sanity
Test-Parameters: trivial testlist=sanity
[repeat 20x]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If testing on the patch itself does not reproduce the problem (it is failing about 10% of runs on master this week, but may depend on previous state to fail),  then the debug  patch could be landed so that it will collect debugging from other patch test runs.&lt;/p&gt;</comment>
                            <comment id="367444" author="paf0186" created="Mon, 27 Mar 2023 19:02:02 +0000"  >&lt;p&gt;Interesting about full debug - that&apos;s good to know.&lt;/p&gt;

&lt;p&gt;RE: debug_mb=10000; oh yes, absolutely.&#160; It&apos;s sort my lazy shorthand for &quot;max&quot;, it&apos;s intended to be when I want to get as much info as possible out of a specific test.&#160; It&apos;s not appropriate for full test runs and it does indeed cause problems if you do one with it set like that.&lt;/p&gt;</comment>
                            <comment id="367450" author="gerrit" created="Mon, 27 Mar 2023 19:52:20 +0000"  >&lt;p&gt;&quot;Yang Sheng &amp;lt;ys@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50439&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50439&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; tests: enable -1 log for 118c &amp;amp; 118d&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: ddbe69524ab8ec5d2147fed5674eddabdf366422&lt;/p&gt;</comment>
                            <comment id="367800" author="gerrit" created="Wed, 29 Mar 2023 21:48:04 +0000"  >&lt;p&gt;&quot;Andreas Dilger &amp;lt;adilger@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50470&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50470&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; tests: disable sanity test_118c/118d&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e3bf4218cb9e1eb68f8ffb9f15161645165b84be&lt;/p&gt;</comment>
                            <comment id="367874" author="gerrit" created="Thu, 30 Mar 2023 14:43:10 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50470/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50470/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16515&quot; title=&quot;sanity test_118c test_118d: No page in writeback, writeback=0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16515&quot;&gt;LU-16515&lt;/a&gt; tests: disable sanity test_118c/118d&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 7c52cbf65218d77c0594f92981173aa7d78f6758&lt;/p&gt;</comment>
                            <comment id="370804" author="ys" created="Thu, 27 Apr 2023 10:49:23 +0000"  >&lt;p&gt;I have captured a vmcore for the failure case:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; ll_inode_info ffff8f7192598000 -x
struct ll_inode_info {
  lli_inode_magic = 0x111d0de5,
....
  lli_fid = {
    f_seq = 0x200001b78,
    f_oid = 0xd3a,
    f_ver = 0x0
  },
.........
    i_op = 0xffffffffc106ffc0 &amp;lt;ll_file_inode_operations&amp;gt;,
    i_sb = 0xffff8f719a8ea800,
    i_mapping = 0xffff8f7192598208,
    i_security = 0xffff8f712ef42d80,
    i_ino = 0x200001b78000d3a,
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Look into mmaping:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; address_space 0xffff8f7192598208 -o
struct address_space {
  [ffff8f7192598208] struct inode *host;
  [ffff8f7192598210] struct xarray i_pages;
  [ffff8f7192598228] atomic_t i_mmap_writable;
...........
crash&amp;gt; xarray ffff8f7192598210
struct xarray {
  xa_lock = {
    {
      rlock = {
        raw_lock = {
          {
            val = {
              counter = 0
            },
            {
              locked = 0 &apos;\000&apos;,
              pending = 0 &apos;\000&apos;
            },
            {
              locked_pending = 0,
              tail = 0
            }
          }
        }
      }
    }
  },
  xa_flags = 16777249,
  xa_head = 0xffffd8b842578f80,
  xarray_size_rh = 0,
  _rh = {&amp;lt;No data fields&amp;gt;}
}
crash&amp;gt; kmem 0xffffd8b842578f80
      PAGE       PHYSICAL      MAPPING       INDEX CNT FLAGS
ffffd8b842578f80 95e3e000 ffff8f7192598208        0  2 fffffc0005028 uptodate,lru,private,writeback
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So every thing looks valid. In the RHEL8 use XA to replace radix tree. The only reason could be a racy in XArray code. But i am not sure which place should be a culprit. Further investigating is needed. Any comment is appreciated. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="36584">LU-8101</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="73044">LU-16285</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03bbr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>