<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:25:31 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16271] replay-single test_89 FAIL: 20480 blocks leaked </title>
                <link>https://jira.whamcloud.com/browse/LU-16271</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Lai Siyao &amp;lt;lai.siyao@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/d1100f6d-6ef7-4e77-9e31-f6de3a6b208e&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/d1100f6d-6ef7-4e77-9e31-f6de3a6b208e&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== replay-single test 89: no disk space leak on late ost connection ========================================================== 19:47:35 
...
Waiting for orphan cleanup...
CMD: onyx-60vm4 /usr/sbin/lctl list_param osp.*osc*.old_sync_processed 2&amp;gt; /dev/null
osp.lustre-OST0000-osc-MDT0000.old_sync_processed
osp.lustre-OST0001-osc-MDT0000.old_sync_processed
wait 40 secs maximumly for onyx-60vm4 mds-ost sync done.
CMD: onyx-60vm4 /usr/sbin/lctl get_param -n osp.*osc*.old_sync_processed
sleep 5 for ZFS zfs
Waiting for MDT destroys to complete
CMD: onyx-60vm4 /usr/sbin/lctl get_param -n osp.*.destroys_in_flight
 replay-single test_89: @@@@@@ FAIL: 20480 blocks leaked 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:6524:error()
  = /usr/lib64/lustre/tests/replay-single.sh:3329:test_89()
  = /usr/lib64/lustre/tests/test-framework.sh:6860:run_one()
  = /usr/lib64/lustre/tests/test-framework.sh:6910:run_one_logged()
  = /usr/lib64/lustre/tests/test-framework.sh:6747:run_test()
  = /usr/lib64/lustre/tests/replay-single.sh:3331:main()
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="72954">LU-16271</key>
            <summary>replay-single test_89 FAIL: 20480 blocks leaked </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="aboyko">Alexander Boyko</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Thu, 27 Oct 2022 03:00:52 +0000</created>
                <updated>Tue, 3 Jan 2023 21:49:10 +0000</updated>
                            <resolved>Tue, 3 Jan 2023 21:49:10 +0000</resolved>
                                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="353776" author="adilger" created="Tue, 22 Nov 2022 00:57:37 +0000"  >&lt;p&gt;I thought the first failure like this was 2022-09-02 (2.15.51.86) and then not another failure until 2022-10-15 (2.15.52.63), but I now found &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14201&quot; title=&quot;replay-single test 89 fails with &amp;#39;3072 blocks leaked&amp;#39;&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14201&quot;&gt;LU-14201&lt;/a&gt; which looks like a duplicate and has been around much longer.&lt;/p&gt;</comment>
                            <comment id="353777" author="adilger" created="Tue, 22 Nov 2022 01:00:14 +0000"  >&lt;p&gt;Duplicate, but seems to be hitting more regularly.  Only 5 failures in 2022-07 and 2022-08, but 16 in the past 4 weeks.&lt;/p&gt;</comment>
                            <comment id="353779" author="adilger" created="Tue, 22 Nov 2022 01:06:41 +0000"  >&lt;p&gt;It looks like this recent wave of failures was caused by the landing of patch &lt;a href=&quot;https://review.whamcloud.com/47928&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/47928&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16002&quot; title=&quot;Ping evictor delayed client eviction for 3 ping interval more than defined&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16002&quot;&gt;LU-16002&lt;/a&gt; ptlrpc: reduce pinger eviction time&lt;/tt&gt;&quot; because this patch caused all 5 failures in this subtest &lt;b&gt;before&lt;/b&gt; it landed, then landed on 2022-10-14 and coincides with the increase in test failures.  &lt;/p&gt;</comment>
                            <comment id="353780" author="adilger" created="Tue, 22 Nov 2022 01:08:35 +0000"  >&lt;p&gt;Alexander, it looks like the landing of your patch caused this test to start failing, please see the following test report:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://testing.whamcloud.com/search?status%5B%5D=FAIL&amp;amp;test_set_script_id=f6a12204-32c3-11e0-a61c-52540025f9ae&amp;amp;sub_test_script_id=fd0b0e02-32c3-11e0-a61c-52540025f9ae&amp;amp;start_date=2022-07-01&amp;amp;end_date=2022-11-21&amp;amp;source=sub_tests#redirect&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/search?status%5B%5D=FAIL&amp;amp;test_set_script_id=f6a12204-32c3-11e0-a61c-52540025f9ae&amp;amp;sub_test_script_id=fd0b0e02-32c3-11e0-a61c-52540025f9ae&amp;amp;start_date=2022-07-01&amp;amp;end_date=2022-11-21&amp;amp;source=sub_tests#redirect&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="354289" author="aboyko" created="Mon, 28 Nov 2022 09:59:24 +0000"  >
&lt;p&gt;I&apos;ve checked  logs and it look a bit strange, MDT0 eviction right after recovery&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00010000:00080000:0.0:1669022027.575599:0:182954:0:(ldlm_lib.c:2820:target_recovery_thread()) @@@ processing final ping from 10.240.26.199@tcp:  req@000000002adb22e8 x1750088378241600/t0(0) o400-&amp;gt;lustre-MDT0002-mdtlov_UUID@10.240.26.199@tcp:161/0 lens 224/0 e 6 to 0 dl 1669022036 ref 1 fl Complete:H/c0/ffffffff rc 0/-1 job:&apos;ldlm_lock_repla.0&apos;
00010000:02000000:0.0:1669022027.575670:0:182954:0:(ldlm_lib.c:1766:target_finish_recovery()) lustre-OST0000: Recovery over after 1:10, of 6 clients 5 recovered and 1 was evicted.
00000100:00080000:1.0:1669022027.578807:0:18353:0:(pinger.c:523:ping_evictor_main()) Last request was at 1669022027
00000020:00080000:1.0:1669022027.578808:0:18353:0:(genops.c:1604:class_fail_export()) disconnecting export 00000000391424d3/lustre-MDT0000-mdtlov_UUID
00000020:00000080:1.0:1669022027.578812:0:18353:0:(genops.c:1445:class_disconnect()) disconnect: cookie 0x8074ece8ff0c069e
00000100:00080000:1.0:1669022027.578814:0:18353:0:(import.c:86:import_set_state_nolock()) 00000000ec9d3565 8&#65533;s&#65533;&#65533;&#65533;&#65533;&#65533;: changing import state from FULL to CLOSED
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;At MDT0 side, it connected, fall through recovery, send OST_CREATE/DESTROY but got -107 (operation on unconnected OST)&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000100:00080000:0.0:1669021958.927892:0:348514:0:(import.c:86:import_set_state_nolock()) 000000000d82a1de lustre-OST0000_UUID: changing import state from DISCONN to CONNECTING
00000100:00080000:0.0:1669021958.927894:0:348514:0:(import.c:534:import_select_connection()) lustre-OST0000-osc-MDT0000: connect to NID 10.240.26.198@tcp last attempt 7822
00000100:00080000:0.0:1669021958.927895:0:348514:0:(import.c:581:import_select_connection()) lustre-OST0000-osc-MDT0000: tried all connections, increasing latency to 11s
00000100:00080000:0.0:1669021958.927897:0:348514:0:(import.c:615:import_select_connection()) lustre-OST0000-osc-MDT0000: import 000000000d82a1de using connection 10.240.26.198@tcp/10.240.26.198@tcp
00000100:00100000:1.0:1669021958.929148:0:11061:0:(client.c:2912:ptlrpc_free_committed()) lustre-OST0000-osc-MDT0000: committing for last_committed 55834574850 gen 2
00000100:00080000:1.0:1669021958.929150:0:11061:0:(import.c:1074:ptlrpc_connect_interpret()) lustre-OST0000-osc-MDT0000: connect to target with instance 15
00000100:00080000:1.0:1669021958.929151:0:11061:0:(import.c:934:ptlrpc_connect_set_flags()) lustre-OST0000-osc-MDT0000: Resetting ns_connect_flags to server flags: 0x2040401443000066
00000100:00080000:1.0:1669021958.929153:0:11061:0:(import.c:1159:ptlrpc_connect_interpret()) connected to replayable target: lustre-OST0000_UUID
00000100:00080000:1.0:1669021958.929153:0:11061:0:(import.c:1177:ptlrpc_connect_interpret()) connect to lustre-OST0000_UUID during recovery
00000100:00080000:1.0:1669021958.929154:0:11061:0:(import.c:86:import_set_state_nolock()) 000000000d82a1de lustre-OST0000_UUID: changing import state from CONNECTING to REPLAY_LOCKS
00000100:00080000:1.0:1669021958.929155:0:11061:0:(import.c:86:import_set_state_nolock()) 000000000d82a1de lustre-OST0000_UUID: changing import state from REPLAY_LOCKS to REPLAY_WAIT

00010000:00080000:1.0:1669022027.578997:0:11061:0:(ldlm_request.c:1449:ldlm_cli_update_pool()) @@@ Zero SLV or limit found (SLV=0, limit=16625)  req@000000001bd81261 x1750088378241600/t0(0) o400-&amp;gt;lustre-OST0000-osc-MDT0002@10.240.26.198@tcp:28/4 lens 224/224 e 6 to 0 dl 1669022037 ref 1 fl Rpc:RQU/c0/0 rc 0/0 job:&apos;ldlm_lock_repla.0&apos;
00000100:00080000:1.0:1669022027.579029:0:11061:0:(import.c:86:import_set_state_nolock()) 000000000d82a1de lustre-OST0000_UUID: changing import state from REPLAY_WAIT to RECOVER
00000100:00080000:1.0:1669022027.579030:0:11061:0:(client.c:3040:ptlrpc_resend_req()) @@@ going to resend  req@00000000c8b765cc x1750088378241536/t0(0) o400-&amp;gt;lustre-OST0000-osc-MDT0000@10.240.26.198@tcp:28/4 lens 224/224 e 6 to 0 dl 1669022041 ref 1 fl Interpret:RQU/c0/0 rc 0/0 job:&apos;ptlrpcd_rcv.0&apos;
00000100:00080000:1.0:1669022027.579033:0:11061:0:(client.c:3049:ptlrpc_resend_req()) @@@ it has reply, so skip it  req@00000000c8b765cc x1750088378241536/t0(0) o400-&amp;gt;lustre-OST0000-osc-MDT0000@10.240.26.198@tcp:28/4 lens 224/224 e 6 to 0 dl 1669022041 ref 1 fl Interpret:RQU/c0/0 rc 0/0 job:&apos;ptlrpcd_rcv.0&apos;
00000100:00080000:1.0:1669022027.579039:0:11061:0:(import.c:86:import_set_state_nolock()) 000000000d82a1de lustre-OST0000_UUID: changing import state from RECOVER to FULL
00000004:00080000:1.0:1669022027.579044:0:11061:0:(osp_dev.c:1618:osp_import_event()) got connected
00000100:02000000:1.0:1669022027.579044:0:11061:0:(import.c:1648:ptlrpc_import_recovery_state_machine()) lustre-OST0000-osc-MDT0000: Connection restored to  (at 10.240.26.198@tcp)

00000100:00100000:0.0:1669022027.579228:0:11062:0:(client.c:1743:ptlrpc_send_new_req()) Sending RPC req@000000005e38fb65 pname:cluuid:pid:xid:nid:opc:job ptlrpcd_00_00:lustre-MDT0000-mdtlov_UUID:11062:1750088378275456:10.240.26.198@tcp:6:osp-syn-0-0.0
00000100:02020000:1.0:1669022027.581000:0:441603:0:(client.c:1361:ptlrpc_check_status()) 11-0: lustre-OST0000-osc-MDT0000: operation ost_create to node 10.240.26.198@tcp failed: rc = -107
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Looks like a last ping for a recovery finish stayed at OST side. After stale client eviction, OST processed final pings, move imports to FULL, and after that pinger_evictor found that timeout expired (no request from MDT0 from final ping to final ping processing) and evicted it.&lt;br/&gt;
I guess, recovery should update exp_last_request_time to a recovery finish time for all clients.&lt;/p&gt;</comment>
                            <comment id="354317" author="gerrit" created="Mon, 28 Nov 2022 14:32:59 +0000"  >&lt;p&gt;&quot;Alexander &amp;lt;alexander.boyko@hpe.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49257&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49257&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16271&quot; title=&quot;replay-single test_89 FAIL: 20480 blocks leaked &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16271&quot;&gt;&lt;del&gt;LU-16271&lt;/del&gt;&lt;/a&gt; ptlrpc: fix eviction right after recovery&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: a3e80d7bcf8d21421a4f18995c245ac84105aa80&lt;/p&gt;</comment>
                            <comment id="354430" author="adilger" created="Tue, 29 Nov 2022 06:31:07 +0000"  >&lt;p&gt;Thanks for the quick patch. Always annoying when there are autotest failures, and I&apos;ve been trying to get the most common ones fixed so that other patch landings are not slowed down. &lt;/p&gt;</comment>
                            <comment id="357797" author="gerrit" created="Tue, 3 Jan 2023 21:32:50 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/49257/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/49257/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16271&quot; title=&quot;replay-single test_89 FAIL: 20480 blocks leaked &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16271&quot;&gt;&lt;del&gt;LU-16271&lt;/del&gt;&lt;/a&gt; ptlrpc: fix eviction right after recovery&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 2a6fda5b369c59a320941848968597244b8f3583&lt;/p&gt;</comment>
                            <comment id="357823" author="pjones" created="Tue, 3 Jan 2023 21:49:10 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="61911">LU-14201</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="71085">LU-16002</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i033u7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>