<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:53:46 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12572] sanity-pfl test_20b: Delete is not completed in 29 seconds</title>
                <link>https://jira.whamcloud.com/browse/LU-12572</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for jianyu &amp;lt;yujian@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/d66394cc-abc3-11e9-a0be-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/d66394cc-abc3-11e9-a0be-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;test_20b failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CMD: trevis-70vm4 /usr/sbin/lctl set_param -n os[cd]*.*MD*.force_sync 1
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
CMD: trevis-70vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
Delete is not completed in 29 seconds
CMD: trevis-70vm4 /usr/sbin/lctl get_param osc.*MDT*.sync_*
osc.lustre-OST0000-osc-MDT0000.sync_changes=0
osc.lustre-OST0000-osc-MDT0000.sync_in_flight=0
osc.lustre-OST0000-osc-MDT0000.sync_in_progress=1
osc.lustre-OST0000-osc-MDT0002.sync_changes=0
osc.lustre-OST0000-osc-MDT0002.sync_in_flight=0
osc.lustre-OST0000-osc-MDT0002.sync_in_progress=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;






&lt;p&gt;VVVVVVV DO NOT REMOVE LINES BELOW, Added by Maloo for auto-association VVVVVVV&lt;br/&gt;
sanity-pfl test_20b - test_20b returned 1&lt;/p&gt;</description>
                <environment></environment>
        <key id="56461">LU-12572</key>
            <summary>sanity-pfl test_20b: Delete is not completed in 29 seconds</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Mon, 22 Jul 2019 04:32:02 +0000</created>
                <updated>Wed, 28 Oct 2020 21:18:35 +0000</updated>
                                            <version>Lustre 2.13.0</version>
                    <version>Lustre 2.14.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="253697" author="yujian" created="Tue, 27 Aug 2019 17:02:17 +0000"  >&lt;p&gt;+1 on master branch: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/5e5172b4-c8e8-11e9-90ad-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/5e5172b4-c8e8-11e9-90ad-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="253795" author="jamesanunez" created="Wed, 28 Aug 2019 20:24:39 +0000"  >&lt;p&gt;sanity-pfl test 20b is failing for review-dne-zfs-part-4 at a high rate. We&#8217;ve seen this test fail as early as 12 July 2019 with two failures for in June 2019; &lt;a href=&quot;https://testing.whamcloud.com/test_sets/f5ff99e2-9a1c-11e9-b26a-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/f5ff99e2-9a1c-11e9-b26a-52540065bddc&lt;/a&gt; and &lt;a href=&quot;https://testing.whamcloud.com/test_sets/19cb1252-90b8-11e9-abe3-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/19cb1252-90b8-11e9-abe3-52540065bddc&lt;/a&gt; . &lt;/p&gt;

&lt;p&gt;In some of these failures, for example at &lt;a href=&quot;https://testing.whamcloud.com/test_sets/b04ecaf6-c953-11e9-97d5-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/b04ecaf6-c953-11e9-97d5-52540065bddc&lt;/a&gt;, we see an error in the client test_log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CMD: trevis-44vm4 /usr/sbin/lctl set_param -n os[cd]*.*MD*.force_sync 1
trevis-44vm4: error: set_param: setting /sys/fs/lustre/osc/lustre-OST0000-osc-MDT0000/force_sync=1: Connection timed out
CMD: trevis-44vm4 /usr/sbin/lctl get_param -n osc.*MDT*.sync_*
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In some failures, for example &lt;a href=&quot;https://testing.whamcloud.com/test_sets/2894009c-b56f-11e9-b023-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/2894009c-b56f-11e9-b023-52540065bddc&lt;/a&gt;, we see that some of the osc parameters are not changed to 0:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Delete is not completed in 29 seconds
CMD: trevis-70vm4 /usr/sbin/lctl get_param osc.*MDT*.sync_*
osc.lustre-OST0000-osc-MDT0000.sync_changes=0
osc.lustre-OST0000-osc-MDT0000.sync_in_flight=0
osc.lustre-OST0000-osc-MDT0000.sync_in_progress=1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In this case, we also see the following in the OSS console&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; [14628.488316] Lustre: DEBUG MARKER: dmesg
[14629.089241] LustreError: 5730:0:(pack_generic.c:425:lustre_msg_buf_v2()) msg ffff9e3e5937f600 buffer[0] size 152 too small (required 184, opc=-1)
[14629.091601] LustreError: 5730:0:(pack_generic.c:425:lustre_msg_buf_v2()) Skipped 815 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Not sure if this causes errors, but for some failures, for example &lt;a href=&quot;https://testing.whamcloud.com/test_sets/1996cd30-b2f5-11e9-b753-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/1996cd30-b2f5-11e9-b753-52540065bddc&lt;/a&gt;, we see the following on the OSS console&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[13183.416656] Lustre: ll_ost00_035: service thread pid 6546 was inactive for 40.145 seconds. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</comment>
                            <comment id="261860" author="yujian" created="Sun, 26 Jan 2020 19:38:11 +0000"  >&lt;p&gt;Still failed on master branch: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/3c217dac-406f-11ea-9543-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/3c217dac-406f-11ea-9543-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="272820" author="emoly.liu" created="Sat, 13 Jun 2020 01:42:55 +0000"  >&lt;p&gt;more on master:&#160;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/53a2b7d2-5056-4261-9ae8-c9cf52f4c47c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/53a2b7d2-5056-4261-9ae8-c9cf52f4c47c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/65adf9fc-24c8-40f2-b969-4a2a4ffde057&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/65adf9fc-24c8-40f2-b969-4a2a4ffde057&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="274760" author="paf0186" created="Wed, 8 Jul 2020 15:16:20 +0000"  >&lt;p&gt;Brief notes from a minimal look:&lt;/p&gt;

&lt;p&gt;This is failing in the cleanup phase related to setting up ENOSPC/low space as part of the test, specifically:&lt;br/&gt;
 stack_trap &quot;ost_watermarks_clear_enospc $tfile $ost_idx1 $wms&quot; EXIT&lt;/p&gt;

&lt;p&gt;Sets up:&lt;/p&gt;

&lt;p&gt;ost_watermarks_clear_enospc&lt;/p&gt;

&lt;p&gt;And the failure is specifically related to&#160;wait_delete_completed_mds, this part:&lt;/p&gt;


&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;        local WAIT=0
        while [[ $WAIT -ne $max_wait ]]; do
                changes=$(do_nodes $mds2sync \
                        &quot;$LCTL get_param -n osc.*MDT*.sync_*&quot; | calc_sum)
                #echo &quot;$node: $changes changes on all&quot;
                if [[ $changes -eq 0 ]]; then
                        wait_zfs_commit $SINGLEMDS                        # the occupied disk space will be released
                        # only after TXGs are committed
                        wait_zfs_commit ost1
                        return 0
                fi
                sleep 1
                WAIT=$((WAIT + 1))
        done &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;On failure, we dump the sync parameters we&apos;re summing there, and they all show as zero.&lt;/p&gt;

&lt;p&gt;Most likely, this means we were waiting for ZFS commit (Unless there is a race condition between the &quot;while [[ $WAIT .. ]] &quot; timer and this check).&lt;/p&gt;

&lt;p&gt;Since this is just failing (I think?&#160; I am relying on past comments &amp;amp; the most recent failure) in ZFS testing, I strongly suspect it&apos;s waiting for ZFS commit.&lt;/p&gt;

&lt;p&gt;So basically it looks to me like sometimes after failover we&apos;re waiting more than 30 seconds for ZFS commit.&#160; I have not dug in further.&lt;/p&gt;</comment>
                            <comment id="274761" author="paf0186" created="Wed, 8 Jul 2020 15:18:43 +0000"  >&lt;p&gt;Note that in contrast to the earlier failures James highlighted, the most recent failure does &lt;b&gt;not&lt;/b&gt; have any errors in the force_sync set_param.&#160; That seems highly relevant because such an error would explain getting stuck, but it&apos;s not present in this most recent failure.&#160; I suspect in the earlier case we had an issue with the connection coming back up after the failover (so we failed to set sync, so we failed this check, etc), whereas here it looks more like a ZFS/storage issue&lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/help_16.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="54946">LU-11987</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00jzr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>