<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:42:08 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11235] sanity: test_230d timeout on ZFS backends</title>
                <link>https://jira.whamcloud.com/browse/LU-11235</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Lai Siyao &amp;lt;lai.siyao@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/b3a1452a-9c82-11e8-8ee3-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/b3a1452a-9c82-11e8-8ee3-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;MDS backtrace &lt;a href=&quot;https://testing.whamcloud.com/test_logs/b50461b8-9c82-11e8-8ee3-52540065bddc/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_logs/b50461b8-9c82-11e8-8ee3-52540065bddc/show_text&lt;/a&gt; shows it&apos;s stuck in txg_wait_synced().&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[10285.072535] ldlm_cn00_002   D ffff9825b67adee0     0 32218      2 0x00000080
[10285.073583] Call Trace:
[10285.073963]  [&amp;lt;ffffffff8b914029&amp;gt;] schedule+0x29/0x70
[10285.074659]  [&amp;lt;ffffffffc05da205&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[10285.075360]  [&amp;lt;ffffffff8b2bc610&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[10285.076201]  [&amp;lt;ffffffffc05da245&amp;gt;] __cv_wait+0x15/0x20 [spl]
[10285.076939]  [&amp;lt;ffffffffc0704bdf&amp;gt;] txg_wait_synced+0xef/0x140 [zfs]
[10285.077663]  [&amp;lt;ffffffffc1348a43&amp;gt;] osd_sync+0xc3/0x140 [osd_zfs]
[10285.078507]  [&amp;lt;ffffffffc1163a5b&amp;gt;] tgt_sync+0x14b/0x270 [ptlrpc]
[10285.079287]  [&amp;lt;ffffffffc1165f32&amp;gt;] tgt_blocking_ast+0x302/0x630 [ptlrpc]
[10285.080043]  [&amp;lt;ffffffffc10b6e5a&amp;gt;] ldlm_cancel_callback+0x8a/0x330 [ptlrpc]
[10285.080787]  [&amp;lt;ffffffffc10b7156&amp;gt;] ldlm_lock_cancel+0x56/0x1f0 [ptlrpc]
[10285.081492]  [&amp;lt;ffffffffc10dc06b&amp;gt;] ldlm_request_cancel+0x18b/0x730 [ptlrpc]
[10285.082266]  [&amp;lt;ffffffffc10df5ea&amp;gt;] ldlm_handle_cancel+0xba/0x250 [ptlrpc]
[10285.082964]  [&amp;lt;ffffffffc10df8d8&amp;gt;] ldlm_cancel_handler+0x158/0x590 [ptlrpc]
[10285.083688]  [&amp;lt;ffffffffc111040b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[10285.084449]  [&amp;lt;ffffffff8b2c52ab&amp;gt;] ? __wake_up_common+0x5b/0x90
[10285.085095]  [&amp;lt;ffffffffc1113c44&amp;gt;] ptlrpc_main+0xb14/0x1fb0 [ptlrpc]
[10285.085752]  [&amp;lt;ffffffffc1113130&amp;gt;] ? ptlrpc_register_service+0xe90/0xe90 [ptlrpc]
[10285.086464]  [&amp;lt;ffffffff8b2bb621&amp;gt;] kthread+0xd1/0xe0
[10285.087009]  [&amp;lt;ffffffff8b2bb550&amp;gt;] ? insert_kthread_work+0x40/0x40
[10285.087626]  [&amp;lt;ffffffff8b9205f7&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[10285.088259]  [&amp;lt;ffffffff8b2bb550&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="52948">LU-11235</key>
            <summary>sanity: test_230d timeout on ZFS backends</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>always_except</label>
                            <label>zfs</label>
                    </labels>
                <created>Mon, 13 Aug 2018 12:33:03 +0000</created>
                <updated>Wed, 9 Dec 2020 22:12:18 +0000</updated>
                                                                                <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="231853" author="laisiyao" created="Mon, 13 Aug 2018 12:34:36 +0000"  >&lt;p&gt;Alex, do you have any clue?&lt;/p&gt;</comment>
                            <comment id="231855" author="bzzz" created="Mon, 13 Aug 2018 13:16:46 +0000"  >&lt;p&gt;Lai, in many other cases that was just a sign of large (and usually slow) I/O&lt;/p&gt;</comment>
                            <comment id="231860" author="laisiyao" created="Mon, 13 Aug 2018 14:16:37 +0000"  >&lt;p&gt;The test log shows it was stuck there over 170mins.&lt;/p&gt;</comment>
                            <comment id="232137" author="adilger" created="Fri, 17 Aug 2018 19:11:34 +0000"  >&lt;p&gt;Given that this test is &quot;&lt;tt&gt;check migrate big directory&lt;/tt&gt;&quot; it seems probable that this relates to the patches themselves.  The &lt;tt&gt;txg_wait_synced()&lt;/tt&gt; call is the equivalent of being stuck in &lt;tt&gt;jbd2_journal_stop()&lt;/tt&gt; - this is waiting for the transaction to commit, but something is keeping the transaction open for a long time.  Normally ZFS will commit a TXG every 1s, so I suspect some kind of reference leak.&lt;/p&gt;</comment>
                            <comment id="232139" author="adilger" created="Fri, 17 Aug 2018 19:30:22 +0000"  >&lt;p&gt;I see some other stuck threads, and in particular &lt;tt&gt;txg_sync&lt;/tt&gt; is waiting on IO to complete, and a second thread is also stuck in the same place (I&apos;m not sure what the &lt;tt&gt;in:imjournal&lt;/tt&gt; thread is for):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[10284.337042] in:imjournal    D ffff9825fafbdee0     0  1304      1 0x00000080
[10284.337732] Call Trace:
[10284.338475]  [&amp;lt;ffffffff8b914029&amp;gt;] schedule+0x29/0x70
[10284.338958]  [&amp;lt;ffffffff8b911999&amp;gt;] schedule_timeout+0x239/0x2c0
[10284.341130]  [&amp;lt;ffffffff8b91353d&amp;gt;] io_schedule_timeout+0xad/0x130
[10284.341696]  [&amp;lt;ffffffff8b9135d8&amp;gt;] io_schedule+0x18/0x20
[10284.342205]  [&amp;lt;ffffffff8b911fc1&amp;gt;] bit_wait_io+0x11/0x50
[10284.342703]  [&amp;lt;ffffffff8b911ae7&amp;gt;] __wait_on_bit+0x67/0x90
[10284.343789]  [&amp;lt;ffffffff8b3936d1&amp;gt;] wait_on_page_bit+0x81/0xa0
[10284.344939]  [&amp;lt;ffffffff8b3a4e7b&amp;gt;] truncate_inode_pages_range+0x42b/0x700
[10284.348657]  [&amp;lt;ffffffff8b3a51bf&amp;gt;] truncate_inode_pages_final+0x4f/0x60
[10284.349288]  [&amp;lt;ffffffffc047610f&amp;gt;] ext4_evict_inode+0x11f/0x4c0 [ext4]
[10284.349899]  [&amp;lt;ffffffff8b4387d4&amp;gt;] evict+0xb4/0x180
[10284.350372]  [&amp;lt;ffffffff8b4390dc&amp;gt;] iput+0xfc/0x190
[10284.350816]  [&amp;lt;ffffffff8b4339a0&amp;gt;] __dentry_kill+0x120/0x180
[10284.351361]  [&amp;lt;ffffffff8b433aa9&amp;gt;] dput+0xa9/0x160
[10284.351811]  [&amp;lt;ffffffff8b42db08&amp;gt;] SYSC_renameat2+0x518/0x5a0
[10284.355455]  [&amp;lt;ffffffff8b42ea0e&amp;gt;] SyS_renameat2+0xe/0x10
[10284.355977]  [&amp;lt;ffffffff8b42ea4e&amp;gt;] SyS_rename+0x1e/0x20

[10285.072535] ldlm_cn00_002   D ffff9825b67adee0     0 32218      2 0x00000080
[10285.073583] Call Trace:
[10285.073963]  [&amp;lt;ffffffff8b914029&amp;gt;] schedule+0x29/0x70
[10285.074659]  [&amp;lt;ffffffffc05da205&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[10285.076201]  [&amp;lt;ffffffffc05da245&amp;gt;] __cv_wait+0x15/0x20 [spl]
[10285.076939]  [&amp;lt;ffffffffc0704bdf&amp;gt;] txg_wait_synced+0xef/0x140 [zfs]
[10285.077663]  [&amp;lt;ffffffffc1348a43&amp;gt;] osd_sync+0xc3/0x140 [osd_zfs]
[10285.078507]  [&amp;lt;ffffffffc1163a5b&amp;gt;] tgt_sync+0x14b/0x270 [ptlrpc]
[10285.079287]  [&amp;lt;ffffffffc1165f32&amp;gt;] tgt_blocking_ast+0x302/0x630 [ptlrpc]
[10285.080043]  [&amp;lt;ffffffffc10b6e5a&amp;gt;] ldlm_cancel_callback+0x8a/0x330 [ptlrpc]
[10285.080787]  [&amp;lt;ffffffffc10b7156&amp;gt;] ldlm_lock_cancel+0x56/0x1f0 [ptlrpc]
[10285.081492]  [&amp;lt;ffffffffc10dc06b&amp;gt;] ldlm_request_cancel+0x18b/0x730 [ptlrpc]
[10285.082266]  [&amp;lt;ffffffffc10df5ea&amp;gt;] ldlm_handle_cancel+0xba/0x250 [ptlrpc]
[10285.082964]  [&amp;lt;ffffffffc10df8d8&amp;gt;] ldlm_cancel_handler+0x158/0x590 [ptlrpc]
[10285.083688]  [&amp;lt;ffffffffc111040b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[10285.085095]  [&amp;lt;ffffffffc1113c44&amp;gt;] ptlrpc_main+0xb14/0x1fb0 [ptlrpc]
[10285.086464]  [&amp;lt;ffffffff8b2bb621&amp;gt;] kthread+0xd1/0xe0

[10285.745845] txg_sync        D ffff9825d9792f70     0  1592      2 0x00000080
[10285.746553] Call Trace:
[10285.747373]  [&amp;lt;ffffffff8b914029&amp;gt;] schedule+0x29/0x70
[10285.747843]  [&amp;lt;ffffffff8b911999&amp;gt;] schedule_timeout+0x239/0x2c0
[10285.749545]  [&amp;lt;ffffffff8b91353d&amp;gt;] io_schedule_timeout+0xad/0x130
[10285.750737]  [&amp;lt;ffffffff8b9135d8&amp;gt;] io_schedule+0x18/0x20
[10285.751252]  [&amp;lt;ffffffffc05da192&amp;gt;] cv_wait_common+0xb2/0x150 [spl]
[10285.752390]  [&amp;lt;ffffffffc05da268&amp;gt;] __cv_wait_io+0x18/0x20 [spl]
[10285.752994]  [&amp;lt;ffffffffc075d023&amp;gt;] zio_wait+0x113/0x1c0 [zfs]
[10285.753560]  [&amp;lt;ffffffffc0711bc1&amp;gt;] vdev_config_sync+0xf1/0x180 [zfs]
[10285.754199]  [&amp;lt;ffffffffc06f1a9c&amp;gt;] spa_sync+0xa1c/0xd90 [zfs]
[10285.755372]  [&amp;lt;ffffffffc0705c41&amp;gt;] txg_sync_thread+0x301/0x510 [zfs]
[10285.756565]  [&amp;lt;ffffffffc05d5013&amp;gt;] thread_generic_wrapper+0x73/0x80 [spl]
[10285.757786]  [&amp;lt;ffffffff8b2bb621&amp;gt;] kthread+0xd1/0xe0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;There is a second &lt;tt&gt;txg_sync&lt;/tt&gt; thread that doesn&apos;t appear to be doing anything, I&apos;m not sure if that is for a different MDT zpool?:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[10285.004345] txg_sync        S ffff9825e40b9fa0     0 31971      2 0x00000080
[10285.005189] Call Trace:
[10285.005442]  [&amp;lt;ffffffff8b914029&amp;gt;] schedule+0x29/0x70
[10285.005912]  [&amp;lt;ffffffff8b9118d4&amp;gt;] schedule_timeout+0x174/0x2c0
[10285.007064]  [&amp;lt;ffffffffc05da362&amp;gt;] __cv_timedwait_common+0xd2/0x190 [spl]
[10285.008255]  [&amp;lt;ffffffffc05da476&amp;gt;] __cv_timedwait_sig+0x16/0x20 [spl]
[10285.008890]  [&amp;lt;ffffffffc0705add&amp;gt;] txg_sync_thread+0x19d/0x510 [zfs]
[10285.010105]  [&amp;lt;ffffffffc05d5013&amp;gt;] thread_generic_wrapper+0x73/0x80 [spl]
[10285.011345]  [&amp;lt;ffffffff8b2bb621&amp;gt;] kthread+0xd1/0xe0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This could conceivably be stuck waiting for IO from the VM host to complete, I&apos;m not sure yet, so I&apos;ve resubmitted this test session to re-run to see if the problem hits again.  I wouldn&apos;t want to just to just accept a 1/2 failure rate for landing, but it is possible this is a freak failure.  Resubmitting the patch with several lines of &lt;tt&gt;Test-Parameters: mdtbackfstype=zfs ostbackfstype=zfs mdscount=2 mdtcount=2 testlist=sanity,sanity,sanity,sanity&lt;/tt&gt; would tell us whether this was a freak accident or a regular problem.&lt;/p&gt;</comment>
                            <comment id="233327" author="laisiyao" created="Tue, 11 Sep 2018 14:18:58 +0000"  >&lt;p&gt;This may be an old issue, because the same symptom appears on 2.10 also, see LDEV-309 and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8601&quot; title=&quot;sanity test_230d: Timeout on ZFS backed MDSs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8601&quot;&gt;&lt;del&gt;LU-8601&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;I&apos;ve enabled full debug and checked the logs, each time it&apos;s stuck in osd_sync() --&amp;gt; txg_wait_synced(), and with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4684&quot; title=&quot;DNE3: allow migrating DNE striped directory&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4684&quot;&gt;&lt;del&gt;LU-4684&lt;/del&gt;&lt;/a&gt; patches this happens more often is because the new dir migration implementation triggers more lock revoke which triggers Sync-on-Lock-Cancel more often. Per Alex&apos;s suggestion in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8601&quot; title=&quot;sanity test_230d: Timeout on ZFS backed MDSs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8601&quot;&gt;&lt;del&gt;LU-8601&lt;/del&gt;&lt;/a&gt;, I&apos;d suggest to disable this test for ZFS system, otherwise the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4684&quot; title=&quot;DNE3: allow migrating DNE striped directory&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4684&quot;&gt;&lt;del&gt;LU-4684&lt;/del&gt;&lt;/a&gt; can&apos;t pass this test.&lt;/p&gt;

&lt;p&gt;Andreas, what&apos;s your opinion?&lt;/p&gt;</comment>
                            <comment id="233444" author="adilger" created="Thu, 13 Sep 2018 06:02:25 +0000"  >&lt;p&gt;I guess the important question is not whether the test is failing , but whether the test represents something that will happen in the real world when directory restripe is being used?  If yes, then it doesn&apos;t help us to disable the test, because it just means that users will get similar hangs on their production systems. &lt;/p&gt;</comment>
                            <comment id="233448" author="laisiyao" created="Thu, 13 Sep 2018 06:36:40 +0000"  >&lt;p&gt;I agree, there are two possible issues about ZFS which needs investigation:&lt;br/&gt;
1. transaction in sync mode may not work.&lt;br/&gt;
2. txg_wait_synced() may hung if called frequently.&lt;/p&gt;

&lt;p&gt;I looked though the latest commits in ZFS code, bug didn&apos;t find related fix. BTW which version of ZFS is deployed in autotest?&lt;/p&gt;</comment>
                            <comment id="233449" author="bzzz" created="Thu, 13 Sep 2018 07:07:26 +0000"  >&lt;p&gt;another theoretical possibility would be a missing I/O completion callback or some race in that area.&lt;/p&gt;</comment>
                            <comment id="233459" author="pjones" created="Thu, 13 Sep 2018 12:42:49 +0000"  >&lt;p&gt;&amp;gt;&#160; BTW which version of ZFS is deployed in autotest?&lt;/p&gt;

&lt;p&gt;Will be 0.7.9 IIUC&lt;/p&gt;</comment>
                            <comment id="236861" author="yujian" created="Mon, 12 Nov 2018 19:55:17 +0000"  >&lt;p&gt;Occurred again on Lustre b2_10 branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/ff7e4cd0-e698-11e8-bfe1-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/ff7e4cd0-e698-11e8-bfe1-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="287155" author="adilger" created="Wed, 9 Dec 2020 22:12:18 +0000"  >&lt;p&gt;+1 on master review-dne-selinux-ssk:&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/dd21c09d-9d88-44e4-9e8e-2f7e93045034&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/dd21c09d-9d88-44e4-9e8e-2f7e93045034&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;there are no errors in any logs, it just looks like it is taking a long time?&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="39743">LU-8601</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i000l3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>