<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:45:13 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4716] replay-ost-single test_5: stuck in dbuf_read-&gt;zio_wait</title>
                <link>https://jira.whamcloud.com/browse/LU-4716</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for sarah &amp;lt;sarah@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;http://maloo.whamcloud.com/test_sets/42573266-9f17-11e3-934b-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/42573266-9f17-11e3-934b-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_5 failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;test failed to respond and timed out&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Cannot find useful log, there are some D processes in client 2 console log but they are not lustre process&lt;/p&gt;</description>
                <environment>client and server: lustre-master build # 1911 RHEL6 zfs</environment>
        <key id="23474">LU-4716</key>
            <summary>replay-ost-single test_5: stuck in dbuf_read-&gt;zio_wait</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="isaac">Isaac Huang</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>zfs</label>
                    </labels>
                <created>Wed, 5 Mar 2014 18:21:30 +0000</created>
                <updated>Wed, 23 Dec 2015 21:08:09 +0000</updated>
                            <resolved>Wed, 23 Dec 2015 21:08:09 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="79263" author="adilger" created="Thu, 13 Mar 2014 17:53:33 +0000"  >&lt;p&gt;There are some ll_ost threads blocked in ZFS:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Feb 26 02:27:43 client-16vm4 kernel: ll_ost_io00_0
Feb 26 02:27:43 client-16vm4 kernel: Call Trace:
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa0142edd&amp;gt;] cv_wait_common+0xed/0x100 [spl]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa0142f45&amp;gt;] __cv_wait+0x15/0x20 [spl]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa0241e9b&amp;gt;] txg_wait_open+0x7b/0xa0 [zfs]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa0206a5d&amp;gt;] dmu_tx_wait+0xed/0xf0 [zfs]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa0206aee&amp;gt;] dmu_tx_assign+0x8e/0x4e0 [zfs]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa11b456c&amp;gt;] osd_trans_start+0x9c/0x410 [osd_zfs]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa15cf54c&amp;gt;] ofd_trans_start+0x7c/0x100 [ofd]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa15d48b3&amp;gt;] ofd_commitrw_write+0x523/0xfd0 [ofd]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa15d588a&amp;gt;] ofd_commitrw+0x52a/0x8c0 [ofd]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa12e971d&amp;gt;] obd_commitrw.clone.0+0x11d/0x390 [ptlrpc]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa12f093e&amp;gt;] tgt_brw_write+0xc7e/0x1530 [ptlrpc]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa12ef43c&amp;gt;] tgt_request_handle+0x23c/0xac0 [ptlrpc]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa129e6ea&amp;gt;] ptlrpc_main+0xd1a/0x1980 [ptlrpc]
Feb 26 02:27:43 client-16vm4 kernel: ll_ost_io00_0
Feb 26 02:27:43 client-16vm4 kernel: Call Trace:
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffff81528823&amp;gt;] io_schedule+0x73/0xc0
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa0142e7c&amp;gt;] cv_wait_common+0x8c/0x100 [spl]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa0142f08&amp;gt;] __cv_wait_io+0x18/0x20 [spl]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa02864ab&amp;gt;] zio_wait+0xfb/0x1b0 [zfs]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa01f2fdd&amp;gt;] dbuf_read+0x3fd/0x750 [zfs]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa01f4d84&amp;gt;] dmu_buf_will_dirty+0xa4/0x100 [zfs]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa01fb930&amp;gt;] dmu_write+0x90/0x160 [zfs]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa11c28f7&amp;gt;] osd_write_commit+0x417/0x570 [osd_zfs]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa15d4994&amp;gt;] ofd_commitrw_write+0x604/0xfd0 [ofd]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa15d588a&amp;gt;] ofd_commitrw+0x52a/0x8c0 [ofd]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa12e971d&amp;gt;] obd_commitrw.clone.0+0x11d/0x390 [ptlrpc]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa12f093e&amp;gt;] tgt_brw_write+0xc7e/0x1530 [ptlrpc]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa12ef43c&amp;gt;] tgt_request_handle+0x23c/0xac0 [ptlrpc]
Feb 26 02:27:43 client-16vm4 kernel: [&amp;lt;ffffffffa129e6ea&amp;gt;] ptlrpc_main+0xd1a/0x1980 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="79264" author="adilger" created="Thu, 13 Mar 2014 17:59:11 +0000"  >&lt;p&gt;I think it makes sense to upgrade our ZFS code to 0.6.3, or the tip of the master branch if that is not available yet.  There isn&apos;t any value to chase bugs that may already be fixed in the latest ZFS code (which we can hope will be released before Lustre 2.6.0).&lt;/p&gt;</comment>
                            <comment id="79319" author="isaac" created="Fri, 14 Mar 2014 06:24:51 +0000"  >&lt;p&gt;On the OSS, there seemed to be 7 ZFS pools, one for each OST, as there were 7 txg_sync threads. Six of them were sleeping waiting for work to do, but one was busy blocking for IO in spa_sync(), writing out the syncing txg. Four ll_ost threads were blocking in dmu_tx_assign()-&amp;gt;txg_wait_open() - looks like the open txg couldn&apos;t accept new tx due to write throttling, and it couldn&apos;t move to the quiescing state as the syncing txg couldn&apos;t complete.&lt;/p&gt;

&lt;p&gt;I agree with Andreas that there&apos;s little value chasing down problems in 0.6.2, as the ZFS write throttle code has been completely reworked post 0.6.2.&lt;/p&gt;

&lt;p&gt;But there&apos;s two more things.&lt;/p&gt;

&lt;p&gt;1. Maybe I&apos;ve missed something, but I couldn&apos;t find any ZFS debugging data on the Maloo test report. It&apos;d be very useful to have a tarball of /proc/spl/, when FSTYPE=zfs. Lots of useful data can be found under that directory, e.g. dmu_tx_assign delay histogram.&lt;/p&gt;

&lt;p&gt;2. Among the 7 ZFS pools, only one was busy writing out data at the time. I wonder why all the load hit a single OST while there were 7.&lt;/p&gt;</comment>
                            <comment id="79380" author="isaac" created="Fri, 14 Mar 2014 21:33:21 +0000"  >&lt;p&gt;Create ticket &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5674&quot; title=&quot;Maloo test report should include zfs debugging data when when FSTYPE=zfs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5674&quot;&gt;&lt;del&gt;TEI-1729&lt;/del&gt;&lt;/a&gt; to ask for ZFS debugging data to be included in test reports.&lt;/p&gt;</comment>
                            <comment id="80256" author="isaac" created="Tue, 25 Mar 2014 21:59:56 +0000"  >&lt;p&gt;Looking further at the logs today, the busy txg_sync thread was blocked because IO couldn&apos;t complete:&lt;br/&gt;
Feb 26 02:27:43 client-16vm4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0142e7c&amp;gt;&amp;#93;&lt;/span&gt; cv_wait_common+0x8c/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;spl&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 26 02:27:43 client-16vm4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8109b290&amp;gt;&amp;#93;&lt;/span&gt; ? autoremove_wake_function+0x0/0x40&lt;br/&gt;
Feb 26 02:27:43 client-16vm4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0142f08&amp;gt;&amp;#93;&lt;/span&gt; __cv_wait_io+0x18/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;spl&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 26 02:27:43 client-16vm4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa02864ab&amp;gt;&amp;#93;&lt;/span&gt; zio_wait+0xfb/0x1b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 26 02:27:43 client-16vm4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa021e3a4&amp;gt;&amp;#93;&lt;/span&gt; dsl_pool_sync+0xf4/0x540 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 26 02:27:43 client-16vm4 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa023725e&amp;gt;&amp;#93;&lt;/span&gt; spa_sync+0x40e/0xa00 &lt;span class=&quot;error&quot;&gt;&amp;#91;zfs&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;This could be the known ZFS IO starvation issue with the Linux cfq scheduler. Although ZFS automatically set IO scheduler to noop on whole disks, the host OS could still be using cfq for the disks behind the guess OS disks. It seemed that ZFS pools on OSS were set up:&lt;br/&gt;
zpool import -f -o cachefile=none -d /dev/lvm-Role_OSS lustre-ost3&lt;/p&gt;

&lt;p&gt;It was not clear to me how devices under /dev/lvm-Role_OSS were setup and used. I think it makes sense to make sure that our test system:&lt;br/&gt;
1. Use whole disks for zfs pools on guest VMs.&lt;br/&gt;
2. Use noop IO scheduler for corresponding disks on host OS.&lt;/p&gt;</comment>
                            <comment id="137335" author="adilger" created="Wed, 23 Dec 2015 21:08:09 +0000"  >&lt;p&gt;Closing old issue, since we haven&apos;t seen this in a long time (suspect update to 0.6.3 fixed it).&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="25269">LU-5242</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="22795">LU-4507</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwgtj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>12962</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>