<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:33:23 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10250] replay-single test_74:  hang and timed out</title>
                <link>https://jira.whamcloud.com/browse/LU-10250</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Jinshan Xiong &amp;lt;jinshan.xiong@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/96539eb0-cadb-11e7-8027-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/96539eb0-cadb-11e7-8027-52540065bddc&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_74 failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Timeout occurred after 128 mins, last suite running was replay-single, restarting cluster to continue tests
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Please provide additional information about the failure here.&lt;/p&gt;

&lt;p&gt;Info required for matching: replay-single 74&lt;/p&gt;

&lt;p&gt;On the client stack, there is a stack trace:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 4440.087054] INFO: task touch:18639 blocked for more than 120 seconds.
[ 4440.089288] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 4440.091541] touch           D 0000000000000000     0 18639  17432 0x00000080
[ 4440.093752]  ffff880069033a40 0000000000000086 ffff880036a79fa0 ffff880069033fd8
[ 4440.096088]  ffff880069033fd8 ffff880069033fd8 ffff880036a79fa0 ffff880066647650
[ 4440.098281]  7fffffffffffffff ffff880066647648 ffff880036a79fa0 0000000000000000
[ 4440.100517] Call Trace:
[ 4440.102304]  [&amp;lt;ffffffff816a9589&amp;gt;] schedule+0x29/0x70
[ 4440.104209]  [&amp;lt;ffffffff816a7099&amp;gt;] schedule_timeout+0x239/0x2c0
[ 4440.106187]  [&amp;lt;ffffffffc09e3108&amp;gt;] ? ptlrpc_set_add_new_req+0xd8/0x150 [ptlrpc]
[ 4440.108288]  [&amp;lt;ffffffffc0bc2bc0&amp;gt;] ? osc_io_ladvise_end+0x50/0x50 [osc]
[ 4440.110272]  [&amp;lt;ffffffffc0a1324b&amp;gt;] ? ptlrpcd_add_req+0x22b/0x300 [ptlrpc]
[ 4440.112208]  [&amp;lt;ffffffffc09f5e40&amp;gt;] ? lustre_swab_niobuf_remote+0x30/0x30 [ptlrpc]
[ 4440.114221]  [&amp;lt;ffffffff816a993d&amp;gt;] wait_for_completion+0xfd/0x140
[ 4440.116026]  [&amp;lt;ffffffff810c4820&amp;gt;] ? wake_up_state+0x20/0x20
[ 4440.117835]  [&amp;lt;ffffffffc0bc2d84&amp;gt;] osc_io_setattr_end+0xc4/0x180 [osc]
[ 4440.119639]  [&amp;lt;ffffffffc0bc4381&amp;gt;] ? osc_io_setattr_start+0x471/0x6e0 [osc]
[ 4440.121517]  [&amp;lt;ffffffffc0c15450&amp;gt;] ? lov_io_iter_fini_wrapper+0x50/0x50 [lov]
[ 4440.123344]  [&amp;lt;ffffffffc08201ed&amp;gt;] cl_io_end+0x5d/0x150 [obdclass]
[ 4440.125122]  [&amp;lt;ffffffffc0c1552b&amp;gt;] lov_io_end_wrapper+0xdb/0xe0 [lov]
[ 4440.126825]  [&amp;lt;ffffffffc0c15b75&amp;gt;] lov_io_call.isra.5+0x85/0x140 [lov]
[ 4440.128586]  [&amp;lt;ffffffffc0c15c66&amp;gt;] lov_io_end+0x36/0xb0 [lov]
[ 4440.130210]  [&amp;lt;ffffffffc08201ed&amp;gt;] cl_io_end+0x5d/0x150 [obdclass]
[ 4440.131901]  [&amp;lt;ffffffffc082287f&amp;gt;] cl_io_loop+0x13f/0xc70 [obdclass]
[ 4440.133544]  [&amp;lt;ffffffffc0cbb4d0&amp;gt;] cl_setattr_ost+0x240/0x3a0 [lustre]
[ 4440.135229]  [&amp;lt;ffffffffc0c95b05&amp;gt;] ll_setattr_raw+0x1185/0x1290 [lustre]
[ 4440.136821]  [&amp;lt;ffffffffc0c95c7c&amp;gt;] ll_setattr+0x6c/0xd0 [lustre]
[ 4440.138380]  [&amp;lt;ffffffff8121ee71&amp;gt;] notify_change+0x2c1/0x420
[ 4440.139813]  [&amp;lt;ffffffff81233b59&amp;gt;] utimes_common+0xd9/0x1c0
[ 4440.141276]  [&amp;lt;ffffffff81233d7e&amp;gt;] do_utimes+0x13e/0x180
[ 4440.142646]  [&amp;lt;ffffffff81233ec4&amp;gt;] SyS_utimensat+0x64/0xb0
[ 4440.144085]  [&amp;lt;ffffffff816ac5c8&amp;gt;] ? page_fault+0x28/0x30
[ 4440.145450]  [&amp;lt;ffffffff816b5089&amp;gt;] system_call_fastpath+0x16/0x1b
[ 4560.146045] INFO: task touch:18639 blocked for more than 120 seconds.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;That shows the client was waiting for a setattr to complete, and on the OFD side:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 4163.936021] LNet: Service thread pid 15315 was inactive for 40.10s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[ 4163.941814] Pid: 15315, comm: ll_ost00_000
[ 4163.944340] 
Call Trace:
[ 4163.948946]  [&amp;lt;ffffffff816a9589&amp;gt;] schedule+0x29/0x70
[ 4163.951469]  [&amp;lt;ffffffffc07084d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[ 4163.954051]  [&amp;lt;ffffffff810b1920&amp;gt;] ? autoremove_wake_function+0x0/0x40
[ 4163.956663]  [&amp;lt;ffffffffc0708515&amp;gt;] __cv_wait+0x15/0x20 [spl]
[ 4163.959223]  [&amp;lt;ffffffffc085781f&amp;gt;] txg_wait_synced+0xef/0x140 [zfs]
[ 4163.961745]  [&amp;lt;ffffffffc080cc65&amp;gt;] dmu_tx_wait+0x275/0x3c0 [zfs]
[ 4163.964242]  [&amp;lt;ffffffffc080ce41&amp;gt;] dmu_tx_assign+0x91/0x490 [zfs]
[ 4163.966725]  [&amp;lt;ffffffffc0ecbe1a&amp;gt;] ? tgt_txn_start_cb+0x1da/0x3b0 [ptlrpc]
[ 4163.969213]  [&amp;lt;ffffffffc1098f37&amp;gt;] osd_trans_start+0xa7/0x3a0 [osd_zfs]
[ 4163.971617]  [&amp;lt;ffffffffc11c1fab&amp;gt;] ofd_trans_start+0x6b/0xe0 [ofd]
[ 4163.973952]  [&amp;lt;ffffffffc11c4943&amp;gt;] ofd_attr_set+0x433/0xb00 [ofd]
[ 4163.976281]  [&amp;lt;ffffffffc11afff3&amp;gt;] ofd_setattr_hdl+0x303/0x950 [ofd]
[ 4163.978596]  [&amp;lt;ffffffffc0ed7c05&amp;gt;] tgt_request_handle+0x925/0x13b0 [ptlrpc]
[ 4163.980977]  [&amp;lt;ffffffffc0e7be8e&amp;gt;] ptlrpc_server_handle_request+0x24e/0xab0 [ptlrpc]
[ 4163.983391]  [&amp;lt;ffffffff810ba598&amp;gt;] ? __wake_up_common+0x58/0x90
[ 4163.985676]  [&amp;lt;ffffffffc0e7f632&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[ 4163.987983]  [&amp;lt;ffffffff81029557&amp;gt;] ? __switch_to+0xd7/0x510
[ 4163.990182]  [&amp;lt;ffffffff816a9000&amp;gt;] ? __schedule+0x350/0x8b0
[ 4163.992348]  [&amp;lt;ffffffffc0e7eba0&amp;gt;] ? ptlrpc_main+0x0/0x1e40 [ptlrpc]
[ 4163.994528]  [&amp;lt;ffffffff810b099f&amp;gt;] kthread+0xcf/0xe0
[ 4163.996574]  [&amp;lt;ffffffff810b08d0&amp;gt;] ? kthread+0x0/0xe0
[ 4163.998609]  [&amp;lt;ffffffff816b4fd8&amp;gt;] ret_from_fork+0x58/0x90
[ 4164.000651]  [&amp;lt;ffffffff810b08d0&amp;gt;] ? kthread+0x0/0xe0

[ 4164.004284] LustreError: dumping log to /tmp/lustre-log.1510820580.15315
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It&apos;s waiting a sync trans, which handled the setattr from client, and it seems the trans is never completed.&lt;/p&gt;

&lt;p&gt;I guess this is the same issue &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4440&quot; title=&quot;replay-single test_74: timeout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4440&quot;&gt;&lt;del&gt;LU-4440&lt;/del&gt;&lt;/a&gt;. Unfortunately there is no useful description on the ticket.&lt;/p&gt;</description>
                <environment></environment>
        <key id="49351">LU-10250</key>
            <summary>replay-single test_74:  hang and timed out</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Thu, 16 Nov 2017 19:00:40 +0000</created>
                <updated>Fri, 18 Feb 2022 22:23:02 +0000</updated>
                                                                                <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="214891" author="adilger" created="Tue, 28 Nov 2017 23:53:50 +0000"  >&lt;p&gt;The stack traces dumped on the client are because &lt;tt&gt;osc_io_setattr_end()&lt;/tt&gt; is using &lt;tt&gt;wait_for_completion()&lt;/tt&gt; directly.  It would be better to use &lt;tt&gt;wait_for_completion_interruptible()&lt;/tt&gt; so that the wait is interruptible in case the user wants to kill the process and get on with their day.&lt;/p&gt;

&lt;p&gt;That doesn&apos;t resolve the issue on the server, but the client may as well be fixed at the same time.&lt;/p&gt;</comment>
                            <comment id="232772" author="adilger" created="Wed, 29 Aug 2018 21:43:06 +0000"  >&lt;p&gt;This seems to be hitting occasionally - unexplained and unreported hang in ZFS &lt;tt&gt;dmu_tx_wait()&lt;/tt&gt; (see linked tickets).  I don&apos;t know if this is an artifact of our VM testing, or a problem that also hits on real hardware, but it is something to be aware of and keep an eye open for.&lt;/p&gt;</comment>
                            <comment id="250622" author="simmonsja" created="Wed, 3 Jul 2019 17:23:34 +0000"  >&lt;p&gt;This is a problem hit on real hardware. This problem took down our production file system &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="250625" author="adilger" created="Wed, 3 Jul 2019 18:01:16 +0000"  >&lt;p&gt;James, could you please file a separate ticket for your issue.  It is of course fine to link to this ticket, but that allows tracking the problem with your production system better, and it may be that the problem you are seeing is unrelated (being stuck in &quot;&lt;tt&gt;dmu_tx_wait()&lt;/tt&gt;&quot; is basically the equivalent of being stuck in &quot;&lt;tt&gt;start_this_handle()&lt;/tt&gt;&quot; for ldiskfs, so there are a hundred ways to get there.&lt;/p&gt;

&lt;p&gt;Also, I wasn&apos;t aware that you are running ZFS in production?  I thought you had ldiskfs-based filesystems on Spider2?&lt;/p&gt;</comment>
                            <comment id="250636" author="simmonsja" created="Wed, 3 Jul 2019 18:57:48 +0000"  >&lt;p&gt;This is for our NOAA file system which is also production. It is 2.12 LTS using ZFS as back end.&lt;/p&gt;</comment>
                            <comment id="250661" author="bzzz" created="Thu, 4 Jul 2019 03:53:11 +0000"  >&lt;p&gt;it would be helpful to get a full set of backtraces. dmu_tx_wait() can be waiting for a new TXG while current TXG is pinned by another process.&lt;/p&gt;
</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="48383">LU-10009</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="48574">LU-10065</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="49229">LU-10223</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="50420">LU-10572</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="50835">LU-10670</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="56275">LU-12510</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzznyn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>