<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:58:08 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13072] High OSS load due to possible deadlock w/ ofd_create_hdl and ofd_quotactl backtraces</title>
                <link>https://jira.whamcloud.com/browse/LU-13072</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We found an OSS on Fir (2.12.3_4) heavily loaded and with threads hung this morning. This started during last night. We took a crash dump that I uploaded to the FTP as &lt;tt&gt;vmcore_fir-io8-s1_2019-12-12-07-42-58&lt;/tt&gt; along with kernel-debuginfo (&lt;tt&gt;kernel-debuginfo-3.10.0-957.27.2.el7_lustre.pl2.x86_64.rpm&lt;/tt&gt; and &lt;tt&gt;kernel-debuginfo-common-x86_64-3.10.0-957.27.2.el7_lustre.pl2.x86_64.rpm&lt;/tt&gt;).&lt;/p&gt;

&lt;p&gt;The last messages on the console before the crash dump were:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[264018.187097] LNet: Service thread pid 33911 was inactive for 1200.94s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[264018.204204] LNet: Skipped 1 previous similar message
[264018.209269] Pid: 33911, comm: ll_ost03_106 3.10.0-957.27.2.el7_lustre.pl2.x86_64 #1 SMP Thu Nov 7 15:26:16 PST 2019
[264018.219804] Call Trace:
[264018.222354]  [&amp;lt;ffffffffc155deb3&amp;gt;] ofd_create_hdl+0xcb3/0x20e0 [ofd]
[264018.228754]  [&amp;lt;ffffffffc0f5036a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
[264018.235831]  [&amp;lt;ffffffffc0ef724b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[264018.243637]  [&amp;lt;ffffffffc0efabac&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
[264018.250067]  [&amp;lt;ffffffffa7ac2e81&amp;gt;] kthread+0xd1/0xe0
[264018.255066]  [&amp;lt;ffffffffa8177c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[264018.261642]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[264018.266754] LustreError: dumping log to /tmp/lustre-log.1576165038.33911
[264126.221225] LNet: Service thread pid 34617 was inactive for 200.36s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[264126.238268] Pid: 34617, comm: ll_ost03_111 3.10.0-957.27.2.el7_lustre.pl2.x86_64 #1 SMP Thu Nov 7 15:26:16 PST 2019
[264126.248785] Call Trace:
[264126.251360]  [&amp;lt;ffffffffc02c3085&amp;gt;] wait_transaction_locked+0x85/0xd0 [jbd2]
[264126.258376]  [&amp;lt;ffffffffc02c3368&amp;gt;] add_transaction_credits+0x268/0x2f0 [jbd2]
[264126.265544]  [&amp;lt;ffffffffc02c35e1&amp;gt;] start_this_handle+0x1a1/0x430 [jbd2]
[264126.272211]  [&amp;lt;ffffffffc02c3a93&amp;gt;] jbd2__journal_start+0xf3/0x1f0 [jbd2]
[264126.278951]  [&amp;lt;ffffffffc131b309&amp;gt;] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs]
[264126.286491]  [&amp;lt;ffffffffc1312ad3&amp;gt;] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs]
[264126.293573]  [&amp;lt;ffffffffa7cb026a&amp;gt;] dqget+0x3fa/0x450
[264126.298588]  [&amp;lt;ffffffffa7cb1074&amp;gt;] dquot_get_dqblk+0x14/0x1f0
[264126.304371]  [&amp;lt;ffffffffc143c6d5&amp;gt;] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs]
[264126.311997]  [&amp;lt;ffffffffc13a380d&amp;gt;] lquotactl_slv+0x27d/0x9d0 [lquota]
[264126.318488]  [&amp;lt;ffffffffc1556a3c&amp;gt;] ofd_quotactl+0x13c/0x380 [ofd]
[264126.324630]  [&amp;lt;ffffffffc0f5036a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
[264126.331681]  [&amp;lt;ffffffffc0ef724b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[264126.339496]  [&amp;lt;ffffffffc0efabac&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
[264126.345912]  [&amp;lt;ffffffffa7ac2e81&amp;gt;] kthread+0xd1/0xe0
[264126.350926]  [&amp;lt;ffffffffa8177c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[264126.357490]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[264126.362604] LustreError: dumping log to /tmp/lustre-log.1576165146.34617
[264136.461417] Pid: 34429, comm: ll_ost03_110 3.10.0-957.27.2.el7_lustre.pl2.x86_64 #1 SMP Thu Nov 7 15:26:16 PST 2019
[264136.471940] Call Trace:
[264136.474507]  [&amp;lt;ffffffffc02c3085&amp;gt;] wait_transaction_locked+0x85/0xd0 [jbd2]
[264136.481514]  [&amp;lt;ffffffffc02c3368&amp;gt;] add_transaction_credits+0x268/0x2f0 [jbd2]
[264136.488699]  [&amp;lt;ffffffffc02c35e1&amp;gt;] start_this_handle+0x1a1/0x430 [jbd2]
[264136.495351]  [&amp;lt;ffffffffc02c3a93&amp;gt;] jbd2__journal_start+0xf3/0x1f0 [jbd2]
[264136.502103]  [&amp;lt;ffffffffc131b309&amp;gt;] __ldiskfs_journal_start_sb+0x69/0xe0 [ldiskfs]
[264136.509629]  [&amp;lt;ffffffffc1312ad3&amp;gt;] ldiskfs_acquire_dquot+0x53/0xb0 [ldiskfs]
[264136.516725]  [&amp;lt;ffffffffa7cb026a&amp;gt;] dqget+0x3fa/0x450
[264136.521728]  [&amp;lt;ffffffffa7cb1074&amp;gt;] dquot_get_dqblk+0x14/0x1f0
[264136.527514]  [&amp;lt;ffffffffc143c6d5&amp;gt;] osd_acct_index_lookup+0x235/0x480 [osd_ldiskfs]
[264136.535127]  [&amp;lt;ffffffffc13a380d&amp;gt;] lquotactl_slv+0x27d/0x9d0 [lquota]
[264136.541618]  [&amp;lt;ffffffffc1556a3c&amp;gt;] ofd_quotactl+0x13c/0x380 [ofd]
[264136.547746]  [&amp;lt;ffffffffc0f5036a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
[264136.554806]  [&amp;lt;ffffffffc0ef724b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[264136.562610]  [&amp;lt;ffffffffc0efabac&amp;gt;] ptlrpc_main+0xb2c/0x1460 [ptlrpc]
[264136.569039]  [&amp;lt;ffffffffa7ac2e81&amp;gt;] kthread+0xd1/0xe0
[264136.574042]  [&amp;lt;ffffffffa8177c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[264136.580632]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[264136.585737] LustreError: dumping log to /tmp/lustre-log.1576165156.34429
[264349.590511] SysRq : Trigger a crash
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Attaching:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;vmcore-dmesg-txt as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/34018/34018_vmcore-dmesg_fir-io8-s1_2019-12-12-07-42-58.txt&quot; title=&quot;vmcore-dmesg_fir-io8-s1_2019-12-12-07-42-58.txt attached to LU-13072&quot;&gt;vmcore-dmesg_fir-io8-s1_2019-12-12-07-42-58.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/li&gt;
	&lt;li&gt;output of &quot;foreach bt&quot; as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/34017/34017_foreach_bt_fir-io8-s1_2019-12-12-07-42-58.txt&quot; title=&quot;foreach_bt_fir-io8-s1_2019-12-12-07-42-58.txt attached to LU-13072&quot;&gt;foreach_bt_fir-io8-s1_2019-12-12-07-42-58.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;We restarted the OSS and a few OSTs couldn&apos;t complete their recovery (stuck at 0s) but a manual &lt;tt&gt;abort_recovery&lt;/tt&gt; did work.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Stephane&lt;/p&gt;</description>
                <environment>lustre-2.12.3_4_g142b4d4-1.el7.x86_64 3.10.0-957.27.2.el7_lustre.pl2.x86_64 MOFED 4.7</environment>
        <key id="57625">LU-13072</key>
            <summary>High OSS load due to possible deadlock w/ ofd_create_hdl and ofd_quotactl backtraces</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="ys">Yang Sheng</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Thu, 12 Dec 2019 19:11:29 +0000</created>
                <updated>Tue, 26 Jul 2022 04:01:34 +0000</updated>
                            <resolved>Tue, 26 Jul 2022 04:01:34 +0000</resolved>
                                    <version>Lustre 2.12.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="259817" author="pjones" created="Fri, 13 Dec 2019 18:32:15 +0000"  >&lt;p&gt;Yang Sheng&lt;/p&gt;

&lt;p&gt;Could you please advise&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="259822" author="green" created="Fri, 13 Dec 2019 18:52:30 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 66168  TASK: ffff8912f5c68000  CPU: 0   COMMAND: &quot;jbd2/md2-8&quot;
 #0 [ffff8912f34cfc08] __schedule at ffffffffa816aa72
 #1 [ffff8912f34cfc98] schedule at ffffffffa816af19
 #2 [ffff8912f34cfca8] jbd2_journal_commit_transaction at ffffffffc02c62bc [jbd2]
 #3 [ffff8912f34cfe48] kjournald2 at ffffffffc02cce89 [jbd2]
 #4 [ffff8912f34cfec8] kthread at ffffffffa7ac2e81
 #5 [ffff8912f34cff50] ret_from_fork_nospec_begin at ffffffffa8177c24
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So jbd is commiting a transaction and it takes a long time so nobody else can start a new transaction for a long time. I also see many other threads waiting on IO including debugfs that I presume reads from the same device? but also many ost io threads. If there&apos;s a lot of IO contention it&apos;s somewhat understandable that IO is slow including slow transaction commits that impacts things like starting new transactions.&lt;/p&gt;</comment>
                            <comment id="259993" author="ys" created="Mon, 16 Dec 2019 17:03:35 +0000"  >&lt;p&gt;Hi, Stephane,&lt;/p&gt;

&lt;p&gt;After investigating, I think this issue should relate to some storage issue. The all of ofd_create_hdl threads were block by wait_transaction_locked. The ofd_quotactl also same blocked on wait_transaction_locked. They are waiting to start a new trans. The transaction was block by &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 67875  TASK: ffff891241b65140  CPU: 9   COMMAND: &quot;ll_ost_io01_019&quot;
 #0 [ffff891241e2f8e8] __schedule at ffffffffa816aa72
 #1 [ffff891241e2f978] schedule at ffffffffa816af19
 #2 [ffff891241e2f988] osd_trans_stop at ffffffffc140fe75 [osd_ldiskfs]
 #3 [ffff891241e2fa40] ofd_trans_stop at ffffffffc1569c75 [ofd]
 #4 [ffff891241e2fa50] ofd_commitrw_write at ffffffffc1570c34 [ofd]
 #5 [ffff891241e2fad8] ofd_commitrw at ffffffffc157510c [ofd]
 #6 [ffff891241e2fb58] tgt_brw_write at ffffffffc0f5460b [ptlrpc]
 #7 [ffff891241e2fcd0] tgt_request_handle at ffffffffc0f5036a [ptlrpc]
 #8 [ffff891241e2fd58] ptlrpc_server_handle_request at ffffffffc0ef724b [ptlrpc]
 #9 [ffff891241e2fdf8] ptlrpc_main at ffffffffc0efabac [ptlrpc]
#10 [ffff891241e2fec8] kthread at ffffffffa7ac2e81
#11 [ffff891241e2ff50] ret_from_fork_nospec_begin at ffffffffa8177c24
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The thread was waiting some data to write out.  It is waiting for a few threads like:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 67874  TASK: ffff891241b64100  CPU: 42  COMMAND: &quot;ll_ost_io02_019&quot;
 #0 [ffff891241e2b4a8] __schedule at ffffffffa816aa72
 #1 [ffff891241e2b538] schedule at ffffffffa816af19
 #2 [ffff891241e2b548] bitmap_startwrite at ffffffffa7f9cc65
 #3 [ffff891241e2b5c8] add_stripe_bio at ffffffffc1383471 [raid456]
 #4 [ffff891241e2b628] raid5_make_request at ffffffffc1388db4 [raid456]
 #5 [ffff891241e2b710] md_handle_request at ffffffffa7f8fa30
 #6 [ffff891241e2b780] md_make_request at ffffffffa7f8fb99
 #7 [ffff891241e2b7a8] generic_make_request at ffffffffa7d46e37
 #8 [ffff891241e2b800] submit_bio at ffffffffa7d470e0
 #9 [ffff891241e2b858] osd_submit_bio at ffffffffc14255ac [osd_ldiskfs]
#10 [ffff891241e2b868] osd_do_bio at ffffffffc1427925 [osd_ldiskfs]
#11 [ffff891241e2b9d0] osd_write_commit at ffffffffc142820c [osd_ldiskfs]
#12 [ffff891241e2ba50] ofd_commitrw_write at ffffffffc157121e [ofd]
#13 [ffff891241e2bad8] ofd_commitrw at ffffffffc157510c [ofd]
#14 [ffff891241e2bb58] tgt_brw_write at ffffffffc0f5460b [ptlrpc]
#15 [ffff891241e2bcd0] tgt_request_handle at ffffffffc0f5036a [ptlrpc]
#16 [ffff891241e2bd58] ptlrpc_server_handle_request at ffffffffc0ef724b [ptlrpc]
#17 [ffff891241e2bdf8] ptlrpc_main at ffffffffc0efabac [ptlrpc]
#18 [ffff891241e2bec8] kthread at ffffffffa7ac2e81
#19 [ffff891241e2bf50] ret_from_fork_nospec_begin at ffffffffa8177c24
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;They were running on /dev/md2 and looks like this is a software raid6 device. I suggest you should investigate write performance for this device to measure whether it is slowly than other.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;

</comment>
                            <comment id="260077" author="sthiell" created="Tue, 17 Dec 2019 23:42:31 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;Thank you very much for your time on this, I really appreciate it.&#160; After the reboot, all OSTs came back online and everything was fine on all targets, even on /dev/md2. I wonder if this could be some kind of mdraid deadlock of some sort, or perhaps a very transient slowdown of a raid6 array.&lt;/p&gt;

&lt;p&gt;Stephane&lt;/p&gt;</comment>
                            <comment id="260095" author="ys" created="Wed, 18 Dec 2019 13:07:23 +0000"  >&lt;p&gt;Hi, Stephane,&lt;/p&gt;

&lt;p&gt;I have investigated all of thread carefully. I cannot find any clue for deadlock.  I suggest you can collect disk io info(like iostat) while this issue be hit again. Then compare it with normal data(collect without this issue) to determine whether it is a storage problem. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="260132" author="sthiell" created="Thu, 19 Dec 2019 01:00:47 +0000"  >&lt;p&gt;Hi&#160;YangSheng,&lt;/p&gt;

&lt;p&gt;Thanks, next time I&apos;ll try to collect disk I/O info, that&apos;s a great idea indeed. But so far, this OST has been running fine since this event.&lt;/p&gt;

&lt;p&gt;Best,&lt;/p&gt;

&lt;p&gt;Stephane&lt;/p&gt;</comment>
                            <comment id="341529" author="ys" created="Tue, 26 Jul 2022 04:01:34 +0000"  >&lt;p&gt;Please reopen it while hit again.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="34017" name="foreach_bt_fir-io8-s1_2019-12-12-07-42-58.txt" size="1680034" author="sthiell" created="Thu, 12 Dec 2019 19:09:28 +0000"/>
                            <attachment id="34018" name="vmcore-dmesg_fir-io8-s1_2019-12-12-07-42-58.txt" size="925332" author="sthiell" created="Thu, 12 Dec 2019 19:09:08 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00qxz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>