<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:01:04 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13413] Lustre soft lockups with peer credit exhaustion</title>
                <link>https://jira.whamcloud.com/browse/LU-13413</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Currently our production file system is experience peer credit exhaustion which is leading to soft locks. The back trace are attached. Instead of a soft lockups we should be having evicts.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[2642432.333239]  [&amp;lt;ffffffff9477544a&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
[2642432.333241]  [&amp;lt;ffffffff94783330&amp;gt;] _raw_spin_lock+0x20/0x30
[2642432.333258]  [&amp;lt;ffffffffc146202c&amp;gt;] lock_res_and_lock+0x2c/0x50 [ptlrpc]
[2642432.333273]  [&amp;lt;ffffffffc1469c61&amp;gt;] ldlm_lock_enqueue+0x1b1/0xa20 [ptlrpc]
[2642432.333294]  [&amp;lt;ffffffffc14b9891&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
[2642432.333311]  [&amp;lt;ffffffffc1492506&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
[2642432.333332]  [&amp;lt;ffffffffc14bb300&amp;gt;] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc]
[2642432.333357]  [&amp;lt;ffffffffc151acf2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
[2642432.333381]  [&amp;lt;ffffffffc1521b0a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[2642432.333404]  [&amp;lt;ffffffffc14fb021&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[2642432.333408]  [&amp;lt;ffffffffc103fbde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[2642432.333428]  [&amp;lt;ffffffffc14c646b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[2642432.333449]  [&amp;lt;ffffffffc14c3285&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[2642432.333451]  [&amp;lt;ffffffff940d3903&amp;gt;] ? __wake_up+0x13/0x20
[2642432.333471]  [&amp;lt;ffffffffc14c9dd4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment>RHEL7.7 running Lustre 2.12 LTS</environment>
        <key id="58604">LU-13413</key>
            <summary>Lustre soft lockups with peer credit exhaustion</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="adilger">Andreas Dilger</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                            <label>ORNL</label>
                    </labels>
                <created>Fri, 3 Apr 2020 13:53:07 +0000</created>
                <updated>Mon, 11 Jul 2022 23:56:41 +0000</updated>
                            <resolved>Thu, 2 Dec 2021 16:53:45 +0000</resolved>
                                    <version>Lustre 2.12.4</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="266812" author="adilger" created="Fri, 3 Apr 2020 21:30:08 +0000"  >&lt;p&gt;Hi James, any changes to the system that would have triggered this to start happening (upgraded Lustre, new patch, new clients, new workload, etc.)?&lt;/p&gt;</comment>
                            <comment id="266928" author="simmonsja" created="Mon, 6 Apr 2020 16:48:27 +0000"  >&lt;p&gt;I talked to our admin team and they stated its the same problem since October with the upgrade to 2.12 LTS. With &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12600&quot; title=&quot;Lustre tgt_brw_write() bug&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12600&quot;&gt;&lt;del&gt;LU-12600&lt;/del&gt;&lt;/a&gt; it did reduce the problem but it still exist. This problem shows up when clients are hitting the servers with large numbers of small I/Os.&lt;/p&gt;

&lt;p&gt;Do you think tuning the lru_resize and lru_size might help? The current values are&lt;/p&gt;

&lt;p&gt;lru_size=2000 and lru_resize=3900000&lt;/p&gt;</comment>
                            <comment id="266940" author="adilger" created="Mon, 6 Apr 2020 17:40:31 +0000"  >&lt;p&gt;James, how many locks are actually being held by the clients and are on the servers?  Running &quot;&lt;tt&gt;lctl get_param ldlm.namespaces.&amp;#42;.lock_count&lt;/tt&gt;&quot; on some clients and servers would give a good idea of where this is at.  There were a couple of bugs related to not canceling the DLM locks on the clients, even above LRU size, so it would be good to know if this is the problem you are seeing.&lt;/p&gt;

&lt;p&gt;Also, are there any patches above 2.12.4?&lt;/p&gt;</comment>
                            <comment id="266941" author="simmonsja" created="Mon, 6 Apr 2020 18:15:20 +0000"  >&lt;p&gt;The only patch we are running outside of 2.12 is a port of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13101&quot; title=&quot;Eviction during ll_open_cleanup()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13101&quot;&gt;&lt;del&gt;LU-13101&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;f2-oss1c1: ldlm.namespaces.filter-f2-OST0002_UUID.lock_count=13071&lt;br/&gt;
f2-oss1c1: ldlm.namespaces.filter-f2-OST0026_UUID.lock_count=15371&lt;br/&gt;
f2-oss1f5: ldlm.namespaces.filter-f2-OST001d_UUID.lock_count=9772&lt;br/&gt;
f2-oss1f5: ldlm.namespaces.filter-f2-OST0041_UUID.lock_count=23522&lt;br/&gt;
f2-oss1f4: ldlm.namespaces.filter-f2-OST0017_UUID.lock_count=6410&lt;br/&gt;
f2-oss1f4: ldlm.namespaces.filter-f2-OST003b_UUID.lock_count=10967&lt;br/&gt;
f2-oss1e2: ldlm.namespaces.filter-f2-OST000a_UUID.lock_count=9424&lt;br/&gt;
f2-oss1e2: ldlm.namespaces.filter-f2-OST002e_UUID.lock_count=14095&lt;br/&gt;
f2-oss1e6: ldlm.namespaces.filter-f2-OST0022_UUID.lock_count=9126&lt;br/&gt;
f2-oss1e6: ldlm.namespaces.filter-f2-OST0046_UUID.lock_count=14720&lt;br/&gt;
f2-oss1e1: ldlm.namespaces.filter-f2-OST0004_UUID.lock_count=5522&lt;br/&gt;
f2-oss1e1: ldlm.namespaces.filter-f2-OST0028_UUID.lock_count=13113&lt;br/&gt;
f2-oss1c4: ldlm.namespaces.filter-f2-OST0014_UUID.lock_count=31000&lt;br/&gt;
f2-oss1c4: ldlm.namespaces.filter-f2-OST0038_UUID.lock_count=8096&lt;br/&gt;
f2-oss1a1: ldlm.namespaces.filter-f2-OST0000_UUID.lock_count=9730&lt;br/&gt;
f2-oss1a1: ldlm.namespaces.filter-f2-OST0024_UUID.lock_count=10386&lt;br/&gt;
f2-oss1a2: ldlm.namespaces.filter-f2-OST0006_UUID.lock_count=6744&lt;br/&gt;
f2-oss1a2: ldlm.namespaces.filter-f2-OST002a_UUID.lock_count=17169&lt;br/&gt;
f2-oss1b2: ldlm.namespaces.filter-f2-OST0007_UUID.lock_count=7458&lt;br/&gt;
f2-oss1b2: ldlm.namespaces.filter-f2-OST002b_UUID.lock_count=12837&lt;br/&gt;
f2-oss1e3: ldlm.namespaces.filter-f2-OST0010_UUID.lock_count=17431&lt;br/&gt;
f2-oss1e3: ldlm.namespaces.filter-f2-OST0034_UUID.lock_count=11464&lt;br/&gt;
f2-oss1d4: ldlm.namespaces.filter-f2-OST0015_UUID.lock_count=9836&lt;br/&gt;
f2-oss1d4: ldlm.namespaces.filter-f2-OST0039_UUID.lock_count=14168&lt;br/&gt;
f2-oss1d3: ldlm.namespaces.filter-f2-OST000f_UUID.lock_count=11082&lt;br/&gt;
f2-oss1d3: ldlm.namespaces.filter-f2-OST0033_UUID.lock_count=9801&lt;br/&gt;
f2-oss1f6: ldlm.namespaces.filter-f2-OST0023_UUID.lock_count=7558&lt;br/&gt;
f2-oss1f6: ldlm.namespaces.filter-f2-OST0047_UUID.lock_count=10884&lt;br/&gt;
f2-oss1c6: ldlm.namespaces.filter-f2-OST0020_UUID.lock_count=11754&lt;br/&gt;
f2-oss1c6: ldlm.namespaces.filter-f2-OST0044_UUID.lock_count=21609&lt;br/&gt;
f2-oss1f2: ldlm.namespaces.filter-f2-OST000b_UUID.lock_count=7273&lt;br/&gt;
f2-oss1f2: ldlm.namespaces.filter-f2-OST002f_UUID.lock_count=16546&lt;br/&gt;
f2-oss1d2: ldlm.namespaces.filter-f2-OST0009_UUID.lock_count=5351&lt;br/&gt;
f2-oss1d2: ldlm.namespaces.filter-f2-OST002d_UUID.lock_count=11648&lt;br/&gt;
f2-oss1b4: ldlm.namespaces.filter-f2-OST0013_UUID.lock_count=7091&lt;br/&gt;
f2-oss1b4: ldlm.namespaces.filter-f2-OST0037_UUID.lock_count=17672&lt;br/&gt;
f2-oss1d1: ldlm.namespaces.filter-f2-OST0003_UUID.lock_count=10786&lt;br/&gt;
f2-oss1d1: ldlm.namespaces.filter-f2-OST0027_UUID.lock_count=12821&lt;br/&gt;
f2-oss1a3: ldlm.namespaces.filter-f2-OST000c_UUID.lock_count=8583&lt;br/&gt;
f2-oss1a3: ldlm.namespaces.filter-f2-OST0030_UUID.lock_count=10223&lt;br/&gt;
f2-oss1b1: ldlm.namespaces.filter-f2-OST0001_UUID.lock_count=19223&lt;br/&gt;
f2-oss1b1: ldlm.namespaces.filter-f2-OST0025_UUID.lock_count=12262&lt;br/&gt;
f2-oss1c3: ldlm.namespaces.filter-f2-OST000e_UUID.lock_count=19796&lt;br/&gt;
f2-oss1c3: ldlm.namespaces.filter-f2-OST0032_UUID.lock_count=19187&lt;br/&gt;
f2-oss1c2: ldlm.namespaces.filter-f2-OST0008_UUID.lock_count=22208&lt;br/&gt;
f2-oss1c2: ldlm.namespaces.filter-f2-OST002c_UUID.lock_count=17602&lt;br/&gt;
f2-oss1e5: ldlm.namespaces.filter-f2-OST001c_UUID.lock_count=10177&lt;br/&gt;
f2-oss1e5: ldlm.namespaces.filter-f2-OST0040_UUID.lock_count=18898&lt;br/&gt;
f2-oss1d6: ldlm.namespaces.filter-f2-OST0021_UUID.lock_count=9237&lt;br/&gt;
f2-oss1d6: ldlm.namespaces.filter-f2-OST0045_UUID.lock_count=23232&lt;br/&gt;
f2-oss1b6: ldlm.namespaces.filter-f2-OST001f_UUID.lock_count=8040&lt;br/&gt;
f2-oss1b6: ldlm.namespaces.filter-f2-OST0043_UUID.lock_count=22877&lt;br/&gt;
f2-oss1f1: ldlm.namespaces.filter-f2-OST0005_UUID.lock_count=8117&lt;br/&gt;
f2-oss1f1: ldlm.namespaces.filter-f2-OST0029_UUID.lock_count=9207&lt;br/&gt;
f2-oss1b5: ldlm.namespaces.filter-f2-OST0019_UUID.lock_count=12895&lt;br/&gt;
f2-oss1b5: ldlm.namespaces.filter-f2-OST003d_UUID.lock_count=25789&lt;/p&gt;</comment>
                            <comment id="266958" author="adilger" created="Mon, 6 Apr 2020 19:49:41 +0000"  >&lt;blockquote&gt;
&lt;p&gt;f2-oss1f1: ldlm.namespaces.filter-f2-OST0029_UUID.lock_count=9207&lt;br/&gt;
f2-oss1b5: ldlm.namespaces.filter-f2-OST0019_UUID.lock_count=12895&lt;br/&gt;
f2-oss1b5: ldlm.namespaces.filter-f2-OST003d_UUID.lock_count=25789&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;OK, so this is totally reasonable, even low by my expectation, so it doesn&apos;t seem like the spinlock hold times are driven by having too many DLM locks to process, unless there is a sudden spike of locks at the time the problem occurs?&lt;/p&gt;

&lt;p&gt;According to the stack traces, there are only really a few &quot;interesting&quot; process, the rest are just blocked on (likely) the same spinlock in either  &lt;tt&gt;ldlm_lock_enqueue()&lt;/tt&gt; or &lt;tt&gt;ofd_lvbo_fill()&lt;/tt&gt;.  In both cases, it looks like the &lt;tt&gt;ldlm_cn&lt;/tt&gt; threads are processing a potentially long list of locks on a single resource:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[2642432.309284] CPU: 8 PID: 57948 Comm: ldlm_cn00_044  3.10.0-1062.9.1.el7.x86_64
[2642432.309314] Call Trace:
[2642432.309331]  [&amp;lt;ffffffffc14808e3&amp;gt;] ldlm_process_extent_lock+0x213/0x490 [ptlrpc]
[2642432.309362]  [&amp;lt;ffffffffc146959e&amp;gt;] ldlm_reprocess_queue+0x1be/0x3f0 [ptlrpc]
[2642432.309383]  [&amp;lt;ffffffffc146a5d2&amp;gt;] __ldlm_reprocess_all+0x102/0x360 [ptlrpc]
[2642432.309397]  [&amp;lt;ffffffffc146a843&amp;gt;] ldlm_reprocess_all+0x13/0x20 [ptlrpc]
[2642432.309414]  [&amp;lt;ffffffffc148f4df&amp;gt;] ldlm_request_cancel+0x42f/0x780 [ptlrpc]
[2642432.309431]  [&amp;lt;ffffffffc14937a2&amp;gt;] ldlm_handle_cancel+0x232/0x2b0 [ptlrpc]
[2642432.309447]  [&amp;lt;ffffffffc1493978&amp;gt;] ldlm_cancel_handler+0x158/0x590 [ptlrpc]
[2642432.309467]  [&amp;lt;ffffffffc14c646b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[2642432.309509]  [&amp;lt;ffffffffc14c9dd4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[2642432.309530]  [&amp;lt;ffffffff940c61f1&amp;gt;] kthread+0xd1/0xe0

[2644718.775432] CPU: 15 PID: 52157 Comm: ldlm_cn01_021  3.10.0-1062.9.1.el7.x86_64
[2644718.775459] Call Trace:
[2644718.775477]  [&amp;lt;ffffffffc14808e3&amp;gt;] ldlm_process_extent_lock+0x213/0x490 [ptlrpc]
[2644718.775508]  [&amp;lt;ffffffffc146959e&amp;gt;] ldlm_reprocess_queue+0x1be/0x3f0 [ptlrpc]
[2644718.775528]  [&amp;lt;ffffffffc146a5d2&amp;gt;] __ldlm_reprocess_all+0x102/0x360 [ptlrpc]
[2644718.775543]  [&amp;lt;ffffffffc146a843&amp;gt;] ldlm_reprocess_all+0x13/0x20 [ptlrpc]
[2644718.775559]  [&amp;lt;ffffffffc148f4df&amp;gt;] ldlm_request_cancel+0x42f/0x780 [ptlrpc]
[2644718.775576]  [&amp;lt;ffffffffc14937a2&amp;gt;] ldlm_handle_cancel+0x232/0x2b0 [ptlrpc]
[2644718.775593]  [&amp;lt;ffffffffc1493978&amp;gt;] ldlm_cancel_handler+0x158/0x590 [ptlrpc]
[2644718.775613]  [&amp;lt;ffffffffc14c646b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[2644718.775656]  [&amp;lt;ffffffffc14c9dd4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[2644718.775677]  [&amp;lt;ffffffff940c61f1&amp;gt;] kthread+0xd1/0xe0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I would guess there is some application which is &quot;piling on&quot; a single object and getting lots of conflicting locks (e.g. high IOPS small/unaligned writes on a HDD OST) that generates a lot of conflicting locks, then repeatedly reprocessing the lock each time a new one is granted/released?  I was wondering whether patch &lt;a href=&quot;https://review.whamcloud.com/33221&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/33221&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11085&quot; title=&quot;Replace Lustre interval tree with kernel one&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11085&quot;&gt;LU-11085&lt;/a&gt; ldlm: simplify use of interval-tree&lt;/tt&gt;&quot; had already landed and might be causing this, but I see it has not.  It &lt;em&gt;seems&lt;/em&gt; like the interval tree should protect us against scanning a lot of non-overlapping locks, so possibly there are a lot of overlapping lock requests from the application?&lt;/p&gt;</comment>
                            <comment id="269016" author="dustb100" created="Thu, 30 Apr 2020 18:19:18 +0000"  >&lt;p&gt;Andreas,&lt;/p&gt;

&lt;p&gt;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160; I wanted to follow up with you on this since it has been a while. Generally the workload that triggers this happens during the first week of the month (a specific code runs by a subset of users on a set schedule). Based on your input about conflicting locks, we changed the lru_size configuration on the compute nodes to limit to 200 instead of the dynamic setting. We were thinking that this would limit the number of metadata transactions a single compute node could issue concurrently to the MDS. After this change we did not see more crashes, but it is possible this will crop up here again in the next week.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Dustin&lt;/p&gt;</comment>
                            <comment id="292954" author="adilger" created="Thu, 25 Feb 2021 00:33:34 +0000"  >&lt;p&gt;Dustin, James, it looks like this problem has not been hit again since the &lt;tt&gt;lru_size=200&lt;/tt&gt; change was made back in April.  Were you ever able to track down if there was a specific workload that was causing the high lock traffic?   &lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="56508">LU-12600</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="62004">LU-14221</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="34594" name="vmcore-dmesg.txt" size="1051452" author="simmonsja" created="Fri, 3 Apr 2020 13:53:53 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00wxb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>