<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:28:13 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16578] osp prealloc_status stuck at -11 after MDT failover</title>
                <link>https://jira.whamcloud.com/browse/LU-16578</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We are seeing a weird problem on our Oak storage system where MDT0000 doesn&#8217;t seem to be able to allocate new objects on specific OSTs after a MDT0000 failover. Lustre doesn&#8217;t seem to complain about that in the logs (which is already a problem per se), but we can see that something is wrong as &lt;tt&gt;prealloc_status&lt;/tt&gt; is set to -11 (EAGAIN) for those:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@oak-md1-s1 ~]# lctl get_param osp.oak-OST*-osc-MDT0000/prealloc_status | grep -c &quot;=-11$&quot;
76
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;While 388 other OSTs are fine:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@oak-md1-s1 ~]# lctl get_param osp.oak-OST*-osc-MDT0000/prealloc_status | grep -c &quot;=0$&quot;
388
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Note: prealloc_status for MDT0001 and MDT0002 located on the same MDS are fine, which likely indicates that the OSTs are fine:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@oak-md1-s1 ~]# lctl get_param osp.oak-OST*-osc-MDT0001/prealloc_status | grep -c &quot;=0$&quot;
464
[root@oak-md1-s1 ~]# lctl get_param osp.oak-OST*-osc-MDT0002/prealloc_status | grep -c &quot;=0$&quot;
464
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;MDT0 prealloc info for a problematic OST:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@oak-md1-s1 ~]# lctl get_param osp.oak-OST0098-osc-MDT0000/prealloc*
osp.oak-OST0098-osc-MDT0000.prealloc_last_id=1
osp.oak-OST0098-osc-MDT0000.prealloc_last_seq=0x0
osp.oak-OST0098-osc-MDT0000.prealloc_next_id=2
osp.oak-OST0098-osc-MDT0000.prealloc_next_seq=0x0
osp.oak-OST0098-osc-MDT0000.prealloc_reserved=0
osp.oak-OST0098-osc-MDT0000.prealloc_status=-11
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;osp stats for a problematic OST:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@oak-md1-s1 ~]# lctl get_param osp.oak-OST0098-osc-MDT0000/stats
osp.oak-OST0098-osc-MDT0000.stats=
snapshot_time &#160; &#160; &#160; &#160; &#160; &#160; 1676840498.011953710 secs.nsecs
req_waittime &#160; &#160; &#160; &#160; &#160; &#160; &#160;22039 samples [usec] 31 4156496 65699472 70586321859268
req_active &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;22039 samples [reqs] 1 20 23968 43578
ost_setattr &#160; &#160; &#160; &#160; &#160; &#160; &#160; 47 samples [usec] 77 19916 25528 397438254
ost_destroy &#160; &#160; &#160; &#160; &#160; &#160; &#160; 480 samples [usec] 132 4079970 38902621 41255678589979
ost_connect &#160; &#160; &#160; &#160; &#160; &#160; &#160; 1 samples [usec] 99 99 99 9801
obd_ping &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;21511 samples [usec] 31 4156496 26771224 29330245821234
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;vs. a working one for comparison purposes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@oak-md1-s1 ~]# lctl get_param osp.oak-OST0099-osc-MDT0000/stats
osp.oak-OST0099-osc-MDT0000.stats=
snapshot_time &#160; &#160; &#160; &#160; &#160; &#160; 1676840500.538529214 secs.nsecs
req_waittime &#160; &#160; &#160; &#160; &#160; &#160; &#160;133680 samples [usec] 30 5731842 227870174 257553761260390
req_active &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;133680 samples [reqs] 1 21 136003 155255
ost_setattr &#160; &#160; &#160; &#160; &#160; &#160; &#160; 66 samples [usec] 75 12523 61768 584301738
ost_create &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;429 samples [usec] 851 392236 42632140 5687640118670
ost_destroy &#160; &#160; &#160; &#160; &#160; &#160; &#160; 5919 samples [usec] 98 5731842 84323316 91990540215144
ost_connect &#160; &#160; &#160; &#160; &#160; &#160; &#160; 1 samples [usec] 110 110 110 12100
ost_statfs &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;105758 samples [usec] 30 5181321 64867010 151688813146608
obd_ping &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;21507 samples [usec] 31 1552632 35985830 8186183466130
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;State for the same problematic OST:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@oak-md1-s1 ~]# lctl get_param osp.oak-OST0098-osc-MDT0000/state
osp.oak-OST0098-osc-MDT0000.state=
current_state: FULL
state_history:
 - [ 1676310563, CONNECTING ]
 - [ 1676310563, FULL ]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So far, we have no user reports of any issue with the filesystem, probably thanks to Lustre which does a good job at NOT using these OSTs for new objects, likely due to a proper check on &lt;tt&gt;prealloc_status&lt;/tt&gt;. We only noticed the problem thanks to a micro-benchmark monitoring script that periodically performs some I/O on every OSTs of the filesystem by creating files on MDT0000:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# Just to make sure here, we&#8217;re using a directory on MDT0000
[root@oak-rbh02 stripes]# lfs getdirstripe .
lmv_stripe_count: 0 lmv_stripe_offset: 0 lmv_hash_type: none

# Allocate a file using plain layout on a single OST with prealloc_status=-11
[root@oak-rbh02 stripes]# time lfs setstripe -c 1 -o 152 test152
lfs setstripe: setstripe error for &apos;test152&apos;: Connection timed out

real &#160; &#160;1m40.006s
user &#160; &#160;0m0.000s
sys &#160; &#160;0m0.005s

[root@oak-rbh02 stripes]# lfs getstripe test152
test152 has no stripe info
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We can see that setstripe does time out after 100s on this, which is Lustre&#8217;s timeout value.&lt;/p&gt;

&lt;p&gt;It looks like to me that MDT0000 is somehow not able to run the preallocation routines for these 76 OSTs and they are stuck with this status. But nothing in the logs at the start of the MDT seems to indicate a problem. Just in case, I am attaching the Lustre logs at the start of both MDT0000 and MDT0003 (due to manual failover to doing a service maintenance on a server) as &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/48154/48154_oak-md1-s1-lustre-mdt-failover.log&quot; title=&quot;oak-md1-s1-lustre-mdt-failover.log attached to LU-16578&quot;&gt;oak-md1-s1-lustre-mdt-failover.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; along with lustre log file &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/48153/48153_lustre-log.1676310742.6098.gz&quot; title=&quot;lustre-log.1676310742.6098.gz attached to LU-16578&quot;&gt;lustre-log.1676310742.6098.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; which was dumped during the start. But it&#8217;s not clear from these logs there was any preallocation problem. Lustre should have a better way of logging that when this happens.&lt;/p&gt;

&lt;p&gt;Some ideas&#8230;&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;I tried things like setting &lt;tt&gt;force_sync=1&lt;/tt&gt;, or changing the value of &lt;tt&gt;max_create_count&lt;/tt&gt; to try to force a refresh of the object preallocation, to no avail.&lt;/li&gt;
&lt;/ul&gt;


&lt;ul&gt;
	&lt;li&gt;Note that we do have the patch (available in 2.12.9):&lt;br/&gt;
&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15009&quot; title=&quot;precreate should cleanup orphans upon error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15009&quot;&gt;&lt;del&gt;LU-15009&lt;/del&gt;&lt;/a&gt; ofd: continue precreate if LAST_ID is less on MDT&lt;/tt&gt;&lt;br/&gt;
and I am wondering if that could be related, as the patch transforms a CERROR into a &lt;tt&gt;LCONSOLE(D_INFO,&#8230;)&lt;/tt&gt; in some cases, which we would have likely missed, as we don&#8217;t run with +info, it&#8217;s way too verbose. It&#8217;s important enough that it should probably be logged as an error instead. But really not sure this is related to our issue..&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;As for the server version we&#8217;re running here, sorry it&#8217;s not ideal but it&#8217;s somewhere between 2.12.8 and 2.12.9 + additional patches not in 2.12.x yet (but only with patches from Gerrit). To see that patches we have since 2.12.8, see&#160; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/48152/48152_git-patches.log&quot; title=&quot;git-patches.log attached to LU-16578&quot;&gt;git-patches.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;.&lt;/p&gt;

&lt;p&gt;Any ideas, suggestions of things to try, before trying a stop/start of the impacted OSTs maybe to force reconnect (or is there a way to force a reconnect for a specific OST just for MDT0?).&lt;/p&gt;</description>
                <environment>CentOS 7.9 (3.10.0-1160.6.1.el7_lustre.pl1.x86_64)</environment>
        <key id="74743">LU-16578</key>
            <summary>osp prealloc_status stuck at -11 after MDT failover</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Sun, 19 Feb 2023 21:26:59 +0000</created>
                <updated>Thu, 23 Mar 2023 18:26:44 +0000</updated>
                            <resolved>Thu, 23 Mar 2023 18:26:44 +0000</resolved>
                                    <version>Lustre 2.12.8</version>
                                    <fixVersion>Lustre 2.14.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="365208" author="sthiell" created="Wed, 8 Mar 2023 05:36:03 +0000"  >&lt;p&gt;This issue persists. We restarted MDT0000 several times. Every time, some OSTs report &lt;tt&gt;prealloc_status=-11&lt;/tt&gt;. We got a different number of them every time,&#160; but we couldn&apos;t get all of them to be at prealloc_status=0. Right now, 29 of them are still in that bad state.&lt;/p&gt;

&lt;p&gt;I tried to take some debug log with +info and noticed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000004:00000040:31.0:1677771272.969914:0:16555:0:(osp_precreate.c:1050:osp_pre_update_msfs()) oak-OST018b-osc-MDT0000: blocks=34861959081 free=14858726726 avail=14507148013 avail_mb=56668546 hwm_mb=265975 files=137329664 ffree=132179391 state=0: rc = -11

00000004:00000040:28.0:1677771272.979391:0:16556:0:(osp_precreate.c:1050:osp_pre_update_msfs()) oak-OST018c-osc-MDT0000: blocks=34861959081 free=13552189747 avail=13200616749 avail_mb=51564909 hwm_mb=265975 files=137329664 ffree=132293767 state=0: rc = -11

00000004:00000040:18.0:1677771273.126947:0:16534:0:(osp_precreate.c:1050:osp_pre_update_msfs()) oak-OST019d-osc-MDT0000: blocks=34861959081 free=13716915320 avail=13365340891 avail_mb=52208362 hwm_mb=265975 files=137329664 ffree=131951371 state=0: rc = -11

00000004:00000040:16.0:1677771273.191533:0:16527:0:(osp_precreate.c:1050:osp_pre_update_msfs()) oak-OST01a5-osc-MDT0000: blocks=34861959081 free=15940628589 avail=15589045678 avail_mb=60894709 hwm_mb=265975 files=137329664 ffree=132008888 state=0: rc = -11
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Attaching the debug logs as &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/48371/48371_ldebug.log.gz&quot; title=&quot;ldebug.log.gz attached to LU-16578&quot;&gt;ldebug.log.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Now... very likely related to the issue described in this ticket, we are also having occurrences of high load on this MDS, due to many threads being blocked (that explains the high load of the server). I took two dump of &quot;foreach bt&quot; in a live crash to investigate:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/48372/48372_LU-16578-oak-md1-s2_foreach_bt_1.gz&quot; title=&quot;LU-16578-oak-md1-s2_foreach_bt_1.gz attached to LU-16578&quot;&gt;LU-16578-oak-md1-s2_foreach_bt_1.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/li&gt;
	&lt;li&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/48373/48373_LU-16578-oak-md1-s2_foreach_bt_2.gz&quot; title=&quot;LU-16578-oak-md1-s2_foreach_bt_2.gz attached to LU-16578&quot;&gt;LU-16578-oak-md1-s2_foreach_bt_2.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;What I can see each time that many threads are blocked in &lt;tt&gt;mdt_object_local_lock()&lt;/tt&gt; or &lt;tt&gt;lod_qos_prep_create()&lt;/tt&gt;.&lt;/p&gt;

&lt;p&gt;But each time, a single thread is blocked in &lt;tt&gt;osp_precreate_reserve()&lt;/tt&gt;:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 20498  TASK: ffff9d9b38a1d280  CPU: 28  COMMAND: &quot;mdt07_076&quot;
 #0 [ffff9d9b38b1f3b0] __schedule at ffffffffb65b78d8
 #1 [ffff9d9b38b1f418] schedule at ffffffffb65b7ca9
 #2 [ffff9d9b38b1f428] schedule_timeout at ffffffffb65b5778
 #3 [ffff9d9b38b1f4d8] osp_precreate_reserve at ffffffffc17b2fc8 [osp]
 #4 [ffff9d9b38b1f598] osp_declare_create at ffffffffc17a78ec [osp]
 #5 [ffff9d9b38b1f5f8] lod_sub_declare_create at ffffffffc16ee203 [lod]
 #6 [ffff9d9b38b1f658] lod_qos_declare_object_on at ffffffffc16e62ae [lod]
 #7 [ffff9d9b38b1f6a8] lod_alloc_qos.constprop.18 at ffffffffc16e7c57 [lod]
 #8 [ffff9d9b38b1f7c8] lod_qos_prep_create at ffffffffc16ed460 [lod]
 #9 [ffff9d9b38b1f8d0] lod_declare_instantiate_components at ffffffffc16cdf5a [lod]
#10 [ffff9d9b38b1f908] lod_declare_layout_change at ffffffffc16e11e5 [lod]
#11 [ffff9d9b38b1f970] mdd_declare_layout_change at ffffffffc1754d02 [mdd]
#12 [ffff9d9b38b1f998] mdd_layout_change at ffffffffc175dd86 [mdd]
#13 [ffff9d9b38b1fa08] mdt_layout_change at ffffffffc15e926f [mdt]
#14 [ffff9d9b38b1fa48] mdt_intent_layout at ffffffffc15f0ff0 [mdt]
#15 [ffff9d9b38b1fb08] mdt_intent_opc at ffffffffc15e695a [mdt]
#16 [ffff9d9b38b1fb68] mdt_intent_policy at ffffffffc15eecac [mdt]
#17 [ffff9d9b38b1fba8] ldlm_lock_enqueue at ffffffffc114e636 [ptlrpc]
#18 [ffff9d9b38b1fc20] ldlm_handle_enqueue0 at ffffffffc1176166 [ptlrpc]
#19 [ffff9d9b38b1fcb0] tgt_enqueue at ffffffffc1201182 [ptlrpc]
#20 [ffff9d9b38b1fcd0] tgt_request_handle at ffffffffc120803a [ptlrpc]
#21 [ffff9d9b38b1fd58] ptlrpc_server_handle_request at ffffffffc11abe4b [ptlrpc]
#22 [ffff9d9b38b1fdf8] ptlrpc_main at ffffffffc11af7f4 [ptlrpc]
#23 [ffff9d9b38b1fec8] kthread at ffffffffb5ecb511
#24 [ffff9d9b38b1ff50] ret_from_fork_nospec_begin at ffffffffb65c51dd
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This is weird to me, as &lt;tt&gt;osp_precreate_reserve()&lt;/tt&gt; is supposed to skip any OST with prealloc_status != 0. So I am wondering if we may have another problem where OSTs would not be in a good state for preallocation, either leading to &lt;tt&gt;prealloc_status=-11&lt;/tt&gt;, or blocking &lt;tt&gt;osp_precreate_reserve()&lt;/tt&gt; for OSTs that have a &lt;tt&gt;prealloc_status=0&lt;/tt&gt;.&lt;/p&gt;

&lt;p&gt;Eventually, the MDT recovers. Logs look like this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Mar 07 16:27:49 oak-md1-s2 kernel: Pid: 20362, comm: mdt04_015 3.10.0-1160.83.1.el7_lustre.pl1.x86_64 #1 SMP Sun Feb 19 18:38:37 PST 2023
Mar 07 16:27:49 oak-md1-s2 kernel: Call Trace:
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffb61ae0d7&amp;gt;] call_rwsem_down_write_failed+0x17/0x30
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc16e6d5d&amp;gt;] lod_alloc_qos.constprop.18+0x20d/0x1870 [lod]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc16ed460&amp;gt;] lod_qos_prep_create+0x1300/0x18c0 [lod]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc16edc3d&amp;gt;] lod_prepare_create+0x21d/0x2e0 [lod]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc16dd9ee&amp;gt;] lod_declare_striped_create+0x1ee/0x980 [lod]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc16e22d4&amp;gt;] lod_declare_create+0x204/0x590 [lod]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc1759c8a&amp;gt;] mdd_declare_create_object_internal+0xea/0x370 [mdd]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc174933c&amp;gt;] mdd_declare_create+0x4c/0xdf0 [mdd]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc174ce97&amp;gt;] mdd_create+0x877/0x14b0 [mdd]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc1615618&amp;gt;] mdt_reint_open+0x2588/0x3970 [mdt]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc1608303&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc15e35e3&amp;gt;] mdt_reint_internal+0x6f3/0xb00 [mdt]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc15f0432&amp;gt;] mdt_intent_open+0x82/0x3a0 [mdt]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc15e695a&amp;gt;] mdt_intent_opc+0x1ba/0xb50 [mdt]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc15eecac&amp;gt;] mdt_intent_policy+0x1ac/0x370 [mdt]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc114e636&amp;gt;] ldlm_lock_enqueue+0x376/0x9b0 [ptlrpc]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc1176166&amp;gt;] ldlm_handle_enqueue0+0xa86/0x1620 [ptlrpc]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc1201182&amp;gt;] tgt_enqueue+0x62/0x220 [ptlrpc]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc120803a&amp;gt;] tgt_request_handle+0xaea/0x1580 [ptlrpc]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc11abe4b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffc11af7f4&amp;gt;] ptlrpc_main+0xb44/0x1480 [ptlrpc]
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffb5ecb511&amp;gt;] kthread+0xd1/0xe0
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffb65c51dd&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
Mar 07 16:27:49 oak-md1-s2 kernel: &#160;[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff

&amp;lt;skip&amp;gt;

Mar 07 16:49:28 oak-md1-s2 kernel: LNet: Service thread pid 20362 completed after 1499.99s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
Mar 07 16:49:28 oak-md1-s2 kernel: LustreError: 20477:0:(service.c:2132:ptlrpc_server_handle_request()) @@@ Dropping timed-out request from 12345-10.51.13.9@o2ib3: deadline 100:85s ago
                                     req@ffff9da42db43600 x1759758327487744/t0(0) o38-&amp;gt;&amp;lt;?&amp;gt;@&amp;lt;?&amp;gt;:0/0 lens 520/0 e 0 to 0 dl 1678236483 ref 1 fl Interpret:/0/ffffffff rc 0/-1
Mar 07 16:49:28 oak-md1-s2 kernel: LustreError: 20477:0:(service.c:2132:ptlrpc_server_handle_request()) Skipped 24 previous similar messages
Mar 07 16:49:28 oak-md1-s2 kernel: Lustre: 20477:0:(service.c:2169:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (100:85s); client may timeout.  req@ffff9da42db43600 x1759758327487744/t0(0) o38-&amp;gt;&amp;lt;?&amp;gt;@&amp;lt;?&amp;gt;:0/0 lens 520/0 e 0 to 
Mar 07 16:49:28 oak-md1-s2 kernel: Lustre: 20477:0:(service.c:2169:ptlrpc_server_handle_request()) Skipped 320 previous similar messages
Mar 07 16:49:28 oak-md1-s2 kernel: LustreError: 20514:0:(tgt_handler.c:651:process_req_last_xid()) @@@ Unexpected xid 63a065158cbc0 vs. last_xid 63a06515a7e7f
                                     req@ffff9d7b32f82400 x1752648669252544/t0(0) o41-&amp;gt;bf148b27-5bbf-4c21-8739-59abeeaf25b2@10.51.2.59@o2ib3:496/0 lens 440/0 e 0 to 0 dl 1678237146 ref 1 fl Interpret:/0/ffffffff rc 0/-1

&amp;lt;back to normal, for now&amp;gt;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Do you have any suggestion? Thanks!&lt;/p&gt;</comment>
                            <comment id="366241" author="asmadeus" created="Fri, 17 Mar 2023 12:14:08 +0000"  >&lt;p&gt;For the high load/threads blocked first; it&apos;s just a side-effect/other manifestation of the same underlying problem, but basically:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;the thread you noticed with &lt;tt&gt;osp_precreate_reserve&lt;/tt&gt; has a write lock on &lt;tt&gt;lod-&amp;gt;lod_qos.lq_rw_sem&lt;/tt&gt; (from &lt;tt&gt;lod_alloc_qos&lt;/tt&gt; a bit higher in the stack), and is waiting for prealloc_status to become ready (&lt;tt&gt;osp_precreate_ready_condition&lt;/tt&gt;) on some ost&lt;/li&gt;
	&lt;li&gt;other threads (both &lt;tt&gt;lod_alloc_qos&lt;/tt&gt; and &lt;tt&gt;lod_qos_statfs_update&lt;/tt&gt; are waiting on the same rw lock (also in write mode)&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;since prealloc status never becomes ready, these wait until timeout (obdtimeout, 100s) at which point the lock is freed, but the next lod_alloc_qos will wait again... presumably for all corresponding create (or until all OSTs with bad precreate state are iterated over; didn&apos;t really check that far)&lt;/p&gt;

&lt;p&gt;&lt;tt&gt;lod_alloc_qos&lt;/tt&gt; first calls &lt;tt&gt;osp_precreate_reserve&lt;/tt&gt; with &quot;slow&quot; (in lod_alloc_qos) /&quot;can_block&quot; (in the rest of the stack) set to false, so that won&apos;t block with the lock taken, but if that wasn&apos;t enough and a file was requested with more stripes than precreates are available it&apos;ll be called again in blocking mode.&lt;br/&gt;
Basically you&apos;ll hang a bit the mdt everytime a job tries to create a batch of files with lfs setstripe -1...&lt;/p&gt;

&lt;p&gt;I don&apos;t think much can be done about this here &#8211; we could just tell the client sorry there aren&apos;t enough OSTs available for your stripes, but in an optimal case it shouldn&apos;t be waiting that long in the first place so let&apos;s fix the underlying precreate issue instead.&lt;/p&gt;

&lt;p&gt;I&apos;ll have a look at that next (I was hoping I&apos;d get a hint in there, but no luck)&lt;/p&gt;</comment>
                            <comment id="366326" author="adilger" created="Sat, 18 Mar 2023 00:56:03 +0000"  >&lt;p&gt;Stephane, are you using a PFL layout with &quot;&lt;tt&gt;-c -1&lt;/tt&gt;&quot; that stripes over all OSTs?  It would be interesting here to see if changing this to use some fixed number (fewer than the number of OSTs) (e.g. &quot;&lt;tt&gt;-c 32&lt;/tt&gt;&quot; or whatever) to see if this avoids the allocations from blocking?  In newer Lustre you could instead set &quot;&lt;tt&gt;lod.*.max_stripecount=N&lt;/tt&gt;&quot; on the MDS and it would force all &quot;&lt;tt&gt;-c -1&lt;/tt&gt;&quot; layouts to use &quot;&lt;tt&gt;-c N&lt;/tt&gt;&quot; instead of &quot;all OSTs&quot;.  This was added in patch &lt;a href=&quot;https://review.whamcloud.com/45532&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/45532&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9162&quot; title=&quot;Option to set max stripe count per filesystem&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9162&quot;&gt;&lt;del&gt;LU-9162&lt;/del&gt;&lt;/a&gt; lod: option to set max stripe count per filesystem&lt;/tt&gt;&quot;, but it hasn&apos;t been backported to 2.12.&lt;/p&gt;

&lt;p&gt;Lustre previously also had a heuristic to allow fully-striped files to be content with at least 3/4 of the OSTs (if some OSTs are unavailable), but this was lost during some code reorganization, so stripe_count = -1 files will always try to allocate from all OSTs even if they are having some problems.  I&apos;ve started fixing on this in patch &lt;a href=&quot;https://review.whamcloud.com/50250&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/50250&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16623&quot; title=&quot;lod_statfs_and_check() does not skip unusable OSTs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16623&quot;&gt;&lt;del&gt;LU-16623&lt;/del&gt;&lt;/a&gt; lod: handle object allocation consistently&lt;/tt&gt;&quot; but I haven&apos;t had a chance to finish the patch yet.&lt;/p&gt;

&lt;p&gt;Separately, there was a patch under &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15393&quot; title=&quot;object allocation when OST is lost&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15393&quot;&gt;&lt;del&gt;LU-15393&lt;/del&gt;&lt;/a&gt; to avoid blocking the MDS threads in OST object creation for a long time that may be useful here.&lt;/p&gt;</comment>
                            <comment id="366328" author="asmadeus" created="Sat, 18 Mar 2023 01:16:59 +0000"  >&lt;p&gt;Hi Andreas, thanks for the pointers!&lt;/p&gt;

&lt;p&gt;The &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15393&quot; title=&quot;object allocation when OST is lost&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15393&quot;&gt;&lt;del&gt;LU-15393&lt;/del&gt;&lt;/a&gt; patch looks like it could help with the high load observed, thanks!&lt;/p&gt;

&lt;p&gt;In this case though this is just chasing after the symptoms, so I&apos;d like to understand why the prealloc isn&apos;t working. Going back to the &apos;foreach bt&apos; St&#233;phane provided, at the time he had 60 &lt;tt&gt;osp-pre-X-0&lt;/tt&gt; threads stuck at the &quot;need to be connected to OST&quot; wait at the start of &lt;tt&gt;osp_precreate_thread&lt;/tt&gt;:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-c&quot;&gt;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; (osp_precreate_running(d)) {
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; if ((d-&amp;gt;opd_pre == &lt;span class=&quot;code-keyword&quot;&gt;NULL&lt;/span&gt; || d-&amp;gt;opd_pre_recovering) &amp;amp;&amp;amp;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; d-&amp;gt;opd_imp_connected &amp;amp;&amp;amp;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; !d-&amp;gt;opd_got_disconnected)
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &lt;span class=&quot;code-keyword&quot;&gt;break&lt;/span&gt;;
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; l_wait_event(d-&amp;gt;opd_pre_waitq,
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;!osp_precreate_running(d) ||
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;d-&amp;gt;opd_new_connection,
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;&amp;amp;lwi);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;(the other threads, and all threads from the mdt3 running on same mds are in the &quot;connected, can handle precreates now&quot; loop as they should be)&lt;/p&gt;

&lt;p&gt;I&apos;ve had a quick look and didn&apos;t see any merged patch that seem to be touching this, but perhaps you&apos;ll have an idea for why threads would be stuck there?&lt;/p&gt;</comment>
                            <comment id="366329" author="asmadeus" created="Sat, 18 Mar 2023 01:45:07 +0000"  >&lt;p&gt;Actually &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12397&quot; title=&quot;osp: race around opd_new_connection&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12397&quot;&gt;&lt;del&gt;LU-12397&lt;/del&gt;&lt;/a&gt; would be a good candidate for that ( &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/35078&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/35078&lt;/a&gt; ) &#8211; the commit message description fits exactly and St&#233;phane doesn&apos;t have it on his branch.&lt;/p&gt;

&lt;p&gt;I&apos;ll have him give it a try.&lt;/p&gt;</comment>
                            <comment id="367111" author="sthiell" created="Thu, 23 Mar 2023 18:24:12 +0000"  >&lt;p&gt;&amp;gt; Stephane, are you using a PFL layout with &quot;-c -1&quot; that stripes over all OSTs?&lt;br/&gt;
Andreas, no, as far as I know, we are not doing that on this system. We have a monitoring script that is doing explicit single-OST plain striping with on each OST to monitor performance. Perhaps this contributed to the high load we saw. Our default PFL setting is very simple on this system that is designed for longer-term storage:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lfs getstripe -d /oak
  lcm_layout_gen:    0
  lcm_mirror_count:  1
  lcm_entry_count:   2
    lcme_id:             N/A
    lcme_mirror_id:      N/A
    lcme_flags:          0
    lcme_extent.e_start: 0
    lcme_extent.e_end:   2199023255552
      stripe_count:  1       stripe_size:   1048576       pattern:       raid0       stripe_offset: -1

    lcme_id:             N/A
    lcme_mirror_id:      N/A
    lcme_flags:          0
    lcme_extent.e_start: 2199023255552
    lcme_extent.e_end:   EOF
      stripe_count:  8       stripe_size:   1048576       pattern:       raid0       stripe_offset: -1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Anyway, we applied the patch cherry-picked from &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/35078&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/35078&lt;/a&gt; (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12397&quot; title=&quot;osp: race around opd_new_connection&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12397&quot;&gt;&lt;del&gt;LU-12397&lt;/del&gt;&lt;/a&gt;) and we don&apos;t see any single OST with &lt;tt&gt;prealloc_status=-11&lt;/tt&gt; anymore. This ticket can be closed as duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12397&quot; title=&quot;osp: race around opd_new_connection&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12397&quot;&gt;&lt;del&gt;LU-12397&lt;/del&gt;&lt;/a&gt;. Thanks!&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="55870">LU-12397</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="74967">LU-16623</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="48372" name="LU-16578-oak-md1-s2_foreach_bt_1.gz" size="195563" author="sthiell" created="Wed, 8 Mar 2023 05:31:33 +0000"/>
                            <attachment id="48373" name="LU-16578-oak-md1-s2_foreach_bt_2.gz" size="199615" author="sthiell" created="Wed, 8 Mar 2023 05:31:39 +0000"/>
                            <attachment id="48155" name="MDT0_prealloc_status_full.log" size="21573" author="sthiell" created="Sun, 19 Feb 2023 21:39:02 +0000"/>
                            <attachment id="48152" name="git-patches.log" size="1596" author="sthiell" created="Sun, 19 Feb 2023 21:21:24 +0000"/>
                            <attachment id="48371" name="ldebug.log.gz" size="70314631" author="sthiell" created="Wed, 8 Mar 2023 05:19:20 +0000"/>
                            <attachment id="48153" name="lustre-log.1676310742.6098.gz" size="2734555" author="sthiell" created="Sun, 19 Feb 2023 21:20:30 +0000"/>
                            <attachment id="48154" name="oak-md1-s1-lustre-mdt-failover.log" size="11958" author="sthiell" created="Sun, 19 Feb 2023 21:20:01 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03ei7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>