<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:29:11 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16690] kernel: obd_memory max: 1854996506, obd_memory current: 1854996506</title>
                <link>https://jira.whamcloud.com/browse/LU-16690</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After upgrade to 2.15.2 server hung with errors&lt;/p&gt;

&lt;p&gt;kernel: obd_memory max: 1854996506, obd_memory current: 1854996506&lt;/p&gt;

&lt;p&gt;see attached logs for full set of longs.&lt;/p&gt;</description>
                <environment></environment>
        <key id="75367">LU-16690</key>
            <summary>kernel: obd_memory max: 1854996506, obd_memory current: 1854996506</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="dongyang">Dongyang Li</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Thu, 30 Mar 2023 21:33:10 +0000</created>
                <updated>Sat, 13 May 2023 14:42:18 +0000</updated>
                            <resolved>Sat, 13 May 2023 14:42:11 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="367951" author="mhanafi" created="Fri, 31 Mar 2023 01:30:06 +0000"  >&lt;p&gt;we now have two servers on two different filesystem hit the same issue.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="368057" author="mhanafi" created="Fri, 31 Mar 2023 19:04:40 +0000"  >&lt;p&gt;I am attaching stack output during which the load on system was +1000.&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="368061" author="mhanafi" created="Fri, 31 Mar 2023 19:45:12 +0000"  >&lt;p&gt;Not sure if this is related or not but after the upgrade module load takes a long time. Stack trace shows module load is in lnet_wait_router_start for several minutes.&lt;/p&gt;

&lt;p&gt;7611 modprobe&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; lnet_wait_router_start+0xb6/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; LNetNIInit+0x873/0xcf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_ni_init+0x22/0x1a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_init_portals+0xb/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_init+0x1e0/0x1000 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; do_one_initcall+0x46/0x1d0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; do_init_module+0x5a/0x230&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; load_module+0x14be/0x17f0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; __do_sys_finit_module+0xb1/0x110&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; do_syscall_64+0x5b/0x1b0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;0&amp;gt;&amp;#93;&lt;/span&gt; entry_SYSCALL_64_after_hwframe+0x61/0xc6&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="368062" author="adilger" created="Fri, 31 Mar 2023 20:14:46 +0000"  >&lt;p&gt;From the stack traces, it looks like a lot of OSS threads are handling clients connecting to the OST, while at the same time a lot are disconnecting, and all of them are stuck starting journal transactions:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;295816 ll_ost03_057
[&amp;lt;0&amp;gt;] wait_transaction_locked+0x89/0xd0 [jbd2]
[&amp;lt;0&amp;gt;] add_transaction_credits+0xd4/0x290 [jbd2]
[&amp;lt;0&amp;gt;] start_this_handle+0x10a/0x520 [jbd2]
[&amp;lt;0&amp;gt;] jbd2__journal_start+0xee/0x1f0 [jbd2]
[&amp;lt;0&amp;gt;] __ldiskfs_journal_start_sb+0x6e/0x140 [ldiskfs]
[&amp;lt;0&amp;gt;] osd_trans_start+0x13b/0x500 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] tgt_client_data_update+0x468/0x6c0 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_client_new+0x5c2/0x880 [ptlrpc]
[&amp;lt;0&amp;gt;] ofd_obd_connect+0x385/0x4f0 [ofd]
[&amp;lt;0&amp;gt;] target_handle_connect+0x611/0x29a0 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_request_handle+0x569/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
[&amp;lt;0&amp;gt;] kthread+0x10a/0x120
[&amp;lt;0&amp;gt;] ret_from_fork+0x1f/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;295822 ll_ost03_063
[&amp;lt;0&amp;gt;] wait_transaction_locked+0x89/0xd0 [jbd2]
[&amp;lt;0&amp;gt;] add_transaction_credits+0xd4/0x290 [jbd2]
[&amp;lt;0&amp;gt;] start_this_handle+0x10a/0x520 [jbd2]
[&amp;lt;0&amp;gt;] jbd2__journal_start+0xee/0x1f0 [jbd2]
[&amp;lt;0&amp;gt;] __ldiskfs_journal_start_sb+0x6e/0x140 [ldiskfs]
[&amp;lt;0&amp;gt;] osd_trans_start+0x13b/0x500 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] tgt_server_data_update+0x3db/0x5a0 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_client_del+0x368/0x710 [ptlrpc]
[&amp;lt;0&amp;gt;] ofd_obd_disconnect+0x1f8/0x210 [ofd]
[&amp;lt;0&amp;gt;] target_handle_disconnect+0x22f/0x500 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_disconnect+0x4a/0x1a0 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_request_handle+0xc97/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;When clients disconnect they need to force a synchronous journal commit, because otherwise if the OSS crashes shortly thereafter same time the clients will not be around to participate in recovery and recovery will take the maximum time.  This might be significantly impacting the filesystem IO rate.  Is this for an HDD or NVMe filesystem?&lt;/p&gt;

&lt;p&gt;There are also a handful of OSS threads are handling file unlinks:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;295743 ll_ost06_061
[&amp;lt;0&amp;gt;] wait_transaction_locked+0x89/0xd0 [jbd2]
[&amp;lt;0&amp;gt;] add_transaction_credits+0xd4/0x290 [jbd2]
[&amp;lt;0&amp;gt;] start_this_handle+0x10a/0x520 [jbd2]
[&amp;lt;0&amp;gt;] jbd2__journal_start+0xee/0x1f0 [jbd2]
[&amp;lt;0&amp;gt;] __ldiskfs_journal_start_sb+0x6e/0x140 [ldiskfs]
[&amp;lt;0&amp;gt;] osd_trans_start+0x13b/0x500 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] ofd_destroy+0x247/0xb20 [ofd]
[&amp;lt;0&amp;gt;] ofd_destroy_by_fid+0x25e/0x4a0 [ofd]
[&amp;lt;0&amp;gt;] ofd_destroy_hdl+0x263/0xa10 [ofd]
[&amp;lt;0&amp;gt;] tgt_request_handle+0xc97/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;295763 ll_ost06_063
[&amp;lt;0&amp;gt;] wait_transaction_locked+0x89/0xd0 [jbd2]
[&amp;lt;0&amp;gt;] add_transaction_credits+0xd4/0x290 [jbd2]
[&amp;lt;0&amp;gt;] start_this_handle+0x10a/0x520 [jbd2]
[&amp;lt;0&amp;gt;] jbd2__journal_start+0xee/0x1f0 [jbd2]
[&amp;lt;0&amp;gt;] __ldiskfs_journal_start_sb+0x6e/0x140 [ldiskfs]
[&amp;lt;0&amp;gt;] ldiskfs_evict_inode+0x273/0x6b0 [ldiskfs]
[&amp;lt;0&amp;gt;] evict+0xd2/0x1a0
[&amp;lt;0&amp;gt;] osd_object_delete+0x21a/0x320 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] lu_object_free.isra.37+0x90/0x1e0 [obdclass]
[&amp;lt;0&amp;gt;] ofd_destroy_by_fid+0x2d6/0x4a0 [ofd]
[&amp;lt;0&amp;gt;] ofd_destroy_hdl+0x263/0xa10 [ofd]
[&amp;lt;0&amp;gt;] tgt_request_handle+0xc97/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;some threads are doing writes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;293423 ll_ost_io02_061
[&amp;lt;0&amp;gt;] wait_transaction_locked+0x89/0xd0 [jbd2]
[&amp;lt;0&amp;gt;] add_transaction_credits+0xd4/0x290 [jbd2]
[&amp;lt;0&amp;gt;] start_this_handle+0x10a/0x520 [jbd2]
[&amp;lt;0&amp;gt;] jbd2__journal_start+0xee/0x1f0 [jbd2]
[&amp;lt;0&amp;gt;] __ldiskfs_journal_start_sb+0x6e/0x140 [ldiskfs]
[&amp;lt;0&amp;gt;] osd_trans_start+0x13b/0x500 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] ofd_write_attr_set+0x11d/0x1070 [ofd]
[&amp;lt;0&amp;gt;] ofd_commitrw_write+0x226/0x1ad0 [ofd]
[&amp;lt;0&amp;gt;] ofd_commitrw+0x5b4/0xd20 [ofd]
[&amp;lt;0&amp;gt;] obd_commitrw+0x1b0/0x380 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_brw_write+0x139f/0x1ce0 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_request_handle+0xc97/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;A bunch of threads are doing reads (and shouldn&apos;t have journal handles), but appear to be stuck in memory allocations:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;293418 ll_ost_io02_056
[&amp;lt;0&amp;gt;] shrink_lruvec+0x2f1/0x6c0
[&amp;lt;0&amp;gt;] shrink_node+0x22e/0x700
[&amp;lt;0&amp;gt;] do_try_to_free_pages+0xc9/0x3e0
[&amp;lt;0&amp;gt;] try_to_free_pages+0xf3/0x1c0
[&amp;lt;0&amp;gt;] __alloc_pages_slowpath+0x372/0xd10
[&amp;lt;0&amp;gt;] __alloc_pages_nodemask+0x2e2/0x320
[&amp;lt;0&amp;gt;] kmalloc_order+0x28/0x90
[&amp;lt;0&amp;gt;] kmalloc_order_trace+0x1d/0xb0
[&amp;lt;0&amp;gt;] __kmalloc+0x203/0x250
[&amp;lt;0&amp;gt;] bio_integrity_prep+0xee/0x2b0
[&amp;lt;0&amp;gt;] blk_mq_make_request+0xad/0x5b0
[&amp;lt;0&amp;gt;] generic_make_request_no_check+0xe1/0x330
[&amp;lt;0&amp;gt;] submit_bio+0x3c/0x160
[&amp;lt;0&amp;gt;] osd_do_bio.constprop.51+0x510/0xc40 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] osd_read_prep+0x456/0x540 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] ofd_preprw_read.isra.27+0x4a0/0x13b0 [ofd]
[&amp;lt;0&amp;gt;] ofd_preprw+0x6ef/0x900 [ofd]
[&amp;lt;0&amp;gt;] tgt_brw_read+0x6e0/0x1fc0 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_request_handle+0xc97/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;293127 ll_ost_io01_057
[&amp;lt;0&amp;gt;] __lock_page+0x12d/0x230
[&amp;lt;0&amp;gt;] pagecache_get_page+0x1e5/0x310
[&amp;lt;0&amp;gt;] osd_bufs_get+0x56a/0xb20 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] ofd_preprw_read.isra.27+0x3dc/0x13b0 [ofd]
[&amp;lt;0&amp;gt;] ofd_preprw+0x6ef/0x900 [ofd]
[&amp;lt;0&amp;gt;] tgt_brw_read+0x6e0/0x1fc0 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_request_handle+0xc97/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;All of the above threads are blocked waiting to access the journal or for memory, so they &lt;em&gt;shouldn&apos;t&lt;/em&gt; actually be doing anything to modify the filesystem yet.  There are a handful of threads that are &lt;b&gt;actually&lt;/b&gt; holding a transaction open:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;293304 ll_ost_io05_051
[&amp;lt;0&amp;gt;] jbd2_log_wait_commit+0xac/0x120 [jbd2]
[&amp;lt;0&amp;gt;] jbd2_journal_stop+0x2d5/0x330 [jbd2]
[&amp;lt;0&amp;gt;] __ldiskfs_journal_stop+0x36/0xb0 [ldiskfs]
[&amp;lt;0&amp;gt;] osd_trans_stop+0x235/0xa60 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] ofd_object_punch+0xdbf/0x1330 [ofd]
[&amp;lt;0&amp;gt;] ofd_punch_hdl+0x4b0/0x710 [ofd]
[&amp;lt;0&amp;gt;] tgt_request_handle+0xc97/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;293411 ll_ost_io04_061
[&amp;lt;0&amp;gt;] range_lock+0x148/0x270 [obdclass]
[&amp;lt;0&amp;gt;] ofd_preprw_write.isra.28+0x720/0x1240 [ofd]
[&amp;lt;0&amp;gt;] ofd_preprw+0x7b2/0x900 [ofd]
[&amp;lt;0&amp;gt;] obd_preprw+0x1a1/0x360 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_brw_write+0x11cf/0x1ce0 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_request_handle+0xc97/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;One possible cause of this blockage is a handful of threads that are holding an open transaction, but are blocked on memory allocation:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;227209 ll_ost_io06_004
292970 ll_ost_io00_021
292989 ll_ost_io06_054
227821 ll_ost_io06_006
271578 ll_ost_io03_023
293355 ll_ost_io00_059
[&amp;lt;0&amp;gt;] shrink_lruvec+0x2f1/0x6c0
[&amp;lt;0&amp;gt;] shrink_node+0x22e/0x700
[&amp;lt;0&amp;gt;] do_try_to_free_pages+0xc9/0x3e0
[&amp;lt;0&amp;gt;] try_to_free_pages+0xf3/0x1c0
[&amp;lt;0&amp;gt;] __alloc_pages_slowpath+0x372/0xd10
[&amp;lt;0&amp;gt;] __alloc_pages_nodemask+0x2e2/0x320
[&amp;lt;0&amp;gt;] kmalloc_order+0x28/0x90
[&amp;lt;0&amp;gt;] kmalloc_order_trace+0x1d/0xb0
[&amp;lt;0&amp;gt;] __kmalloc+0x203/0x250
[&amp;lt;0&amp;gt;] bio_integrity_prep+0xee/0x2b0
[&amp;lt;0&amp;gt;] blk_mq_make_request+0xad/0x5b0
[&amp;lt;0&amp;gt;] generic_make_request_no_check+0xe1/0x330
[&amp;lt;0&amp;gt;] submit_bio+0x3c/0x160
[&amp;lt;0&amp;gt;] osd_do_bio.constprop.51+0xb63/0xc40 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] osd_ldiskfs_map_inode_pages+0x873/0x8f0 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] osd_write_commit+0x5e2/0x990 [osd_ldiskfs]
[&amp;lt;0&amp;gt;] ofd_commitrw_write+0x77e/0x1ad0 [ofd]
[&amp;lt;0&amp;gt;] ofd_commitrw+0x5b4/0xd20 [ofd]
[&amp;lt;0&amp;gt;] obd_commitrw+0x1b0/0x380 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_brw_write+0x139f/0x1ce0 [ptlrpc]
[&amp;lt;0&amp;gt;] tgt_request_handle+0xc97/0x1a40 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x323/0xbe0 [ptlrpc]
[&amp;lt;0&amp;gt;] ptlrpc_main+0xc0f/0x1570 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It looks like the underlying storage is using T10-PI at the block level, which is doing an allocation late in the IO submission process, which is bad.  The &lt;tt&gt;kmalloc()&lt;/tt&gt; itself looks like it is using &lt;tt&gt;GFP_NOIO&lt;/tt&gt;, which is good, so it doesn&apos;t recurse into the filesystem again, but it doesn&apos;t appear that there is much memory available for the number of running threads, and this is causing it to block.&lt;/p&gt;

&lt;p&gt;That may also be problematic because &lt;tt&gt;kswapd&lt;/tt&gt; is blocked in the filesystem when trying to free an inode, so it is unable to do any memory reclaim:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;349 kswapd0
[&amp;lt;0&amp;gt;] wait_transaction_locked+0x89/0xd0 [jbd2]
[&amp;lt;0&amp;gt;] add_transaction_credits+0xd4/0x290 [jbd2]
[&amp;lt;0&amp;gt;] start_this_handle+0x10a/0x520 [jbd2]
[&amp;lt;0&amp;gt;] jbd2__journal_start+0xee/0x1f0 [jbd2]
[&amp;lt;0&amp;gt;] __ldiskfs_journal_start_sb+0x6e/0x140 [ldiskfs]
[&amp;lt;0&amp;gt;] ldiskfs_release_dquot+0x60/0xb0 [ldiskfs]
[&amp;lt;0&amp;gt;] dqput.part.19+0x82/0x1e0
[&amp;lt;0&amp;gt;] __dquot_drop+0x69/0x90
[&amp;lt;0&amp;gt;] ldiskfs_clear_inode+0x1e/0x80 [ldiskfs]
[&amp;lt;0&amp;gt;] ldiskfs_evict_inode+0x58/0x6b0 [ldiskfs]
[&amp;lt;0&amp;gt;] evict+0xd2/0x1a0
[&amp;lt;0&amp;gt;] dispose_list+0x48/0x70
[&amp;lt;0&amp;gt;] prune_icache_sb+0x52/0x80
[&amp;lt;0&amp;gt;] super_cache_scan+0x123/0x1b0
[&amp;lt;0&amp;gt;] do_shrink_slab+0x11d/0x330
[&amp;lt;0&amp;gt;] shrink_slab+0xbe/0x2f0
[&amp;lt;0&amp;gt;] shrink_node+0x246/0x700
[&amp;lt;0&amp;gt;] balance_pgdat+0x2d7/0x550
[&amp;lt;0&amp;gt;] kswapd+0x201/0x3c0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I see the same &lt;tt&gt;kswapd0&lt;/tt&gt; process stuck in &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/48611/48611_nbp13_hung&quot; title=&quot;nbp13_hung attached to LU-16690&quot;&gt;nbp13_hung&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; so that seems significant.&lt;/p&gt;

&lt;p&gt;I don&apos;t have the configuration, system, and storage details, but it may just be that there are too many threads running on this system for the amount of RAM that it has?  You could try tuning &lt;tt&gt;oss_max_threads=256&lt;/tt&gt; or similar to reduce the concurrent thread count so the OSS memory isn&apos;t running out.  If large RPCs are in use (e.g. &lt;tt&gt;obdfilter.&amp;#42;.brw_size=16&lt;/tt&gt; or &lt;tt&gt;=32&lt;/tt&gt;) then this could be reduced so the OST threads are not preallocating so much memory for large RPCs.&lt;/p&gt;</comment>
                            <comment id="368064" author="green" created="Fri, 31 Mar 2023 20:33:11 +0000"  >&lt;p&gt;to me this looks like something is wrong on the disk backend, the system is super slow (all the traces trying to get data from storage) and then eventually clients start to reconnect and I imagine whatever dirty data there is is not being flushed super fast either.&lt;/p&gt;

&lt;p&gt;And then the system thinks it&apos;s running out of memory and frantically asks everybody around to free some (and this is the memory message you see is), the lustre only allocated 1.8G of RAM though which is not all that much (how much RAM is there?)&lt;/p&gt;</comment>
                            <comment id="368077" author="mhanafi" created="Fri, 31 Mar 2023 21:51:34 +0000"  >&lt;p&gt;All the servers have 192GB of memory. Memory has never been an issue before the upgrade.&lt;/p&gt;

&lt;p&gt;Filesystem crashing all have small files with hight number of IOPs. We changed thread count on one filesystem and turned off ib_iser T10pi checking. I saw threads waiting in bio_integrity_prep.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="368090" author="pjones" created="Sat, 1 Apr 2023 02:35:20 +0000"  >&lt;p&gt;Dongyang&lt;/p&gt;

&lt;p&gt;What is your advise here?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="368247" author="mhanafi" created="Mon, 3 Apr 2023 20:23:46 +0000"  >&lt;p&gt;Lower the number threads help but eventualy the service hit the issue. But host that had ib_iser t10pi disable was find. So for now we have disabled t10pi and lowered the number of threads.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I will try to see if I can get a reproducer to help debug the issue.&#160;&lt;/p&gt;</comment>
                            <comment id="368270" author="adilger" created="Mon, 3 Apr 2023 22:26:51 +0000"  >&lt;p&gt;Mahmoud, which kernel is running on the servers for this system?&lt;/p&gt;</comment>
                            <comment id="368299" author="dongyang" created="Tue, 4 Apr 2023 01:42:43 +0000"  >&lt;p&gt;Andreas,&lt;br/&gt;
From the logs the kernel is 4.18.0-425.3.1.el8_lustre.x86_64.&lt;/p&gt;

&lt;p&gt;I think we are seeing memory allocation issue is because of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16413&quot; title=&quot;T10PI is broken for CentOS 8.x&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16413&quot;&gt;&lt;del&gt;LU-16413&lt;/del&gt;&lt;/a&gt;, the patch was landed in master but not in 2.15.2.&lt;br/&gt;
Before the patch the bio-integrity kernel patch is broken for 4.18 kernels, it removed check in bio_integrity_prep() if the integrity payload is already allocated or not, and from osd we are calling bio_integrity_prep() twice, so we are leaking the integrity payload. Both are fixed in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16413&quot; title=&quot;T10PI is broken for CentOS 8.x&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16413&quot;&gt;&lt;del&gt;LU-16413&lt;/del&gt;&lt;/a&gt;.&lt;br/&gt;
I will port &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16413&quot; title=&quot;T10PI is broken for CentOS 8.x&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16413&quot;&gt;&lt;del&gt;LU-16413&lt;/del&gt;&lt;/a&gt; to b2_15.&lt;/p&gt;</comment>
                            <comment id="370145" author="pjones" created="Fri, 21 Apr 2023 15:35:59 +0000"  >&lt;p&gt;Mahmoud&lt;/p&gt;

&lt;p&gt;The &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16413&quot; title=&quot;T10PI is broken for CentOS 8.x&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16413&quot;&gt;&lt;del&gt;LU-16413&lt;/del&gt;&lt;/a&gt; fix has been merged to b2_15 and will be in the upcoming 2.15.3 release. Have you tested the effectiveness of this release?&lt;/p&gt;

&lt;p&gt;Peter&#160;&lt;/p&gt;</comment>
                            <comment id="370746" author="mhanafi" created="Wed, 26 Apr 2023 17:32:24 +0000"  >&lt;p&gt;Peter,&lt;/p&gt;

&lt;p&gt;I haven&apos;t been able to reproduce it on our test filesystem, I think it is too small. We may need to wait on a production filesystem during an extended dedicated.&#160;&lt;/p&gt;</comment>
                            <comment id="372212" author="pjones" created="Sat, 13 May 2023 14:42:11 +0000"  >&lt;p&gt;Ok so in that case let&apos;s assume that this is fixed unless future evidence comes to light that this is not the case&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="73682">LU-16413</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="48611" name="nbp13_hung" size="71963" author="mhanafi" created="Thu, 30 Mar 2023 21:33:05 +0000"/>
                            <attachment id="48625" name="stack.out" size="81501" author="mhanafi" created="Fri, 31 Mar 2023 19:04:15 +0000"/>
                            <attachment id="48624" name="stack.out2" size="84173" author="mhanafi" created="Fri, 31 Mar 2023 19:04:15 +0000"/>
                            <attachment id="48623" name="stack.out3" size="81946" author="mhanafi" created="Fri, 31 Mar 2023 19:04:15 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03hpj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>