<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:57:01 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6077] MDS OOM</title>
                <link>https://jira.whamcloud.com/browse/LU-6077</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have had a number of crashes with the MDS OOMing with ldlm_locks slab using most of the memory.  Attached you&apos;ll find console logs and back trace. &lt;/p&gt;

&lt;p&gt;&amp;lt;code&amp;gt;&lt;br/&gt;
rash&amp;gt;  kmem -i&lt;br/&gt;
              PAGES        TOTAL      PERCENTAGE&lt;br/&gt;
 TOTAL MEM  12289376      46.9 GB         ----&lt;br/&gt;
      FREE   348961       1.3 GB    2% of TOTAL MEM&lt;br/&gt;
      USED  11940415      45.5 GB   97% of TOTAL MEM&lt;br/&gt;
    SHARED   251654       983 MB    2% of TOTAL MEM&lt;br/&gt;
   BUFFERS   250789     979.6 MB    2% of TOTAL MEM&lt;br/&gt;
    CACHED      864       3.4 MB    0% of TOTAL MEM&lt;br/&gt;
      SLAB  9196563      35.1 GB   74% of TOTAL MEM&lt;/p&gt;

&lt;p&gt;TOTAL SWAP   500013       1.9 GB         ----&lt;br/&gt;
 SWAP USED     2913      11.4 MB    0% of TOTAL SWAP&lt;br/&gt;
 SWAP FREE   497100       1.9 GB   99% of TOTAL SWAP&lt;/p&gt;

&lt;p&gt;crash&amp;gt; kmem -s&lt;br/&gt;
CACHE            NAME                 OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE&lt;br/&gt;
ffff880ba5641980 osp_obj                  216      38190     85302   4739     4k&lt;br/&gt;
ffff880babc51940 lod_obj                  120      21141     64096   2003     4k&lt;br/&gt;
ffff880bb1951900 mdt_obj                  248      21141     54528   3408     4k&lt;br/&gt;
ffff880bb34a18c0 fsfilt_ldiskfs_fcb        56          0         0      0     4k&lt;br/&gt;
ffff880bb3631880 dynlock_cache            128          0         0      0     4k&lt;br/&gt;
ffff880bb3621840 ldiskfs_inode_cache     1056      22187     33852  11284     4k&lt;br/&gt;
ffff880bb3611800 ldiskfs_xattr             88          0         0      0     4k&lt;br/&gt;
ffff880bb36017c0 ldiskfs_free_data         64          0         0      0     4k&lt;br/&gt;
ffff880bb35f1780 ldiskfs_alloc_context    136          0         0      0     4k&lt;br/&gt;
ffff880bb35e1740 ldiskfs_prealloc_space   112         37       170      5     4k&lt;br/&gt;
ffff880bb35d1700 ldiskfs_system_zone       40          0         0      0     4k&lt;br/&gt;
ffff880bb35516c0 upd_kmem                  96          0         0      0     4k&lt;br/&gt;
ffff880bb3541680 lqe_kmem                 192       3130      3180    159     4k&lt;br/&gt;
ffff880bb3491640 jbd2_journal_handle       48          0         0      0     4k&lt;br/&gt;
ffff880bb3481600 jbd2_journal_head        112          0         0      0     4k&lt;br/&gt;
ffff880bb3b715c0 jbd2_revoke_table         16          4       404      2     4k&lt;br/&gt;
ffff880bb3b81580 jbd2_revoke_record        32          0         0      0     4k&lt;br/&gt;
ffff880bb3461540 mdd_obj                   96      21141     68200   1705     4k&lt;br/&gt;
ffff8805fd5a2040 ccc_req_kmem              40          0         0      0     4k&lt;br/&gt;
ffff8805fd592000 ccc_session_kmem         184        589      1890     90     4k&lt;br/&gt;
ffff8805fd581fc0 ccc_thread_kmem          352         71       176     16     4k&lt;br/&gt;
ffff8805fdfb1f80 ccc_object_kmem          264          0         0      0     4k&lt;br/&gt;
ffff8805fdfa1f40 ccc_lock_kmem             40          0         0      0     4k&lt;br/&gt;
ffff8805fdf91f00 vvp_session_kmem         104        589      2183     59     4k&lt;br/&gt;
ffff8805fdf81ec0 vvp_thread_kmem          488         71       136     17     4k&lt;br/&gt;
ffff8805fde31e80 ll_rmtperm_hash_cache    256          0         0      0     4k&lt;br/&gt;
ffff8805fde21e40 ll_remote_perm_cache      40          0         0      0     4k&lt;br/&gt;
ffff8805fe391e00 ll_file_data             192          0         0      0     4k&lt;br/&gt;
ffff880601741dc0 lustre_inode_cache      1216          0         0      0     4k&lt;br/&gt;
ffff8805fdf71d80 lov_oinfo                128          0         0      0     4k&lt;br/&gt;
ffff8805fdf61d40 lov_lock_link_kmem        32          0         0      0     4k&lt;br/&gt;
ffff8805fdf51d00 lovsub_req_kmem           40          0         0      0     4k&lt;br/&gt;
ffff8805fdf41cc0 lovsub_object_kmem       240          0         0      0     4k&lt;br/&gt;
ffff8805fdf31c80 lovsub_lock_kmem          64          0         0      0     4k&lt;br/&gt;
ffff8805fdf21c40 lov_req_kmem              40          0         0      0     4k&lt;br/&gt;
ffff8805fdd11c00 lov_session_kmem         400        589      1110    111     4k&lt;br/&gt;
ffff8805fdd01bc0 lov_thread_kmem          288         71       195     15     4k&lt;br/&gt;
ffff8805fdf11b80 lov_object_kmem          240          0         0      0     4k&lt;br/&gt;
ffff8805fdcf1b40 lov_lock_kmem            104          0         0      0     4k&lt;br/&gt;
ffff8805fde11b00 osc_quota_kmem            24          0         0      0     4k&lt;br/&gt;
ffff8805fde01ac0 osc_extent_kmem          168          0         0      0     4k&lt;br/&gt;
ffff8805fddf1a80 osc_req_kmem              40          0         0      0     4k&lt;br/&gt;
ffff8805fdde1a40 osc_session_kmem         424        589      1080    120     4k&lt;br/&gt;
ffff8805fddd1a00 osc_thread_kmem          984         71        96     24     4k&lt;br/&gt;
ffff8805fddc19c0 osc_object_kmem          288          0         0      0     4k&lt;br/&gt;
ffff8805fddb1980 osc_lock_kmem            192          0         0      0     4k&lt;br/&gt;
ffff8805fe371940 interval_node            128          0         0      0     4k&lt;br/&gt;
ffff8805fe361900 ldlm_locks               576   49731039  49796635 7113805     4k&lt;br/&gt;
&amp;lt;cod&amp;gt;&lt;/p&gt;</description>
                <environment></environment>
        <key id="28032">LU-6077</key>
            <summary>MDS OOM</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 31 Dec 2014 20:34:14 +0000</created>
                <updated>Mon, 1 Jun 2015 07:38:46 +0000</updated>
                            <resolved>Mon, 1 Jun 2015 07:38:46 +0000</resolved>
                                    <version>Lustre 2.4.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="102465" author="jfc" created="Thu, 1 Jan 2015 01:28:16 +0000"  >&lt;p&gt;Niu,&lt;br/&gt;
Could you please advise on this issue.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
~ jfc.&lt;/p&gt;</comment>
                            <comment id="102490" author="niu" created="Sun, 4 Jan 2015 02:41:48 +0000"  >&lt;p&gt;1. I see lots of network errors in the log:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&amp;lt;4&amp;gt;LNet: 2036:0:(o2iblnd_cb.c:2348:kiblnd_passive_connect()) Conn stale 10.151.28.220@o2ib [old ver: 12, &lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt; ver: 12]
&amp;lt;4&amp;gt;LNet: 2036:0:(o2iblnd_cb.c:2348:kiblnd_passive_connect()) Conn stale 10.151.49.230@o2ib [old ver: 12, &lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt; ver: 12]
&amp;lt;4&amp;gt;LNet: 2036:0:(o2iblnd_cb.c:2348:kiblnd_passive_connect()) Skipped 1 previous similar message
&amp;lt;4&amp;gt;LNet: 2036:0:(o2iblnd_cb.c:2348:kiblnd_passive_connect()) Conn stale 10.151.49.233@o2ib [old ver: 12, &lt;span class=&quot;code-keyword&quot;&gt;new&lt;/span&gt; ver: 12]
&amp;lt;4&amp;gt;LNet: 2036:0:(o2iblnd_cb.c:2348:kiblnd_passive_connect()) Skipped 2 previous similar messages
&amp;lt;4&amp;gt;Lustre: MGS: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 115bc340-65eb-e4c8-5212-3d07e8fe9c9b (at 10.151.46.238@o2ib) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff880432122c00, cur 1419472730 expire 1419472580 last 1419472503
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;You probably should check the network is working properly first.&lt;/p&gt;

&lt;p&gt;2. Do you have any special patches applied on 2.4.3?&lt;/p&gt;

&lt;p&gt;3. I&apos;m afraid that the ldlm pools shrink mechanism can&apos;t work well in heavy workload, could you try to disable the lru_resize to see if the OOM can be resolved? (see Lustre manual 32.8 Configuring locking)&lt;/p&gt;</comment>
                            <comment id="102523" author="pjones" created="Mon, 5 Jan 2015 13:17:42 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;The NASA tree is on github - &lt;a href=&quot;https://github.com/jlan/lustre-nas&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/jlan/lustre-nas&lt;/a&gt;. NASA will have to advise as to the exact version in use.&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="102559" author="jaylan" created="Mon, 5 Jan 2015 18:47:16 +0000"  >&lt;p&gt;Service160 was running 2.4.3-8nasS. The tag corresponds to &lt;br/&gt;
    &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4019&quot; title=&quot;today&amp;#39;s master stick on shutdown on test == sanity test 132: on lu_object_find_at&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4019&quot;&gt;&lt;del&gt;LU-4019&lt;/del&gt;&lt;/a&gt; ofd: setattr don&apos;t udpate lvbo with object referenced&lt;br/&gt;
in the nas-2.4.3 branch.&lt;/p&gt;</comment>
                            <comment id="102572" author="mhanafi" created="Mon, 5 Jan 2015 20:19:43 +0000"  >&lt;p&gt;The network error you pointed out are normal We see those all the time. We have large number of nodes that are some time rebooted after a job.&lt;/p&gt;

&lt;p&gt;The Documentation is not very clear. Do we run this on every client? If we have different clients/#cpus how do we deal with that?&lt;br/&gt;
$ lctl set_param ldlm.namespaces.&lt;b&gt;osc&lt;/b&gt;.lru_size=$((NR_CPU*100))&lt;/p&gt;

&lt;p&gt;What are the side effects of disabling lru_size?&lt;/p&gt;</comment>
                            <comment id="102613" author="niu" created="Tue, 6 Jan 2015 03:13:45 +0000"  >&lt;p&gt;Thank you, Jay. I didn&apos;t see any suspicious commit in the log.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;The Documentation is not very clear. Do we run this on every client? If we have different clients/#cpus how do we deal with that?&lt;br/&gt;
$ lctl set_param ldlm.namespaces.osc.lru_size=$((NR_CPU*100))&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Yes, you have to run this on every client. You can use a script to get the NR_CPU on each client then set the lru_size accordingly, or you can just use an average value for all clients.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;What are the side effects of disabling lru_size?&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;When lru_resize enabled, each client has a dynamic ldlm cache size, the number of cached locks for each client depends on the workload and memory on client/server (active client can cache more locks, idle client cache less locks); When lru_resize disabled, each client can at maximum cache only  lru_size (NR_CPU * 100) ldlm locks.&lt;/p&gt;</comment>
                            <comment id="103683" author="pjones" created="Thu, 15 Jan 2015 22:05:24 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could this be related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5726&quot; title=&quot;MDS buffer not freed when deleting files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5726&quot;&gt;&lt;del&gt;LU-5726&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="103704" author="niu" created="Fri, 16 Jan 2015 02:17:35 +0000"  >&lt;p&gt;I think they are different issues, in this ticket, the ldlm lock cache is getting very huge, it consumed lots of memory, whereas in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5726&quot; title=&quot;MDS buffer not freed when deleting files&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5726&quot;&gt;&lt;del&gt;LU-5726&lt;/del&gt;&lt;/a&gt;, it&apos;s kernel buffers consumed lots of memory.&lt;/p&gt;</comment>
                            <comment id="117006" author="niu" created="Mon, 1 Jun 2015 07:37:54 +0000"  >&lt;p&gt;I think this is the same problem of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6529&quot; title=&quot;Server side lock limits to avoid unnecessary memory exhaustion&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6529&quot;&gt;&lt;del&gt;LU-6529&lt;/del&gt;&lt;/a&gt;, will fix it in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6529&quot; title=&quot;Server side lock limits to avoid unnecessary memory exhaustion&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6529&quot;&gt;&lt;del&gt;LU-6529&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="117007" author="niu" created="Mon, 1 Jun 2015 07:38:46 +0000"  >&lt;p&gt;dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6529&quot; title=&quot;Server side lock limits to avoid unnecessary memory exhaustion&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6529&quot;&gt;&lt;del&gt;LU-6529&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="16647" name="service160" size="3441609" author="mhanafi" created="Wed, 31 Dec 2014 20:34:14 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzx353:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>16909</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>