<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:12:20 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-7836] MDSes crashed with oom-killer</title>
                <link>https://jira.whamcloud.com/browse/LU-7836</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Error occurred during soak testing of build &apos;20160302&apos; (b2_8 RC4) (see: &lt;a href=&quot;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160302&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160302&lt;/a&gt; also). DNE is enabled. MDTs had been formatted using &lt;em&gt;ldiskfs&lt;/em&gt;, OSTs using &lt;em&gt;zfs&lt;/em&gt;. MDS nodes are configured in active - active HA failover configuration. (For teset set-up configuration see &lt;a href=&quot;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-Configuration&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-Configuration&lt;/a&gt;)&lt;/p&gt;

&lt;p&gt;Note: This might be a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7780&quot; title=&quot;MDS crashed with oom-killer&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7780&quot;&gt;&lt;del&gt;LU-7780&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Sequence of events:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;2016-03-01 20:38:40  triggering fault mds_failover (lola-10 --&amp;gt; lola-11)&lt;/li&gt;
	&lt;li&gt;2016-03-01 20:41:58 lola-8   LNet process hang on lola-8
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LNet: Service thread pid 5074 was inactive for 200.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes to /tmp/lustre-log.1456893718.5074
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;(see attached file &lt;tt&gt;lustre-log.1456893718.5074&lt;/tt&gt;&lt;/p&gt;&lt;/li&gt;
	&lt;li&gt;Lustre Recovery never completed (till next failover)&lt;/li&gt;
	&lt;li&gt;All slurm jobs are stalled and began to timeout after ~ 20:40&lt;br/&gt;
Slabs continuously allocated memory. Top 10 consumers before next&lt;br/&gt;
failover are
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;slab-details/size-1048576.dat:20160301 22:09:20 size-1048576 13215 13856931840 13215 13856931840 13215 13856931840 13215 13856931840 67108864 0
slab-details/size-262144.dat:20160301 22:09:20 size-262144 385 100925440 385 100925440 385 100925440 385 100925440 0 0
slab-details/size-192.dat:20160301 22:09:20 size-192 268155 51485760 268320 51517440 13416 54951936 13416 54951936 24576 0
slab-details/size-8192.dat:20160301 22:09:20 size-8192 5508 45121536 5508 45121536 5508 45121536 5508 45121536 0 0
slab-details/size-1024.dat:20160301 22:09:20 size-1024 41857 42861568 41864 42868736 10466 42868736 10466 42868736 110592 0
slab-details/ptlrpc_cache.dat:20160301 22:09:20 ptlrpc_cache 35590 27333120 35605 27344640 7121 29167616 7121 29167616 122880 0
slab-details/size-65536.dat:20160301 22:09:20 size-65536 361 23658496 361 23658496 361 23658496 361 23658496 0 0
slab-details/size-512.dat:20160301 22:09:20 size-512 37639 19271168 38920 19927040 4865 19927040 4865 19927040 45056 0
slab-details/kmem_cache.dat:20160301 22:09:20 kmem_cache 289 9506944 289 9506944 289 18939904 289 18939904 0 0
slab-details/size-4096.dat:20160301 22:09:20 size-4096 2970 12165120 2971 12169216 2970 12165120 2971 12169216 4096 0
slab-details/inode_cache.dat:20160301 22:09:20 inode_cache 15612 9242304 15678 9281376 2613 10702848 2613 10702848 0 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
	&lt;li&gt;2016-03-01 22:09:46,199:fsmgmt.fsmgmt:INFO     triggering fault mds_failover  (lola-10 --&amp;gt; lola-11)&lt;/li&gt;
	&lt;li&gt;2016-03-01 22:16:19,483:fsmgmt.fsmgmt:INFO     mds_failover just completed&lt;/li&gt;
	&lt;li&gt;Recovery of MDTs (mdt-4,5) never completed&lt;/li&gt;
	&lt;li&gt;2016-03-02  00:15-30 Created stack traces and debug log files on &lt;tt&gt;lola-&lt;span class=&quot;error&quot;&gt;&amp;#91;8,10,11&amp;#93;&lt;/span&gt;&lt;/tt&gt;&lt;br/&gt;
*2016-02-02 01:30:01  approximately at this time oom-killer started on MDS &lt;tt&gt;lola-10&lt;/tt&gt;&lt;br/&gt;
Top 10 consumers are:
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;slab-details/size-1048576.dat:20160302 01:32:00 size-1048576 29882 31333548032 29882 31333548032 29882 31333548032 29882 31333548032 1048576 0
slab-details/size-262144.dat:20160302 01:32:00 size-262144 641 168034304 641 168034304 641 168034304 641 168034304 0 0
slab-details/size-1024.dat:20160302 01:32:00 size-1024 84449 86475776 84484 86511616 21121 86511616 21121 86511616 106496 0
slab-details/ptlrpc_cache.dat:20160302 01:32:00 ptlrpc_cache 95492 73337856 95510 73351680 19102 78241792 19102 78241792 114688 0
slab-details/size-192.dat:20160302 01:32:00 size-192 298834 57376128 299260 57457920 14963 61288448 14963 61288448 0 0
slab-details/size-8192.dat:20160302 01:32:00 size-8192 5862 48021504 5862 48021504 5862 48021504 5862 48021504 -8192 0
slab-details/size-512.dat:20160302 01:32:00 size-512 77790 39828480 79096 40497152 9879 40464384 9887 40497152 28672 0
slab-details/size-65536.dat:20160302 01:32:00 size-65536 361 23658496 361 23658496 361 23658496 361 23658496 0 0
slab-details/kmem_cache.dat:20160302 01:32:00 kmem_cache 289 9506944 289 9506944 289 18939904 289 18939904 0 0
slab-details/size-128.dat:20160302 01:32:00 size-128 78707 10074496 100920 12917760 3364 13778944 3364 13778944 0 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;*2016-02-03 03:00  MDS &lt;tt&gt;lola-8&lt;/tt&gt; also crashed with oom-killer&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Attached messages, console and debug logs of nodes &lt;tt&gt;lola-8,10,11&lt;/tt&gt;, &lt;br/&gt;
file &lt;tt&gt;recovery-status-20160302&lt;/tt&gt; showing recovery and process status around  2016-03-02 00:15, memory and detailed slab,counters of &lt;tt&gt;lola-10&lt;/tt&gt;&lt;/p&gt;</description>
                <environment>lola&lt;br/&gt;
build: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-b2_8/11/&quot;&gt;https://build.hpdd.intel.com/job/lustre-b2_8/11/&lt;/a&gt;</environment>
        <key id="35106">LU-7836</key>
            <summary>MDSes crashed with oom-killer</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="di.wang">Di Wang</assignee>
                                    <reporter username="heckes">Frank Heckes</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Wed, 2 Mar 2016 14:29:25 +0000</created>
                <updated>Fri, 5 Aug 2016 21:08:47 +0000</updated>
                            <resolved>Fri, 5 Aug 2016 21:08:47 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                    <fixVersion>Lustre 2.9.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="144387" author="heckes" created="Wed, 2 Mar 2016 14:31:35 +0000"  >&lt;p&gt;slab counters of &lt;tt&gt;lola-8&lt;/tt&gt; can be uploaded on demand.&lt;/p&gt;</comment>
                            <comment id="144588" author="green" created="Fri, 4 Mar 2016 02:37:30 +0000"  >&lt;p&gt;just as I asked in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7780&quot; title=&quot;MDS crashed with oom-killer&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7780&quot;&gt;&lt;del&gt;LU-7780&lt;/del&gt;&lt;/a&gt; - we really need the same debug log from lustre with memory allocation tracing enabled so that we can see who is it that does the allocations, not just their sizes.&lt;/p&gt;

&lt;p&gt;Do you think you can collect something like that?&lt;/p&gt;</comment>
                            <comment id="144589" author="green" created="Fri, 4 Mar 2016 02:43:34 +0000"  >&lt;p&gt;you do this by adding the &quot;malloc&quot; debug mask to the run before the problem starts.&lt;/p&gt;</comment>
                            <comment id="144740" author="heckes" created="Mon, 7 Mar 2016 14:07:24 +0000"  >&lt;p&gt;Debug mask has been extended with &apos;+malloc&apos;&lt;/p&gt;</comment>
                            <comment id="144765" author="heckes" created="Mon, 7 Mar 2016 17:00:44 +0000"  >&lt;p&gt;The error didn&apos;t happened again till now.&lt;/p&gt;</comment>
                            <comment id="145400" author="heckes" created="Mon, 14 Mar 2016 11:22:55 +0000"  >&lt;p&gt;The issue happened again during soak testing of b2_8 RC5 (see &lt;a href=&quot;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160309&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160309&lt;/a&gt;). &lt;br/&gt;
Unfortunately the error occurred at the weekend (at Mar 12 01:05:02) on MDS node &lt;tt&gt;lola-11&lt;/tt&gt;.&lt;br/&gt;
Therefore th latest debug log written on the affected node &lt;tt&gt;lola-11&lt;/tt&gt; only contains memory allocation messages from &lt;b&gt;20160311 18:08:06&lt;/b&gt; till &lt;b&gt;20160311 18:34:59&lt;/b&gt;.&lt;/p&gt;

&lt;p&gt;The MDS nodes, especially &lt;tt&gt;lola-11&lt;/tt&gt; had been restarted (randomly) at the following times:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;mds_restart      : 2016-03-11 17:13:36,111 - 2016-03-11 17:24:02,736    lola-11&lt;/li&gt;
	&lt;li&gt;mds_restart      : 2016-03-11 17:29:11,738 - 2016-03-11 17:35:40,652    lola-11&lt;/li&gt;
	&lt;li&gt;mds_restart      : 2016-03-11 19:10:02,531 - 2016-03-11 19:17:28,133    lola-11&lt;/li&gt;
	&lt;li&gt;mds_restart      : 2016-03-11 19:28:55,134 - 2016-03-11 19:39:27,661    lola-11&lt;/li&gt;
	&lt;li&gt;mds_restart      : 2016-03-11 19:45:13,663 - 2016-03-11 19:58:16,686    lola-9&lt;/li&gt;
	&lt;li&gt;mds_restart      : 2016-03-11 20:24:31,687 - 2016-03-11 20:32:39,547    lola-11&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;The oom-killer on &lt;tt&gt;lola-11&lt;/tt&gt; ran at 2016-03-12 01:05 after the last restart for &lt;tt&gt;lola-11&lt;/tt&gt; finishing at 2016-03-11 20:32:39 (line 6 in list above).&lt;br/&gt;
The memory debug information are from the restarts between 17:35 and 19:10 (line 2 and 3). &lt;br/&gt;
I attached the extracted allocation messages and the original debug file and the &lt;tt&gt;collectl&lt;/tt&gt; memory counters to the&lt;br/&gt;
ticket. I hope it might contain a pointer to the code section causing the memory leak as the slab memory counter show an continuous increasing amount of allocated memory resources. Anyway, the error might haven&apos;t been triggered before the restart at 20:24 (line 6).&lt;br/&gt;
Please let me know if any other log files are needed.&lt;/p&gt;
</comment>
                            <comment id="145402" author="heckes" created="Mon, 14 Mar 2016 12:54:25 +0000"  >&lt;p&gt;Uploaded files are &lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;lola-11-lustre-loglog.20160311-183459.bz2                         - full debug log&lt;/li&gt;
	&lt;li&gt;lola-11-mem-counter-20160311_1729-1910.dat.bz2              - collectl memory counters&lt;/li&gt;
	&lt;li&gt;lola-11-memory-debug-messages.20160311-183459.bz2      - memory allocation debug log&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="145422" author="heckes" created="Mon, 14 Mar 2016 15:22:51 +0000"  >&lt;p&gt;The upload of file lola-11-lustre-loglog.20160311-183459.bz2  stalled every time I time half the way before completion.&lt;/p&gt;

&lt;p&gt;The effect of continuously increasing amount of slabs immediately occured after the clean-up (remount of MDTS on lola-10) and restart&lt;br/&gt;
of soak during the first MDS restart of &lt;tt&gt;lola-10&lt;/tt&gt;. &lt;br/&gt;
I uploaded the debug log files dirctory &lt;tt&gt;lhn:/scratch/lu-7836&lt;/tt&gt;&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;Till 2016-03-14 07:25 slab allocation continues to grow till ~ 9 GB&lt;/li&gt;
	&lt;li&gt;Recovery completed at:
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; lola-10.log:Mar 14 07:24:35 lola-10 kernel: Lustre: soaked-MDT0005: Recovery over after 66:57, of 16 clients 16 recovered and 0 were evicted.
lola-10.log:Mar 14 07:25:38 lola-10 kernel: Lustre: soaked-MDT0004: Recovery over after 67:00, of 16 clients 15 recovered and 1 was evicted.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
	&lt;li&gt;After that slab memory allocation remains constant at 9GB&lt;/li&gt;
	&lt;li&gt;During 06:50 - 07:27 the following debug log were created and saved to &lt;tt&gt;lhn:/scratch/lu-7638&lt;/tt&gt;&lt;br/&gt;
lustre-log.20160314-0650, lustre-log.20160314-0656, lustre-log.20160314-0714, lustre-log.20160314-0727&lt;/li&gt;
	&lt;li&gt;umount / mount the MDTs --&amp;gt; recovery went to state INACTIVE&lt;/li&gt;
	&lt;li&gt;Reboot and mount of MDTs brought back MDS &lt;tt&gt;lola-10&lt;/tt&gt; into state with &apos;wild&apos; allocation of slabs, see:&lt;br/&gt;
lustre-log.20160314-0801, lustre-log.20160314-0820&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="145452" author="pjones" created="Mon, 14 Mar 2016 17:56:32 +0000"  >&lt;p&gt;Di&lt;/p&gt;

&lt;p&gt;Could you please look into this?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="145522" author="di.wang" created="Tue, 15 Mar 2016 00:49:46 +0000"  >&lt;p&gt;After some investigation, it looks like the MDT is blocked on update recovery, then queue too much final ping req there. I will try to make a patch.&lt;/p&gt;</comment>
                            <comment id="145543" author="gerrit" created="Tue, 15 Mar 2016 09:28:07 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/18915&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/18915&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7836&quot; title=&quot;MDSes crashed with oom-killer&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7836&quot;&gt;&lt;del&gt;LU-7836&lt;/del&gt;&lt;/a&gt; ptlrpc: No new signal complete request&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_8&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: c80d98b7c1d741e239656be30b80ce723f3aaab5&lt;/p&gt;</comment>
                            <comment id="145840" author="pjones" created="Wed, 16 Mar 2016 18:58:44 +0000"  >&lt;p&gt;Moving to 2.9 because it seems that this issue only occurs with multiple MDTs per MDS and does not happen with the more common configuration of a single MDT per MDS. Is this a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7780&quot; title=&quot;MDS crashed with oom-killer&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7780&quot;&gt;&lt;del&gt;LU-7780&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="145943" author="heckes" created="Thu, 17 Mar 2016 14:28:40 +0000"  >&lt;p&gt;Soak has been continued to execute b2_8 RC5 build with reformatted Lustre FS. &lt;br/&gt;
Now there&apos;s only 1 MDT per MDS and 5 OSTs per OSS (unchanged). MDT had&lt;br/&gt;
been formatted with &lt;em&gt;ldiskfs&lt;/em&gt; and OSTs using &lt;em&gt;zfs&lt;/em&gt;.&lt;br/&gt;
Soak session is running for ~ 2.5 days &lt;b&gt;without&lt;/b&gt; appearance of this bug.&lt;br/&gt;
As &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7848&quot; title=&quot;Recovery process on MDS stalled&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7848&quot;&gt;&lt;del&gt;LU-7848&lt;/del&gt;&lt;/a&gt; is most likely related, it&apos;s worth mentioning here that all recovery&lt;br/&gt;
times are below 2 mins now.&lt;/p&gt;</comment>
                            <comment id="146082" author="heckes" created="Fri, 18 Mar 2016 12:51:15 +0000"  >&lt;p&gt;The error happened after executions of soak test for approximately ~ 73 hours.&lt;/p&gt;

&lt;p&gt;Sequence of events&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;2016-03-18 00:34:30,557:fsmgmt.fsmgmt:INFO     triggering fault mds_restart&lt;/li&gt;
	&lt;li&gt;2016-03-18 00:40:49,655:fsmgmt.fsmgmt:INFO     lola-8 is up!!&lt;/li&gt;
	&lt;li&gt;2016-03-18 00:42:25,091:fsmgmt.fsmgmt:INFO     ... soaked-MDT0000 mounted successfully on lola-8&lt;/li&gt;
	&lt;li&gt;till then recovery process stalled and slab allocation is increasing continuously.&lt;/li&gt;
	&lt;li&gt;2016-03-18 01:34 - 02:04   created debug logs with frequency ~ 4min&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;The distribution of the biggest consumers is similar to the 2 MDTs per MDS configuration listed above:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;   #Date Time SlabName ObjInUse ObjInUseB ObjAll ObjAllB SlabInUse SlabInUseB SlabAll SlabAllB SlabChg SlabPct
size-1048576.dat:20160318 03:27:00 size-1048576 10115 10606346240 10115 10606346240 10115 10606346240 10115 10606346240 0 0
size-262144.dat:20160318 03:27:00 size-262144 449 117702656 449 117702656 449 117702656 449 117702656 0 0
size-8192.dat:20160318 03:27:00 size-8192 4663 38199296 4663 38199296 4663 38199296 4663 38199296 0 0
size-1024.dat:20160318 03:27:00 size-1024 35610 36464640 35628 36483072 8903 36466688 8907 36483072 106496 0
ptlrpc_cache.dat:20160318 03:27:00 ptlrpc_cache 41048 31524864 41080 31549440 8216 33652736 8216 33652736 53248 0
size-65536.dat:20160318 03:27:00 size-65536 360 23592960 360 23592960 360 23592960 360 23592960 0 0
size-512.dat:20160318 03:27:00 size-512 45384 23236608 45472 23281664 5684 23281664 5684 23281664 8192 0
kmem_cache.dat:20160318 03:27:00 kmem_cache 289 9506944 289 9506944 289 18939904 289 18939904 0 0
inode_cache.dat:20160318 03:27:00 inode_cache 15638 9257696 15684 9284928 2614 10706944 2614 10706944 0 0
Acpi-Operand.dat:20160318 03:27:00 Acpi-Operand 133270 9595440 135468 9753696 2556 10469376 2556 10469376 0 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;After the occurrance of the error, debug logs with filter &apos;&lt;tt&gt;+malloc + trace&lt;/tt&gt; have been taken for ~ 30 Mins in 4 minutes intervals. Buffer size was increased from initially 128MB to 1024 MB. &lt;br/&gt;
Uploaded files:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;message, console and &lt;tt&gt;collectl&lt;/tt&gt; slab performance counter lola-8&lt;/li&gt;
	&lt;li&gt;I try to attache the kernel debug logs to the tickets or will upload them to a Intel cluster for further investigations if I&apos;ll face size limits or transfer problems.&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="146083" author="heckes" created="Fri, 18 Mar 2016 12:54:57 +0000"  >&lt;p&gt;Also, it is impossible to abort the recovery process (see  &lt;a href=&quot;https://jira.hpdd.intel.com/browse/LU-7848?focusedCommentId=146077&amp;amp;page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-146077&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jira.hpdd.intel.com/browse/LU-7848?focusedCommentId=146077&amp;amp;page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-146077&lt;/a&gt;)&lt;/p&gt;</comment>
                            <comment id="146089" author="heckes" created="Fri, 18 Mar 2016 14:13:26 +0000"  >&lt;p&gt;debug files have been uploaded. Oleg: I kept them in the binary state as I was unsure what should be extracted&lt;br/&gt;
since Andreas requested &apos;+trace&apos; also.&lt;/p&gt;</comment>
                            <comment id="149640" author="gerrit" created="Thu, 21 Apr 2016 05:06:52 +0000"  >&lt;p&gt;wangdi (di.wang@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/19693&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/19693&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7836&quot; title=&quot;MDSes crashed with oom-killer&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7836&quot;&gt;&lt;del&gt;LU-7836&lt;/del&gt;&lt;/a&gt; ptlrpc: remove duplicate final ping req&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: b6f85ba66403a30f135037ae38fb88b35d9b86aa&lt;/p&gt;</comment>
                            <comment id="151455" author="heckes" created="Mon, 9 May 2016 09:39:07 +0000"  >&lt;p&gt;oom-killer was active for build &apos;20160427&apos; (see &lt;a href=&quot;https://wiki.hpdd.intel.com/pages/viewpage.action?title=Soak+Testing+on+Lola&amp;amp;spaceKey=Releases#SoakTestingonLola-20160427&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/pages/viewpage.action?title=Soak+Testing+on+Lola&amp;amp;spaceKey=Releases#SoakTestingonLola-20160427&lt;/a&gt;) also. The patch above wasn&apos;t applied. Crash dump files are saved in:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lhn.hpdd.intel.com:/var/crashdumps/lu-7836/lola-11/127.0.0.1-2016-05-07-10:48:57
lhn.hpdd.intel.com:/var/crashdumps/lu-7836/lola-11/127.0.0.1-2016-05-07-17:34:05
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="155072" author="heckes" created="Wed, 8 Jun 2016 15:09:57 +0000"  >&lt;p&gt;The error also occurred while soak testing build &apos;20160601&apos; (see: &lt;a href=&quot;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160601&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160601&lt;/a&gt;) after the recovery process stalled for a MDT that failed over to the secondary node. Although &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7848&quot; title=&quot;Recovery process on MDS stalled&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7848&quot;&gt;&lt;del&gt;LU-7848&lt;/del&gt;&lt;/a&gt; is part of the build, the recovery process is stalls.&lt;br/&gt;
Configuration: DNE enabled; 1 MDT per MDS; nodes lola-10, 11 from an active-active failover cluster. &lt;/p&gt;

&lt;p&gt;1st Event:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;2016-06-03 11:31:10   - failover resource of lola-10 (MDT-2) --&amp;gt; lola-11&lt;/li&gt;
	&lt;li&gt;2016-06-03 11:36:37   -   ... soaked-MDT0002 mounted successfully on lola-11&lt;/li&gt;
	&lt;li&gt;till 2016-06-04-00:44   - soaked-MDT0002 in status &apos;RECOVERING&apos;.&lt;/li&gt;
	&lt;li&gt;2016-06-04-00:44:52  - lola-11 crash with oom-killer&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;2nd Event:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;2016-06-07 08:34:06,621 triggering fault mds_failover lola-10 (MDT-2) --&amp;gt; lola-11&lt;/li&gt;
	&lt;li&gt;2016-06-07 08:38:42  - Mounting soaked-MDT0002 on lola-11&lt;/li&gt;
	&lt;li&gt;since 2016-06-07 08:39:32,155 Wait for recovery to complete&lt;/li&gt;
	&lt;li&gt;memory resources are nearly exhausted:
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@lola-11 ~]# date
Wed Jun  8 07:59:49 PDT 2016
[root@lola-11 ~]# collectl -sm --verbose
waiting for 1 second sample...

# MEMORY SUMMARY
#&amp;lt;-------------------------------Physical Memory--------------------------------------&amp;gt;&amp;lt;-----------Swap------------&amp;gt;&amp;lt;-------Paging------&amp;gt;
#   Total    Used    Free    Buff  Cached    Slab  Mapped    Anon  Commit  Locked Inact Total  Used  Free   In  Out Fault MajFt   In  Out
   32006M  30564M   1441M 127144K 676256K  28701M  16196K  69072K 201740K   5008K  509M   15G     0   15G    0    0    28     0    0    8
   32006M  30565M   1441M 127144K 676256K  28701M  16196K  69072K 201740K   5008K  509M   15G     0   15G    0    0    63     0    0    4
   32006M  30565M   1441M 127144K 676256K  28701M  16196K  69072K 201740K   5008K  509M   15G     0   15G    0    0     1     0    0    0
   32006M  30564M   1441M 127144K 676256K  28701M  16196K  69072K 201740K   5008K  509M   15G     0   15G    0    0    17     0    0    0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Attached files:&lt;br/&gt;
*1st event only: Saved crash dump file to &lt;tt&gt;lhn.hpdd.intel.com:/var/crashdumps/lu-7836/lola-11/127.0.0.1-2016-06-04-00:44:52&lt;/tt&gt;&lt;/p&gt;&lt;/li&gt;
	&lt;li&gt;2nd event only: kernel debug log of lola-11&lt;/li&gt;
	&lt;li&gt;Both event: messages, console logs&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="155077" author="heckes" created="Wed, 8 Jun 2016 15:17:35 +0000"  >&lt;p&gt;I double checked the server node &lt;tt&gt;lola-11&lt;/tt&gt; and found no HW related errors.&lt;/p&gt;</comment>
                            <comment id="159303" author="heckes" created="Wed, 20 Jul 2016 12:47:00 +0000"  >&lt;p&gt;The error didn&apos;t occurred for soak test of build &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-master/3406&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-master/3406&lt;/a&gt; (see &lt;a href=&quot;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160713&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160713&lt;/a&gt;) during a test session that is ongoing and last already for 7 days.&lt;/p&gt;</comment>
                            <comment id="159343" author="gerrit" created="Wed, 20 Jul 2016 17:42:17 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/19693/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/19693/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7836&quot; title=&quot;MDSes crashed with oom-killer&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7836&quot;&gt;&lt;del&gt;LU-7836&lt;/del&gt;&lt;/a&gt; ptlrpc: remove duplicate final ping req&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 1f2bb415543f8801f3b4a7c55e29b715b09f327b&lt;/p&gt;</comment>
                            <comment id="159969" author="jgmitter" created="Tue, 26 Jul 2016 20:02:39 +0000"  >&lt;p&gt;Is this issue resolved with the landing of the above patch?&lt;/p&gt;</comment>
                            <comment id="160047" author="heckes" created="Wed, 27 Jul 2016 12:17:59 +0000"  >&lt;p&gt;New build (see &lt;a href=&quot;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160727&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160727&lt;/a&gt;)  include the fix above has been started 1 hour ago. The previous test session for &apos;20160713&apos; ran till yesterday (Aug, 26th) without the occurrence of this bug.&lt;br/&gt;
If possible I&apos;d like to run build &apos;20160727&apos; till Friday before closing the ticket. Anyway feel free to close the ticket.&lt;/p&gt;</comment>
                            <comment id="160065" author="jgmitter" created="Wed, 27 Jul 2016 13:17:06 +0000"  >&lt;p&gt;Thanks Frank.  We can wait until Friday to validate that the issue is resolved.&lt;/p&gt;</comment>
                            <comment id="160412" author="jgmitter" created="Mon, 1 Aug 2016 12:33:35 +0000"  >&lt;p&gt;Hi Frank,&lt;br/&gt;
Are you comfortable that the issue is resolved in the soak run from the end of last week?&lt;br/&gt;
Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="160989" author="pjones" created="Fri, 5 Aug 2016 21:08:47 +0000"  >&lt;p&gt;As this fix has landed and is intended to fix this issue then let&apos;s mark this as resolved and then reopen if it reoccurs&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="34687">LU-7780</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="36413">LU-8070</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="20622" name="console-lola-10.bz2" size="371810" author="heckes" created="Wed, 2 Mar 2016 15:20:07 +0000"/>
                            <attachment id="20623" name="console-lola-11.bz2" size="351932" author="heckes" created="Wed, 2 Mar 2016 15:20:07 +0000"/>
                            <attachment id="21817" name="console-lola-11.log-20160608.bz2" size="64732" author="heckes" created="Wed, 8 Jun 2016 15:27:32 +0000"/>
                            <attachment id="20621" name="console-lola-8.bz2" size="535340" author="heckes" created="Wed, 2 Mar 2016 15:20:07 +0000"/>
                            <attachment id="20804" name="console-lola-8.log-20160318.bz2" size="506166" author="heckes" created="Fri, 18 Mar 2016 14:10:52 +0000"/>
                            <attachment id="21818" name="dmesg-lola-11-20160609-0811.bz2" size="26974" author="heckes" created="Wed, 8 Jun 2016 15:27:32 +0000"/>
                            <attachment id="20625" name="lola-10-lustre-log.20160302-0008.bz2" size="4728760" author="heckes" created="Wed, 2 Mar 2016 15:20:07 +0000"/>
                            <attachment id="20626" name="lola-10-memory-counter-2214-0130.dat.bz2" size="18513" author="heckes" created="Wed, 2 Mar 2016 15:26:48 +0000"/>
                            <attachment id="20627" name="lola-10-slab-detailed-2214-0130.dat.bz2" size="609133" author="heckes" created="Wed, 2 Mar 2016 15:26:48 +0000"/>
                            <attachment id="20628" name="lola-11-lustre-log.20160302-0008.bz2" size="1396781" author="heckes" created="Wed, 2 Mar 2016 15:26:48 +0000"/>
                            <attachment id="21838" name="lola-11-lustre-log.20160608-0656.bz2" size="261" author="heckes" created="Thu, 9 Jun 2016 07:41:33 +0000"/>
                            <attachment id="20747" name="lola-11-mem-counter-20160311_1729-1910.dat.bz2" size="11483" author="heckes" created="Mon, 14 Mar 2016 11:38:22 +0000"/>
                            <attachment id="20748" name="lola-11-memory-debug-messages.20160311-183459.bz2" size="287" author="heckes" created="Mon, 14 Mar 2016 12:50:30 +0000"/>
                            <attachment id="20796" name="lola-8-lustre-log-20160318-0134.bz2" size="1090875" author="heckes" created="Fri, 18 Mar 2016 13:33:12 +0000"/>
                            <attachment id="20795" name="lola-8-lustre-log-20160318-0145.bz2" size="1415597" author="heckes" created="Fri, 18 Mar 2016 13:31:08 +0000"/>
                            <attachment id="20797" name="lola-8-lustre-log-20160318-0147.bz2" size="906594" author="heckes" created="Fri, 18 Mar 2016 13:34:29 +0000"/>
                            <attachment id="20798" name="lola-8-lustre-log-20160318-0150.bz2" size="960968" author="heckes" created="Fri, 18 Mar 2016 13:41:18 +0000"/>
                            <attachment id="20799" name="lola-8-lustre-log-20160318-0153.bz2" size="955004" author="heckes" created="Fri, 18 Mar 2016 13:42:40 +0000"/>
                            <attachment id="20800" name="lola-8-lustre-log-20160318-0156.bz2" size="1068222" author="heckes" created="Fri, 18 Mar 2016 13:48:40 +0000"/>
                            <attachment id="20801" name="lola-8-lustre-log-20160318-0200.bz2" size="1036290" author="heckes" created="Fri, 18 Mar 2016 13:54:28 +0000"/>
                            <attachment id="20803" name="lola-8-lustre-log-20160318-0204.bz2" size="1050545" author="heckes" created="Fri, 18 Mar 2016 14:04:55 +0000"/>
                            <attachment id="20624" name="lola-8-lustre-log.20160302-0155.bz2" size="4634706" author="heckes" created="Wed, 2 Mar 2016 15:20:07 +0000"/>
                            <attachment id="20794" name="lola-8-slab-detailed-counter-20160318.tar.bz2" size="916079" author="heckes" created="Fri, 18 Mar 2016 13:23:06 +0000"/>
                            <attachment id="20629" name="lustre-log.1456893718.5074.bz2" size="249" author="heckes" created="Wed, 2 Mar 2016 15:26:48 +0000"/>
                            <attachment id="20631" name="messages-lola-10.bz2" size="242414" author="heckes" created="Wed, 2 Mar 2016 15:28:22 +0000"/>
                            <attachment id="20632" name="messages-lola-11.bz2" size="175584" author="heckes" created="Wed, 2 Mar 2016 15:28:22 +0000"/>
                            <attachment id="21819" name="messages-lola-11.log-20160608.bz2" size="318328" author="heckes" created="Wed, 8 Jun 2016 15:27:32 +0000"/>
                            <attachment id="20630" name="messages-lola-8.bz2" size="238022" author="heckes" created="Wed, 2 Mar 2016 15:26:48 +0000"/>
                            <attachment id="20805" name="messages-lola-8.log-20160318.bz2" size="746006" author="heckes" created="Fri, 18 Mar 2016 14:11:42 +0000"/>
                            <attachment id="20633" name="recovery-status-20160302" size="3803" author="heckes" created="Wed, 2 Mar 2016 15:28:22 +0000"/>
                            <attachment id="20634" name="slab-details-2041-2208-one-file-per-slab.tar.bz2" size="215934" author="heckes" created="Wed, 2 Mar 2016 15:29:30 +0000"/>
                            <attachment id="20635" name="slab-details-2214-0130-one-file-per-slab.tar.bz2" size="442953" author="heckes" created="Wed, 2 Mar 2016 15:29:30 +0000"/>
                            <attachment id="20636" name="slab-sorted-alloaction-2041-2208.dat.bz2" size="4266" author="heckes" created="Wed, 2 Mar 2016 15:29:30 +0000"/>
                            <attachment id="20637" name="slab-sorted-alloaction-2214-0130.dat.bz2" size="4345" author="heckes" created="Wed, 2 Mar 2016 15:29:30 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzy3db:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>