<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:51:52 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5483] recovery-mds-scale test failover_mds: oom failure on client</title>
                <link>https://jira.whamcloud.com/browse/LU-5483</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;While testing test script patch &lt;a href=&quot;http://review.whamcloud.com/11425&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11425&lt;/a&gt; on Lustre b2_5 branch, recovery-mds-scale test failover_mds hit oom failure on one of the clients:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;15:04:21:Lustre: DEBUG MARKER: mds1 has failed over 1 times, and counting...
15:04:22:Lustre: 2207:0:(client.c:1918:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1407966652/real 1407966652]  req@ffff88005ddeb400 x1476359923718516/t0(0) o250-&amp;gt;MGC10.2.4.104@tcp@10.2.4.104@tcp:26/25 lens 400/544 e 0 to 1 dl 1407966663 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
15:04:22:Lustre: 2207:0:(client.c:1918:ptlrpc_expire_one_request()) Skipped 1 previous similar message
15:04:22:Lustre: Evicted from MGS (at 10.2.4.108@tcp) after server handle changed from 0x3d7499e48b9b2ab6 to 0x46ffbf3b8b30002
15:04:22:Lustre: MGC10.2.4.104@tcp: Connection restored to MGS (at 10.2.4.108@tcp)
15:04:22:LustreError: 2207:0:(client.c:2795:ptlrpc_replay_interpret()) @@@ status 301, old was 0  req@ffff88007a684400 x1476359922647520/t4294967302(4294967302) o101-&amp;gt;lustre-MDT0000-mdc-ffff88007a58c000@10.2.4.108@tcp:12/10 lens 704/544 e 0 to 0 dl 1407966698 ref 2 fl Interpret:RP/4/0 rc 301/301
15:04:22:dd invoked oom-killer: gfp_mask=0x200da, order=0, oom_adj=0, oom_score_adj=0
15:04:22:dd cpuset=/ mems_allowed=0
15:04:22:Pid: 3997, comm: dd Not tainted 2.6.32-431.17.1.el6.x86_64 #1
15:04:22:Call Trace:
15:04:23: [&amp;lt;ffffffff810d0211&amp;gt;] ? cpuset_print_task_mems_allowed+0x91/0xb0
15:04:23: [&amp;lt;ffffffff811225c0&amp;gt;] ? dump_header+0x90/0x1b0
15:04:23: [&amp;lt;ffffffff8122761c&amp;gt;] ? security_real_capable_noaudit+0x3c/0x70
15:04:23: [&amp;lt;ffffffff81122a42&amp;gt;] ? oom_kill_process+0x82/0x2a0
15:04:23: [&amp;lt;ffffffff81122981&amp;gt;] ? select_bad_process+0xe1/0x120
15:04:23: [&amp;lt;ffffffff81122e80&amp;gt;] ? out_of_memory+0x220/0x3c0
15:04:23: [&amp;lt;ffffffff8112f79f&amp;gt;] ? __alloc_pages_nodemask+0x89f/0x8d0
15:04:23: [&amp;lt;ffffffff8116769a&amp;gt;] ? alloc_pages_current+0xaa/0x110
15:04:24: [&amp;lt;ffffffff8111f9b7&amp;gt;] ? __page_cache_alloc+0x87/0x90
15:04:24: [&amp;lt;ffffffff811206ce&amp;gt;] ? grab_cache_page_write_begin+0x8e/0xc0
15:04:24: [&amp;lt;ffffffffa0a05f58&amp;gt;] ? ll_write_begin+0x58/0x1a0 [lustre]
15:04:24: [&amp;lt;ffffffff8111ff33&amp;gt;] ? generic_file_buffered_write+0x123/0x2e0
15:04:24: [&amp;lt;ffffffff81078f37&amp;gt;] ? current_fs_time+0x27/0x30
15:04:24: [&amp;lt;ffffffff81121990&amp;gt;] ? __generic_file_aio_write+0x260/0x490
15:04:24: [&amp;lt;ffffffffa056793c&amp;gt;] ? cl_lock_trace0+0x11c/0x130 [obdclass]
15:04:24: [&amp;lt;ffffffffa056793c&amp;gt;] ? cl_lock_trace0+0x11c/0x130 [obdclass]
15:04:24: [&amp;lt;ffffffff81121c48&amp;gt;] ? generic_file_aio_write+0x88/0x100
15:04:24: [&amp;lt;ffffffffa0a1acc7&amp;gt;] ? vvp_io_write_start+0x137/0x2a0 [lustre]
15:04:25: [&amp;lt;ffffffffa056de3a&amp;gt;] ? cl_io_start+0x6a/0x140 [obdclass]
15:04:25: [&amp;lt;ffffffffa0572544&amp;gt;] ? cl_io_loop+0xb4/0x1b0 [obdclass]
15:04:25: [&amp;lt;ffffffffa09bd4c0&amp;gt;] ? ll_file_io_generic+0x460/0x610 [lustre]
15:04:25: [&amp;lt;ffffffffa09be2c2&amp;gt;] ? ll_file_aio_write+0x142/0x2c0 [lustre]
15:04:25: [&amp;lt;ffffffffa09be5ac&amp;gt;] ? ll_file_write+0x16c/0x2a0 [lustre]
15:04:25: [&amp;lt;ffffffff81188c38&amp;gt;] ? vfs_write+0xb8/0x1a0
15:04:25: [&amp;lt;ffffffff81189531&amp;gt;] ? sys_write+0x51/0x90
15:04:25: [&amp;lt;ffffffff810e1abe&amp;gt;] ? __audit_syscall_exit+0x25e/0x290
15:04:25: [&amp;lt;ffffffff8100b072&amp;gt;] ? system_call_fastpath+0x16/0x1b
15:04:25:Mem-Info:
15:04:26:Node 0 DMA per-cpu:
15:04:26:CPU    0: hi:    0, btch:   1 usd:   0
15:04:26:CPU    1: hi:    0, btch:   1 usd:   0
15:04:26:Node 0 DMA32 per-cpu:
15:04:26:CPU    0: hi:  186, btch:  31 usd: 133
15:04:26:CPU    1: hi:  186, btch:  31 usd:  63
15:04:26:active_anon:1286 inactive_anon:1284 isolated_anon:0
15:04:26: active_file:171721 inactive_file:173031 isolated_file:32
15:04:26: unevictable:0 dirty:0 writeback:38535 unstable:0
15:04:26: free:15144 slab_reclaimable:4187 slab_unreclaimable:99907
15:04:27: mapped:4 shmem:1 pagetables:1115 bounce:0
15:04:27:Node 0 DMA free:8352kB min:332kB low:412kB high:496kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:5248kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15348kB mlocked:0kB dirty:0kB writeback:5328kB mapped:0kB shmem:0kB slab_reclaimable:32kB slab_unreclaimable:2032kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:7904 all_unreclaimable? yes
15:04:27:lowmem_reserve[]: 0 2004 2004 2004
15:04:27:Node 0 DMA32 free:52224kB min:44720kB low:55900kB high:67080kB active_anon:5144kB inactive_anon:5136kB active_file:686828kB inactive_file:687060kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2052308kB mlocked:0kB dirty:0kB writeback:148812kB mapped:16kB shmem:4kB slab_reclaimable:16716kB slab_unreclaimable:397596kB kernel_stack:1408kB pagetables:4460kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:612151 all_unreclaimable? no
15:04:27:lowmem_reserve[]: 0 0 0 0
15:04:27:Node 0 DMA: 11*4kB 1*8kB 3*16kB 7*32kB 6*64kB 4*128kB 2*256kB 3*512kB 1*1024kB 2*2048kB 0*4096kB = 8388kB
15:04:27:Node 0 DMA32: 806*4kB 275*8kB 215*16kB 85*32kB 23*64kB 4*128kB 3*256kB 4*512kB 5*1024kB 1*2048kB 7*4096kB = 52224kB
15:04:27:346103 total pagecache pages
15:04:27:1306 pages in swap cache
15:04:27:Swap cache stats: add 4759, delete 3453, find 0/0
15:04:27:Free swap  = 2706844kB
15:04:27:Total swap = 2725880kB
15:04:28:524284 pages RAM
15:04:28:43693 pages reserved
15:04:28:668392 pages shared
15:04:28:115112 pages non-shared
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/1d84cc0e-2339-11e4-b8ac-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/1d84cc0e-2339-11e4-b8ac-5254006e85c2&lt;/a&gt;&lt;/p&gt;</description>
                <environment>Lustre build: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-b2_5/77/&quot;&gt;https://build.hpdd.intel.com/job/lustre-b2_5/77/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.5/x86_64&lt;br/&gt;
Test group: failover</environment>
        <key id="26005">LU-5483</key>
            <summary>recovery-mds-scale test failover_mds: oom failure on client</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="4" iconUrl="https://jira.whamcloud.com/images/icons/statuses/reopened.png" description="This issue was once resolved, but the resolution was deemed incorrect. From here issues are either marked assigned or resolved.">Reopened</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                            <label>22pl</label>
                            <label>mq414</label>
                    </labels>
                <created>Wed, 13 Aug 2014 23:18:32 +0000</created>
                <updated>Tue, 19 Jul 2016 00:08:01 +0000</updated>
                                            <version>Lustre 2.7.0</version>
                    <version>Lustre 2.5.3</version>
                    <version>Lustre 2.8.0</version>
                    <version>Lustre 2.5.4</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="91622" author="pjones" created="Thu, 14 Aug 2014 16:52:43 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Could you please look into this issue?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="91659" author="yujian" created="Thu, 14 Aug 2014 20:57:42 +0000"  >&lt;p&gt;The failure occurred again: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/489ef5aa-2374-11e4-84ee-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/489ef5aa-2374-11e4-84ee-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="92070" author="yujian" created="Wed, 20 Aug 2014 19:20:30 +0000"  >&lt;p&gt;This is blocking the whole hard failover test session on Lustre b2_5 branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/a0c998fe-2818-11e4-893b-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/a0c998fe-2818-11e4-893b-5254006e85c2&lt;/a&gt; (build #83)&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/a3544ab6-2826-11e4-8135-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/a3544ab6-2826-11e4-8135-5254006e85c2&lt;/a&gt; (build #82)&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/73ce3354-24a4-11e4-99bd-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/73ce3354-24a4-11e4-99bd-5254006e85c2&lt;/a&gt; (build #80)&lt;/p&gt;

&lt;p&gt;It looks like this is a regression introduced by build #80 because the previous builds did not hit this failure.&lt;/p&gt;

&lt;p&gt;FYI, here is the hard failover test session of build #79:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/2fb00c98-25e8-11e4-8ee8-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/2fb00c98-25e8-11e4-8ee8-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="92196" author="green" created="Fri, 22 Aug 2014 03:16:50 +0000"  >&lt;p&gt;So I imagine we should just revert &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3326&quot; title=&quot;recovery-mds-scale test_failover_ost: tar: Cannot open: No space left on device&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3326&quot;&gt;&lt;del&gt;LU-3326&lt;/del&gt;&lt;/a&gt; patch, even though I am not sure how can it trigger this problem here&lt;/p&gt;</comment>
                            <comment id="92199" author="yujian" created="Fri, 22 Aug 2014 04:06:09 +0000"  >&lt;p&gt;I just pushed &lt;a href=&quot;http://review.whamcloud.com/11555&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11555&lt;/a&gt; to perform recovery-mds-scale test with &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3326&quot; title=&quot;recovery-mds-scale test_failover_ost: tar: Cannot open: No space left on device&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3326&quot;&gt;&lt;del&gt;LU-3326&lt;/del&gt;&lt;/a&gt; tests: sync after file deleted in run_*.sh&quot; reverted. Let&apos;s see whether the oom failure disappears or not.&lt;/p&gt;</comment>
                            <comment id="92234" author="yujian" created="Fri, 22 Aug 2014 17:10:25 +0000"  >&lt;p&gt;The same failure occurred on master branch with FSTYPE=ldiskfs:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/a73550b8-29ba-11e4-a2a1-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/a73550b8-29ba-11e4-a2a1-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="92266" author="yujian" created="Sat, 23 Aug 2014 01:36:38 +0000"  >&lt;blockquote&gt;&lt;p&gt;I just pushed &lt;a href=&quot;http://review.whamcloud.com/11555&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11555&lt;/a&gt; to perform recovery-mds-scale test with &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3326&quot; title=&quot;recovery-mds-scale test_failover_ost: tar: Cannot open: No space left on device&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3326&quot;&gt;&lt;del&gt;LU-3326&lt;/del&gt;&lt;/a&gt; tests: sync after file deleted in run_*.sh&quot; reverted. Let&apos;s see whether the oom failure disappears or not.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Test result showed that &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5483&quot; title=&quot;recovery-mds-scale test failover_mds: oom failure on client&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5483&quot;&gt;LU-5483&lt;/a&gt; still occurred. So, the test script change is not the cause.&lt;/p&gt;</comment>
                            <comment id="92267" author="yujian" created="Sat, 23 Aug 2014 01:51:53 +0000"  >&lt;p&gt;I updated &lt;a href=&quot;http://review.whamcloud.com/#/c/11555&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/11555&lt;/a&gt; to check whether &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5403&quot; title=&quot;Kernel update [RHEL6.5 2.6.32-431.23.3.el6]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5403&quot;&gt;&lt;del&gt;LU-5403&lt;/del&gt;&lt;/a&gt; kernel: kernel update RHEL6.5 [2.6.32-431.23.3.el6]&quot; is a culprit. The oom failure still occurred.&lt;/p&gt;</comment>
                            <comment id="92375" author="yujian" created="Tue, 26 Aug 2014 00:27:45 +0000"  >&lt;p&gt;In the above comments, Lustre b2_5 build #79 performed on 2014-08-15 (UTC) did &lt;em&gt;not&lt;/em&gt; hit the oom failure:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/2fb00c98-25e8-11e4-8ee8-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/2fb00c98-25e8-11e4-8ee8-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The failure started occurring on build #80, which was also performed on 2014-08-15 (UTC):&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/73ce3354-24a4-11e4-99bd-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/73ce3354-24a4-11e4-99bd-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;But now, by re-running the test on build #79, the failure occurred consistently on both Onyx and Shadow clusters:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/c98707b6-2c11-11e4-9bfb-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/c98707b6-2c11-11e4-9bfb-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/792cccca-2c97-11e4-9bfb-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/792cccca-2c97-11e4-9bfb-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/adb60468-2cb3-11e4-9bfb-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/adb60468-2cb3-11e4-9bfb-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The memory size on each VM node has been 1.83 GB, which is not changed.&lt;/p&gt;

&lt;p&gt;Hi Minh and Joshua,&lt;/p&gt;

&lt;p&gt;Do you know if there was any test environment change made on 2014-08-15 (UTC) ?&lt;/p&gt;</comment>
                            <comment id="92453" author="joshua" created="Tue, 26 Aug 2014 16:53:25 +0000"  >&lt;p&gt;No changes were made to the test infrastructure recently.  I see there is a mix of running them on shadow and onyx, so that rules out one particular cluster.&lt;/p&gt;</comment>
                            <comment id="92455" author="mdiep" created="Tue, 26 Aug 2014 17:09:37 +0000"  >&lt;p&gt;I am not aware of any autotest changes that could have affected the memory size. The memory size was set during VM setup which was done once.&lt;/p&gt;</comment>
                            <comment id="92750" author="pjones" created="Thu, 28 Aug 2014 21:38:07 +0000"  >&lt;p&gt;Closing ticket for now as this seems to have stopped happening.&lt;/p&gt;</comment>
                            <comment id="92764" author="yujian" created="Thu, 28 Aug 2014 23:35:16 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-b2_5/85/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-b2_5/85/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.5/x86_64&lt;br/&gt;
FSTYPE=ldiskfs&lt;br/&gt;
TEST_GROUP=failover&lt;/p&gt;

&lt;p&gt;recovery-mds-scale test failover_mds passed for 24 hours (MDS failed over 72 times):&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/sub_tests/ff043854-2f05-11e4-b34e-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/sub_tests/ff043854-2f05-11e4-b34e-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="93127" author="yujian" created="Wed, 3 Sep 2014 16:40:38 +0000"  >&lt;p&gt;While testing Lustre 2.5.3 RC1 (build #86), the same issue occurred frequently again:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/f1da5e1c-311f-11e4-b503-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/f1da5e1c-311f-11e4-b503-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/f8513690-301a-11e4-9e60-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/f8513690-301a-11e4-9e60-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/aea7b94a-303a-11e4-ad0f-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/aea7b94a-303a-11e4-ad0f-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/4bc75a48-2fad-11e4-957a-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/4bc75a48-2fad-11e4-957a-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Only 1 of 5 test run did not hit the oom failure:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/cc5ca82a-3269-11e4-8c3a-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/cc5ca82a-3269-11e4-8c3a-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The same issue also occurred on master branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/fb45a3ce-3081-11e4-9e60-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/fb45a3ce-3081-11e4-9e60-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;In manual test run, I increased the memory size on each VM from 2GB to 3GB, the oom failure still occurred finally.&lt;/p&gt;</comment>
                            <comment id="99850" author="yujian" created="Sat, 22 Nov 2014 06:17:41 +0000"  >&lt;p&gt;The same failure occurred on master branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/ba0b8798-6902-11e4-9d25-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/ba0b8798-6902-11e4-9d25-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="101379" author="yujian" created="Fri, 12 Dec 2014 00:42:13 +0000"  >&lt;p&gt;More instance on Lustre b2_5 branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/eaf738d2-805b-11e4-a434-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/eaf738d2-805b-11e4-a434-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="104459" author="hongchao.zhang" created="Fri, 23 Jan 2015 02:15:55 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5483&quot; title=&quot;recovery-mds-scale test failover_mds: oom failure on client&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5483&quot;&gt;LU-5483&lt;/a&gt; could be caused by the unstable pages&lt;/p&gt;</comment>
                            <comment id="125274" author="sarah" created="Wed, 26 Aug 2015 20:01:55 +0000"  >&lt;p&gt;Hit this on master branch&lt;br/&gt;
lustre-master build # 3141 RHEL6.6&lt;/p&gt;


&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;11:54:51:Lustre: DEBUG MARKER: ps auxwww | grep -v grep | grep -q run_dd.sh
11:54:51:Lustre: DEBUG MARKER: /usr/sbin/lctl mark mds1 has failed over 2 times, and counting...
11:54:51:Lustre: DEBUG MARKER: mds1 has failed over 2 times, and counting...
11:54:51:Lustre: Evicted from MGS (at 10.2.4.104@tcp) after server handle changed from 0xe04739a2b649323f to 0x5d043b24d153c410
11:54:52:Lustre: MGC10.2.4.104@tcp: Connection restored to MGS (at 10.2.4.104@tcp)
11:54:52:Lustre: Skipped 1 previous similar message
11:54:52:LustreError: 3009:0:(client.c:2819:ptlrpc_replay_interpret()) @@@ request replay timed out.
11:54:52:  req@ffff880057f5c680 x1509841555796340/t8590079738(8590079738) o101-&amp;gt;lustre-MDT0000-mdc-ffff880037e0fc00@10.2.4.104@tcp:12/10 lens 944/744 e 1 to 1 dl 1439898450 ref 2 fl Interpret:EXP/4/ffffffff rc -110/-1
11:54:52:LustreError: 3009:0:(client.c:2874:ptlrpc_replay_interpret()) @@@ status 301, old was 0  req@ffff880057f5c680 x1509841555796340/t8590079738(8590079738) o101-&amp;gt;lustre-MDT0000-mdc-ffff880037e0fc00@10.2.4.104@tcp:12/10 lens 944/544 e 1 to 0 dl 1439898461 ref 2 fl Interpret:RP/4/0 rc 301/301
11:54:52:Lustre: lustre-MDT0000-mdc-ffff880037e0fc00: Connection restored to lustre-MDT0000 (at 10.2.4.104@tcp)
11:54:52:sssd invoked oom-killer: gfp_mask=0x200da, order=0, oom_adj=0, oom_score_adj=0
11:54:52:sssd cpuset=/ mems_allowed=0
11:54:52:Pid: 1217, comm: sssd Not tainted 2.6.32-504.30.3.el6.x86_64 #1
11:54:53:Call Trace:
11:54:53: [&amp;lt;ffffffff810d4241&amp;gt;] ? cpuset_print_task_mems_allowed+0x91/0xb0
11:54:53: [&amp;lt;ffffffff81127500&amp;gt;] ? dump_header+0x90/0x1b0
11:54:53: [&amp;lt;ffffffff8112766e&amp;gt;] ? check_panic_on_oom+0x4e/0x80
11:54:53: [&amp;lt;ffffffff81127d5b&amp;gt;] ? out_of_memory+0x1bb/0x3c0
11:54:53: [&amp;lt;ffffffff811346ff&amp;gt;] ? __alloc_pages_nodemask+0x89f/0x8d0
11:54:53: [&amp;lt;ffffffff8116cc8a&amp;gt;] ? alloc_pages_vma+0x9a/0x150
11:54:53: [&amp;lt;ffffffff811606f2&amp;gt;] ? read_swap_cache_async+0xf2/0x160
11:54:54: [&amp;lt;ffffffff81161219&amp;gt;] ? valid_swaphandles+0x69/0x150
11:54:55: [&amp;lt;ffffffff811607e7&amp;gt;] ? swapin_readahead+0x87/0xc0
11:54:55: [&amp;lt;ffffffff8114f95d&amp;gt;] ? handle_pte_fault+0x6dd/0xb00
11:54:55: [&amp;lt;ffffffff81529afe&amp;gt;] ? thread_return+0x4e/0x7d0
11:54:55: [&amp;lt;ffffffff810a3c05&amp;gt;] ? __hrtimer_start_range_ns+0x1a5/0x460
11:54:55: [&amp;lt;ffffffff810a3291&amp;gt;] ? lock_hrtimer_base+0x31/0x60
11:54:56: [&amp;lt;ffffffff81150019&amp;gt;] ? handle_mm_fault+0x299/0x3d0
11:54:56: [&amp;lt;ffffffff8104d096&amp;gt;] ? __do_page_fault+0x146/0x500
11:54:56: [&amp;lt;ffffffff811d7cd4&amp;gt;] ? ep_poll+0x314/0x350
11:54:56: [&amp;lt;ffffffff81064c00&amp;gt;] ? default_wake_function+0x0/0x20
11:54:56: [&amp;lt;ffffffff8153010e&amp;gt;] ? do_page_fault+0x3e/0xa0
11:54:56: [&amp;lt;ffffffff8152d4b5&amp;gt;] ? page_fault+0x25/0x30
11:54:57:Mem-Info:
11:54:57:Node 0 DMA per-cpu:
11:54:57:CPU    0: hi:    0, btch:   1 usd:   0
11:54:57:CPU    1: hi:    0, btch:   1 usd:   0
11:54:57:Node 0 DMA32 per-cpu:
11:54:58:CPU    0: hi:  186, btch:  31 usd:  35
11:54:58:CPU    1: hi:  186, btch:  31 usd:  58
11:54:58:active_anon:0 inactive_anon:10 isolated_anon:0
11:54:58: active_file:204249 inactive_file:204267 isolated_file:160
11:54:58: unevictable:0 dirty:4 writeback:20428 unstable:0
11:54:58: free:13241 slab_reclaimable:3434 slab_unreclaimable:42128
11:54:58: mapped:362 shmem:0 pagetables:1197 bounce:0
11:54:58:Node 0 DMA free:8340kB min:332kB low:412kB high:496kB active_anon:0kB inactive_anon:0kB active_file:3452kB inactive_file:3516kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15348kB mlocked:0kB dirty:16kB writeback:460kB mapped:0kB shmem:0kB slab_reclaimable:16kB slab_unreclaimable:420kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:10400 all_unreclaimable? yes
11:54:59:lowmem_reserve[]: 0 2004 2004 2004
11:54:59:Node 0 DMA32 free:44624kB min:44720kB low:55900kB high:67080kB active_anon:0kB inactive_anon:40kB active_file:813540kB inactive_file:813552kB unevictable:0kB isolated(anon):0kB isolated(file):640kB present:2052308kB mlocked:0kB dirty:0kB writeback:81252kB mapped:1448kB shmem:0kB slab_reclaimable:13720kB slab_unreclaimable:168092kB kernel_stack:1472kB pagetables:4788kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:2565668 all_unreclaimable? no
11:54:59:lowmem_reserve[]: 0 0 0 0
11:54:59:Node 0 DMA: 25*4kB 42*8kB 36*16kB 21*32kB 10*64kB 3*128kB 2*256kB 2*512kB 2*1024kB 1*2048kB 0*4096kB = 8340kB
11:54:59:Node 0 DMA32: 1090*4kB 481*8kB 238*16kB 123*32kB 70*64kB 37*128kB 18*256kB 7*512kB 7*1024kB 0*2048kB 1*4096kB = 44624kB
11:54:59:222323 total pagecache pages
11:54:59:1 pages in swap cache
11:54:59:Swap cache stats: add 6418, delete 6417, find 76/96
11:55:00:Free swap  = 2700876kB
11:55:00:Total swap = 2725884kB
11:55:00:524284 pages RAM
11:55:00:43706 pages reserved
11:55:00:441661 pages shared
11:55:00:238021 pages non-shared
11:55:01:[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name
11:55:02:[  423]     0   423     2728       57   1     -17         -1000 udevd
11:55:02:[ 1074]     0  1074     2280       56   0       0             0 dhclient
11:55:02:[ 1127]     0  1127     6905       84   1     -17         -1000 auditd
11:55:02:[ 1157]     0  1157    63854       92   0       0             0 rsyslogd
11:55:02:[ 1187]     0  1187     4560       75   1       0             0 irqbalance
11:55:02:[ 1203]    32  1203     4744       75   1       0             0 rpcbind
11:55:02:[ 1217]     0  1217    52783      165   1       0             0 sssd
11:55:02:[ 1218]     0  1218    70941      156   1       0             0 sssd_be
11:55:02:[ 1219]     0  1219    53426      142   1       0             0 sssd_nss
11:55:02:[ 1220]     0  1220    50500      141   1       0             0 sssd_pam
11:55:02:[ 1221]     0  1221    49987      143   1       0             0 sssd_ssh
11:55:02:[ 1222]     0  1222    55080      141   1       0             0 sssd_pac
11:55:02:[ 1242]    29  1242     6357       90   1       0             0 rpc.statd
11:55:02:[ 1359]    81  1359     5878       48   1       0             0 dbus-daemon
11:55:03:[ 1376]     0  1376    47233       90   0       0             0 cupsd
11:55:03:[ 1414]     0  1414     1020       69   1       0             0 acpid
11:55:03:[ 1424]    68  1424    10482      180   1       0             0 hald
11:55:04:[ 1425]     0  1425     5099      126   0       0             0 hald-runner
11:55:04:[ 1457]     0  1457     5629      117   1       0             0 hald-addon-inpu
11:55:04:[ 1464]    68  1464     4501      117   1       0             0 hald-addon-acpi
11:55:04:[ 1487]     0  1487   169287      131   1       0             0 automount
11:55:04:[ 1535]     0  1535    26827       16   0       0             0 rpc.rquotad
11:55:04:[ 1540]     0  1540     5417       54   0       0             0 rpc.mountd
11:55:05:[ 1581]     0  1581     5773       43   1       0             0 rpc.idmapd
11:55:06:[ 1614]   496  1614    56785      101   0       0             0 munged
11:55:06:[ 1632]     0  1632    16553       69   0     -17         -1000 sshd
11:55:07:[ 1641]     0  1641     5429       77   0       0             0 xinetd
11:55:07:[ 1727]     0  1727    20734      107   1       0             0 master
11:55:07:[ 1750]     0  1750    29215       80   1       0             0 crond
11:55:07:[ 1760]    89  1760    20797       90   1       0             0 qmgr
11:55:08:[ 1765]     0  1765     5276       42   0       0             0 atd
11:55:08:[ 1793]     0  1793    16058      103   1       0             0 certmonger
11:55:08:[ 1832]     0  1832     1020       63   1       0             0 agetty
11:55:08:[ 1833]     0  1833     1016       54   1       0             0 mingetty
11:55:08:[ 1835]     0  1835     1016       54   1       0             0 mingetty
11:55:08:[ 1837]     0  1837     1016       54   1       0             0 mingetty
11:55:08:[ 1839]     0  1839     2727       49   1     -17         -1000 udevd
11:55:08:[ 1840]     0  1840     2727       45   0     -17         -1000 udevd
11:55:09:[ 1841]     0  1841     1016       54   1       0             0 mingetty
11:55:09:[ 1843]     0  1843     1016       54   1       0             0 mingetty
11:55:09:[ 1845]     0  1845     1016       54   1       0             0 mingetty
11:55:09:[ 2286]    89  2286    20754      102   1       0             0 pickup
11:55:09:[ 2632]    38  2632     8205      136   0       0             0 ntpd
11:55:09:[ 4852]     0  4852    15806       94   0       0             0 in.mrshd
11:55:09:[ 4853]     0  4853    26515       75   0       0             0 bash
11:55:09:[ 4889]     0  4889    26515       28   1       0             0 bash
11:55:10:[ 4890]     0  4890    26839       72   1       0             0 run_dd.sh
11:55:10:[ 9303]     0  9303    26295       56   1       0             0 dd
11:55:10:Kernel panic - not syncing: Out of memory: system-wide panic_on_oom is enabled
11:55:10:
11:55:10:Pid: 1217, comm: sssd Not tainted 2.6.32-504.30.3.el6.x86_64 #1
11:55:10:Call Trace:
11:55:10: [&amp;lt;ffffffff815293fc&amp;gt;] ? panic+0xa7/0x16f
11:55:11: [&amp;lt;ffffffff81127601&amp;gt;] ? dump_header+0x191/0x1b0
11:55:11: [&amp;lt;ffffffff8112769c&amp;gt;] ? check_panic_on_oom+0x7c/0x80
11:55:11: [&amp;lt;ffffffff81127d5b&amp;gt;] ? out_of_memory+0x1bb/0x3c0
11:55:11: [&amp;lt;ffffffff811346ff&amp;gt;] ? __alloc_pages_nodemask+0x89f/0x8d0
11:55:12: [&amp;lt;ffffffff8116cc8a&amp;gt;] ? alloc_pages_vma+0x9a/0x150
11:55:12: [&amp;lt;ffffffff811606f2&amp;gt;] ? read_swap_cache_async+0xf2/0x160
11:55:12: [&amp;lt;ffffffff81161219&amp;gt;] ? valid_swaphandles+0x69/0x150
11:55:12: [&amp;lt;ffffffff811607e7&amp;gt;] ? swapin_readahead+0x87/0xc0
11:55:13: [&amp;lt;ffffffff8114f95d&amp;gt;] ? handle_pte_fault+0x6dd/0xb00
11:55:13: [&amp;lt;ffffffff81529afe&amp;gt;] ? thread_return+0x4e/0x7d0
11:55:13: [&amp;lt;ffffffff810a3c05&amp;gt;] ? __hrtimer_start_range_ns+0x1a5/0x460
11:55:13: [&amp;lt;ffffffff810a3291&amp;gt;] ? lock_hrtimer_base+0x31/0x60
11:55:13: [&amp;lt;ffffffff81150019&amp;gt;] ? handle_mm_fault+0x299/0x3d0
11:55:13: [&amp;lt;ffffffff8104d096&amp;gt;] ? __do_page_fault+0x146/0x500
11:55:14: [&amp;lt;ffffffff811d7cd4&amp;gt;] ? ep_poll+0x314/0x350
11:55:14: [&amp;lt;ffffffff81064c00&amp;gt;] ? default_wake_function+0x0/0x20
11:55:14: [&amp;lt;ffffffff8153010e&amp;gt;] ? do_page_fault+0x3e/0xa0
11:55:14: [&amp;lt;ffffffff8152d4b5&amp;gt;] ? page_fault+0x25/0x30
11:55:14:Initializing cgroup subsys cpuset
11:55:14:Initializing cgroup subsys cpu
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="127364" author="sarah" created="Tue, 15 Sep 2015 17:31:45 +0000"  >&lt;p&gt;more instance on master, build# 3175:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/97a392b8-53e9-11e5-bfaa-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/97a392b8-53e9-11e5-bfaa-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="26281">LU-5574</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="27683">LU-5944</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="15971">LU-2139</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="28511">LU-6200</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwtpb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>15301</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>