<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:33:59 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10319] recovery-random-scale, test_fail_client_mds: test_fail_client_mds returned 4</title>
                <link>https://jira.whamcloud.com/browse/LU-10319</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This impacts the SLES client that runs the dd load during failover recovery tests.  &lt;/p&gt;

&lt;p&gt;Note: SLES out-of-memory was first seen with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9601&quot; title=&quot;recovery-mds-scale test_failover_mds: test_failover_mds returned 1&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9601&quot;&gt;LU-9601&lt;/a&gt;.&lt;/p&gt;



&lt;p&gt;recovery-mds-scale: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/c95ce2ce-d41a-11e7-9840-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/c95ce2ce-d41a-11e7-9840-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Note: LBUG/LASSERT (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10221&quot; title=&quot;recovery-mds-scale test_failover_mds: onyx-40vm1:LBUG/LASSERT detected&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10221&quot;&gt;LU-10221&lt;/a&gt;) was also seen during the first recovery test run in the failover group (recovery-mds-scale).&lt;/p&gt;

&lt;p&gt;From the client console (vm3):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 2075.737415] jbd2/vda1-8 invoked oom-killer: gfp_mask=0x1420848(GFP_NOFS|__GFP_NOFAIL|__GFP_HARDWALL|__GFP_MOVABLE), nodemask=0, order=0, oom_score_adj=0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;followed by a core dump.&lt;/p&gt;



&lt;p&gt;recovery-random-scale: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/c9603c80-d41a-11e7-9840-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/c9603c80-d41a-11e7-9840-52540065bddc&lt;/a&gt;&lt;br/&gt;
recovery-double-scale: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/c963786e-d41a-11e7-9840-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/c963786e-d41a-11e7-9840-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The next two recovery tests run in the failover group (recovery-random-scale, recovery-double-scale) have page allocation failures:&lt;/p&gt;

&lt;p&gt;From the client console (vm3):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  960.559009] swapper/0: page allocation failure: order:0, mode:0x1080020(GFP_ATOMIC)
[  960.559012] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G           OE   N  4.4.92-6.18-default #1
[  960.559013] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[  960.559016]  0000000000000000 ffffffff813211b0 0000000000000000 ffff88007fc03d00
[  960.559018]  ffffffff81196022 0108002000000030 0000000000000000 0000000000000400
[  960.559019]  ffff88007fc15f00 ffff88007fc03d28 ffff88007fc15fb8 ffff88007fc15f00
[  960.559019] Call Trace:
[  960.559056]  [&amp;lt;ffffffff81019b19&amp;gt;] dump_trace+0x59/0x310
[  960.559059]  [&amp;lt;ffffffff81019eba&amp;gt;] show_stack_log_lvl+0xea/0x170
[  960.559064]  [&amp;lt;ffffffff8101ac41&amp;gt;] show_stack+0x21/0x40
[  960.559075]  [&amp;lt;ffffffff813211b0&amp;gt;] dump_stack+0x5c/0x7c
[  960.559087]  [&amp;lt;ffffffff81196022&amp;gt;] warn_alloc_failed+0xe2/0x150
[  960.559091]  [&amp;lt;ffffffff81196497&amp;gt;] __alloc_pages_nodemask+0x407/0xb80
[  960.559093]  [&amp;lt;ffffffff81196d4a&amp;gt;] __alloc_page_frag+0x10a/0x120
[  960.559104]  [&amp;lt;ffffffff81502e82&amp;gt;] __napi_alloc_skb+0x82/0xd0
[  960.559110]  [&amp;lt;ffffffffa02b6334&amp;gt;] cp_rx_poll+0x1b4/0x540 [8139cp]
[  960.559122]  [&amp;lt;ffffffff81511ae7&amp;gt;] net_rx_action+0x157/0x360
[  960.559133]  [&amp;lt;ffffffff810826d2&amp;gt;] __do_softirq+0xe2/0x2e0
[  960.559136]  [&amp;lt;ffffffff81082b8a&amp;gt;] irq_exit+0xfa/0x110
[  960.559149]  [&amp;lt;ffffffff8160ce71&amp;gt;] do_IRQ+0x51/0xd0
[  960.559152]  [&amp;lt;ffffffff8160ad0c&amp;gt;] common_interrupt+0x8c/0x8c
[  960.560535] DWARF2 unwinder stuck at ret_from_intr+0x0/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;  and &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  138.384058] Leftover inexact backtrace:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;  (many instances of this follow the page allocation traces)&lt;/p&gt;

&lt;p&gt;followed by core dumps.&lt;/p&gt;</description>
                <environment>onyx, failover&lt;br/&gt;
servers: sles12sp3, ldiskfs, branch b2_10, v2.10.2.RC1, b50&lt;br/&gt;
clients: sles12sp3, branch b2_10, v2.10.2.RC1, b50&lt;br/&gt;
</environment>
        <key id="49535">LU-10319</key>
            <summary>recovery-random-scale, test_fail_client_mds: test_fail_client_mds returned 4</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="jcasper">James Casper</reporter>
                        <labels>
                    </labels>
                <created>Mon, 4 Dec 2017 15:59:59 +0000</created>
                <updated>Fri, 12 Aug 2022 21:51:47 +0000</updated>
                            <resolved>Fri, 12 Aug 2022 21:51:47 +0000</resolved>
                                    <version>Lustre 2.11.0</version>
                    <version>Lustre 2.10.2</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="215256" author="pjones" created="Mon, 4 Dec 2017 18:53:20 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Is this a distinct issue from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10221&quot; title=&quot;recovery-mds-scale test_failover_mds: onyx-40vm1:LBUG/LASSERT detected&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10221&quot;&gt;LU-10221&lt;/a&gt;?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="215294" author="hongchao.zhang" created="Tue, 5 Dec 2017 08:22:32 +0000"  >&lt;p&gt;this could be a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10221&quot; title=&quot;recovery-mds-scale test_failover_mds: onyx-40vm1:LBUG/LASSERT detected&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10221&quot;&gt;LU-10221&lt;/a&gt;, the symptom is similar.&lt;/p&gt;</comment>
                            <comment id="215496" author="casperjx" created="Wed, 6 Dec 2017 21:48:44 +0000"  >&lt;p&gt;Just looked at a system running recovery-mds-scale:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;trevis-37vm3:/mnt/lustre/d0.dd-trevis-37vm3 # ls -al
total 2756636
drwxr-xr-x 2 root root       4096 Dec  6 13:41 .
drwxr-xr-x 5 root root       4096 Dec  6 13:42 ..
-rw-r--r-- 1 root root 3032481792 Dec  6 13:42 dd-file
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So the dd client is working with a single large file.  But memory may be freed after each 4K transfer.&lt;/p&gt;</comment>
                            <comment id="215497" author="casperjx" created="Wed, 6 Dec 2017 21:57:39 +0000"  >&lt;p&gt;I confirmed with top that dd is using less than 1% of memory (but lots of CPU cycles).&lt;/p&gt;</comment>
                            <comment id="215812" author="casperjx" created="Fri, 8 Dec 2017 18:04:19 +0000"  >&lt;p&gt;Per Oleg: &lt;span class=&quot;error&quot;&gt;&amp;#91;Our SUSE contact&amp;#93;&lt;/span&gt; spoke with an mm guy who suggested to &lt;br/&gt;
tweak some proc parameters, namely vm.min_free_kbytes.&lt;/p&gt;</comment>
                            <comment id="216191" author="green" created="Wed, 13 Dec 2017 19:55:19 +0000"  >&lt;p&gt;for vmcore to be useful we also need a pointer at the kernel-debuginfo rpm and also lustre build pointer to get files with symbols and be able to load the file in the crash tool&lt;/p&gt;</comment>
                            <comment id="216212" author="adilger" created="Wed, 13 Dec 2017 22:14:39 +0000"  >&lt;p&gt;I think we need to get some information about what is consuming the memory here.  Either from the crash dump, or by running &quot;slabtop&quot; and &quot;watch cat /proc/meminfo&quot; to see where all the memory is going.  I suspect something is wrong with CLIO memory management if it can&apos;t handle a write to a single large file (e.g. is a single DLM lock for the whole file pinning all of the pages, and they won&apos;t be freed until the lock is cancelled?).&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="49219">LU-10221</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="55159">LU-12067</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="46523">LU-9601</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="28834" name="vmcore_onyx-44vm3_recovery-double-scale" size="76805966" author="jcasper" created="Mon, 4 Dec 2017 18:19:30 +0000"/>
                            <attachment id="28836" name="vmcore_onyx-44vm3_recovery-mds-scale" size="78822513" author="jcasper" created="Mon, 4 Dec 2017 18:19:32 +0000"/>
                            <attachment id="28835" name="vmcore_onyx-44vm3_recovery-random-scale" size="76562034" author="jcasper" created="Mon, 4 Dec 2017 18:19:31 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzon3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>