<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:19:56 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8715] Regression from LU-8057 causes loading of fld.ko hung in 2.7.2</title>
                <link>https://jira.whamcloud.com/browse/LU-8715</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Since our nas-2.7.2-2nas rebased to b2_7_fe to nas-2.7.2-3nas, we found loading lustre module fld.ko hanged. Modprobe took 100% cpu time and could not be killed.&lt;/p&gt;

&lt;p&gt;I identified the culprit of the problem using git bisect:&lt;br/&gt;
commit f23e22da88f07e95071ec76807aaa42ecd39e8ca&lt;br/&gt;
Author: Amitoj Kaur Chawla &amp;lt;amitoj1606@gmail.com&amp;gt;&lt;br/&gt;
Date:   Thu Jun 16 23:12:03 2016 +0800&lt;/p&gt;

&lt;p&gt;    &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8057&quot; title=&quot;o2iblnd driver is causing memory corruption due to improper handling of scatter list.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8057&quot;&gt;&lt;del&gt;LU-8057&lt;/del&gt;&lt;/a&gt; ko2iblnd: Replace sg++ with sg = sg_next(sg)&lt;/p&gt;

&lt;p&gt;It was a b2_7_fe back port from the following one:&lt;br/&gt;
    Lustre-commit: d226464acaacccd240da43dcc22372fbf8cb04a6&lt;br/&gt;
    Lustre-change: &lt;a href=&quot;http://review.whamcloud.com/19342&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/19342&lt;/a&gt;&lt;/p&gt;</description>
                <environment>lustre server nas-2.7.2-3nasS running in centos 6.7.</environment>
        <key id="40752">LU-8715</key>
            <summary>Regression from LU-8057 causes loading of fld.ko hung in 2.7.2</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="jaylan">Jay Lan</reporter>
                        <labels>
                    </labels>
                <created>Tue, 18 Oct 2016 01:34:35 +0000</created>
                <updated>Wed, 18 Apr 2018 18:07:37 +0000</updated>
                            <resolved>Wed, 18 Apr 2018 18:07:37 +0000</resolved>
                                    <version>Lustre 2.7.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="170057" author="bfaccini" created="Tue, 18 Oct 2016 06:56:46 +0000"  >&lt;p&gt;Well, both the failure and suspected cause look surprising.&lt;br/&gt;
Do you mean that the fld.ko module load simply hangs on a fresh system when running &quot;modprobre lustre&quot;?&lt;/p&gt;</comment>
                            <comment id="170123" author="mhanafi" created="Tue, 18 Oct 2016 16:33:25 +0000"  >&lt;p&gt;Module load time before  was about 2-5mins, because we have large ntx values.&lt;br/&gt;
(options ko2iblnd ntx=125536 credits=62768 fmr_pool_size=31385)&lt;br/&gt;
But after the patch it takes &amp;gt;20mins&lt;/p&gt;
</comment>
                            <comment id="170125" author="simmonsja" created="Tue, 18 Oct 2016 16:38:51 +0000"  >&lt;p&gt;The fix is correct and it fixes a real bug. What this change did is exposed another problem in the ko2iblnd driver. I have to ask is your system really consuming all those credits? I don&apos;t think the IB driver queue pair depth is big enough to handle all those credits.&lt;/p&gt;</comment>
                            <comment id="170195" author="jgmitter" created="Tue, 18 Oct 2016 17:14:31 +0000"  >&lt;p&gt;Hi Doug,&lt;/p&gt;

&lt;p&gt;Can you please have a look into the issue since it relates to the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8057&quot; title=&quot;o2iblnd driver is causing memory corruption due to improper handling of scatter list.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8057&quot;&gt;&lt;del&gt;LU-8057&lt;/del&gt;&lt;/a&gt; change?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="170203" author="jaylan" created="Tue, 18 Oct 2016 18:23:43 +0000"  >&lt;p&gt;@Bruno Faccini: Yes, I can reproduce the problem on our freshly rebooted lustre servers by doing &apos;modprobe fld.&apos; &lt;/p&gt;</comment>
                            <comment id="170205" author="mhanafi" created="Tue, 18 Oct 2016 18:27:07 +0000"  >&lt;p&gt;we have &amp;gt;12,000 clients. We do see some servers consume all the credits.&lt;/p&gt;
</comment>
                            <comment id="170219" author="mhanafi" created="Tue, 18 Oct 2016 19:17:43 +0000"  >&lt;p&gt;perf top showed during module load all the time is spent in __vmalloc_node.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Samples: 748K of event &lt;span class=&quot;code-quote&quot;&gt;&apos;cycles&apos;&lt;/span&gt;, Event count (approx.): 53812402443
Overhead  Shared &lt;span class=&quot;code-object&quot;&gt;Object&lt;/span&gt;            Symbol
  96.21%  [kernel]                 [k] __vmalloc_node
   0.91%  [kernel]                 [k] read_hpet
   0.28%  [kernel]                 [k] get_vmalloc_info
   0.26%  [kernel]                 [k] __write_lock_failed
   0.25%  [kernel]                 [k] __read_lock_failed
   0.05%  [kernel]                 [k] apic_timer_interrupt
   0.05%  [kernel]                 [k] _spin_lock
   0.04%  perf                     [.] dso__find_symbol
   0.03%  [kernel]                 [k] find_busiest_group
   0.03%  [kernel]                 [k] clear_page_c
   0.03%  [kernel]                 [k] page_fault
   0.03%  [kernel]                 [k] memset
   0.02%  [kernel]                 [k] rcu_process_gp_end
   0.02%  perf                     [.] perf_evsel__parse_sample
   0.02%  [kernel]                 [k] sha_transform
   0.02%  [kernel]                 [k] native_write_msr_safe
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="170223" author="simmonsja" created="Tue, 18 Oct 2016 19:31:23 +0000"  >&lt;p&gt;I know exactly what your problem is. We saw this problem in the lustre core some time ago and changed the OBD_ALLOC macros. The libcfs/LNet layer uses it own LIBCFS_ALLOC macros which means when the allocations  are more than 2 pages in size they hit the vmalloc spinlock serialization issue. We need a fix for libcfs much like lustre had.&lt;/p&gt;</comment>
                            <comment id="170244" author="doug" created="Tue, 18 Oct 2016 21:09:04 +0000"  >&lt;p&gt;James, can we do that fix under this ticket?&lt;/p&gt;</comment>
                            <comment id="170247" author="simmonsja" created="Tue, 18 Oct 2016 21:49:30 +0000"  >&lt;p&gt;Why not. The problem is the LIBCFS_ALLOC and FREE macros. Looking at the macros gave me a headache so no patch from me. I need to get into the right mental state to tackle it &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="170682" author="mhanafi" created="Sun, 23 Oct 2016 03:15:30 +0000"  >&lt;p&gt;Any updates?&lt;/p&gt;</comment>
                            <comment id="170798" author="doug" created="Mon, 24 Oct 2016 18:27:00 +0000"  >&lt;p&gt;As best we can figure out, the change in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8057&quot; title=&quot;o2iblnd driver is causing memory corruption due to improper handling of scatter list.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8057&quot;&gt;&lt;del&gt;LU-8057&lt;/del&gt;&lt;/a&gt; has causes a little more memory to be used per connection.  That is pushing your system over the edge.&lt;/p&gt;

&lt;p&gt;As James has indicated, a proper fix will be to change how to allocate memory in LNet.  That is going to take some time to get right as the potential to break all of LNet is pretty good.&lt;/p&gt;

&lt;p&gt;I don&apos;t believe the fix for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8057&quot; title=&quot;o2iblnd driver is causing memory corruption due to improper handling of scatter list.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8057&quot;&gt;&lt;del&gt;LU-8057&lt;/del&gt;&lt;/a&gt; is critical for your setup.  If this needs to be fixed quickly, then removing &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8085&quot; title=&quot;Inode Iteration should increase the cursor to skip unused inodes&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8085&quot;&gt;&lt;del&gt;LU-8085&lt;/del&gt;&lt;/a&gt; from your build will be the best approach.  As ORNL needs &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8085&quot; title=&quot;Inode Iteration should increase the cursor to skip unused inodes&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8085&quot;&gt;&lt;del&gt;LU-8085&lt;/del&gt;&lt;/a&gt;, we cannot remove it from master.&lt;/p&gt;</comment>
                            <comment id="171017" author="jaylan" created="Tue, 25 Oct 2016 19:04:55 +0000"  >&lt;p&gt;Doug,&lt;/p&gt;

&lt;p&gt;You wrote in previous commet:&lt;br/&gt;
&quot; If this needs to be fixed quickly, then removing &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8085&quot; title=&quot;Inode Iteration should increase the cursor to skip unused inodes&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8085&quot;&gt;&lt;del&gt;LU-8085&lt;/del&gt;&lt;/a&gt; from your build will be the best approach. As ORNL needs &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8085&quot; title=&quot;Inode Iteration should increase the cursor to skip unused inodes&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8085&quot;&gt;&lt;del&gt;LU-8085&lt;/del&gt;&lt;/a&gt;, we cannot remove it from master.&quot;&lt;/p&gt;

&lt;p&gt;Did you actually mean to write &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8057&quot; title=&quot;o2iblnd driver is causing memory corruption due to improper handling of scatter list.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8057&quot;&gt;&lt;del&gt;LU-8057&lt;/del&gt;&lt;/a&gt;&quot;? We do not have &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8057&quot; title=&quot;o2iblnd driver is causing memory corruption due to improper handling of scatter list.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8057&quot;&gt;&lt;del&gt;LU-8057&lt;/del&gt;&lt;/a&gt; in our git repo...&lt;/p&gt;</comment>
                            <comment id="171048" author="doug" created="Tue, 25 Oct 2016 22:21:34 +0000"  >&lt;p&gt;Yes, sorry about that.  &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8057&quot; title=&quot;o2iblnd driver is causing memory corruption due to improper handling of scatter list.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8057&quot;&gt;&lt;del&gt;LU-8057&lt;/del&gt;&lt;/a&gt; is what I was referring to.&lt;/p&gt;</comment>
                            <comment id="226260" author="simmonsja" created="Wed, 18 Apr 2018 17:20:23 +0000"  >&lt;p&gt;Is this still a problem&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="226269" author="jaylan" created="Wed, 18 Apr 2018 18:06:19 +0000"  >&lt;p&gt;This case can be closed. Thanks.&lt;/p&gt;</comment>
                            <comment id="226270" author="pjones" created="Wed, 18 Apr 2018 18:07:37 +0000"  >&lt;p&gt;ok thanks!&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="36320">LU-8057</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzys8n:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>