<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:51:10 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5400] inode structure corruption leading to OSS crash</title>
                <link>https://jira.whamcloud.com/browse/LU-5400</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;One of our customer had kernel Null pointer dereference in __iget.&lt;br/&gt;
The backtrace is as follows:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 29825  TASK: ffff88044c49e7b0  CPU: 3   COMMAND: &quot;ll_ost_583&quot;
[...]
    [exception RIP: __iget+45]
    RIP: ffffffff81180cfd  RSP: ffff88044c517ac0  RFLAGS: 00010246
    RAX: ffff880040aa5550  RBX: ffff880040aa5540  RCX: 0000000000000000
    RDX: 0000000000000000  RSI: ffff88040c7bd3a9  RDI: ffff880040aa5540
    RBP: ffff88044c517ac0   R8: 00000000fffffff3   R9: 00000000fffffff6
    R10: 0000000000000008  R11: 0000000000000096  R12: ffff8800b59f0a80
    R13: ffff88040c7bd300  R14: ffff8804243b22f8  R15: 000000000000000b
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #9 [ffff88044c517ac8] igrab at ffffffff81180fd8
#10 [ffff88044c517ae8] filter_lvbo_init at ffffffffa0bdc795 [obdfilter]
#11 [ffff88044c517b18] ldlm_resource_get at ffffffffa07c33a4 [ptlrpc]
#12 [ffff88044c517b88] ldlm_lock_create at ffffffffa07bcb85 [ptlrpc]
#13 [ffff88044c517bd8] ldlm_handle_enqueue0 at ffffffffa07e40a4 [ptlrpc]
#14 [ffff88044c517c48] ldlm_handle_enqueue at ffffffffa07e4ef6 [ptlrpc]
#15 [ffff88044c517c88] ost_handle at ffffffffa0964e83 [ost]
#16 [ffff88044c517da8] ptlrpc_main at ffffffffa08134e6 [ptlrpc]
#17 [ffff88044c517f48] kernel_thread at ffffffff8100412a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The crash occurs here:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; dis __iget
0xffffffff81180cd0 &amp;lt;__iget&amp;gt;:    push   %rbp
0xffffffff81180cd1 &amp;lt;__iget+1&amp;gt;:  mov    %rsp,%rbp
0xffffffff81180cd4 &amp;lt;__iget+4&amp;gt;:  nopl   0x0(%rax,%rax,1)
0xffffffff81180cd9 &amp;lt;__iget+9&amp;gt;:  mov    0x48(%rdi),%eax
0xffffffff81180cdc &amp;lt;__iget+12&amp;gt;: test   %eax,%eax
0xffffffff81180cde &amp;lt;__iget+14&amp;gt;: jne    0xffffffff81180d30 &amp;lt;__iget+96&amp;gt;
0xffffffff81180ce0 &amp;lt;__iget+16&amp;gt;: lock incl 0x48(%rdi)
0xffffffff81180ce4 &amp;lt;__iget+20&amp;gt;: testq  $0x107,0x218(%rdi)
0xffffffff81180cef &amp;lt;__iget+31&amp;gt;: jne    0xffffffff81180d22 &amp;lt;__iget+82&amp;gt;
0xffffffff81180cf1 &amp;lt;__iget+33&amp;gt;: mov    0x18(%rdi),%rdx
0xffffffff81180cf5 &amp;lt;__iget+37&amp;gt;: mov    0x10(%rdi),%rcx
0xffffffff81180cf9 &amp;lt;__iget+41&amp;gt;: lea    0x10(%rdi),%rax
0xffffffff81180cfd &amp;lt;__iget+45&amp;gt;: mov    %rdx,0x8(%rcx)    &amp;lt;=== HERE
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;which corresponds to :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;	if (!(inode-&amp;gt;i_state &amp;amp; (I_DIRTY|I_SYNC)))
		list_move(&amp;amp;inode-&amp;gt;i_list, &amp;amp;inode_in_use);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The %rcx is supposed to hold &amp;amp;inode-&amp;gt;i_list, but is NULL.&lt;br/&gt;
Looking at the inode structure, all first fields contain zeros:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;struct inode {
  i_hash = {
    next = 0x0, 
    pprev = 0x0
  }, 
  i_list = {
    next = 0x0, 
    prev = 0x0
  }, 
  i_sb_list = {
    next = 0x0, 
    prev = 0x0
  }, 
  i_dentry = {
    next = 0x0, 
    prev = 0x0
  }, 
  i_ino = 0, 
  i_count = {
    counter = 1
  }, 
  i_nlink = 0, 
....
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Looking at the dentry structure from which the inode address comes from, it looks to be ok:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; struct dentry ffff88040c7bd300
struct dentry {
  d_count = {
    counter = 1
  }, 
  d_flags = 8, 
  d_lock = {
    raw_lock = {
      slock = 2555943
    }
  }, 
  d_mounted = -559087616, 
  d_inode = 0xffff880040aa5540, 
  d_hash = {
    next = 0xffff88039a004f18, 
    pprev = 0xffff8803b4f8c558
  }, 
  d_parent = 0xffff8804235ea9c0, 
  d_name = {
    hash = 72921089, 
    len = 9, 
    name = 0xffff88040c7bd3a0 &quot;120408088&quot;
  }, 
  d_lru = {
    next = 0xffff88040c7bd400, 
    prev = 0xffff88040c7bd280
  }, 
  d_u = {
    d_child = {
      next = 0xffff88040c31dc10, 
      prev = 0xffff88054d37b950
    }, 
    d_rcu = {
      next = 0xffff88040c31dc10, 
      func = 0xffff88054d37b950
    }
  }, 
  d_subdirs = {
    next = 0xffff88040c7bd360, 
    prev = 0xffff88040c7bd360
  }, 
  d_alias = {
    next = 0xffff880040aa5570, 
    prev = 0xffff880040aa5570
  }, 
  d_time = 0, 
  d_op = 0x0, 
  d_sb = 0xffff880bc74dd400, 
  d_fsdata = 0x0, 
  d_iname = &quot;120408088\000\000\000\000\000\000\000\000\000\b\000\000\000\000\000\000\000\000\000\000\000\000&quot;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and is consistent with its parent directory:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; struct dentry.d_name ffff8804235ea9c0
  d_name = {
    hash = 2243934, 
    len = 3, 
    name = 0xffff8804235eaa60 &quot;d24&quot;
  }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Can you find how this corruption happened ?&lt;/p&gt;</description>
                <environment>Bull environment</environment>
        <key id="25714">LU-5400</key>
            <summary>inode structure corruption leading to OSS crash</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="spiechurski">Sebastien Piechurski</reporter>
                        <labels>
                    </labels>
                <created>Wed, 23 Jul 2014 22:55:06 +0000</created>
                <updated>Wed, 7 Jun 2017 12:00:52 +0000</updated>
                            <resolved>Wed, 7 Jun 2017 12:00:52 +0000</resolved>
                                    <version>Lustre 2.1.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="89915" author="bfaccini" created="Thu, 24 Jul 2014 00:06:40 +0000"  >&lt;p&gt;Hello Seb,&lt;br/&gt;
I think you meant that &quot;%rcx is supposed to hold inode-&amp;gt;i_list.next&quot;, right ?&lt;/p&gt;

&lt;p&gt;You say the beginning of Inode at address 0xffff880040aa5540 has been zeroed, but what about later fields do they look ok? I already see that i_count&apos;s value is 1 !&lt;/p&gt;

&lt;p&gt;Also, can you check memory content just before this inode structure and to which slab kmem_cache it belongs to?&lt;/p&gt;</comment>
                            <comment id="89933" author="spiechurski" created="Thu, 24 Jul 2014 08:42:04 +0000"  >&lt;p&gt;Hi Bruno &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;I was meaning %rcx == inode-&amp;gt;i_list, not &amp;amp;inode-&amp;gt;i_list, with the crash occurring when trying to access i_list.prev in the inlined call to list_move.&lt;/p&gt;

&lt;p&gt;I looked the 400 bytes preceding the inode address, and everything is set to zeros.&lt;br/&gt;
The i_count value was incremented just a few instructions before the crash, which is why it is the only value != 0.&lt;br/&gt;
However, not all fields in the inode struct are zeros, only the beginning, but all others are not consistent. &lt;br/&gt;
I attach the complete inode struct dump as well as the hex dump.&lt;br/&gt;
You will notice in the hex dump, that there seems to be an incrementing pattern every 32 bytes like:&lt;/p&gt;

&lt;p&gt;ffff880040aa6220:  00 &lt;b&gt;300011&lt;/b&gt; 00300001 00000be300300221 &lt;br/&gt;
ffff880040aa6230:  0000000000040252 65a7000000000000 &lt;br/&gt;
ffff880040aa6240:  00 &lt;b&gt;300012&lt;/b&gt; 00300002 0000092b00300422 &lt;br/&gt;
ffff880040aa6250:  00000000000401ec 921e000000000000 &lt;br/&gt;
ffff880040aa6260:  00 &lt;b&gt;300013&lt;/b&gt; 00300003 00000f1800300623 &lt;/p&gt;

&lt;p&gt;Finally, it looks like we have a ldiskfs_inode slab corruption:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; kmem -s ffff880040aa5540
kmem: ldiskfs_inode_cache: partial list: slab: ffff880040aa72c0  bad next pointer: 0
kmem: ldiskfs_inode_cache: partial list: slab: ffff880040aa72c0  bad prev pointer: 0
kmem: ldiskfs_inode_cache: partial list: slab: ffff880040aa72c0  bad inuse counter: 0
kmem: ldiskfs_inode_cache: partial list: slab: ffff880040aa72c0  bad s_mem pointer: 0
kmem: ldiskfs_inode_cache: partial list: slab: ffff880040aa5000  bad next pointer: 0
kmem: ldiskfs_inode_cache: partial list: slab: ffff880040aa5000  bad s_mem pointer: 0
kmem: ldiskfs_inode_cache: full list: slab: ffff880040aa5000  bad next pointer: 0
kmem: ldiskfs_inode_cache: full list: slab: ffff880040aa5000  bad s_mem pointer: 0
kmem: ldiskfs_inode_cache: free list: slab: ffff880040aa5000  bad next pointer: 0
kmem: ldiskfs_inode_cache: free list: slab: ffff880040aa5000  bad s_mem pointer: 0
kmem: ldiskfs_inode_cache: partial list: slab: ffff880040aa5000  bad next pointer: 0
kmem: ldiskfs_inode_cache: partial list: slab: ffff880040aa5000  bad s_mem pointer: 0
kmem: ldiskfs_inode_cache: full list: slab: ffff880040aa5000  bad next pointer: 0
kmem: ldiskfs_inode_cache: full list: slab: ffff880040aa5000  bad s_mem pointer: 0
kmem: ldiskfs_inode_cache: free list: slab: ffff880040aa5000  bad next pointer: 0
kmem: ldiskfs_inode_cache: free list: slab: ffff880040aa5000  bad s_mem pointer: 0
kmem: ldiskfs_inode_cache: address not found in cache: ffff880040aa5540
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;To be linked to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5284&quot; title=&quot;GPF in radix_tree_lookup_slot on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5284&quot;&gt;&lt;del&gt;LU-5284&lt;/del&gt;&lt;/a&gt; ? The crash comes from the same customer ...&lt;/p&gt;</comment>
                            <comment id="89936" author="bfaccini" created="Thu, 24 Jul 2014 09:20:41 +0000"  >&lt;p&gt;Humm, I am late on &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5284&quot; title=&quot;GPF in radix_tree_lookup_slot on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5284&quot;&gt;&lt;del&gt;LU-5284&lt;/del&gt;&lt;/a&gt; crash-dump analysis, but this can be suspected sure!&lt;br/&gt;
Let me check further and I will get back with some more ideas ...&lt;/p&gt;</comment>
                            <comment id="90038" author="bfaccini" created="Fri, 25 Jul 2014 13:49:18 +0000"  >&lt;p&gt;After some work on crash-dump for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5284&quot; title=&quot;GPF in radix_tree_lookup_slot on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5284&quot;&gt;&lt;del&gt;LU-5284&lt;/del&gt;&lt;/a&gt;, I can already confirm that the corruption looks really similar (serie of 4 quad-words with similarities/increments).&lt;/p&gt;</comment>
                            <comment id="90065" author="bfaccini" created="Fri, 25 Jul 2014 16:40:15 +0000"  >&lt;p&gt;Humm and the corruption address range is very close too !! Could this be the same node that failed both time ??&lt;/p&gt;</comment>
                            <comment id="93443" author="spiechurski" created="Mon, 8 Sep 2014 16:51:15 +0000"  >&lt;p&gt;No the occurrence here and in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5284&quot; title=&quot;GPF in radix_tree_lookup_slot on OSS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5284&quot;&gt;&lt;del&gt;LU-5284&lt;/del&gt;&lt;/a&gt; are not on the same node, even though on the same site.&lt;/p&gt;</comment>
                            <comment id="94133" author="bfaccini" created="Tue, 16 Sep 2014 09:15:40 +0000"  >&lt;p&gt;Humm this is very strange, I know it is not an easy question/answer, but is there something specific (HW/SW configs, work-load, ...) for these 2 nodes vs others ??&lt;/p&gt;

&lt;p&gt;Did you encounter new crashes ?&lt;/p&gt;</comment>
                            <comment id="198414" author="spiechurski" created="Wed, 7 Jun 2017 08:21:04 +0000"  >&lt;p&gt;Same thing. Old ticket and problem disappeared ...&lt;/p&gt;

&lt;p&gt;Please close.&lt;/p&gt;</comment>
                            <comment id="198428" author="pjones" created="Wed, 7 Jun 2017 12:00:52 +0000"  >&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Wed, 17 Sep 2014 22:55:06 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzws4v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>15033</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Wed, 23 Jul 2014 22:55:06 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>