<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:14:08 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8043] MDS running lustre 2.5.5+ OOM when running with Lustre 2.8 GA clients</title>
                <link>https://jira.whamcloud.com/browse/LU-8043</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Today we performed a test shot on our smaller Cray Aries cluster (700 nodes) with a non-patched lustre 2.8 GA client specially build for this system. The test were run against our atlas file system which is running a RHEL6.7 distro with the lustre version 2.5.5 with patches. During our test shot while running an IOR single shared file test across all nodes with the stripe count of 1008 the MDS server ran out of memory. I attached the dmesg output to this ticket.&lt;/p&gt;</description>
                <environment>Cray clients running unpatched lustre 2.8 GA clients. Server side running Lustre 2.5.5 with a patch set in a RHEL6.7 environment.</environment>
        <key id="36256">LU-8043</key>
            <summary>MDS running lustre 2.5.5+ OOM when running with Lustre 2.8 GA clients</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                    </labels>
                <created>Tue, 19 Apr 2016 17:22:48 +0000</created>
                <updated>Sat, 3 Sep 2016 08:14:02 +0000</updated>
                            <resolved>Fri, 20 May 2016 01:15:04 +0000</resolved>
                                    <version>Lustre 2.5.5</version>
                                    <fixVersion>Lustre 2.5.5</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="149579" author="green" created="Wed, 20 Apr 2016 20:24:03 +0000"  >&lt;p&gt;From James:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;BTW I ran kmem -S on the vmcore.
CACHE            NAME                 OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE
ffff883f77c23c40 osp_obj                  200   35894144  35894895 1889205     4k
ffff883f811f3c00 mdd_obj                   80   12196492  14344704 298848     4k
ffff883f810f3bc0 lod_obj                  112   12196492  13903654 408931     4k
ffff883f940a3b80 mdt_cdt_agent_req        168          0         0      0     4k
ffff883f94653b40 mdt_cdt_restore_handle   112          0         0      0     4k
ffff883f94033b00 mdt_obj                  256   12196492  13096710 873114     4k
ffff883fa7b83ac0 dynlock_cache             72          0       265      5     4k
ffff883fa7ab3a80 upd_kmem                  96          0         0      0     4k
ffff883fa7aa3a40 lqe_kmem                 200         18        19      1     4k
ffff883fa7a23a00 ldiskfs_inode_cache     1040   24405903  24405930 8135310     4k
ffff883fa7a139c0 ldiskfs_xattr             88          0         0      0     4k
ffff883fa7a03980 ldiskfs_free_data         64          1        59      1     4k
ffff883fa79f3940 ldiskfs_alloc_context    136          0        84      3     4k
ffff883fa79e3900 ldiskfs_prealloc_space   112        121       238      7     4k
ffff883fa79d38c0 ldiskfs_system_zone       40          0         0      0     4k
ffff883fa78d3880 jbd2_journal_handle       48          0       385      5     4k
ffff883fa7863840 jbd2_journal_head        112       3985      8126    239     4k
ffff883fa7853800 jbd2_revoke_table         16          2       202      1     4k
ffff883fa7f737c0 jbd2_revoke_record        32          0         0      0     4k
ffff883fb05c3780 xattr_kmem                40          0         0      0     4k
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="149716" author="simmonsja" created="Thu, 21 Apr 2016 17:49:51 +0000"  >&lt;p&gt;The memory exhaustion problem exposed &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5367&quot; title=&quot;(mdt_handler.c:2782:mdt_lock_handle_fini()) ASSERTION( !lustre_handle_is_used(&amp;amp;lh-&amp;gt;mlh_pdo_lh) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5367&quot;&gt;&lt;del&gt;LU-5367&lt;/del&gt;&lt;/a&gt; which is also present in the b2_5_fe release&lt;/p&gt;</comment>
                            <comment id="149720" author="yujian" created="Thu, 21 Apr 2016 18:00:37 +0000"  >&lt;p&gt;Hi James,&lt;br/&gt;
The patch &lt;a href=&quot;http://review.whamcloud.com/11219&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11219&lt;/a&gt; for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5367&quot; title=&quot;(mdt_handler.c:2782:mdt_lock_handle_fini()) ASSERTION( !lustre_handle_is_used(&amp;amp;lh-&amp;gt;mlh_pdo_lh) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5367&quot;&gt;&lt;del&gt;LU-5367&lt;/del&gt;&lt;/a&gt; has already included in that release.&lt;/p&gt;</comment>
                            <comment id="149723" author="bzzz" created="Thu, 21 Apr 2016 18:07:53 +0000"  >&lt;p&gt;Can you please confirm that you did same 1008 stripes with the old clients and it was doing fine?&lt;/p&gt;</comment>
                            <comment id="149725" author="simmonsja" created="Thu, 21 Apr 2016 18:21:39 +0000"  >&lt;p&gt;Yes we tested 2.5 client on the same platform and the MDS did not OOM.&lt;/p&gt;</comment>
                            <comment id="149726" author="simmonsja" created="Thu, 21 Apr 2016 18:23:19 +0000"  >&lt;p&gt;Oh I do see it now. We still encountered that same issues even with the patch &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/sad.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt; I posted the assert here:&lt;/p&gt;

&lt;p&gt;0&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570311&amp;#93;&lt;/span&gt; LustreError: 15316:0:(mdt_handler.c:3224:mdt_lock_handle_fini()) ASSERTION( !lustre_handle_is_used(&amp;amp;lh-&amp;gt;mlh_reg_lh) ) failed: &lt;br/&gt;
&amp;lt;0&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570478&amp;#93;&lt;/span&gt; LustreError: 15659:0:(mdt_handler.c:3224:mdt_lock_handle_fini()) ASSERTION( !lustre_handle_is_used(&amp;amp;lh-&amp;gt;mlh_reg_lh) ) failed: &lt;br/&gt;
&amp;lt;0&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570480&amp;#93;&lt;/span&gt; LustreError: 15659:0:(mdt_handler.c:3224:mdt_lock_handle_fini()) LBUG&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570481&amp;#93;&lt;/span&gt; Pid: 15659, comm: mdt00_441&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570481&amp;#93;&lt;/span&gt; &lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570482&amp;#93;&lt;/span&gt; Call Trace:&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570495&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0420895&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570499&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0420e97&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570511&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d9e64b&amp;gt;&amp;#93;&lt;/span&gt; mdt_lock_handle_fini+0x4b/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570517&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0da4cc0&amp;gt;&amp;#93;&lt;/span&gt; mdt_thread_info_fini+0xe0/0x190 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570523&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0daa473&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x653/0x1470 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570533&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0de98c5&amp;gt;&amp;#93;&lt;/span&gt; mds_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570563&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa076d07e&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xf8e/0x1af0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570581&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa076c0f0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1af0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570584&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810a0fce&amp;gt;&amp;#93;&lt;/span&gt; kthread+0x9e/0xc0&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570587&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c28a&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570588&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810a0f30&amp;gt;&amp;#93;&lt;/span&gt; ? kthread+0x0/0xc0&lt;br/&gt;
&amp;lt;4&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;4805210.570589&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c280&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;</comment>
                            <comment id="149727" author="ezell" created="Thu, 21 Apr 2016 18:27:27 +0000"  >&lt;p&gt;Our assertion was for a regular lock, not a pdo lock.&lt;/p&gt;</comment>
                            <comment id="149731" author="ezell" created="Thu, 21 Apr 2016 18:59:57 +0000"  >&lt;p&gt;I dumped the debug log from the crash dump, but I didn&apos;t see anything interesting.  The most recent data is&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000004:00080000:2.0:1461078294.488964:0:15455:0:(mdt_open.c:661:mdt_mfd_set_mode()) [0x2002d12a5:0x47ab:0x0] Change mfd mode 00 -&amp;gt; 0302.
00000004:00080000:2.0:1461078294.488967:0:15455:0:(mdt_handler.c:2853:mdt_save_lock()) request = ffff8834683bd400 reply state = ffff881fa5957000 transno = 448115532568
00000004:00080000:2.0:1461078294.503955:0:15455:0:(mdt_open.c:661:mdt_mfd_set_mode()) [0x2002ce5a3:0x5f:0x0] Change mfd mode 00 -&amp;gt; 01.
00000004:00080000:2.0:1461078294.504528:0:15455:0:(mdt_open.c:661:mdt_mfd_set_mode()) [0x2002d12a5:0x47ac:0x0] Change mfd mode 00 -&amp;gt; 0302.
00000004:00080000:2.0:1461078294.504532:0:15455:0:(mdt_handler.c:2853:mdt_save_lock()) request = ffff88048acf6000 reply state = ffff8816e7196000 transno = 448115532572
00000004:00080000:3.0:1461078294.522700:0:15239:0:(mdt_open.c:661:mdt_mfd_set_mode()) [0x2002ccbf6:0x55:0x0] Change mfd mode 00 -&amp;gt; 01.
00000004:00080000:3.0:1461078294.523266:0:15239:0:(mdt_open.c:661:mdt_mfd_set_mode()) [0x2002d12a5:0x47ad:0x0] Change mfd mode 00 -&amp;gt; 0302.
00000004:00080000:3.0:1461078294.523269:0:15239:0:(mdt_handler.c:2853:mdt_save_lock()) request = ffff880339962400 reply state = ffff8817f1155000 transno = 448115532576
00000004:00080000:1.0:1461078294.538529:0:15601:0:(mdt_open.c:661:mdt_mfd_set_mode()) [0x2002d111a:0x1f4b0:0x0] Change mfd mode 00 -&amp;gt; 01.
00000004:00080000:2.0:1461078294.538598:0:15371:0:(mdt_handler.c:2853:mdt_save_lock()) request = ffff880873ac7000 reply state = ffff8834db7b5000 transno = 448115532578
00000004:00040000:0.0:1461078294.538823:0:15316:0:(mdt_handler.c:3224:mdt_lock_handle_fini()) ASSERTION( !lustre_handle_is_used(&amp;amp;lh-&amp;gt;mlh_reg_lh) ) failed: 
00000004:00080000:2.0:1461078294.538953:0:15371:0:(mdt_handler.c:2853:mdt_save_lock()) request = ffff880873ac7000 reply state = ffff8834db7b5000 transno = 448115532578
00000004:00040000:2.0:1461078294.538990:0:15659:0:(mdt_handler.c:3224:mdt_lock_handle_fini()) ASSERTION( !lustre_handle_is_used(&amp;amp;lh-&amp;gt;mlh_reg_lh) ) failed: 
00000004:00040000:2.0:1461078294.538993:0:15659:0:(mdt_handler.c:3224:mdt_lock_handle_fini()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="149736" author="gerrit" created="Thu, 21 Apr 2016 19:21:11 +0000"  >&lt;p&gt;Alex Zhuravlev (alexey.zhuravlev@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/19717&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/19717&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8043&quot; title=&quot;MDS running lustre 2.5.5+ OOM when running with Lustre 2.8 GA clients&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8043&quot;&gt;&lt;del&gt;LU-8043&lt;/del&gt;&lt;/a&gt; obdclass: use atomic allocations interrupt context only&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 07b0663db5dc84926e55cc89f0d97df9df24377f&lt;/p&gt;</comment>
                            <comment id="149737" author="bzzz" created="Thu, 21 Apr 2016 19:21:17 +0000"  >&lt;p&gt;in all the cases it&apos;s stats:&lt;/p&gt;

&lt;p&gt;&amp;lt;3&amp;gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;2806622.047940&amp;#93;&lt;/span&gt; LustreError: 14895:0:(lvfs_lib.c:157:lprocfs_stats_alloc_one()) LNET: out of memory at /data/buildsystem/jsimmons-atlas/rpmbuild/BUILD/lustre-2.5.5/lustre/lvfs/lvfs_lib.c:157 (tried to alloc &apos;(stats-&amp;gt;ls_percpu&lt;span class=&quot;error&quot;&gt;&amp;#91;cpuid&amp;#93;&lt;/span&gt;)&apos; = 4224)&lt;/p&gt;

&lt;p&gt;lprocfs_stats_alloc_one() is trying to allocate per-cpu slot:&lt;br/&gt;
	LIBCFS_ALLOC_ATOMIC(stats-&amp;gt;ls_percpu&lt;span class=&quot;error&quot;&gt;&amp;#91;cpuid&amp;#93;&lt;/span&gt;, percpusize);&lt;/p&gt;</comment>
                            <comment id="149738" author="bzzz" created="Thu, 21 Apr 2016 19:22:08 +0000"  >&lt;p&gt;the patch above hasn&apos;t been tested yet.. uploaded to start testing in Maloo.&lt;/p&gt;</comment>
                            <comment id="149783" author="ezell" created="Thu, 21 Apr 2016 23:49:04 +0000"  >&lt;p&gt;I took a look in the crash dump and found that &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mdt_thread_info.mti_lh[MDT_LH_OLD]&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt; is:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;{
      mlh_type = MDT_NUL_LOCK, 
      mlh_reg_lh = {
        cookie = 16144317094821187230
      }, 
      mlh_reg_mode = LCK_CR, 
      mlh_pdo_lh = {
        cookie = 0
      }, 
      mlh_pdo_mode = LCK_MINMODE, 
      mlh_pdo_hash = 0, 
      mlh_rreg_lh = {
        cookie = 0
      }, 
      mlh_rreg_mode = LCK_MINMODE
    }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Also looking at the mdt_thread_info, I see:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;  mti_rr = {
    rr_opcode = REINT_OPEN, 
    rr_handle = 0xffff883f36707350, 
    rr_fid1 = 0xffff883f36707330, 
    rr_fid2 = 0xffff883f36707340, 
    rr_name = 0xffff883f36707480 &quot;testfile.out&quot;, 
    rr_namelen = 12, 
    rr_tgt = 0x0, 
    rr_tgtlen = 0, 
    rr_eadata = 0xffff883f36707490, 
    rr_eadatalen = 4096, 
    rr_logcookielen = 0, 
    rr_logcookies = 0x0, 
    rr_flags = 0
  }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Why does a create reint have a lock handle for &quot;old&quot;.  Isn&apos;t that just used for renames and layout swaps?&lt;/p&gt;</comment>
                            <comment id="150111" author="simmonsja" created="Mon, 25 Apr 2016 21:54:25 +0000"  >&lt;p&gt;After a discussion internally we really like to see the assertion that is present in the dmesg log to be fixed as well. It is possible that the assertion in the log is independent of the memory exhaustion problems.&lt;/p&gt;</comment>
                            <comment id="150119" author="yujian" created="Mon, 25 Apr 2016 23:06:00 +0000"  >&lt;p&gt;Hi John,&lt;br/&gt;
Do you have any suggestions on the assertion failure above? Thank you.&lt;/p&gt;</comment>
                            <comment id="150215" author="jhammond" created="Tue, 26 Apr 2016 15:05:05 +0000"  >&lt;p&gt;Hi Matt, could you post the debug log? Or if it&apos;s too big, could you post the lines from tasks 15316 and 15659?&lt;/p&gt;</comment>
                            <comment id="150234" author="jhammond" created="Tue, 26 Apr 2016 15:53:41 +0000"  >&lt;p&gt;&amp;gt; Also looking at the mdt_thread_info, I see&lt;/p&gt;

&lt;p&gt;The values in &lt;tt&gt;mti_rr&lt;/tt&gt; may be from a previous request. I think &lt;tt&gt;mdt_intent_layout()&lt;/tt&gt; is more likely here. Note that &lt;tt&gt;MDT_LH_LAYOUT&lt;/tt&gt; has the same value as &lt;tt&gt;MDT_LH_OLD&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="150244" author="ezell" created="Tue, 26 Apr 2016 16:32:37 +0000"  >&lt;p&gt;I just attached mylog.dk.gz.  Unfortunately it was only running with &quot;normal&quot; debug levels.&lt;/p&gt;

&lt;p&gt;James Simmons was running IOR when it crashed.  He said it was a single-shared-file striped across 1008 OSTs from ~700 clients.  &quot;testfile.out&quot; likely is the name of the file he was using, but I understand &lt;em&gt;mti_rr&lt;/em&gt; might be from a previous request.&lt;/p&gt;

&lt;p&gt;Let me know if there&apos;s anything you want to see from the crash.  We can&apos;t upload it to Intel for you to look at, but I can run commands for you and provide the output.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;#5
        mod libcfs, name lbug_with_loc, RIP 0xffffffffa0420eeb
        frame start 0xffff883f8d5a3ce8, end 0xffff883f8d5a3d08, *base 0xffff883f8d5a3d10
        XBT_RBX = ffffffffa0e15620
        msgdata = ffffffffa0e15620

#6
        mod mdt, name mdt_lock_handle_fini, RIP 0xffffffffa0d9e64b
        frame start 0xffff883f8d5a3d08, end 0xffff883f8d5a3d18, *base 0xffff883f8d5a3d40
        XBT_RBX = ffff883f8d4ff000

#7
        mod mdt, name mdt_thread_info_fini, RIP 0xffffffffa0da4cc0
        frame start 0xffff883f8d5a3d18, end 0xffff883f8d5a3d48, *base 0xffff883f8d5a3d90
        XBT_RBX = ffff883f8d4ff000
        info = ffff883f8d4ff000

#8
        mod mdt, name mdt_handle_common, RIP 0xffffffffa0daa473
        frame start 0xffff883f8d5a3d48, end 0xffff883f8d5a3d98, *base 0xffff883f8d5a3da0
        XBT_RBX = ffff8837a0061000
        XBT_R12 = ffff883f8d4ff000
        XBT_R13 = ffffffffa0e23ee0
        req = ffff8837a0061000
        &amp;amp;supported = ffff883f8d5a3d58
        supported = ffffffff00000002 ...
        info = ffff883f8d4ff000
        &amp;amp;supported = ffff883f8d5a3d58
        supported = ffffffff00000002 ...
        info = ffff883f8d4ff000
        req = ffff8837a0061000
        set = 1
        id = 123
        quiet = 0
        subsystem = 4
        mask = 1
#9
        mod mdt, name mds_regular_handle, RIP 0xffffffffa0de98c5
        frame start 0xffff883f8d5a3d98, end 0xffff883f8d5a3da8, *base 0xffff883f8d5a3ee0
        XBT_RBX = ffff883f8df0d140
        XBT_R12 = ffff883fa7a96800
        XBT_R13 = ffff8837a0061000
        XBT_R14 = 42
        XBT_R15 = ffff883f8d418940

#10
        mod ptlrpc, name ptlrpc_main, RIP 0xffffffffa076d07e
        frame start 0xffff883f8d5a3da8, end 0xffff883f8d5a3ee8, *base 0xffff883f8d5a3f40
        XBT_RBX = ffff883f8df0d140
        XBT_R12 = ffff883fa7a96800
        XBT_R13 = ffff8837a0061000
        XBT_R14 = 42
        XBT_R15 = ffff883f8d418940
        arg = ffff883f8df0d140
        thread = ffff883f8df0d140
        svcpt = ffff883fa7a96800
        &amp;amp;svc = ffff883f8d5a3e50
        svc = ffff883fa7f96080 ...
        &amp;amp;rs = ffff883f8d5a3e68
        rs = ffff883fa7a96868 ...
        &amp;amp;env = ffff883f8d5a3e80
        env = ffff883fa7f96080 ...
        counter = 42
        &amp;amp;rc = ffff883f8d5a3e64
        rc = 0 ...
        flags = 8050
        size = 38
        &amp;amp;ret = ffff883f8d5a3e80
        ret = ffff883fa7f96080 ...
        i = 1
        id = c00
        quiet = 0
        set = 0
        value = 0
        subsystem = 100
        mask = 10
        flags = 8250
        thread = ffff883f8df0d140
        flags = 4
        thread = ffff883f8df0d140
        flags = 8
        thread = ffff883f8df0d140
        &amp;amp;lock = ffff883f8d5a3e88
        lock = ffff883fa7a96830 ...
        svcpt = ffff883fa7a96800
        head = ffff883fa7a96a18
        new = ffff883fa7a96878
        subsystem = 100
        mask = 200
        svcpt = ffff883fa7a96800
        thread = ffff883f8df0d140
        svcpt = ffff883fa7a96800
        &amp;amp;svc = ffff883f8d5a3e80
        svc = ffff883fa7f96080 ...
        request = ffff8837a0061000
        &amp;amp;work_start = ffff883f8d5a3ea0
        work_start = 57164916 ...
        &amp;amp;work_end = ffff883f8d5a3e90
        work_end = 57164916 ...
        small = ffff8837a00611e0
        large = ffff883f8d5a3ea0
        &amp;amp;r = ffff883f8d5a3e40
        r = 4 ...
        result = 0
        id = 50e
        quiet = 0
        set = 0
        value = 0
        req = ffff8837a0061000
        id = 512
        quiet = 0
        set = 0
        value = 0
        subsystem = 100
        mask = 1
        subsystem = 100
        mask = 100000
        subsystem = 100
        mask = 200
        req = ffff8837a0061000
        thread = ffff883f8df0d140
        svcpt = ffff883fa7a96800
        svcpt = ffff883fa7a96800
        svcpt = ffff883fa7a96800
        force = 0
        svcpt = ffff883fa7a96800
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</comment>
                            <comment id="150245" author="ezell" created="Tue, 26 Apr 2016 16:38:04 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; struct ptlrpc_request.rq_pill ffff8837a0061000
  rq_pill = {
    rc_req = 0xffff8837a0061000, 
    rc_fmt = 0xffffffffa0819240 &amp;lt;RQF_LDLM_INTENT_LAYOUT&amp;gt;, 
    rc_loc = RCL_SERVER, 
    rc_area = {{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295}, {4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295}}
  }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="150831" author="simmonsja" created="Tue, 3 May 2016 13:49:42 +0000"  >&lt;p&gt;Any progress on fixing the assertion?&lt;/p&gt;</comment>
                            <comment id="150841" author="jhammond" created="Tue, 3 May 2016 15:23:53 +0000"  >&lt;p&gt;I understood that ORNL was going to reproduce with a stronger debug mask. Has that been done?&lt;/p&gt;</comment>
                            <comment id="150842" author="ezell" created="Tue, 3 May 2016 15:26:31 +0000"  >&lt;p&gt;We have been unable to reproduce on our testbed systems, and we haven&apos;t had an opportunity to reproduce on the production systems.&lt;/p&gt;</comment>
                            <comment id="150847" author="jhammond" created="Tue, 3 May 2016 15:58:38 +0000"  >&lt;p&gt;It&apos;s hard to say for sure without more information but the failed assertion may be addresses by &lt;a href=&quot;http://review.whamcloud.com/#/c/18060/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/18060/&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="152865" author="ezell" created="Thu, 19 May 2016 17:29:19 +0000"  >&lt;p&gt;We attempted to reproduce this assertion on Tuesday using the same conditions as last time, but it never crashed.  After that, we moved to a server with 19717 and 18060 to hopefully prevent it in the future.  I think we can close this ticket and reopen if we see it again.  Thanks.&lt;/p&gt;</comment>
                            <comment id="152945" author="yujian" created="Fri, 20 May 2016 01:15:04 +0000"  >&lt;p&gt;Thank you, Matt.&lt;/p&gt;</comment>
                            <comment id="159227" author="gerrit" created="Tue, 19 Jul 2016 15:38:58 +0000"  >&lt;p&gt;Comment deleted (wrong LU in commit message).&lt;/p&gt;</comment>
                            <comment id="159228" author="gerrit" created="Tue, 19 Jul 2016 15:38:59 +0000"  >&lt;p&gt;Comment deleted (wrong LU in commit message).&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="33545">LU-7535</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="25648">LU-5367</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="21276" name="mylog.dk.gz" size="4719130" author="ezell" created="Tue, 26 Apr 2016 16:25:19 +0000"/>
                            <attachment id="21205" name="vmcore-dmesg.txt" size="464527" author="simmonsja" created="Tue, 19 Apr 2016 17:22:48 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzy8mf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>