<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:26:43 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2615] group of OSS crashed at umount</title>
                <link>https://jira.whamcloud.com/browse/LU-2615</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We have got 4 OSSes that crash at the same time, at umount, with the following bt :&lt;/p&gt;

&lt;p&gt;PID: 18173 TASK: ffff8803376dc040 CPU: 4 COMMAND: &quot;umount&quot;&lt;br/&gt;
 #0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115f8d0&amp;#93;&lt;/span&gt; machine_kexec at ffffffff8102895b&lt;br/&gt;
 0000001 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115f930&amp;#93;&lt;/span&gt; crash_kexec at ffffffff810a4622&lt;br/&gt;
 0000002 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fa00&amp;#93;&lt;/span&gt; panic at ffffffff81484657&lt;br/&gt;
 0000003 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fa80&amp;#93;&lt;/span&gt; lbug_with_loc at ffffffffa04ade5b &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 0000004 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115faa0&amp;#93;&lt;/span&gt; llog_recov_thread_stop at ffffffffa072e55b &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 0000005 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fad0&amp;#93;&lt;/span&gt; llog_recov_thread_fini at ffffffffa072e593 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 0000006 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115faf0&amp;#93;&lt;/span&gt; filter_llog_finish at ffffffffa0c7d3dd &lt;span class=&quot;error&quot;&gt;&amp;#91;obdfilter&amp;#93;&lt;/span&gt;&lt;br/&gt;
 0000007 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fb20&amp;#93;&lt;/span&gt; obd_llog_finish at ffffffffa057c2f8 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 0000008 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fb40&amp;#93;&lt;/span&gt; filter_precleanup at ffffffffa0c7cdaf &lt;span class=&quot;error&quot;&gt;&amp;#91;obdfilter&amp;#93;&lt;/span&gt;&lt;br/&gt;
 0000009 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fba0&amp;#93;&lt;/span&gt; class_cleanup at ffffffffa05a3ca7 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
0000010 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fc20&amp;#93;&lt;/span&gt; class_process_config at ffffffffa05a5feb &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
0000011 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fcb0&amp;#93;&lt;/span&gt; class_manual_cleanup at ffffffffa05a6d29 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
0000012 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fd70&amp;#93;&lt;/span&gt; server_put_super at ffffffffa05b2c0c &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
0000013 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fe40&amp;#93;&lt;/span&gt; generic_shutdown_super at ffffffff8116542b&lt;br/&gt;
0000014 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fe60&amp;#93;&lt;/span&gt; kill_anon_super at ffffffff81165546&lt;br/&gt;
0000015 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fe80&amp;#93;&lt;/span&gt; lustre_kill_super at ffffffffa05a8966 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
0000016 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fea0&amp;#93;&lt;/span&gt; deactivate_super at ffffffff811664e0&lt;br/&gt;
0000017 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fec0&amp;#93;&lt;/span&gt; mntput_no_expire at ffffffff811826bf&lt;br/&gt;
0000018 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115fef0&amp;#93;&lt;/span&gt; sys_umount at ffffffff81183188&lt;br/&gt;
0000019 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff8802b115ff80&amp;#93;&lt;/span&gt; system_call_fastpath at ffffffff810030f2&lt;br/&gt;
    RIP: 00007f62ddfbdd67 RSP: 00007fffab738308 RFLAGS: 00010202&lt;br/&gt;
    RAX: 00000000000000a6 RBX: ffffffff810030f2 RCX: 0000000000000010&lt;br/&gt;
    RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007f62deeb3bb0&lt;br/&gt;
    RBP: 00007f62deeb3b80 R8: 00007f62deeb3bd0 R9: 0000000000000000&lt;br/&gt;
    R10: 00007fffab738130 R11: 0000000000000246 R12: 0000000000000000&lt;br/&gt;
    R13: 0000000000000000 R14: 0000000000000000 R15: 00007f62deeb3c10&lt;br/&gt;
    ORIG_RAX: 00000000000000a6 CS: 0033 SS: 002b&lt;/p&gt;

&lt;p&gt;This bt is identical as the one shown &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1194&quot; title=&quot;llog_recov_thread_stop+0x1ae/0x1b0 asserting&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1194&quot;&gt;&lt;del&gt;LU-1194&lt;/del&gt;&lt;/a&gt; which is supposed to be fixed in 2.1.3.&lt;/p&gt;

&lt;p&gt;Site is classified so I can&apos;t upload the binary crash but I can export the content of some structures upon request.&lt;/p&gt;</description>
                <environment></environment>
        <key id="17163">LU-2615</key>
            <summary>group of OSS crashed at umount</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="10000">Done</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="louveta">Alexandre Louvet</reporter>
                        <labels>
                            <label>ptr</label>
                    </labels>
                <created>Mon, 14 Jan 2013 09:16:20 +0000</created>
                <updated>Thu, 17 Dec 2015 23:57:24 +0000</updated>
                            <resolved>Thu, 17 Dec 2015 23:57:24 +0000</resolved>
                                    <version>Lustre 2.1.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="50441" author="pjones" created="Mon, 14 Jan 2013 16:46:28 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="50862" author="hongchao.zhang" created="Sun, 20 Jan 2013 08:35:01 +0000"  >&lt;p&gt;is the debug log before the crash available? it will print the content of the remain llcd and the address of llog_ctxt and&lt;br/&gt;
llog_commit_master will be printed also,&lt;/p&gt;

&lt;p&gt;   CDEBUG(D_RPCTRACE, &quot;Llcd (%p) at %s:%d:\n&quot;, llcd, func, line);&lt;br/&gt;
   CDEBUG(D_RPCTRACE, &quot;  size: %d\n&quot;, llcd-&amp;gt;llcd_size);&lt;br/&gt;
   CDEBUG(D_RPCTRACE, &quot;  ctxt: %p\n&quot;, llcd-&amp;gt;llcd_ctxt);&lt;br/&gt;
   CDEBUG(D_RPCTRACE, &quot;  lcm : %p\n&quot;, llcd-&amp;gt;llcd_lcm);&lt;br/&gt;
   CDEBUG(D_RPCTRACE, &quot;  cookiebytes : %d\n&quot;, llcd-&amp;gt;llcd_cookiebytes);&lt;/p&gt;

&lt;p&gt;could you please print the content of the two structures? Thanks!&lt;/p&gt;</comment>
                            <comment id="50884" author="hongchao.zhang" created="Mon, 21 Jan 2013 09:18:45 +0000"  >&lt;p&gt;the other possible reason of this issue is the ptlrpc_request created in llog_send is not completed normally, and its rq_interpret_reply (llcd_interpret)&lt;br/&gt;
is not called to free the llcd.&lt;/p&gt;

&lt;p&gt;Hi, what patches are you using with the 2.1.3?&lt;/p&gt;</comment>
                            <comment id="51093" author="louveta" created="Thu, 24 Jan 2013 08:33:21 +0000"  >&lt;p&gt;Here is the list of patches we have on our production machine.&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1039&quot; title=&quot;data corruption in check_set&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1039&quot;&gt;&lt;del&gt;LU-1039&lt;/del&gt;&lt;/a&gt; handle bulk IO errors correctly&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1650&quot; title=&quot;crash of lustre clients in osc_req_attr_set() routine&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1650&quot;&gt;&lt;del&gt;LU-1650&lt;/del&gt;&lt;/a&gt; find the lock by index of subpage: update with patch set 2&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2170&quot; title=&quot;osc_extent_merge()) ASSERTION( cur-&amp;gt;oe_osclock == victim-&amp;gt;oe_osclock) while running racer&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2170&quot;&gt;&lt;del&gt;LU-2170&lt;/del&gt;&lt;/a&gt; osc: set osc_lock attribute only once&lt;/li&gt;
	&lt;li&gt;ORNL-22 general ptlrpcd threads pool support&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1144&quot; title=&quot;implement a NUMA aware ptlrpcd binding policy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1144&quot;&gt;&lt;del&gt;LU-1144&lt;/del&gt;&lt;/a&gt; implement a NUMA aware ptlrpcd binding policy&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1110&quot; title=&quot;MDS Oops in osd_xattr_get() during file open by FID&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1110&quot;&gt;&lt;del&gt;LU-1110&lt;/del&gt;&lt;/a&gt; MDS Oops in osd_xattr_get() during file open by FID&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1363&quot; title=&quot;SELinux and stateahead hang&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1363&quot;&gt;&lt;del&gt;LU-1363&lt;/del&gt;&lt;/a&gt; llite: Not held lock when calling security_d_instantiate&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-645&quot; title=&quot;getcwd fails&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-645&quot;&gt;&lt;del&gt;LU-645&lt;/del&gt;&lt;/a&gt;/BZ23978 getcwd failure&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1299&quot; title=&quot;running truncated executable causes spewing of lock debug messages&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1299&quot;&gt;&lt;del&gt;LU-1299&lt;/del&gt;&lt;/a&gt; (patch set 11) loading large enough binary from lustre trigger OOM killer&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-549&quot; title=&quot;Add xattr list/value cache on client&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-549&quot;&gt;&lt;del&gt;LU-549&lt;/del&gt;&lt;/a&gt; Improve statfs performance if selinux is disabled (complement)&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1749&quot; title=&quot;llog_lvfs_create()) error looking up logfile&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1749&quot;&gt;&lt;del&gt;LU-1749&lt;/del&gt;&lt;/a&gt; mdt failing to open llog result in invalidating ost&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1331&quot; title=&quot;changelogs: RNMTO record not always after RNMFRM&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1331&quot;&gt;&lt;del&gt;LU-1331&lt;/del&gt;&lt;/a&gt; changelogs: RNMTO record not always after RNMFRM&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1592&quot; title=&quot;ASSERTION(cfs_atomic_read(&amp;amp;imp-&amp;gt;imp_refcount) == 0) failed: value: -1&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1592&quot;&gt;&lt;del&gt;LU-1592&lt;/del&gt;&lt;/a&gt; ldlm: protect obd_export:exp_imp_reverse&apos;s change&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1144&quot; title=&quot;implement a NUMA aware ptlrpcd binding policy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1144&quot;&gt;&lt;del&gt;LU-1144&lt;/del&gt;&lt;/a&gt; update patch to add ptlrpc option in modprobe.conf file&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1448&quot; title=&quot;Disabled OSC can cause NULL pointer dereference when reading import&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1448&quot;&gt;&lt;del&gt;LU-1448&lt;/del&gt;&lt;/a&gt; prevent NULL pointer dereference on disabled OSC&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1057&quot; title=&quot;low performance maybe related to quota&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1057&quot;&gt;&lt;del&gt;LU-1057&lt;/del&gt;&lt;/a&gt; quota speed up lookup in osc_quota_chkdq.patch&lt;/li&gt;
	&lt;li&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1714&quot; title=&quot;crash upon loading libcfs module&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1714&quot;&gt;&lt;del&gt;LU-1714&lt;/del&gt;&lt;/a&gt; properly initialyse sg_magic value.patch&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;For CDEBUG log it will take a little more time. By default our production machine run with a debug filter set to 0 (none), so I do not have those trace in the dmesg log. I have to spend some time to extract them from the crash.&lt;/p&gt;</comment>
                            <comment id="51100" author="hongchao.zhang" created="Thu, 24 Jan 2013 09:34:45 +0000"  >&lt;p&gt;what is the content of the two structures &quot;llog_ctxt&quot; and &quot;llog_commit_master&quot; of the llcd?&lt;/p&gt;</comment>
                            <comment id="52112" author="louveta" created="Mon, 11 Feb 2013 07:56:52 +0000"  >&lt;p&gt;Here is the content of llog_ctxt &amp;amp; llog_commit_master extracted from the crash dump :&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 
crash&amp;gt; llog_ctxt 0xffff8802327f8900
struct llog_ctxt {
  loc_idx = 3,
  loc_gen = {
    mnt_cnt = 0,
    conn_cnt = 0
  },
  loc_obd = 0xffff88032adbc038,
  loc_olg = 0xffff88032adbc2d0,
  loc_exp = 0xffff8802ed751400,
  loc_imp = 0x0,
  loc_logops = 0xffffffffa0cb99e0,
  loc_handle = 0x0,
  loc_lcm = 0xffff88033614b400,
  loc_llcd = 0x0,
  loc_sem = {
    lock = {
      raw_lock = {
        slock = 458759
      }
    },
    count = 0,
    wait_list = {
      next = 0xffff8802327f8960,
      prev = 0xffff8802327f8960
    }
  },
  loc_refcount = {
    counter = 2
  },
  llog_proc_cb = 0xffffffffa0c95220,
  loc_flags = 2
}

crash&amp;gt; llog_commit_master 0xffff88033614b400
struct llog_commit_master {
  lcm_flags = 4,
  lcm_count = {
    counter = 1
  },
  lcm_refcount = {
    counter = 3
  },
  lcm_pc = {
    pc_flags = 0,
    pc_lock = {
      raw_lock = {
        slock = 65537
      }
    },
    pc_starting = {
      done = 0,
      wait = {
        lock = {
          raw_lock = {
            slock = 196611
          }
        },
        task_list = {
          next = 0xffff88033614b430,
          prev = 0xffff88033614b430
        }
      }
    },
    pc_finishing = {
      done = 0,
      wait = {
        lock = {
          raw_lock = {
            slock = 196611
          }
        },
        task_list = {
          next = 0xffff88033614b450,
          prev = 0xffff88033614b450
        }
      }
    },
    pc_set = 0x0,
    pc_name = &quot;lcm_xxxxx1-OST0&quot;,
    pc_env = {
      le_ctx = {
        lc_tags = 2415919112,
        lc_thread = 0x0,
        lc_value = 0x0,
        lc_state = LCS_FINALIZED,
        lc_remember = {
          next = 0xffff88033614b498,
          prev = 0xffff88033614b498
        },
        lc_version = 15,
        lc_cookie = 0
      },
      le_ses = 0x0
    },
    pc_index = -1,
    pc_npartners = 0,
    pc_partners = 0x0,
    pc_cursor = 0
  },
  lcm_lock = {
    raw_lock = {
      slock = 155453764
    }
  },
  lcm_llcds = {
    next = 0xffff88021b2c2050,
    prev = 0xffff88021b2c2050
  },
  lcm_name = &quot;lcm_xxxxx1-OST0002\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000&quot;
} 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="53588" author="hongchao.zhang" created="Fri, 8 Mar 2013 06:21:57 +0000"  >&lt;p&gt;the lcm-&amp;gt;lcm_llcds list is empty&lt;br/&gt;
  ...&lt;br/&gt;
  lcm_llcds = &lt;/p&gt;
{
    next = 0xffff88021b2c2050,
    prev = 0xffff88021b2c2050
  }
&lt;p&gt;,&lt;br/&gt;
  ...&lt;/p&gt;

&lt;p&gt;but the lcm-&amp;gt;lcm_count is &quot;1&quot;, which is very strange!&lt;br/&gt;
could you please attach the &quot;lustre/ptlrpc/ptlrpcd.c&quot; and &quot;lustre/ptlrpc/recov_thread.c&quot; in your source tree, thanks very much!&lt;/p&gt;</comment>
                            <comment id="53691" author="sebastien.buisson" created="Mon, 11 Mar 2013 10:56:19 +0000"  >&lt;p&gt;Hi, here are the source files requested by Hongchao.&lt;/p&gt;</comment>
                            <comment id="53785" author="hongchao.zhang" created="Tue, 12 Mar 2013 07:35:03 +0000"  >&lt;p&gt;the list &quot;lcm_llcds&quot; is corrupted for its value of &quot;next&quot; and &quot;prev&quot; is wrong (it&apos;s not in the address region of &quot;struct llog_commit_master&quot;).&lt;br/&gt;
Could it be memory corrupt? there is no trace of the bug yet, sorry!&lt;/p&gt;

&lt;p&gt;does the issue occur again recently?&lt;/p&gt;</comment>
                            <comment id="56089" author="louveta" created="Thu, 11 Apr 2013 13:35:07 +0000"  >&lt;p&gt;&amp;gt; Could it be memory corrupt?&lt;br/&gt;
It&apos;s unexpected. The serveur is fully ECC protected and, as it&apos;s an OSS, almost only linux &amp;amp; lustre are running on this node.&lt;/p&gt;

&lt;p&gt;&amp;gt; does the issue occur again recently?&lt;br/&gt;
It did occurs the last 4 times we did stop lustre on the node.&lt;/p&gt;</comment>
                            <comment id="57103" author="hongchao.zhang" created="Fri, 26 Apr 2013 10:51:42 +0000"  >&lt;p&gt;Hi, &lt;br/&gt;
Does the kernel dump referred in comment at 11/Feb/13 3:56 PM exist? if so, could you please print the content at 0xffff88021b2c2050&lt;br/&gt;
as &quot;struct llog_canceld_ctxt&quot;? besides, can the console output(just the part related to Lustre) be attached here? Thanks a lot!&lt;/p&gt;</comment>
                            <comment id="57501" author="hongchao.zhang" created="Thu, 2 May 2013 10:16:41 +0000"  >&lt;p&gt;the remaining &quot;llcd&quot; should have been sent over ptlrpc_request for llog_ctxt-&amp;gt;loc_llcd == NULL, and the request could not finish, then &quot;llcd_interpret&quot; wasn&apos;t&lt;br/&gt;
called to free the &quot;llcd&quot;, there are 2 patches (ORNL-22 general ptlrpcd threads pool support; &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1144&quot; title=&quot;implement a NUMA aware ptlrpcd binding policy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1144&quot;&gt;&lt;del&gt;LU-1144&lt;/del&gt;&lt;/a&gt; implement a NUMA aware ptlrpcd binding policy) among&lt;br/&gt;
the patches applied currently is related to it, could you please help to revert the 2 patches and test it, Thanks!&lt;/p&gt;</comment>
                            <comment id="58656" author="sebastien.buisson" created="Thu, 16 May 2013 15:31:51 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;It might be difficult to have the opportunity to install packages with those 2 patches reverted at customer site.&lt;br/&gt;
Instead, could we just set ptlrpcd_bind_policy=1 and max_ptlrpcds=2 as options for the ptlrpc kernel module, so that it behaves like if patch from ORNL-22 was not applied?&lt;br/&gt;
Is it still a relevant test for you?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Sebastien.&lt;/p&gt;</comment>
                            <comment id="60171" author="hongchao.zhang" created="Fri, 7 Jun 2013 15:00:44 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;Yes, it will disable the ptlrpcd thread pools (although not shaking off the patch completely) and it should be still a relevant test.&lt;/p&gt;

&lt;p&gt;Thanks &lt;/p&gt;</comment>
                            <comment id="60798" author="hongchao.zhang" created="Tue, 18 Jun 2013 09:45:26 +0000"  >&lt;p&gt;Hi, what is the output of the test? Thanks&lt;/p&gt;</comment>
                            <comment id="60846" author="sebastien.buisson" created="Wed, 19 Jun 2013 06:57:09 +0000"  >&lt;p&gt;Hi,&lt;/p&gt;

&lt;p&gt;I have asked people on site for the results of the tests.&lt;/p&gt;

&lt;p&gt;Cheers,&lt;br/&gt;
Sebastien.&lt;/p&gt;</comment>
                            <comment id="63520" author="lixi" created="Fri, 2 Aug 2013 02:50:38 +0000"  >&lt;p&gt;We hit the same problem on lustre-2.1.6 too.&lt;/p&gt;

&lt;p&gt;After reading a few codes, I am wondering whether it is possible for following race problem to happen. Please correct me if I am wrong.&lt;/p&gt;

&lt;p&gt;filter_llog_finish&lt;br/&gt;
--llog_recov_thread_fini&lt;br/&gt;
----llog_sync&lt;br/&gt;
------llog_obd_repl_sync&lt;br/&gt;
--------llog_cancel&lt;br/&gt;
----------llog_obd_repl_cancel&lt;br/&gt;
------------llcd_push&lt;br/&gt;
--------------llcd_send&lt;br/&gt;
----------------Sending async&lt;br/&gt;
----llog_recov_thread_stop&lt;br/&gt;
------LBUG&#65292;because llcd_send is sending a llcd and llcd_interpret() is not called since no reply has been got now.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="63522" author="hongchao.zhang" created="Fri, 2 Aug 2013 04:14:21 +0000"  >&lt;p&gt;what are the two threads involved in the race?&lt;br/&gt;
normally, the llog_recov_thread_stop is only called by llog_recov_thread_fini, and &quot;llog_recov_thread_stop&quot; is called in two places,&lt;br/&gt;
one is the cleanup for the failed llog_recov_thread_init call, the other is the normal cleanup phase during device cleanup &lt;br/&gt;
(called in filter_llog_finish). they can&apos;t be called simultaneously&lt;/p&gt;

&lt;p&gt;could you please attach some more info about this issue, and can it be reproduced on your site?&lt;/p&gt;</comment>
                            <comment id="63535" author="lixi" created="Fri, 2 Aug 2013 07:22:11 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;Sorry, may be &apos;race&apos; is not the right word to express my thought.&lt;/p&gt;

&lt;p&gt;At the time llcd_send() returns, the completion handler llcd_interpret() might not be called yet, right? When the llcd is still under use by the RPC on flight, llog_recov_thread_stop() will hit a LBUG. I can&apos;t find any codes in filter_llog_finish() which waits for the RPC finishes, so I guess it is possible that when llog_recov_thread_stop() is called, the RPC is still on flight. Am I right?&lt;/p&gt;

&lt;p&gt;Thanks&lt;br/&gt;
Li Xi&lt;/p&gt;</comment>
                            <comment id="63625" author="hongchao.zhang" created="Mon, 5 Aug 2013 06:03:03 +0000"  >&lt;p&gt;in ptlrpcd_stop, cfs_wait_for_completion(&amp;amp;pc-&amp;gt;pc_finishing) will be called to wait the pending RPCs to complete!&lt;/p&gt;

&lt;p&gt;can the issue be reproduced in your site?&lt;/p&gt;</comment>
                            <comment id="63630" author="lixi" created="Mon, 5 Aug 2013 06:45:47 +0000"  >&lt;p&gt;Ah, I see. Thank you very much!&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="12287" name="ptlrpcd.c" size="32455" author="sebastien.buisson" created="Mon, 11 Mar 2013 10:56:19 +0000"/>
                            <attachment id="12288" name="recov_thread.c" size="24078" author="sebastien.buisson" created="Mon, 11 Mar 2013 10:56:19 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvf9j:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6118</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>