<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:26:46 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-9504] LBUG ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-9504</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Loaded latest master build, started soak.  Only fault induced was router drop. &lt;br/&gt;
Soak-9 is the second MDS (MDT0001) &lt;br/&gt;
soak-9 has a hard crash in normal operation:&lt;br/&gt;
Soak is started, mount completes&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[  893.779340] LustreError: 11-0: soaked-OST000c-osc-MDT0001: operation ost_connect to node 192.168.1.102@o2ib10 failed: rc = -16^M
[  893.801270] LustreError: Skipped 139 previous similar messages^M
[  894.095801] Lustre: soaked-MDT0003-osp-MDT0001: Connection restored to 192.168.1.111@o2ib10 (at 192.168.1.111@o2ib10)^M
[  894.110919] Lustre: Skipped 7 previous similar messages^M
[  894.253639] Lustre: soaked-MDT0001: recovery is timed out, evict stale exports^M
[  894.265333] Lustre: soaked-MDT0001: disconnecting 28 stale clients^M
[  894.278063] Lustre: soaked-MDT0001: Recovery over after 5:01, of 31 clients 3 recovered and 28 were evicted.^M
[ 1465.548946] Lustre: soaked-MDT0001: Client 8e539072-a775-2171-7825-433ade3d0c39 (at 192.168.1.132@o2ib100) reconnecting^M
[ 1465.563698] Lustre: soaked-MDT0001: Connection restored to 8e539072-a775-2171-7825-433ade3d0c39 (at 192.168.1.132@o2ib100)^M
[ 1465.579478] Lustre: Skipped 30 previous similar messages^M
[ 1757.516066] Lustre: soaked-MDT0001: Client 6fb4512c-e89d-8233-88ad-b696d11c9821 (at 192.168.1.138@o2ib100) reconnecting^M
[ 1757.531619] Lustre: soaked-MDT0001: Connection restored to 6fb4512c-e89d-8233-88ad-b696d11c9821 (at 192.168.1.138@o2ib100)^M
[ 1904.507160] Lustre: soaked-MDT0001: Client 6ae0a03c-6567-9e19-d483-8743882e83e1 (at 192.168.1.116@o2ib100) reconnecting^M
[ 1942.000624] Lustre: soaked-MDT0001: Client 0a5dee9d-f606-8dd1-e9d4-d42a75c735e1 (at 192.168.1.129@o2ib100) reconnecting^M
[ 2436.463475] Lustre: soaked-MDT0001: Client 2d2ea61b-cf5f-add6-6e06-30458f85a726 (at 192.168.1.139@o2ib100) reconnecting^M
[ 2436.478139] Lustre: soaked-MDT0001: Connection restored to 2d2ea61b-cf5f-add6-6e06-30458f85a726 (at 192.168.1.139@o2ib100)^M
[ 2436.493810] Lustre: Skipped 2 previous similar messages^M
[ 2733.438247] Lustre: soaked-MDT0001: Client cb715667-fb6c-b895-e632-274b232c5bc9 (at 192.168.1.119@o2ib100) reconnecting^M
[ 3117.401693] Lustre: soaked-MDT0001: Client 0a5dee9d-f606-8dd1-e9d4-d42a75c735e1 (at 192.168.1.129@o2ib100) reconnecting^M
[ 3117.416360] Lustre: soaked-MDT0001: Connection restored to 0a5dee9d-f606-8dd1-e9d4-d42a75c735e1 (at 192.168.1.129@o2ib100)^M
[ 3117.430856] Lustre: Skipped 1 previous similar message^M
[ 3359.388740] Lustre: soaked-MDT0001: Client 6ae0a03c-6567-9e19-d483-8743882e83e1 (at 192.168.1.116@o2ib100) reconnecting^M
[ 3497.321905] Lustre: soaked-MDT0001: Client 6ae0a03c-6567-9e19-d483-8743882e83e1 (at 192.168.1.116@o2ib100) reconnecting^M
[ 3951.338201] Lustre: soaked-MDT0001: Client 6ae0a03c-6567-9e19-d483-8743882e83e1 (at 192.168.1.116@o2ib100) reconnecting^M
[ 3951.353148] Lustre: soaked-MDT0001: Connection restored to 6ae0a03c-6567-9e19-d483-8743882e83e1 (at 192.168.1.116@o2ib100)^M
[ 3951.368647] Lustre: Skipped 2 previous similar messages^M
[ 4415.324113] Lustre: soaked-MDT0001: Client 6ae0a03c-6567-9e19-d483-8743882e83e1 (at 192.168.1.116@o2ib100) reconnecting^M
[ 4501.233379] Lustre: 3876:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1494873743/real 1494873743]  req@ffff8803b3329b00 x1567484370368192/t0(0) o104-&amp;gt;soaked-MDT0001@192.168.1.139@o2ib100:15/16 lens 296/224 e 0 to 1 dl 1494873750 ref 1 fl Rpc:X/0/ffffffff rc 0/-1^M
[ 4501.273401] Lustre: 3876:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 7 previous similar messages^M
[ 4632.525998] LustreError: 3049:0:(service.c:2229:ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed: ^M
[ 4632.539820] LustreError: 3049:0:(service.c:2229:ptlrpc_handle_rs()) LBUG^M
[ 4632.550400] Pid: 3049, comm: ptlrpc_hr01_004^M
[ 4632.557537] ^M
[ 4632.557537] Call Trace:^M
[ 4632.566532]  [&amp;lt;ffffffffa080e7ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]^M
[ 4632.575923]  [&amp;lt;ffffffffa080e87c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]^M
[ 4632.585006]  [&amp;lt;ffffffffa0ba2bed&amp;gt;] ptlrpc_hr_main+0x83d/0x8f0 [ptlrpc]^M
[ 4632.594107]  [&amp;lt;ffffffff810c8345&amp;gt;] ? sched_clock_cpu+0x85/0xc0^M
[ 4632.602416]  [&amp;lt;ffffffff810c54c0&amp;gt;] ? default_wake_function+0x0/0x20^M
[ 4632.611086]  [&amp;lt;ffffffffa0ba23b0&amp;gt;] ? ptlrpc_hr_main+0x0/0x8f0 [ptlrpc]^M
[ 4632.620009]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0^M
[ 4632.627027]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread+0x0/0xe0^M
[ 4632.634106]  [&amp;lt;ffffffff81697318&amp;gt;] ret_from_fork+0x58/0x90^M
[ 4632.641571]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread+0x0/0xe0^M
[ 4632.648510] ^M
[ 4632.651470] Kernel panic - not syncing: LBUG^M
[ 4632.657497] CPU: 11 PID: 3049 Comm: ptlrpc_hr01_004 Tainted: P           OE  ------------   3.10.0-514.16.1.el7_lustre.x86_64 #1^M
[ 4632.673033] Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013^M
[ 4632.686895]  ffffffffa082cdac 00000000329e47aa ffff8808212b7d30 ffffffff81686d1f^M
[ 4632.696571]  ffff8808212b7db0 ffffffff8168014a ffffffff00000008 ffff8808212b7dc0^M
[ 4632.706233]  ffff8808212b7d60 00000000329e47aa 00000000329e47aa ffff88082d8cf838^M
[ 4632.715875] Call Trace:^M
[ 4632.715875] Call Trace:^M
[ 4632.719901]  [&amp;lt;ffffffff81686d1f&amp;gt;] dump_stack+0x19/0x1b^M
[ 4632.726949]  [&amp;lt;ffffffff8168014a&amp;gt;] panic+0xe3/0x1f2^M
[ 4632.733611]  [&amp;lt;ffffffffa080e894&amp;gt;] lbug_with_loc+0x64/0xb0 [libcfs]^M
[ 4632.741856]  [&amp;lt;ffffffffa0ba2bed&amp;gt;] ptlrpc_hr_main+0x83d/0x8f0 [ptlrpc]^M
[ 4632.750326]  [&amp;lt;ffffffff810c8345&amp;gt;] ? sched_clock_cpu+0x85/0xc0^M
[ 4632.757970]  [&amp;lt;ffffffff810c54c0&amp;gt;] ? wake_up_state+0x20/0x20^M
[ 4632.765508]  [&amp;lt;ffffffffa0ba23b0&amp;gt;] ? ptlrpc_svcpt_stop_threads+0x590/0x590 [ptlrpc]^M
[ 4632.775290]  [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0^M
[ 4632.781973]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140^M
[ 4632.790485]  [&amp;lt;ffffffff81697318&amp;gt;] ret_from_fork+0x58/0x90^M
[ 4632.797662]  [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread_create_on_node+0x140/0x140^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;System then crashed, crash dump is available on the node. &lt;br/&gt;
vmcore-dmesg attached. &lt;/p&gt;</description>
                <environment>Soak stress cluster</environment>
        <key id="46094">LU-9504</key>
            <summary>LBUG ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Mon, 15 May 2017 19:34:13 +0000</created>
                <updated>Mon, 19 Jun 2017 16:32:08 +0000</updated>
                            <resolved>Mon, 19 Jun 2017 16:32:08 +0000</resolved>
                                    <version>Lustre 2.10.0</version>
                                    <fixVersion>Lustre 2.10.0</fixVersion>
                                        <due></due>
                            <votes>1</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="196050" author="jhammond" created="Tue, 16 May 2017 17:34:23 +0000"  >&lt;p&gt;This assertion was added by &lt;a href=&quot;https://review.whamcloud.com/#/c/22807/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/22807/&lt;/a&gt; for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8650&quot; title=&quot;DNE disabled REP-ACK&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8650&quot;&gt;&lt;del&gt;LU-8650&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="196051" author="pjones" created="Tue, 16 May 2017 17:35:06 +0000"  >&lt;p&gt;Lai&lt;/p&gt;

&lt;p&gt;Could you please advise on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="196165" author="cliffw" created="Wed, 17 May 2017 14:45:22 +0000"  >&lt;p&gt;Hit again on build 3581 - soak fails within one hour due to this bug.&lt;/p&gt;</comment>
                            <comment id="196228" author="cliffw" created="Wed, 17 May 2017 18:35:15 +0000"  >&lt;p&gt;This issue was not seen on builds 3577 or 3578, the issue is likely in build 3579&lt;/p&gt;</comment>
                            <comment id="196229" author="pjones" created="Wed, 17 May 2017 18:37:19 +0000"  >&lt;p&gt;To translate that into git commits&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8943&quot; title=&quot;Enable Multiple IB/OPA Endpoints Between Nodes&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8943&quot;&gt;&lt;del&gt;LU-8943&lt;/del&gt;&lt;/a&gt; lnd: Enable Multiple OPA Endpoints between Nodes &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5788&quot; title=&quot;recovery-double-scale test_pairwise_fail: no enough free disk space&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5788&quot;&gt;&lt;del&gt;LU-5788&lt;/del&gt;&lt;/a&gt; test: fix the cmd to getfree space &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9324&quot; title=&quot;sanity-pfl test 10 needs to reset the file system default layout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9324&quot;&gt;&lt;del&gt;LU-9324&lt;/del&gt;&lt;/a&gt; lfs: output stripe info in YAML format &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9357&quot; title=&quot;PFL llapi_layout_comp_add() should inherit pool from previous component layout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9357&quot;&gt;&lt;del&gt;LU-9357&lt;/del&gt;&lt;/a&gt; pfl: should inherit pool from previous layout comp &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9372&quot; title=&quot;OOM happens on OSS during Lustre recovery for more than 5000 clients&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9372&quot;&gt;&lt;del&gt;LU-9372&lt;/del&gt;&lt;/a&gt; ptlrpc: drain &quot;ptlrpc_request_buffer_desc&quot; objects &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9324&quot; title=&quot;sanity-pfl test 10 needs to reset the file system default layout&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9324&quot;&gt;&lt;del&gt;LU-9324&lt;/del&gt;&lt;/a&gt; lnet: move cyaml.h under lnet/include/ &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7108&quot; title=&quot;Remove sanityn tests 14b, 19, 29, and 35 from the ALWAYS_EXCEPT list&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7108&quot;&gt;&lt;del&gt;LU-7108&lt;/del&gt;&lt;/a&gt; test: Remove sanityn tests from ALWAYS_EXCEPT list &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7567&quot; title=&quot;lfs_changelog() misprints timestamps&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7567&quot;&gt;&lt;del&gt;LU-7567&lt;/del&gt;&lt;/a&gt; utils: fix timestamp printing in lfs_changelog() &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8346&quot; title=&quot;conf-sanity test_93: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8346&quot;&gt;&lt;del&gt;LU-8346&lt;/del&gt;&lt;/a&gt; obdclass: guarantee all keys filled &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9439&quot; title=&quot;Introduce an lnet systemd service&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9439&quot;&gt;&lt;del&gt;LU-9439&lt;/del&gt;&lt;/a&gt; scripts: Change behavior of lustre_rmmod &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9456&quot; title=&quot;Change socklnd calls from sock_create() to sock_create_kern()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9456&quot;&gt;&lt;del&gt;LU-9456&lt;/del&gt;&lt;/a&gt; lnd: Change sock_create() to sock_create_kern() &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8294&quot; title=&quot;Noisy gss_svc_upcall_handle_init&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8294&quot;&gt;&lt;del&gt;LU-8294&lt;/del&gt;&lt;/a&gt; gss: quiet cache_check return ENOENT warning &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9458&quot; title=&quot;LustreError: 12764:0:(sec_bulk.c:188:enc_pools_release_free_pages()) ASSERTION( npages &amp;lt;= page_pools.epp_free_pages ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9458&quot;&gt;&lt;del&gt;LU-9458&lt;/del&gt;&lt;/a&gt; ptlrpc: handle case of epp_free_pages &amp;lt;= PTLRPC_MAX_BRW_PAGES &#8212; oleg.drokin / gitweb&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9096&quot; title=&quot;sanity test_253: File creation failed after rm&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9096&quot;&gt;LU-9096&lt;/a&gt; test: add sanity 253 to ALWAYS_EXCEPT &#8212; oleg.drokin / gitweb&lt;/p&gt;</comment>
                            <comment id="196297" author="laisiyao" created="Thu, 18 May 2017 04:27:09 +0000"  >&lt;p&gt;This assert is from &lt;a href=&quot;https://review.whamcloud.com/22807&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/22807&lt;/a&gt; for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8650&quot; title=&quot;DNE disabled REP-ACK&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8650&quot;&gt;&lt;del&gt;LU-8650&lt;/del&gt;&lt;/a&gt;, which assumes all the handling of a reply will be in one specific hr thread, so it doesn&apos;t need to consider race, now it looks to be not true, I&apos;ll review the code to verify it.&lt;/p&gt;</comment>
                            <comment id="196422" author="gerrit" created="Fri, 19 May 2017 02:15:46 +0000"  >&lt;p&gt;Lai Siyao (lai.siyao@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/27207&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/27207&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9504&quot; title=&quot;LBUG ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9504&quot;&gt;&lt;del&gt;LU-9504&lt;/del&gt;&lt;/a&gt; ptlrpc: REP-ACK hr may race with trans commit&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: fcd7bdd08ec2c23d1e7557788a42e04b97590e02&lt;/p&gt;</comment>
                            <comment id="196518" author="cliffw" created="Fri, 19 May 2017 21:50:20 +0000"  >&lt;p&gt;Patch has survived for three hours, one mds_restart one oss_restart and one router_delay. We&apos;ll see how it does over the weekend, but so far good.&lt;/p&gt;</comment>
                            <comment id="196627" author="cliffw" created="Mon, 22 May 2017 16:08:34 +0000"  >&lt;p&gt;Patch ran &amp;gt; 48 hours (weekend) No hard crashes, survived multiple failovers&lt;br/&gt;
(14 OSS, 13 MDS) without a hard crash. Unfortunately detailed examination of results &lt;br/&gt;
indicated a problem with the soak load engine, the failovers occurred by the clients were idle.&lt;/p&gt;

&lt;p&gt;After fixing the load issue, restarted soak, Had one client lockup, then hard server LBUG after second OST failover. Bug is &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9547&quot; title=&quot;LBUG osp_dev.c:755:osp_statfs()) ASSERTION( sfs-&amp;gt;os_fprecreated &amp;lt;= OST_MAX_PRECREATE * 2 ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9547&quot;&gt;&lt;del&gt;LU-9547&lt;/del&gt;&lt;/a&gt; - may not be related to this issue, we successfully performed two mds failovers. &lt;/p&gt;</comment>
                            <comment id="196956" author="cliffw" created="Wed, 24 May 2017 19:12:49 +0000"  >&lt;p&gt;Hit this on soak with latest master. Patch should land to master.&lt;/p&gt;</comment>
                            <comment id="197715" author="cliffw" created="Wed, 31 May 2017 17:51:42 +0000"  >&lt;p&gt;Tested with latest version of patch&lt;br/&gt;
&lt;a href=&quot;https://review.whamcloud.com/#/c/27207/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/27207/&lt;/a&gt;&lt;br/&gt;
Hit LBUG immediately&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;ay 31 17:11:18 soak-9 kernel: LustreError: 4177:0:(ldlm_lock.c:2548:ldlm_lock_downgrade()) ASSERTION( lock-&amp;gt;l_granted_mode &amp;amp; (LCK_PW | LCK_EX) ) failed:
May 31 17:11:18 soak-9 kernel: LustreError: 4177:0:(ldlm_lock.c:2548:ldlm_lock_downgrade()) LBUG
May 31 17:11:18 soak-9 kernel: Pid: 4177, comm: ptlrpc_hr01_003
May 31 17:11:18 soak-9 kernel: #012Call Trace:
May 31 17:11:18 soak-9 kernel: [&amp;lt;ffffffffa08247ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffffa082487c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffffa0b6283b&amp;gt;] ldlm_lock_downgrade+0x19b/0x1d0 [ptlrpc]
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffffa0bb9620&amp;gt;] ptlrpc_handle_rs+0x3f0/0x640 [ptlrpc]
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffffa0bb9955&amp;gt;] ptlrpc_hr_main+0xe5/0x2c0 [ptlrpc]
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffff810c54c0&amp;gt;] ? default_wake_function+0x0/0x20
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffffa0bb9870&amp;gt;] ? ptlrpc_hr_main+0x0/0x2c0 [ptlrpc]
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread+0x0/0xe0
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffff81697318&amp;gt;] ret_from_fork+0x58/0x90
May 31 17:11:19 soak-9 kernel: [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread+0x0/0xe0
May 31 17:11:19 soak-9 kernel:
May 31 17:11:19 soak-9 kernel: Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="197914" author="laisiyao" created="Fri, 2 Jun 2017 16:23:22 +0000"  >&lt;p&gt;&lt;a href=&quot;https://review.whamcloud.com/#/c/27207/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/27207/&lt;/a&gt; is updated, and some debug messages are added, Cliff, will you test again?&lt;/p&gt;</comment>
                            <comment id="198140" author="cliffw" created="Mon, 5 Jun 2017 16:11:15 +0000"  >&lt;p&gt;I have loaded soak, will start tests today&lt;/p&gt;</comment>
                            <comment id="198152" author="cliffw" created="Mon, 5 Jun 2017 16:59:55 +0000"  >&lt;p&gt;Hit &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9563&quot; title=&quot;LBUG ldlm_lock_downgrade()) ASSERTION( lock-&amp;gt;l_granted_mode &amp;amp; (LCK_PW | LCK_EX) ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9563&quot;&gt;&lt;del&gt;LU-9563&lt;/del&gt;&lt;/a&gt; after first failover&lt;/p&gt;</comment>
                            <comment id="198236" author="laisiyao" created="Tue, 6 Jun 2017 02:27:28 +0000"  >&lt;p&gt;Cliff, could you post more messages of crash? I added some debug messages, which should be printed upon crash.&lt;/p&gt;</comment>
                            <comment id="198372" author="cliffw" created="Tue, 6 Jun 2017 21:48:45 +0000"  >&lt;p&gt;I am still waiting for the failure to repeat. &lt;/p&gt;</comment>
                            <comment id="198493" author="cliffw" created="Wed, 7 Jun 2017 18:08:10 +0000"  >&lt;p&gt;Here is the log from prior to the LBUG.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jun  5 16:44:31 soak-11 kernel: Lustre: Skipped 1 previous similar message
Jun  5 16:44:40 soak-11 kernel: Lustre: soaked-MDT0002: Not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 192.168.1.124@o2ib100 (stopping)
Jun  5 16:44:40 soak-11 kernel: Lustre: Skipped 5 previous similar messages
Jun  5 16:44:44 soak-11 kernel: LustreError: 0-0: Forced cleanup waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; mdt-soaked-MDT0002_UUID namespace with 2 resources in use, (rc=-110)
Jun  5 16:45:02 soak-11 kernel: Lustre: soaked-MDT0002: Not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 192.168.1.136@o2ib100 (stopping)
Jun  5 16:45:02 soak-11 kernel: Lustre: Skipped 20 previous similar messages
Jun  5 16:45:07 soak-11 kernel: Lustre: 4229:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; sent delay: [sent 1496681101/real 0]  req@ffff8807e7265700 x1569380820308784/t0(0) o38-&amp;gt;soaked-MDT0002-osp-MDT0003@192.168.1.110@o2ib10:24/4 lens 520/544 e 0 to 1 dl 1496681107 ref 2 fl Rpc:XN/0/ffffffff rc 0/-1
Jun  5 16:45:09 soak-11 kernel: LustreError: 0-0: Forced cleanup waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; mdt-soaked-MDT0002_UUID namespace with 2 resources in use, (rc=-110)
Jun  5 16:45:34 soak-11 kernel: LustreError: 0-0: Forced cleanup waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; mdt-soaked-MDT0002_UUID namespace with 2 resources in use, (rc=-110)
Jun  5 16:45:41 soak-11 kernel: Lustre: soaked-MDT0002: Not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 192.168.1.102@o2ib10 (stopping)
Jun  5 16:45:41 soak-11 kernel: Lustre: Skipped 11 previous similar messages
Jun  5 16:45:51 soak-11 kernel: LustreError: 5001:0:(lod_qos.c:208:lod_statfs_and_check()) soaked-MDT0002-mdtlov: statfs: rc = -108
Jun  5 16:45:51 soak-11 kernel: Lustre: 5001:0:(service.c:2117:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (31:69s); client may timeout.  req@ffff8803f1b0ef00 x1569380804405664/t0(0) o101-&amp;gt;bd5dff2c-b3c7-e6f9-ab0e-ec6b3d83954a@192.168.1.117@o2ib100:37/0 lens 1728/544 e 1 to 0 dl 1496681082 ref 1 fl Complete:/0/0 rc -19/-19
Jun  5 16:45:54 soak-11 kernel: Lustre: server umount soaked-MDT0002 complete
Jun  5 16:45:54 soak-11 sshd[5502]: Received disconnect from 10.10.1.135: 11: disconnected by user
Jun  5 16:45:54 soak-11 sshd[5502]: pam_unix(sshd:session): session closed &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; user root
Jun  5 16:45:54 soak-11 systemd-logind: Removed session 15.
Jun  5 16:45:54 soak-11 systemd: Removed slice user-0.slice.
Jun  5 16:45:54 soak-11 systemd: Stopping user-0.slice.
Jun  5 16:45:56 soak-11 kernel: Lustre: soaked-MDT0003: Connection restored to 192.168.1.110@o2ib10 (at 192.168.1.110@o2ib10)
Jun  5 16:45:56 soak-11 kernel: Lustre: Skipped 26 previous similar messages
Jun  5 16:51:12 soak-11 kernel: LustreError: 4204:0:(ldlm_lock.c:2548:ldlm_lock_downgrade()) ASSERTION( lock-&amp;gt;l_granted_mode &amp;amp; (LCK_PW | LCK_EX) ) failed:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="198516" author="cliffw" created="Wed, 7 Jun 2017 20:08:12 +0000"  >&lt;p&gt;Hit LBUG again, syslog and lustre-logs from dead server attached&lt;/p&gt;</comment>
                            <comment id="198587" author="laisiyao" created="Thu, 8 Jun 2017 01:24:38 +0000"  >&lt;p&gt;It&apos;s strange I don&apos;t see debug messages before LBUG(), Cliff, which update from &lt;a href=&quot;https://review.whamcloud.com/#/c/27207/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/27207/&lt;/a&gt; did you use in this test? The debug messages were added since Jun 3 (update 5). And if possible can you retest with the update 7? which added more  debug message right before LBUG.&lt;/p&gt;</comment>
                            <comment id="198625" author="cliffw" created="Thu, 8 Jun 2017 14:34:16 +0000"  >&lt;p&gt;I used build 48048, which should have had the debug patches. &lt;br/&gt;
I will retry with the latest&lt;/p&gt;</comment>
                            <comment id="198627" author="cliffw" created="Thu, 8 Jun 2017 14:35:45 +0000"  >&lt;p&gt;I see a patch set 6, build 48063, I see no newer builds&lt;/p&gt;</comment>
                            <comment id="198632" author="laisiyao" created="Thu, 8 Jun 2017 14:48:08 +0000"  >&lt;p&gt;strange, can you access &lt;a href=&quot;https://review.whamcloud.com/#/c/27207/7&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/27207/7&lt;/a&gt; ? which is patch set 7, and the  build is &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-reviews/48061/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-reviews/48061/&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;I just checked build 48063, which is for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9049&quot; title=&quot;DNE MDT Never completes recovery&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9049&quot;&gt;&lt;del&gt;LU-9049&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="198633" author="cliffw" created="Thu, 8 Jun 2017 15:01:33 +0000"  >&lt;p&gt;Okay, loading that build now. &lt;/p&gt;</comment>
                            <comment id="198898" author="cliffw" created="Mon, 12 Jun 2017 14:19:53 +0000"  >&lt;p&gt;Failed again:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Jun 11 07:25:40 soak-11 kernel: perf: interrupt took too &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt; (2510 &amp;gt; 2500), lowering kernel.perf_event_max_sample_rate to 79000
Jun 11 07:25:43 soak-11 kernel: LustreError: 4581:0:(ldlm_lock.c:2549:ldlm_lock_downgrade()) ### weird lock mode ns: mdt-soaked-MDT0003_UUID lock: ffff8806b57f9800/0x3e1187b8839d4a55 lrc: 1/0,0 mode: --/PW res: [0x2c006071b:0x169d:0x0].0x0 bits 0x2 rrc: 2 type: IBT flags: 0x44a19401000000 nid: local remote: 0x0 expref: -99 pid: 7178 timeout: 0 lvb_type: 0
Jun 11 07:25:43 soak-11 kernel: LustreError: 4581:0:(ldlm_lock.c:2550:ldlm_lock_downgrade()) ASSERTION( lock-&amp;gt;l_granted_mode &amp;amp; (LCK_PW | LCK_EX) ) failed:
Jun 11 07:25:43 soak-11 kernel: LustreError: 4581:0:(ldlm_lock.c:2550:ldlm_lock_downgrade()) LBUG
Jun 11 07:25:43 soak-11 kernel: Pid: 4581, comm: ptlrpc_hr01_007
Jun 11 07:25:43 soak-11 kernel: #012Call Trace:
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffffa0bca7ee&amp;gt;] libcfs_call_trace+0x4e/0x60 [libcfs]
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffffa0bca87c&amp;gt;] lbug_with_loc+0x4c/0xb0 [libcfs]
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffffa0eed7b1&amp;gt;] ldlm_lock_downgrade+0x111/0x210 [ptlrpc]
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffffa0f44818&amp;gt;] ptlrpc_handle_rs+0x5c8/0x700 [ptlrpc]
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffffa0f44a35&amp;gt;] ptlrpc_hr_main+0xe5/0x2c0 [ptlrpc]
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffff810c54c0&amp;gt;] ? default_wake_function+0x0/0x20  
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffffa0f44950&amp;gt;] ? ptlrpc_hr_main+0x0/0x2c0 [ptlrpc]
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffff810b0a4f&amp;gt;] kthread+0xcf/0xe0
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread+0x0/0xe0
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffff81697318&amp;gt;] ret_from_fork+0x58/0x90
Jun 11 07:25:43 soak-11 kernel: [&amp;lt;ffffffff810b0980&amp;gt;] ? kthread+0x0/0xe0
Jun 11 07:25:43 soak-11 kernel:
Jun 11 07:25:43 soak-11 kernel: LustreError: dumping log to /tmp/lustre-log.1497165943.4581 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="198901" author="cliffw" created="Mon, 12 Jun 2017 14:30:25 +0000"  >&lt;p&gt;Attached three lustre-log, one from immediately before the LBUG, one after&lt;/p&gt;</comment>
                            <comment id="198979" author="laisiyao" created="Tue, 13 Jun 2017 01:55:49 +0000"  >&lt;p&gt;Cliff, thanks! This helps a lot, so what happened is as below:&lt;br/&gt;
1. ptlrpc_handle_rs() handle REPACK rs, and is about to convert PW lock A to COS mode.&lt;br/&gt;
2. transaction committed, ptlrpc_handle_rs() is called again which decref lock A.&lt;br/&gt;
3. there comes a conflicting lock B, since there is no reader or writer for lock A, it will cancel lock A, which will set its l_granted_mode to 0.&lt;br/&gt;
4. step 1 continues, ASSERTION( lock-&amp;gt;l_granted_mode &amp;amp; (LCK_PW | LCK_EX) ) is triggered.&lt;/p&gt;

&lt;p&gt;I&apos;ll update the patch later.&lt;/p&gt;</comment>
                            <comment id="199465" author="cliffw" created="Fri, 16 Jun 2017 15:47:21 +0000"  >&lt;p&gt;I have run the latest patch for 48 hours on soak. No LBUGS. LFSCK does not complete, but does abort successfully&lt;/p&gt;</comment>
                            <comment id="199607" author="gerrit" created="Mon, 19 Jun 2017 16:27:43 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/27207/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/27207/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9504&quot; title=&quot;LBUG ptlrpc_handle_rs()) ASSERTION( lock != ((void *)0) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9504&quot;&gt;&lt;del&gt;LU-9504&lt;/del&gt;&lt;/a&gt; ptlrpc: REP-ACK hr may race with trans commit&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: ca4659c9d1f010c1046b634cf5a592a620ac3935&lt;/p&gt;</comment>
                            <comment id="199614" author="pjones" created="Mon, 19 Jun 2017 16:32:08 +0000"  >&lt;p&gt;Landed for 2.10&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="46315">LU-9563</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="40177">LU-8650</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="46265">LU-9547</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="26936" name="lu-9504-6-07.soak-11.log.txt" size="55851" author="cliffw" created="Wed, 7 Jun 2017 20:07:14 +0000"/>
                            <attachment id="26937" name="lustre-log.1496836992.5097.txt.gz" size="963966" author="cliffw" created="Wed, 7 Jun 2017 20:07:28 +0000"/>
                            <attachment id="26938" name="lustre-log.1496843934.5221.txt.gz" size="11707635" author="cliffw" created="Wed, 7 Jun 2017 20:07:52 +0000"/>
                            <attachment id="26939" name="lustre-log.1496861499.5292.txt.gz" size="13220671" author="cliffw" created="Wed, 7 Jun 2017 20:08:38 +0000"/>
                            <attachment id="26940" name="lustre-log.1496861511.5224.txt.gz" size="394" author="cliffw" created="Wed, 7 Jun 2017 20:08:49 +0000"/>
                            <attachment id="26941" name="lustre-log.1496861518.5296.txt.gz" size="362982" author="cliffw" created="Wed, 7 Jun 2017 20:09:05 +0000"/>
                            <attachment id="26968" name="lustre-log.1497161984.5008.preLBUG.txt.gz" size="5045677" author="cliffw" created="Mon, 12 Jun 2017 14:28:23 +0000"/>
                            <attachment id="26969" name="lustre-log.1497165943.4581.txt.gz" size="4658706" author="cliffw" created="Mon, 12 Jun 2017 14:28:23 +0000"/>
                            <attachment id="26970" name="lustre-log.1497166143.4807.postLBUG.txt.gz" size="3384176" author="cliffw" created="Mon, 12 Jun 2017 14:28:21 +0000"/>
                            <attachment id="26689" name="vmcore-dmesg.txt" size="139193" author="cliffw" created="Mon, 15 May 2017 19:35:34 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzcs7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>