<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:57:05 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12953] LNet timeouts with restarted Lustre production file system</title>
                <link>https://jira.whamcloud.com/browse/LU-12953</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;When restarting our production Lustre file system we encountered this bug:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;407608.498637&amp;#93;&lt;/span&gt; LNetError: 72335:0:(o2iblnd_cb.c:3335:kiblnd_check_txs_locked()) Timed out tx: active_txs, 0 seconds&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;407608.509681&amp;#93;&lt;/span&gt; LNetError: 72335:0:(o2iblnd_cb.c:3410:kiblnd_check_conns()) Timed out RDMA with 10.10.32.102@o2ib2 (5): c: 3, oc: 0, rc: 7&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;407608.526089&amp;#93;&lt;/span&gt; LustreError: 72335:0:(events.c:450:server_bulk_callback()) event type 5, status -103, desc ffff8ca33db8a800&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;407608.537667&amp;#93;&lt;/span&gt; LustreError: 72335:0:(events.c:450:server_bulk_callback()) event type 3, status -103, desc ffff8ca33db8a800&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;407608.549244&amp;#93;&lt;/span&gt; LustreError: 167072:0:(ldlm_lib.c:3259:target_bulk_io()) @@@ network error on bulk WRITE req@ffff8cabcfcba850 x1648066684855104/t0(0) o4-&amp;gt;8d9c48a5-020d-9844-4aa4-57225c35d4e2@10.10.32.102@o2ib2:135/0 lens 608/448 e 0 to 0 dl 1573227610 ref 1 fl Interpret:/0/0 rc 0/0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;407608.576219&amp;#93;&lt;/span&gt; Lustre: f2-OST001d: Bulk IO write error with 8d9c48a5-020d-9844-4aa4-57225c35d4e2 (at 10.10.32.102@o2ib2), client will retry: rc = -110&lt;/p&gt;

&lt;p&gt;Eventually we ended up seeing:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;423015.676012&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff98d5d28b&amp;gt;&amp;#93;&lt;/span&gt; queued_spin_lock_slowpath+0xb/0xf&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;423015.676017&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff98d6b760&amp;gt;&amp;#93;&lt;/span&gt; _raw_spin_lock+0x20/0x30&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;423015.676026&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc19ddf39&amp;gt;&amp;#93;&lt;/span&gt; ofd_intent_policy+0x1d9/0x920 &lt;span class=&quot;error&quot;&gt;&amp;#91;ofd&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;423015.676070&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc161dd26&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x366/0xa60 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;423015.676080&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc12f4033&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_hash_bd_add_locked+0x63/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;423015.676085&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc12f77be&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_hash_add+0xbe/0x1a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;423015.676107&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc1646587&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0xa47/0x15a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;423015.676130&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc166e6d0&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_swab_ldlm_lock_desc+0x30/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;This looks similar to the issues reported by NASA but just to make sure.&lt;/p&gt;</description>
                <environment>Lustre OSS server running ZFS.</environment>
        <key id="57348">LU-12953</key>
            <summary>LNet timeouts with restarted Lustre production file system</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                            <label>ORNL</label>
                    </labels>
                <created>Fri, 8 Nov 2019 21:55:54 +0000</created>
                <updated>Tue, 1 Feb 2022 19:42:21 +0000</updated>
                            <resolved>Tue, 1 Feb 2022 19:42:21 +0000</resolved>
                                    <version>Lustre 2.12.3</version>
                                    <fixVersion>Lustre 2.12.3</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="258047" author="pjones" created="Sat, 9 Nov 2019 13:07:11 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Could you please advise&lt;/p&gt;

&lt;p&gt;James&lt;/p&gt;

&lt;p&gt;Do you have any patches applied or are you running vanilla 2.12.3?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="258176" author="ashehata" created="Tue, 12 Nov 2019 19:35:42 +0000"  >&lt;p&gt;Did you try:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 lctl set_param osc.*.short_io_bytes=0&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Seems like this resolved the issue on &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12856&quot; title=&quot;LustreError: 82937:0:(ldlm_lib.c:3268:target_bulk_io()) @@@ truncated bulk READ 0(270336)  &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12856&quot;&gt;&lt;del&gt;LU-12856&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="262537" author="simmonsja" created="Tue, 4 Feb 2020 13:48:24 +0000"  >&lt;p&gt;We are going to update to a newer 2.12 LTS that has the&#160;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12856&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-12856&lt;/a&gt;&#160;fix. We will see if it resolves our problems. A specific users job can reproduce this problem easily.&lt;/p&gt;</comment>
                            <comment id="264404" author="hanleyja" created="Mon, 2 Mar 2020 15:07:33 +0000"  >&lt;p&gt;We moved to the newer release a couple weeks ago and haven&apos;t seen a panic related to the &lt;tt&gt;short_io&lt;/tt&gt; error, and the system has been pretty stable.&lt;/p&gt;

&lt;p&gt;Last night, we encountered a crash with a similar stack trace:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[1272493.521802] Call Trace:
[1272493.521807]  [&amp;lt;ffffffff8e17544a&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
[1272493.521810]  [&amp;lt;ffffffff8e183330&amp;gt;] _raw_spin_lock+0x20/0x30
[1272493.521821]  [&amp;lt;ffffffffc17ce5a9&amp;gt;] ofd_intent_policy+0x1d9/0x920 [ofd]
[1272493.521865]  [&amp;lt;ffffffffc1402e7c&amp;gt;] ? ldlm_extent_alloc_lock+0x6c/0x280 [ptlrpc]
[1272493.521880]  [&amp;lt;ffffffffc13ebe06&amp;gt;] ldlm_lock_enqueue+0x356/0xa20 [ptlrpc]
[1272493.521889]  [&amp;lt;ffffffffc1072033&amp;gt;] ? cfs_hash_bd_add_locked+0x63/0x80 [libcfs]
[1272493.521894]  [&amp;lt;ffffffffc10757be&amp;gt;] ? cfs_hash_add+0xbe/0x1a0 [libcfs]
[1272493.521912]  [&amp;lt;ffffffffc1414506&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
[1272493.521934]  [&amp;lt;ffffffffc143d300&amp;gt;] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc]
[1272493.521965]  [&amp;lt;ffffffffc149ccf2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
[1272493.521989]  [&amp;lt;ffffffffc14a3b0a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[1272493.522011]  [&amp;lt;ffffffffc147d021&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[1272493.522017]  [&amp;lt;ffffffffc1066bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[1272493.522037]  [&amp;lt;ffffffffc144846b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[1272493.522057]  [&amp;lt;ffffffffc1445285&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[1272493.522059]  [&amp;lt;ffffffff8dad3903&amp;gt;] ? __wake_up+0x13/0x20
[1272493.522079]  [&amp;lt;ffffffffc144bdd4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[1272493.522099]  [&amp;lt;ffffffffc144b2a0&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
[1272493.522101]  [&amp;lt;ffffffff8dac61f1&amp;gt;] kthread+0xd1/0xe0
[1272493.522103]  [&amp;lt;ffffffff8dac6120&amp;gt;] ? insert_kthread_work+0x40/0x40
[1272493.522105]  [&amp;lt;ffffffff8e18dd1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[1272493.522107]  [&amp;lt;ffffffff8dac6120&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;The other 2 stack traces I&apos;m seeing repeated are:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[1313981.701739] Call Trace:
[1313981.701741]  &amp;lt;IRQ&amp;gt;  [&amp;lt;ffffffff8dada648&amp;gt;] sched_show_task+0xa8/0x110
[1313981.701743]  [&amp;lt;ffffffff8dade3e9&amp;gt;] dump_cpu_task+0x39/0x70
[1313981.701744]  [&amp;lt;ffffffff8db57fa0&amp;gt;] rcu_dump_cpu_stacks+0x90/0xd0
[1313981.701746]  [&amp;lt;ffffffff8db5b662&amp;gt;] rcu_check_callbacks+0x442/0x730
[1313981.701748]  [&amp;lt;ffffffff8db0ff40&amp;gt;] ? tick_sched_do_timer+0x50/0x50
[1313981.701750]  [&amp;lt;ffffffff8daaf536&amp;gt;] update_process_times+0x46/0x80
[1313981.701752]  [&amp;lt;ffffffff8db0fcb0&amp;gt;] tick_sched_handle+0x30/0x70
[1313981.701753]  [&amp;lt;ffffffff8db0ff79&amp;gt;] tick_sched_timer+0x39/0x80
[1313981.701755]  [&amp;lt;ffffffff8daca5ee&amp;gt;] __hrtimer_run_queues+0x10e/0x270
[1313981.701757]  [&amp;lt;ffffffff8dacab4f&amp;gt;] hrtimer_interrupt+0xaf/0x1d0
[1313981.701759]  [&amp;lt;ffffffff8da5c60b&amp;gt;] local_apic_timer_interrupt+0x3b/0x60
[1313981.701761]  [&amp;lt;ffffffff8e1929d3&amp;gt;] smp_apic_timer_interrupt+0x43/0x60
[1313981.701763]  [&amp;lt;ffffffff8e18eefa&amp;gt;] apic_timer_interrupt+0x16a/0x170
[1313981.701765]  &amp;lt;EOI&amp;gt;  [&amp;lt;ffffffff8db17432&amp;gt;] ? native_queued_spin_lock_slowpath+0x122/0x200
[1313981.701766]  [&amp;lt;ffffffff8e17544a&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
[1313981.701767]  [&amp;lt;ffffffff8e183330&amp;gt;] _raw_spin_lock+0x20/0x30
[1313981.701781]  [&amp;lt;ffffffffc13e402c&amp;gt;] lock_res_and_lock+0x2c/0x50 [ptlrpc]
[1313981.701795]  [&amp;lt;ffffffffc13ebc61&amp;gt;] ldlm_lock_enqueue+0x1b1/0xa20 [ptlrpc]
[1313981.701815]  [&amp;lt;ffffffffc143b891&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
[1313981.701832]  [&amp;lt;ffffffffc1414506&amp;gt;] ldlm_handle_enqueue0+0xa56/0x15f0 [ptlrpc]
[1313981.701851]  [&amp;lt;ffffffffc143d300&amp;gt;] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc]
[1313981.701875]  [&amp;lt;ffffffffc149ccf2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
[1313981.701899]  [&amp;lt;ffffffffc14a3b0a&amp;gt;] tgt_request_handle+0xada/0x1570 [ptlrpc]
[1313981.701921]  [&amp;lt;ffffffffc147d021&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[1313981.701924]  [&amp;lt;ffffffffc1066bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[1313981.701944]  [&amp;lt;ffffffffc144846b&amp;gt;] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[1313981.701963]  [&amp;lt;ffffffffc1445285&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[1313981.701983]  [&amp;lt;ffffffffc144bdd4&amp;gt;] ptlrpc_main+0xb34/0x1470 [ptlrpc]
[1313981.702002]  [&amp;lt;ffffffffc144b2a0&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
[1313981.702003]  [&amp;lt;ffffffff8dac61f1&amp;gt;] kthread+0xd1/0xe0
[1313981.702005]  [&amp;lt;ffffffff8dac6120&amp;gt;] ? insert_kthread_work+0x40/0x40
[1313981.702006]  [&amp;lt;ffffffff8e18dd1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[1313981.702008]  [&amp;lt;ffffffff8dac6120&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[1313981.702010] Call Trace:
[1313981.702014]  [&amp;lt;ffffffffc0bc1797&amp;gt;] ? kiblnd_send+0x357/0xa20 [ko2iblnd]
[1313981.702028]  [&amp;lt;ffffffffc118e869&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[1313981.702031]  [&amp;lt;ffffffffc1066bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[1313981.702049]  [&amp;lt;ffffffffc141f2ce&amp;gt;] ? ldlm_pool_recalc+0x12e/0x1f0 [ptlrpc]
[1313981.702060]  [&amp;lt;ffffffffc118e869&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[1313981.702078]  [&amp;lt;ffffffffc1420a40&amp;gt;] ? ldlm_pool_add+0x80/0xc0 [ptlrpc]
[1313981.702092]  [&amp;lt;ffffffffc13e9cce&amp;gt;] ? ldlm_grant_lock+0xfe/0x270 [ptlrpc]
[1313981.702107]  [&amp;lt;ffffffffc13ee841&amp;gt;] ? ldlm_resource_unlink_lock+0x41/0x70 [ptlrpc]
[1313981.702122]  [&amp;lt;ffffffffc14029be&amp;gt;] ? ldlm_process_extent_lock+0x2ee/0x490 [ptlrpc]
[1313981.702124]  [&amp;lt;ffffffff8db17436&amp;gt;] ? native_queued_spin_lock_slowpath+0x126/0x200
[1313981.702126]  [&amp;lt;ffffffff8e17544a&amp;gt;] ? queued_spin_lock_slowpath+0xb/0xf
[1313981.702127]  [&amp;lt;ffffffff8e183330&amp;gt;] ? _raw_spin_lock+0x20/0x30
[1313981.702140]  [&amp;lt;ffffffffc13e402c&amp;gt;] ? lock_res_and_lock+0x2c/0x50 [ptlrpc]
[1313981.702157]  [&amp;lt;ffffffffc1414870&amp;gt;] ? ldlm_handle_enqueue0+0xdc0/0x15f0 [ptlrpc]
[1313981.702175]  [&amp;lt;ffffffffc143d300&amp;gt;] ? lustre_swab_ldlm_lock_desc+0x30/0x30 [ptlrpc]
[1313981.702199]  [&amp;lt;ffffffffc149ccf2&amp;gt;] ? tgt_enqueue+0x62/0x210 [ptlrpc]
[1313981.702222]  [&amp;lt;ffffffffc14a3b0a&amp;gt;] ? tgt_request_handle+0xada/0x1570 [ptlrpc]
[1313981.702244]  [&amp;lt;ffffffffc147d021&amp;gt;] ? ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[1313981.702248]  [&amp;lt;ffffffffc1066bde&amp;gt;] ? ktime_get_real_seconds+0xe/0x10 [libcfs]
[1313981.702268]  [&amp;lt;ffffffffc144846b&amp;gt;] ? ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
[1313981.702288]  [&amp;lt;ffffffffc1445285&amp;gt;] ? ptlrpc_wait_event+0xa5/0x360 [ptlrpc]
[1313981.702289]  [&amp;lt;ffffffff8e17544a&amp;gt;] ? queued_spin_lock_slowpath+0xb/0xf
[1313981.702291]  [&amp;lt;ffffffff8e183330&amp;gt;] ? _raw_spin_lock+0x20/0x30
[1313981.702310]  [&amp;lt;ffffffffc144bdd4&amp;gt;] ? ptlrpc_main+0xb34/0x1470 [ptlrpc]
[1313981.702329]  [&amp;lt;ffffffffc144b2a0&amp;gt;] ? ptlrpc_register_service+0xf80/0xf80 [ptlrpc]
[1313981.702330]  [&amp;lt;ffffffff8dac61f1&amp;gt;] ? kthread+0xd1/0xe0
[1313981.702332]  [&amp;lt;ffffffff8dac6120&amp;gt;] ? insert_kthread_work+0x40/0x40
[1313981.702333]  [&amp;lt;ffffffff8e18dd1d&amp;gt;] ? ret_from_fork_nospec_begin+0x7/0x21
[1313981.702334]  [&amp;lt;ffffffff8dac6120&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;



&lt;p&gt;When we brought the server back up, the server immediately panic&apos;d when recovery completed:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[ 1176.846234] Call Trace:
[ 1176.846250]  [&amp;lt;ffffffffc13fde41&amp;gt;] lu_context_refill+0x41/0x50 [obdclass]
[ 1176.846263]  [&amp;lt;ffffffffc13fded4&amp;gt;] lu_env_refill+0x24/0x30 [obdclass]
[ 1176.846271]  [&amp;lt;ffffffffc1a1cf61&amp;gt;] ofd_lvbo_init+0x2a1/0x7f0 [ofd]
[ 1176.846324]  [&amp;lt;ffffffffc16ae0af&amp;gt;] ? req_capsule_shrink+0xff/0x240 [ptlrpc]
[ 1176.846343]  [&amp;lt;ffffffffc166072d&amp;gt;] ldlm_server_completion_ast+0x5fd/0x980 [ptlrpc]
[ 1176.846361]  [&amp;lt;ffffffffc1660130&amp;gt;] ? ldlm_server_blocking_ast+0xa40/0xa40 [ptlrpc]
[ 1176.846374]  [&amp;lt;ffffffffc1632d08&amp;gt;] ldlm_work_cp_ast_lock+0xa8/0x1d0 [ptlrpc]
[ 1176.846394]  [&amp;lt;ffffffffc167ab42&amp;gt;] ptlrpc_set_wait+0x72/0x790 [ptlrpc]
[ 1176.846397]  [&amp;lt;ffffffff93a2630d&amp;gt;] ? kmem_cache_alloc_node_trace+0x11d/0x210
[ 1176.846409]  [&amp;lt;ffffffffc13db869&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[ 1176.846423]  [&amp;lt;ffffffffc1632c60&amp;gt;] ? ldlm_work_gl_ast_lock+0x3a0/0x3a0 [ptlrpc]
[ 1176.846441]  [&amp;lt;ffffffffc16713d2&amp;gt;] ? ptlrpc_prep_set+0xd2/0x280 [ptlrpc]
[ 1176.846455]  [&amp;lt;ffffffffc1638115&amp;gt;] ldlm_run_ast_work+0xd5/0x3a0 [ptlrpc]
[ 1176.846469]  [&amp;lt;ffffffffc16395ef&amp;gt;] __ldlm_reprocess_all+0x11f/0x360 [ptlrpc]
[ 1176.846484]  [&amp;lt;ffffffffc1639ba8&amp;gt;] ldlm_reprocess_res+0x28/0x30 [ptlrpc]
[ 1176.846492]  [&amp;lt;ffffffffc12bffb0&amp;gt;] cfs_hash_for_each_relax+0x250/0x450 [libcfs]
[ 1176.846507]  [&amp;lt;ffffffffc1639b80&amp;gt;] ? ldlm_lock_mode_downgrade+0x330/0x330 [ptlrpc]
[ 1176.846521]  [&amp;lt;ffffffffc1639b80&amp;gt;] ? ldlm_lock_mode_downgrade+0x330/0x330 [ptlrpc]
[ 1176.846526]  [&amp;lt;ffffffffc12c3345&amp;gt;] cfs_hash_for_each_nolock+0x75/0x1c0 [libcfs]
[ 1176.846540]  [&amp;lt;ffffffffc1639bec&amp;gt;] ldlm_reprocess_recovery_done+0x3c/0x110 [ptlrpc]
[ 1176.846556]  [&amp;lt;ffffffffc164c751&amp;gt;] target_recovery_thread+0xcd1/0x1160 [ptlrpc]
[ 1176.846571]  [&amp;lt;ffffffffc164ba80&amp;gt;] ? replay_request_or_update.isra.24+0x8c0/0x8c0 [ptlrpc]
[ 1176.846574]  [&amp;lt;ffffffff938c61f1&amp;gt;] kthread+0xd1/0xe0
[ 1176.846576]  [&amp;lt;ffffffff938c6120&amp;gt;] ? insert_kthread_work+0x40/0x40
[ 1176.846578]  [&amp;lt;ffffffff93f8dd1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[ 1176.846580]  [&amp;lt;ffffffff938c6120&amp;gt;] ? insert_kthread_work+0x40/0x40
[ 1176.846594] Code: 0b 74 06 00 0f 1f 00 31 db eb 15 0f 1f 40 00 48 83 c3 08 48 81 fb 40 01 00 00 0f 84 9f 00 00 00 49 8b 45 10 4c 8b a3 e0 80 52 c1 &amp;lt;48&amp;gt; 83 3c 18 00 75 dd 4d 85 e4 74 d8 41 8b 04 24 41 8b 55 00 85
[ 1176.846606] RIP  [&amp;lt;ffffffffc13f8d5c&amp;gt;] keys_fill+0x5c/0x180 [obdclass]
[ 1176.846607]  RSP &amp;lt;ffff9c868fea3ac0&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;Upon second reboot, the same issue happened.  We then aborted recovery during the last few nodes.&lt;/p&gt;

</comment>
                            <comment id="264410" author="simmonsja" created="Mon, 2 Mar 2020 17:56:26 +0000"  >&lt;p&gt;The last trace looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12853&quot; title=&quot;general protection fault: 0000 RIP: keys_fill&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12853&quot;&gt;&lt;del&gt;LU-12853&lt;/del&gt;&lt;/a&gt;. Let me port the patch.&lt;/p&gt;</comment>
                            <comment id="324772" author="simmonsja" created="Tue, 1 Feb 2022 19:42:21 +0000"  >&lt;p&gt;Patch landed that resolved this issue.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00p8f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>