<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:04:29 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6927] recovery_small test_66 failure</title>
                <link>https://jira.whamcloud.com/browse/LU-6927</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This seems to be hanging for me relatively frequently:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== recovery-small test 66: lock enqueue re-send vs client eviction == 21:24:18 (1438219458)
fail_loc=0x8000030c
  File: `/mnt/lustre/f66.recovery-small&apos;
  Size: 0         	Blocks: 0          IO Block: 4194304 regular empty file
Device: 2c54f966h/743766374d	Inode: 144115809252278274  Links: 1
Access: (0644/-rw-r--r--)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2015-07-29 21:24:19.000000000 -0400
Modify: 2015-07-29 21:24:19.000000000 -0400
Change: 2015-07-29 21:24:19.000000000 -0400
fail_loc=0
fail_loc=0x80000136
mdc.lustre-MDT0000-mdc-ffff8800815b17f0.import=connection=192.168.10.211@tcp
./../tests/test-framework.sh: line 2399: /mnt/lustre/recon: Input/output error
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In dmesg we get:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[195769.388166] Lustre: DEBUG MARKER: == recovery-small test 66: lock enqueue re-send vs client eviction == 21:24:18 (1438219458)
[195773.859644] Lustre: 17831:0:(genops.c:1491:obd_export_evict_by_uuid()) lustre-MDT0000: evicting f797195c-b579-5409-6da6-f79d52d8f961 at adminstrative request
[195773.860464] Lustre: 17831:0:(genops.c:1491:obd_export_evict_by_uuid()) Skipped 1 previous similar message
[195773.906764] LustreError: 17610:0:(fail.c:133:__cfs_fail_timeout_set()) cfs_fail_timeout id 136 sleeping for 40000ms
[195809.905048] Lustre: lustre-OST0001: haven&apos;t heard from client 9007d4b0-ffa9-1973-fae0-b8ec6df9deb5 (at 0@lo) in 48 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88003bea07f0, cur 1438219499 expire 1438219469 last 1438219451
[195813.904060] LustreError: 17610:0:(fail.c:137:__cfs_fail_timeout_set()) cfs_fail_timeout id 136 awake
[195813.904062] LNet: Service thread pid 17610 was inactive for 40.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[195813.904064] Pid: 17610, comm: mdt01_003
[195813.904065] 
[195813.904065] Call Trace:
[195813.904075]  [&amp;lt;ffffffff81522574&amp;gt;] ? _spin_lock_irqsave+0x24/0x30
[195813.904079]  [&amp;lt;ffffffff810866a2&amp;gt;] ? del_timer_sync+0x22/0x30
[195813.904081]  [&amp;lt;ffffffff8151f279&amp;gt;] ? schedule_timeout+0x199/0x2e0
[195813.904091]  [&amp;lt;ffffffffa0909c41&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
[195813.904102]  [&amp;lt;ffffffffa0906c4c&amp;gt;] ? __cfs_fail_timeout_set+0x13c/0x160 [libcfs]
[195813.904116]  [&amp;lt;ffffffffa0c113a5&amp;gt;] ? mdt_getattr_name_lock+0x475/0x1910 [mdt]
[195813.904125]  [&amp;lt;ffffffffa0c16cd3&amp;gt;] ? old_init_ucred+0x1e3/0x390 [mdt]
[195813.904133]  [&amp;lt;ffffffffa0c12d62&amp;gt;] ? mdt_intent_getattr+0x292/0x470 [mdt]
[195813.904141]  [&amp;lt;ffffffffa0c02674&amp;gt;] ? mdt_intent_policy+0x494/0xc40 [mdt]
[195813.904169]  [&amp;lt;ffffffffa053ef6f&amp;gt;] ? ldlm_lock_enqueue+0x12f/0x860 [ptlrpc]
[195813.904194]  [&amp;lt;ffffffffa056ab97&amp;gt;] ? ldlm_handle_enqueue0+0x7e7/0x1520 [ptlrpc]
[195813.904227]  [&amp;lt;ffffffffa05f1111&amp;gt;] ? tgt_enqueue+0x61/0x230 [ptlrpc]
[195813.904256]  [&amp;lt;ffffffffa05f1d22&amp;gt;] ? tgt_request_handle+0xa42/0x1230 [ptlrpc]
[195813.904282]  [&amp;lt;ffffffffa059d5e4&amp;gt;] ? ptlrpc_main+0xd74/0x1850 [ptlrpc]
[195813.904284]  [&amp;lt;ffffffff8152240e&amp;gt;] ? _spin_unlock_irq+0xe/0x20
[195813.904287]  [&amp;lt;ffffffff81061829&amp;gt;] ? finish_task_switch+0x59/0xe0
[195813.904289]  [&amp;lt;ffffffff8151e3ee&amp;gt;] ? thread_return+0x4e/0x7e0
[195813.904314]  [&amp;lt;ffffffffa059c870&amp;gt;] ? ptlrpc_main+0x0/0x1850 [ptlrpc]
[195813.904317]  [&amp;lt;ffffffff8109ce4e&amp;gt;] ? kthread+0x9e/0xc0
[195813.904319]  [&amp;lt;ffffffff8100c24a&amp;gt;] ? child_rip+0xa/0x20
[195813.904321]  [&amp;lt;ffffffff8109cdb0&amp;gt;] ? kthread+0x0/0xc0
[195813.904322]  [&amp;lt;ffffffff8100c240&amp;gt;] ? child_rip+0x0/0x20
[195813.904323] 
[195813.904325] LustreError: dumping log to /tmp/lustre-log.1438219503.17610
[195813.913595] LNet: Service thread pid 17610 completed after 40.00s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="31281">LU-6927</key>
            <summary>recovery_small test_66 failure</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="green">Oleg Drokin</reporter>
                        <labels>
                    </labels>
                <created>Thu, 30 Jul 2015 13:46:35 +0000</created>
                <updated>Sun, 10 Oct 2021 22:34:26 +0000</updated>
                            <resolved>Sun, 10 Oct 2021 22:34:26 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="122677" author="green" created="Thu, 30 Jul 2015 13:54:23 +0000"  >&lt;p&gt;On the client we can see this in process list:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;17837 pts/0    S+     0:00 sh -c df /mnt/lustre; uname -n &amp;gt;&amp;gt; /mnt/lustre/recon
17838 pts/0    S+     0:01 df /mnt/lustre
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is inside test-framework&apos;s client_reconnect_try() call.&lt;/p&gt;</comment>
                            <comment id="122685" author="green" created="Thu, 30 Jul 2015 15:11:36 +0000"  >&lt;p&gt;I tried to do touch/mnt/lustre/file and that ended up in io errro too, and I got this extra dmesg stuff:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[240838.112047] Lustre: 17905:0:(client.c:2020:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1438264520/real 1438264520]  req@ffff8800999fcce8 x1508080398450504/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 872/1208 e 0 to 1 dl 1438264527 ref 2 fl Rpc:XP/0/ffffffff rc 0/-1
[240838.113580] Lustre: 17905:0:(client.c:2020:ptlrpc_expire_one_request()) Skipped 89 previous similar messages
[240838.114146] Lustre: lustre-MDT0000-mdc-ffff8800815b17f0: Connection to lustre-MDT0000 (at 0@lo) was lost; in progress operations using this service will wait for recovery to complete
[240838.115003] Lustre: Skipped 10 previous similar messages
[240838.115661] Lustre: lustre-MDT0000: Client f797195c-b579-5409-6da6-f79d52d8f961 (at 0@lo) reconnecting
[240838.116431] Lustre: Skipped 13 previous similar messages
[240838.116910] Lustre: lustre-MDT0000-mdc-ffff8800815b17f0: Connection restored to lustre-MDT0000 (at 0@lo)
[240838.117511] Lustre: Skipped 12 previous similar messages
[240871.112044] LNet: Service thread pid 17527 was inactive for 40.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[240871.112880] Pid: 17527, comm: mdt00_000
[240871.113119] 
[240871.113119] Call Trace:
[240871.113612]  [&amp;lt;ffffffffa05396d0&amp;gt;] ? _ldlm_lock_debug+0x300/0x690 [ptlrpc]
[240871.113940]  [&amp;lt;ffffffff8151f271&amp;gt;] schedule_timeout+0x191/0x2e0
[240871.114250]  [&amp;lt;ffffffff81085b40&amp;gt;] ? process_timeout+0x0/0x10
[240871.114570]  [&amp;lt;ffffffffa055a8c0&amp;gt;] ? ldlm_expired_completion_wait+0x0/0x370 [ptlrpc]
[240871.115063]  [&amp;lt;ffffffffa055f509&amp;gt;] ldlm_completion_ast+0x609/0x9b0 [ptlrpc]
[240871.115362]  [&amp;lt;ffffffff81061630&amp;gt;] ? default_wake_function+0x0/0x20
[240871.115661]  [&amp;lt;ffffffffa055e86e&amp;gt;] ldlm_cli_enqueue_local+0x21e/0x8b0 [ptlrpc]
[240871.116143]  [&amp;lt;ffffffffa055ef00&amp;gt;] ? ldlm_completion_ast+0x0/0x9b0 [ptlrpc]
[240871.116452]  [&amp;lt;ffffffffa0bfe0d0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
[240871.116841]  [&amp;lt;ffffffffa0c0a814&amp;gt;] mdt_object_local_lock+0x3c4/0xa80 [mdt]
[240871.117140]  [&amp;lt;ffffffffa0bfe0d0&amp;gt;] ? mdt_blocking_ast+0x0/0x2a0 [mdt]
[240871.117451]  [&amp;lt;ffffffffa055ef00&amp;gt;] ? ldlm_completion_ast+0x0/0x9b0 [ptlrpc]
[240871.117752]  [&amp;lt;ffffffffa0c0b20b&amp;gt;] mdt_object_lock_internal+0x5b/0x290 [mdt]
[240871.118061]  [&amp;lt;ffffffffa0c0b501&amp;gt;] mdt_object_lock+0x11/0x20 [mdt]
[240871.118370]  [&amp;lt;ffffffffa0c2f5f2&amp;gt;] mdt_reint_open+0x772/0x2d80 [mdt]
[240871.118656]  [&amp;lt;ffffffffa0c1a6ed&amp;gt;] mdt_reint_rec+0x5d/0x200 [mdt]
[240871.118939]  [&amp;lt;ffffffffa0c03bfb&amp;gt;] mdt_reint_internal+0x62b/0xa40 [mdt]
[240871.119341]  [&amp;lt;ffffffffa0c04206&amp;gt;] mdt_intent_reint+0x1f6/0x430 [mdt]
[240871.119631]  [&amp;lt;ffffffffa0c02674&amp;gt;] mdt_intent_policy+0x494/0xc40 [mdt]
[240871.119934]  [&amp;lt;ffffffffa053ef6f&amp;gt;] ldlm_lock_enqueue+0x12f/0x860 [ptlrpc]
[240871.123439]  [&amp;lt;ffffffffa056ab97&amp;gt;] ldlm_handle_enqueue0+0x7e7/0x1520 [ptlrpc]
[240871.123802]  [&amp;lt;ffffffffa05f1111&amp;gt;] tgt_enqueue+0x61/0x230 [ptlrpc]
[240871.124125]  [&amp;lt;ffffffffa05f1d22&amp;gt;] tgt_request_handle+0xa42/0x1230 [ptlrpc]
[240871.124479]  [&amp;lt;ffffffffa059d5e4&amp;gt;] ptlrpc_main+0xd74/0x1850 [ptlrpc]
[240871.124852]  [&amp;lt;ffffffffa059c870&amp;gt;] ? ptlrpc_main+0x0/0x1850 [ptlrpc]
[240871.125169]  [&amp;lt;ffffffff8109ce4e&amp;gt;] kthread+0x9e/0xc0
[240871.125430]  [&amp;lt;ffffffff8100c24a&amp;gt;] child_rip+0xa/0x20
[240871.125684]  [&amp;lt;ffffffff8109cdb0&amp;gt;] ? kthread+0x0/0xc0
[240871.125939]  [&amp;lt;ffffffff8100c240&amp;gt;] ? child_rip+0x0/0x20
[240871.126200] 
[240871.126445] LustreError: dumping log to /tmp/lustre-log.1438264560.17527
[240931.816060] LustreError: 0:0:(ldlm_lockd.c:350:waiting_locks_callback()) ### lock callback timer expired after 101s: evicting client at 0@lo  ns: mdt-lustre-MDT0000_UUID lock: ffff88002deffdb8/0x91588c5d002f9ff4 lrc: 3/0,0 mode: PR/PR res: [0x200000007:0x1:0x0].0 bits 0x13 rrc: 3 type: IBT flags: 0x60200000000020 nid: 0@lo remote: 0x91588c5d002f9fd1 expref: 7 pid: 17610 timeout: 4355125074 lvb_type: 0
[240931.817853] LustreError: 0:0:(ldlm_lockd.c:350:waiting_locks_callback()) Skipped 1 previous similar message
[240931.819475] LustreError: 17527:0:(ldlm_lockd.c:1390:ldlm_handle_enqueue0()) ### lock on destroyed export ffff8800768887f0 ns: mdt-lustre-MDT0000_UUID lock: ffff8800b4683db8/0x91588c5d002fa03a lrc: 3/0,0 mode: CR/CR res: [0x2000090a2:0x1:0x0].0 bits 0x9 rrc: 1 type: IBT flags: 0x50200000000000 nid: 0@lo remote: 0x91588c5d002fa010 expref: 4 pid: 17527 timeout: 0 lvb_type: 0
[240931.828351] LNet: Service thread pid 17527 completed after 100.70s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
[240931.829116] LustreError: 11-0: lustre-MDT0000-mdc-ffff8800815b17f0: operation ldlm_enqueue to node 0@lo failed: rc = -107
[240931.829119] LustreError: Skipped 16 previous similar messages
[240931.829125] Lustre: lustre-MDT0000-mdc-ffff8800815b17f0: Connection to lustre-MDT0000 (at 0@lo) was lost; in progress operations using this service will wait for recovery to complete
[240931.829127] Lustre: Skipped 1 previous similar message
[240931.830325] LustreError: 167-0: lustre-MDT0000-mdc-ffff8800815b17f0: This client was evicted by lustre-MDT0000; in progress operations using this service will fail.
[240931.830328] LustreError: Skipped 16 previous similar messages
[240931.831571] LustreError: 17905:0:(file.c:3104:ll_inode_revalidate_fini()) lustre: revalidate FID [0x200000007:0x1:0x0] error: rc = -108
[240931.831574] LustreError: 17905:0:(file.c:3104:ll_inode_revalidate_fini()) Skipped 1 previous similar message
[240951.828046] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (1 != 0)
[240951.828643] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88006c1fece8 x1508080398126124/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 696/1080 e 1 to 0 dl 1438219484 ref 2 fl Unregistering:ES/2/ffffffff rc -5/-1
[240951.829800] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (1). Network is sluggish? Waiting them to error out.
[240971.828046] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (1 != 0)
[240971.828657] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88006c1fece8 x1508080398126124/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 696/1080 e 1 to 0 dl 1438219484 ref 2 fl Unregistering:ES/2/ffffffff rc -5/-1
[240971.829654] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (1). Network is sluggish? Waiting them to error out.
[240985.116518] Lustre: lustre-MDT0000: haven&apos;t heard from client f797195c-b579-5409-6da6-f79d52d8f961 (at 0@lo) in 53 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88008c3027f0, cur 1438264674 expire 1438264644 last 1438264621
[240991.828054] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (1 != 0)
[240991.828618] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88006c1fece8 x1508080398126124/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 696/1080 e 1 to 0 dl 1438219484 ref 2 fl Unregistering:ES/2/ffffffff rc -5/-1
[240991.829520] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (1). Network is sluggish? Waiting them to error out.
[241011.828067] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (1 != 0)
[241011.828670] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88006c1fece8 x1508080398126124/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 696/1080 e 1 to 0 dl 1438219484 ref 2 fl Unregistering:ES/2/ffffffff rc -5/-1
[241011.829831] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (1). Network is sluggish? Waiting them to error out.
[241031.828076] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (1 != 0)
[241031.828680] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88006c1fece8 x1508080398126124/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 696/1080 e 1 to 0 dl 1438219484 ref 2 fl Unregistering:ES/2/ffffffff rc -5/-1
[241031.829941] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (1). Network is sluggish? Waiting them to error out.
[241051.828068] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (1 != 0)
[241051.828672] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88006c1fece8 x1508080398126124/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 696/1080 e 1 to 0 dl 1438219484 ref 2 fl Unregistering:ES/2/ffffffff rc -5/-1
[241051.829840] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (1). Network is sluggish? Waiting them to error out.
[241071.828052] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (1 != 0)
[241071.828553] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88006c1fece8 x1508080398126124/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 696/1080 e 1 to 0 dl 1438219484 ref 2 fl Unregistering:ES/2/ffffffff rc -5/-1
[241071.829411] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (1). Network is sluggish? Waiting them to error out.
[241111.828063] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (1 != 0)
[241111.828578] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) Skipped 1 previous similar message
[241111.829146] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88006c1fece8 x1508080398126124/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 696/1080 e 1 to 0 dl 1438219484 ref 2 fl Unregistering:ES/2/ffffffff rc -5/-1
[241111.830368] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) Skipped 1 previous similar message
[241111.830925] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (1). Network is sluggish? Waiting them to error out.
[241111.831846] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) Skipped 1 previous similar message
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="122734" author="pjones" created="Thu, 30 Jul 2015 17:28:28 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Oleg is wondering whether this is an LNET issue. He has a node wedged in this manner on his test cluster that you could look at if that helps (limited time offer - maybe for the next day or so &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt; )&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="122735" author="green" created="Thu, 30 Jul 2015 17:29:18 +0000"  >&lt;p&gt;Contact me on skype for login credentials&lt;/p&gt;</comment>
                            <comment id="127263" author="green" created="Mon, 14 Sep 2015 18:15:25 +0000"  >&lt;p&gt;Adding a bit of a discussion we had with Amir for back history in case this gets reassigned:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[9/9/15 1:20:39 PM] Amir Shehata: Hey Oleg, I&apos;m looking at LU-6927... Wondering why you think this is an LNet issue? the stack trace doesn&apos;t show LNet functions. what do you suspect is causing the thread being slow?
[9/9/15 1:21:59 PM] Oleg Drokin: [240951.828046] LustreError: 17909:0:(import.c:332:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: rc = -110 waiting for callback (1 != 0)
[240951.828643] LustreError: 17909:0:(import.c:358:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff88006c1fece8 x1508080398126124/t0(0) o101-&amp;gt;lustre-MDT0000-mdc-ffff8800815b17f0@0@lo:12/10 lens 696/1080 e 1 to 0 dl 1438219484 ref 2 fl Unregistering:ES/2/ffffffff rc -5/-1
[240951.829800] LustreError: 17909:0:(import.c:373:ptlrpc_invalidate_import()) lustre-MDT0000_UUID: RPCs in &quot;Unregistering&quot; phase found (1). Network is sluggish? Waiting th
[9/9/15 1:22:08 PM] Oleg Drokin: so we are trying to unregister a message from network and cannot
[9/9/15 1:37:49 PM] Amir Shehata: hmm... I&apos;m not familiar with that Unregistering concept.  As far as I understand LNet is message based.  It send a message and waits for a response.  It has its timeouts and ptlrpc has its timeout
[9/9/15 1:38:41 PM] Oleg Drokin: yes. so in this instance ptlrpc registered a buffer sending or receiving. now we want to stop the service so are trying to unregister the buffer
[9/9/15 1:39:01 PM] Oleg Drokin: and we cannot - we hit our timeout waiting for acknowledgement from lnet that thebuffer has been unregistered
[9/9/15 1:39:38 PM] Amir Shehata: hmm.. Ok I&apos;ll take a look at ptlrpc_invalidate_import() to see what I can gather
[9/9/15 2:02:54 PM] Amir Shehata: so what I see is that there is a list: imp_sending_list, which requests get added to when sending, but I don&apos;t see where the request is removed from that list when it&apos;s completed (or aborted)
[9/9/15 3:40:51 PM] Oleg Drokin:                 /* Request already may be not on sending or delaying list. This
                 * may happen in the case of marking it erroneous for the case
                 * ptlrpc_import_delay_req(req, status) find it impossible to
                 * allow sending this rpc and returns *status != 0. */
                if (!list_empty(&amp;amp;req-&amp;gt;rq_list)) {
                        list_del_init(&amp;amp;req-&amp;gt;rq_list);
                        atomic_dec(&amp;amp;imp-&amp;gt;imp_inflight);
[9/9/15 3:40:56 PM] Oleg Drokin: here it&apos;s removed once it was sent
[9/9/15 3:41:06 PM] Oleg Drokin: ther are a couple more places where it is removed if there was a n error sending
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxjc7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>