<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:31:28 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10035] Many threads hanging on OST, lustre-log dumps</title>
                <link>https://jira.whamcloud.com/browse/LU-10035</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Soak OSS are dumping multiple Lustre logs, many threads hanging. &lt;br/&gt;
OSS&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 26 16:13:09 soak-3 kernel: LNet: Service thread pid 55178 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 200.74s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Sep 26 16:13:09 soak-3 kernel: Pid: 55178, comm: ll_ost00_037
Sep 26 16:13:09 soak-3 kernel: #012Call Trace:
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff816a94e9&amp;gt;] schedule+0x29/0x70
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc09404d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff810b1910&amp;gt;] ? autoremove_wake_function+0x0/0x40
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0940515&amp;gt;] __cv_wait+0x15/0x20 [spl]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc2464a0b&amp;gt;] dmu_tx_wait+0x20b/0x3c0 [zfs]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc2464c51&amp;gt;] dmu_tx_assign+0x91/0x490 [zfs]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0ba7efa&amp;gt;] osd_trans_start+0xaa/0x3c0 [osd_zfs]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0f96c8d&amp;gt;] tgt_client_data_update+0x2ed/0x5d0 [ptlrpc]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0f9738b&amp;gt;] tgt_client_new+0x41b/0x610 [ptlrpc]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc111c553&amp;gt;] ofd_obd_connect+0x3a3/0x4c0 [ofd]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0efeaec&amp;gt;] target_handle_connect+0x12bc/0x3200 [ptlrpc]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff810d0408&amp;gt;] ? enqueue_task_fair+0x208/0x6c0
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff810c76f5&amp;gt;] ? sched_clock_cpu+0x85/0xc0
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0fa1b62&amp;gt;] tgt_request_handle+0x402/0x1370 [ptlrpc]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0f4aec6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0f474f8&amp;gt;] ? ptlrpc_wait_event+0x98/0x340 [ptlrpc]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff810c4822&amp;gt;] ? default_wake_function+0x12/0x20
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff810ba588&amp;gt;] ? __wake_up_common+0x58/0x90
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0f4e602&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffffc0f4db70&amp;gt;] ? ptlrpc_main+0x0/0x1e40 [ptlrpc]
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Sep 26 16:13:09 soak-3 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 26 16:13:09 soak-3 kernel:
Sep 26 16:13:09 soak-3 kernel: LustreError: dumping log to /tmp/lustre-log.1506442389.55178
Sep 26 16:13:11 soak-3 kernel: LNet: Service thread pid 64795 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 201.48s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Sep 26 16:13:11 soak-3 kernel: Pid: 64795, comm: ll_ost00_043
Sep 26 16:13:11 soak-3 kernel: #012Call Trace:
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff816a94e9&amp;gt;] schedule+0x29/0x70
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc09404d5&amp;gt;] cv_wait_common+0x125/0x150 [spl]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff810b1910&amp;gt;] ? autoremove_wake_function+0x0/0x40
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0940515&amp;gt;] __cv_wait+0x15/0x20 [spl]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc2464a0b&amp;gt;] dmu_tx_wait+0x20b/0x3c0 [zfs]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc2464c51&amp;gt;] dmu_tx_assign+0x91/0x490 [zfs]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0ba7efa&amp;gt;] osd_trans_start+0xaa/0x3c0 [osd_zfs]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0f96c8d&amp;gt;] tgt_client_data_update+0x2ed/0x5d0 [ptlrpc]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0f9738b&amp;gt;] tgt_client_new+0x41b/0x610 [ptlrpc]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc111c553&amp;gt;] ofd_obd_connect+0x3a3/0x4c0 [ofd]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0efeaec&amp;gt;] target_handle_connect+0x12bc/0x3200 [ptlrpc]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff810d0408&amp;gt;] ? enqueue_task_fair+0x208/0x6c0
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff810c76f5&amp;gt;] ? sched_clock_cpu+0x85/0xc0
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0fa1b62&amp;gt;] tgt_request_handle+0x402/0x1370 [ptlrpc]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0f4aec6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0f474f8&amp;gt;] ? ptlrpc_wait_event+0x98/0x340 [ptlrpc]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff810c4822&amp;gt;] ? default_wake_function+0x12/0x20
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff810ba588&amp;gt;] ? __wake_up_common+0x58/0x90
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0f4e602&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff81029557&amp;gt;] ? __switch_to+0xd7/0x510
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff816a8f00&amp;gt;] ? __schedule+0x2f0/0x8b0
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffffc0f4db70&amp;gt;] ? ptlrpc_main+0x0/0x1e40 [ptlrpc]
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Sep 26 16:13:11 soak-3 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 26 16:13:11 soak-3 kernel:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;MDS report disconnection from OST at this time:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;ep 26 16:15:23 soak-8 kernel: Lustre: 2119:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1506441922/real 1506441922]  req@ffff8807bab2e300 x1579578644250848/t0(0) o1000-&amp;gt;soaked-OST000d-osc-MDT0000@192.168.1.103@o2ib:24/4 lens 368/4320 e 23 to 1 dl 1506442523 ref 1 fl Rpc:X/2/ffffffff rc -11/-1
Sep 26 16:15:23 soak-8 kernel: Lustre: soaked-OST0001-osc-MDT0000: Connection to soaked-OST0001 (at 192.168.1.103@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 26 16:15:23 soak-8 kernel: Lustre: Skipped 4 previous similar messages
Sep 26 16:15:23 soak-8 kernel: Lustre: soaked-OST0001-osc-MDT0000: Connection restored to 192.168.1.103@o2ib (at 192.168.1.103@o2ib)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We see connections dropping and being restored quite a lot. example covering one minute:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;soak-10.log:Sep 26 16:10:00 soak-10 kernel: Lustre: soaked-MDT0002: Connection restored to 6bb03e1f-2a66-5ad2-7153-3a2729fcc702 (at 192.168.1.143@o2ib)
soak-10.log:Sep 26 16:10:19 soak-10 kernel: Lustre: soaked-MDT0002: Connection restored to f6aa261f-cc7b-f1d5-147f-cc08e579672d (at 192.168.1.124@o2ib)
soak-11.log:Sep 26 16:12:39 soak-11 kernel: Lustre: soaked-OST000d-osc-MDT0003: Connection restored to 192.168.1.103@o2ib (at 192.168.1.103@o2ib)
soak-2.log:Sep 26 16:10:04 soak-2 kernel: Lustre: soaked-OST0006: Connection restored to  (at 192.168.1.117@o2ib)
soak-2.log:Sep 26 16:10:20 soak-2 kernel: Lustre: soaked-OST0000: Connection restored to  (at 192.168.1.125@o2ib)
soak-3.log:Sep 26 16:11:48 soak-3 kernel: Lustre: soaked-OST0001: Connection restored to  (at 192.168.1.111@o2ib)
soak-4.log:Sep 26 16:10:31 soak-4 kernel: Lustre: soaked-OST0002: Connection restored to  (at 192.168.1.133@o2ib)
soak-5.log:Sep 26 16:10:00 soak-5 kernel: Lustre: soaked-OST0009: Connection restored to  (at 192.168.1.143@o2ib)
soak-5.log:Sep 26 16:10:19 soak-5 kernel: Lustre: soaked-OST0015: Connection restored to  (at 192.168.1.124@o2ib)
soak-6.log:Sep 26 16:10:00 soak-6 kernel: Lustre: soaked-OST0010: Connection restored to  (at 192.168.1.143@o2ib)
soak-6.log:Sep 26 16:10:00 soak-6 kernel: Lustre: soaked-OST0016: Connection restored to  (at 192.168.1.143@o2ib)
soak-6.log:Sep 26 16:10:00 soak-6 kernel: Lustre: soaked-OST000a: Connection restored to  (at 192.168.1.143@o2ib)
soak-6.log:Sep 26 16:10:18 soak-6 kernel: Lustre: soaked-OST0016: Connection restored to  (at 192.168.1.124@o2ib)
soak-6.log:Sep 26 16:10:18 soak-6 kernel: Lustre: soaked-OST000a: Connection restored to  (at 192.168.1.124@o2ib)
soak-7.log:Sep 26 16:15:23 soak-7 kernel: Lustre: soaked-OST0005: Connection restored to  (at 192.168.1.108@o2ib)
soak-8.log:Sep 26 16:15:23 soak-8 kernel: Lustre: soaked-OST0001-osc-MDT0000: Connection restored to 192.168.1.103@o2ib (at 192.168.1.103@o2ib)
soak-9.log:Sep 26 16:10:00 soak-9 kernel: Lustre: soaked-MDT0001: Connection restored to 6bb03e1f-2a66-5ad2-7153-3a2729fcc702 (at 192.168.1.143@o2ib)
soak-9.log:Sep 26 16:10:19 soak-9 kernel: Lustre: soaked-MDT0001: Connection restored to f6aa261f-cc7b-f1d5-147f-cc08e579672d (at 192.168.1.124@o2ib)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>Soak cluster, b2.10.1 + wangdi diag patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9983&quot; title=&quot;LBUG llog_osd.c:327:llog_osd_declare_write_rec() - all DNE MDS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9983&quot;&gt;&lt;strike&gt;LU-9983&lt;/strike&gt;&lt;/a&gt;</environment>
        <key id="48482">LU-10035</key>
            <summary>Many threads hanging on OST, lustre-log dumps</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Tue, 26 Sep 2017 17:26:47 +0000</created>
                <updated>Thu, 20 Dec 2018 18:01:08 +0000</updated>
                                            <version>Lustre 2.10.1</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="209611" author="jgmitter" created="Tue, 26 Sep 2017 17:59:55 +0000"  >&lt;p&gt;Hi Amir,&lt;/p&gt;

&lt;p&gt;Can you please have a look into this?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="209612" author="adilger" created="Tue, 26 Sep 2017 18:03:42 +0000"  >&lt;p&gt;In the &lt;a href=&quot;https://jira.hpdd.intel.com/secure/attachment/28360/soak-3.lustre-log.1506442389.55178&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;soak-3.lustre-log.1506442389.55178&lt;/a&gt; log there is a continuous stream of LNet messages that imply some sort of connection problem:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000800:00000100:5.0:1506439845.140835:0:5558:0:(o2iblnd_cb.c:490:kiblnd_rx_complete()) Rx from 192.168.1.115@o2ib failed: 5
00000800:00000100:5.0:1506439845.140838:0:5558:0:(o2iblnd_cb.c:490:kiblnd_rx_complete()) Rx from 192.168.1.115@o2ib failed: 5
00000800:00000100:1.0:1506439845.140839:0:79688:0:(o2iblnd_cb.c:2677:kiblnd_rejected()) 192.168.1.115@o2ib rejected: no listener at 987
00000800:00000100:5.0:1506439845.140840:0:5558:0:(o2iblnd_cb.c:490:kiblnd_rx_complete()) Rx from 192.168.1.115@o2ib failed: 5
00000800:00000100:1.0:1506439845.148485:0:79688:0:(o2iblnd_cb.c:2651:kiblnd_check_reconnect()) 192.168.1.115@o2ib: reconnect (invalid service id), 12, 12, msg_size: 4096, queue_depth: 8/-1, max_frags: 256/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It isn&apos;t clear if there is a network configuration problem, or if LNet multi-rail/DD are doing the wrong thing and flooding the network with extra requests?&lt;/p&gt;</comment>
                            <comment id="209615" author="ashehata" created="Tue, 26 Sep 2017 18:06:02 +0000"  >&lt;p&gt;Is this using OPA? are there multiple configured interfaces on some nodes?&lt;/p&gt;</comment>
                            <comment id="209650" author="cliffw" created="Tue, 26 Sep 2017 19:56:04 +0000"  >&lt;p&gt;the 192.168.1.114 and 192.168.1.115 nodes are LNET-&amp;gt;OPA routers. At the time this test was run, those nodes were powered off, due to missing OPA bits (since fixed) so, I would ignore any messages referencing those nodes, just servers trying to ping dead routers. &lt;/p&gt;</comment>
                            <comment id="209652" author="ashehata" created="Tue, 26 Sep 2017 20:03:48 +0000"  >&lt;p&gt;so this might not be related to LNet, could be an issue with the patch applied? Can we roll the patch back and retry?&lt;/p&gt;</comment>
                            <comment id="209674" author="cliffw" created="Tue, 26 Sep 2017 22:32:16 +0000"  >&lt;p&gt;Rolling back the patch leaves us with a system that LBUGs after 15 seconds of operation, so I&apos;d say no.&lt;/p&gt;</comment>
                            <comment id="209681" author="cliffw" created="Wed, 27 Sep 2017 00:03:58 +0000"  >&lt;p&gt;Running with the debug patch, seeing a lot of reconnects, in bursts:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;/scratch/logs/syslog/soak-41.log:Sep 26 23:57:58 soak-41 kernel: Lustre: soaked-OST000f-osc-ffff880173a8c000: Connection to soaked-OST000f (at 192.168.1.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
/scratch/logs/syslog/soak-41.log:Sep 26 23:57:58 soak-41 kernel: Lustre: soaked-OST000f-osc-ffff880173a8c000: Connection restored to 192.168.1.105@o2ib (at 192.168.1.105@o2ib)
/scratch/logs/syslog/soak-41.log:Sep 26 23:59:47 soak-41 kernel: Lustre: soaked-MDT0000-mdc-ffff880173a8c000: Connection to soaked-MDT0000 (at 192.168.1.108@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
/scratch/logs/syslog/soak-41.log:Sep 26 23:59:47 soak-41 kernel: Lustre: soaked-MDT0000-mdc-ffff880173a8c000: Connection restored to 192.168.1.108@o2ib (at 192.168.1.108@o2ib)
/scratch/logs/syslog/soak-42.log:Sep 26 23:57:59 soak-42 kernel: Lustre: soaked-OST000f-osc-ffff880173f87000: Connection to soaked-OST000f (at 192.168.1.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
/scratch/logs/syslog/soak-42.log:Sep 26 23:57:59 soak-42 kernel: Lustre: soaked-OST000f-osc-ffff880173f87000: Connection restored to 192.168.1.105@o2ib (at 192.168.1.105@o2ib)
/scratch/logs/syslog/soak-42.log:Sep 26 23:59:46 soak-42 kernel: Lustre: soaked-MDT0000-mdc-ffff880173f87000: Connection to soaked-MDT0000 (at 192.168.1.108@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
/scratch/logs/syslog/soak-42.log:Sep 26 23:59:46 soak-42 kernel: Lustre: soaked-MDT0000-mdc-ffff880173f87000: Connection restored to 192.168.1.108@o2ib (at 192.168.1.108@o2ib)
/scratch/logs/syslog/soak-43.log:Sep 26 23:57:59 soak-43 kernel: Lustre: soaked-OST000f-osc-ffff880173ae4000: Connection to soaked-OST000f (at 192.168.1.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
/scratch/logs/syslog/soak-43.log:Sep 26 23:57:59 soak-43 kernel: Lustre: soaked-OST000f-osc-ffff880173ae4000: Connection restored to 192.168.1.105@o2ib (at 192.168.1.105@o2ib)
/scratch/logs/syslog/soak-43.log:Sep 26 23:59:48 soak-43 kernel: Lustre: soaked-MDT0000-mdc-ffff880173ae4000: Connection to soaked-MDT0000 (at 192.168.1.108@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
/scratch/logs/syslog/soak-43.log:Sep 26 23:59:48 soak-43 kernel: Lustre: soaked-MDT0000-mdc-ffff880173ae4000: Connection restored to 192.168.1.108@o2ib (at 192.168.1.108@o2ib)
/scratch/logs/syslog/soak-44.log:Sep 26 23:57:58 soak-44 kernel: Lustre: soaked-OST000f-osc-ffff88083e79f800: Connection to soaked-OST000f (at 192.168.1.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
/scratch/logs/syslog/soak-44.log:Sep 26 23:57:59 soak-44 kernel: Lustre: soaked-OST000f-osc-ffff88083e79f800: Connection restored to 192.168.1.105@o2ib (at 192.168.1.105@o2ib)
/scratch/logs/syslog/soak-44.log:Sep 26 23:59:48 soak-44 kernel: Lustre: soaked-MDT0000-mdc-ffff88083e79f800: Connection to soaked-MDT0000 (at 192.168.1.108@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
/scratch/logs/syslog/soak-44.log:Sep 26 23:59:48 soak-44 kernel: Lustre: soaked-MDT0000-mdc-ffff88083e79f800: Connection restored to 192.168.1.108@o2ib (at 192.168.1.108@o2ib)
/scratch/logs/syslog/soak-5.log:Sep 26 23:57:58 soak-5 kernel: Lustre: soaked-OST000f: Connection restored to  (at 192.168.1.141@o2ib)
/scratch/logs/syslog/soak-8.log:Sep 26 23:51:32 soak-8 kernel: Lustre: soaked-OST000f-osc-MDT0000: Connection to soaked-OST000f (at 192.168.1.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
/scratch/logs/syslog/soak-8.log:Sep 26 23:58:07 soak-8 kernel: Lustre: soaked-MDT0000: Connection restored to 22754519-5c3c-7d4f-2c9f-eacb7fdfd149 (at 192.168.1.140@o2ib)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Server-side is dumping logs, example&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Sep 26 23:56:08 soak-5 systemd: Stopping User Slice of root.
Sep 26 23:56:41 soak-5 kernel: Lustre: 43683:0:(service.c:1346:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply#012  req@ffff8808145b9450 x1579643797320240/t0(0) o5-&amp;gt;soaked-MDT0000-mdtlov_UUID@192.168.1.108@o2ib:586/0 lens 432/432 e 1 to 0 dl 1506470206 ref 2 fl Interpret:/0/0 rc 0/0
Sep 26 23:56:41 soak-5 kernel: Lustre: 43683:0:(service.c:1346:ptlrpc_at_send_early_reply()) Skipped 10 previous similar messages
Sep 26 23:56:47 soak-5 kernel: LNet: Service thread pid 149701 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 1203.59s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Sep 26 23:56:47 soak-5 kernel: LNet: Skipped 3 previous similar messages
Sep 26 23:56:47 soak-5 kernel: Pid: 149701, comm: ll_ost01_037
Sep 26 23:56:47 soak-5 kernel: #012Call Trace:
Sep 26 23:56:47 soak-5 kernel: [&amp;lt;ffffffff816aa409&amp;gt;] schedule_preempt_disabled+0x29/0x70
Sep 26 23:56:47 soak-5 kernel: [&amp;lt;ffffffff816a8337&amp;gt;] __mutex_lock_slowpath+0xc7/0x1d0
Sep 26 23:56:47 soak-5 kernel: [&amp;lt;ffffffff816a774f&amp;gt;] mutex_lock+0x1f/0x2f
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffffc127ccfb&amp;gt;] ofd_create_hdl+0xdcb/0x2090 [ofd]
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffffc0ed1de7&amp;gt;] ? lustre_msg_add_version+0x27/0xa0 [ptlrpc]
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffffc0ed213f&amp;gt;] ? lustre_pack_reply_v2+0x14f/0x280 [ptlrpc]
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffffc0ed2461&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffffc0f35085&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffffc0eddec6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffffc0eda4f8&amp;gt;] ? ptlrpc_wait_event+0x98/0x340 [ptlrpc]
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffff810c4822&amp;gt;] ? default_wake_function+0x12/0x20
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffff810ba588&amp;gt;] ? __wake_up_common+0x58/0x90
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffffc0ee1602&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffffc0ee0b70&amp;gt;] ? ptlrpc_main+0x0/0x1e40 [ptlrpc]
Sep 26 23:56:48 soak-5 kernel: [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Sep 26 23:56:49 soak-5 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 26 23:56:49 soak-5 kernel: [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Sep 26 23:56:49 soak-5 kernel: [&amp;lt;ffffffff810b08c0&amp;gt;] ? kthread+0x0/0xe0
Sep 26 23:56:49 soak-5 kernel:
Sep 26 23:56:49 soak-5 kernel: LustreError: dumping log to /tmp/lustre-log.1506470209.149701
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We will run lnet_selftest tomorrow to validate the hardware after the upgrade.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="28360" name="soak-3.lustre-log.1506442389.55178" size="3185732" author="cliffw" created="Tue, 26 Sep 2017 17:27:04 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzkun:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>