<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:05:06 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13892] LNetError kiblnd_check_conns() Timed out RDMA with 172.19.1.217@o2ib100 (107): c: 8, oc: 0, rc: 8</title>
                <link>https://jira.whamcloud.com/browse/LU-13892</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;lctl and lnetctl commands hang.&lt;/p&gt;

&lt;p&gt;console log messages on router:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Thu Aug&#160; 6 13:45:29 2020] LNet: 21326:0:(o2iblnd_cb.c:3397:kiblnd_check_conns()) Timed out tx for 172.19.2.24@o2ib100: 121 seconds
[Thu Aug&#160; 6 13:47:13 2020] LNetError: 21326:0:(o2iblnd_cb.c:3351:kiblnd_check_txs_locked()) Timed out tx: active_txs, 0 seconds
[Thu Aug&#160; 6 13:47:13 2020] LNetError: 21326:0:(o2iblnd_cb.c:3426:kiblnd_check_conns()) Timed out RDMA with 172.19.1.217@o2ib100 (107): c: 8, oc: 0, rc: 8
[Thu Aug&#160; 6 13:47:14 2020] LNetError: 21326:0:(o2iblnd_cb.c:3351:kiblnd_check_txs_locked()) Timed out tx: active_txs, 1 seconds
[Thu Aug&#160; 6 13:47:14 2020] LNetError: 21326:0:(o2iblnd_cb.c:3426:kiblnd_check_conns()) Timed out RDMA with 172.19.1.196@o2ib100 (108): c: 8, oc: 0, rc: 8 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Watchdog complains about hung kernel tasks:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Thu Aug  6 13:47:28 2020] INFO: task kworker/43:1:584 blocked for more than 120 seconds.
[Thu Aug  6 13:47:28 2020] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[Thu Aug  6 13:47:28 2020] kworker/43:1    D ffff98cc7df2e2a0     0   584      2 0x00000080
[Thu Aug  6 13:47:28 2020] Workqueue: ib_cm cm_work_handler [ib_cm]
[Thu Aug  6 13:47:28 2020] Call Trace:
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffa9bb4cb9&amp;gt;] schedule_preempt_disabled+0x39/0x90
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffa9bb27ef&amp;gt;] __mutex_lock_slowpath+0x10f/0x250
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffa9bb1a82&amp;gt;] mutex_lock+0x32/0x42
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffc14d4aa1&amp;gt;] lnet_nid2peerni_locked+0x71/0x150 [lnet]
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffc14c1591&amp;gt;] lnet_parse+0x791/0x1200 [lnet]
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffc0ad23a3&amp;gt;] kiblnd_handle_rx+0x223/0x6f0 [ko2iblnd]
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffa94e26ea&amp;gt;] ? try_to_wake_up+0x19a/0x3c0
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffc0ad28ef&amp;gt;] kiblnd_handle_early_rxs+0x7f/0x120 [ko2iblnd]
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffc0ad385e&amp;gt;] kiblnd_connreq_done+0x29e/0x6e0 [ko2iblnd]
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffc0ad6617&amp;gt;] kiblnd_cm_callback+0x11e7/0x23b0 [ko2iblnd]
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffa94c2eda&amp;gt;] ? __queue_delayed_work+0xaa/0x1a0
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffc11dffbf&amp;gt;] cma_ib_handler+0x13f/0x2b0 [rdma_cm]
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffc0bdafab&amp;gt;] cm_process_work+0x2b/0x140 [ib_cm]
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffc0bdd3f3&amp;gt;] cm_work_handler+0xb53/0x158a [ib_cm]
[Thu Aug  6 13:47:28 2020]  [&amp;lt;ffffffffa94c197e&amp;gt;] ? move_linked_works+0x5e/0x90
[Thu Aug  6 13:47:29 2020]  [&amp;lt;ffffffffa94c20a3&amp;gt;] ? pwq_activate_delayed_work+0x43/0xe0
[Thu Aug  6 13:47:29 2020]  [&amp;lt;ffffffffa94c43cf&amp;gt;] process_one_work+0x18f/0x4a0
[Thu Aug  6 13:47:29 2020]  [&amp;lt;ffffffffa94c51b6&amp;gt;] worker_thread+0x126/0x3e0
[Thu Aug  6 13:47:29 2020]  [&amp;lt;ffffffffa94c5090&amp;gt;] ? rescuer_thread+0x430/0x430
[Thu Aug  6 13:47:29 2020]  [&amp;lt;ffffffffa94cca01&amp;gt;] kthread+0xd1/0xe0
[Thu Aug  6 13:47:29 2020]  [&amp;lt;ffffffffa94cc930&amp;gt;] ? insert_kthread_work+0x40/0x40
[Thu Aug  6 13:47:29 2020]  [&amp;lt;ffffffffa9bc0f64&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[Thu Aug  6 13:47:29 2020]  [&amp;lt;ffffffffa94cc930&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;utility commands hang on router with these stacks:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@corona86:pass2]# for x in 21845 25481 18055; do ps -fp $x; cat /proc/${x}/stack; echo; done       
UID         PID   PPID  C STIME TTY          TIME CMD
root      21845      1  0 13:49 ?        00:00:00 lctl list_nids
[&amp;lt;ffffffffc14a0bf6&amp;gt;] LNetNIInit+0x46/0xc20 [lnet]
[&amp;lt;ffffffffc14c29ae&amp;gt;] lnet_ioctl+0x4e/0x270 [lnet]
[&amp;lt;ffffffffa9bbbcbf&amp;gt;] notifier_call_chain+0x4f/0x70
[&amp;lt;ffffffffa94d34ed&amp;gt;] __blocking_notifier_call_chain+0x4d/0x70
[&amp;lt;ffffffffa94d3526&amp;gt;] blocking_notifier_call_chain+0x16/0x20
[&amp;lt;ffffffffc0a5cae3&amp;gt;] libcfs_ioctl+0x2a3/0x510 [libcfs]
[&amp;lt;ffffffffc0a58887&amp;gt;] libcfs_psdev_ioctl+0x67/0xf0 [libcfs]
[&amp;lt;ffffffffa9675800&amp;gt;] do_vfs_ioctl+0x420/0x6d0
[&amp;lt;ffffffffa9675b51&amp;gt;] SyS_ioctl+0xa1/0xc0
[&amp;lt;ffffffffa9bc1112&amp;gt;] system_call_fastpath+0x25/0x2a
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff

UID         PID   PPID  C STIME TTY          TIME CMD
root      25481      1  0 14:07 ?        00:00:00 lctl ping 192.168.128.4@o2ib36
[&amp;lt;ffffffffc14a0bf6&amp;gt;] LNetNIInit+0x46/0xc20 [lnet]
[&amp;lt;ffffffffc14c29ae&amp;gt;] lnet_ioctl+0x4e/0x270 [lnet]
[&amp;lt;ffffffffa9bbbcbf&amp;gt;] notifier_call_chain+0x4f/0x70
[&amp;lt;ffffffffa94d34ed&amp;gt;] __blocking_notifier_call_chain+0x4d/0x70
[&amp;lt;ffffffffa94d3526&amp;gt;] blocking_notifier_call_chain+0x16/0x20
[&amp;lt;ffffffffc0a5cae3&amp;gt;] libcfs_ioctl+0x2a3/0x510 [libcfs]
[&amp;lt;ffffffffc0a58887&amp;gt;] libcfs_psdev_ioctl+0x67/0xf0 [libcfs]
[&amp;lt;ffffffffa9675800&amp;gt;] do_vfs_ioctl+0x420/0x6d0
[&amp;lt;ffffffffa9675b51&amp;gt;] SyS_ioctl+0xa1/0xc0
[&amp;lt;ffffffffa9bc1112&amp;gt;] system_call_fastpath+0x25/0x2a
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff

UID         PID   PPID  C STIME TTY          TIME CMD
root      18055  25762  0 14:28 pts/3    00:00:00 lnetctl lnet configure
[&amp;lt;ffffffffc14c28f6&amp;gt;] lnet_configure+0x16/0x80 [lnet]
[&amp;lt;ffffffffc14c2bb5&amp;gt;] lnet_ioctl+0x255/0x270 [lnet]
[&amp;lt;ffffffffa9bbbcbf&amp;gt;] notifier_call_chain+0x4f/0x70
[&amp;lt;ffffffffa94d34ed&amp;gt;] __blocking_notifier_call_chain+0x4d/0x70
[&amp;lt;ffffffffa94d3526&amp;gt;] blocking_notifier_call_chain+0x16/0x20
[&amp;lt;ffffffffc0a5cae3&amp;gt;] libcfs_ioctl+0x2a3/0x510 [libcfs]
[&amp;lt;ffffffffc0a58887&amp;gt;] libcfs_psdev_ioctl+0x67/0xf0 [libcfs]
[&amp;lt;ffffffffa9675800&amp;gt;] do_vfs_ioctl+0x420/0x6d0
[&amp;lt;ffffffffa9675b51&amp;gt;] SyS_ioctl+0xa1/0xc0
[&amp;lt;ffffffffa9bc1112&amp;gt;] system_call_fastpath+0x25/0x2a
[&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The router node is a dual-socket AMD EPYC 7401 24-Core Processor. The internal fabric is managed by a Mellanox SM and Lustre was built against MOFED.&lt;/p&gt;</description>
                <environment>router node&lt;br/&gt;
lustre-2.12.5_4.llnl-1.1mofed.ch6.x86_64&lt;br/&gt;
3.10.0-1127.18.2.1chaos.ch6.x86_64</environment>
        <key id="60336">LU-13892</key>
            <summary>LNetError kiblnd_check_conns() Timed out RDMA with 172.19.1.217@o2ib100 (107): c: 8, oc: 0, rc: 8</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="ssmirnov">Serguei Smirnov</assignee>
                                    <reporter username="ofaaland">Olaf Faaland</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Fri, 7 Aug 2020 16:06:55 +0000</created>
                <updated>Fri, 16 Jul 2021 18:24:15 +0000</updated>
                            <resolved>Thu, 22 Oct 2020 13:57:55 +0000</resolved>
                                    <version>Lustre 2.12.5</version>
                                    <fixVersion>Lustre 2.12.6</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="276950" author="ofaaland" created="Fri, 7 Aug 2020 16:07:22 +0000"  >&lt;p&gt;For my records, my locla ticket is TOSS4861&lt;/p&gt;</comment>
                            <comment id="276953" author="ofaaland" created="Fri, 7 Aug 2020 16:37:35 +0000"  >&lt;p&gt;As far as I can tell both the internal fabric and the SAN are fine.  The subnet managers aren&apos;t reporting issues, &lt;del&gt;and I can ping (/bin/ping) nodes on both the SAN and the internal fabric successfully.&lt;/del&gt;&lt;/p&gt;

&lt;p&gt;edit: ICMP pings are not working now.  Last night I could ping fine, this morning I cannot.   The LNet symptoms were present even when pings were succeeding 100% of the time.&lt;/p&gt;</comment>
                            <comment id="276967" author="pjones" created="Fri, 7 Aug 2020 17:40:08 +0000"  >&lt;p&gt;Serguei&lt;/p&gt;

&lt;p&gt;Can you please advise?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="276968" author="ofaaland" created="Fri, 7 Aug 2020 17:40:10 +0000"  >&lt;p&gt;Load average on the router is 52, but sysrq &quot;l&quot; shows no processes on-core aside from the one handling the sysrq.  I assume that means that some group of processes, perhaps lnet, are repeatedly waking up all at the same time, and then immediately going to sleep again.&lt;/p&gt;</comment>
                            <comment id="277046" author="ofaaland" created="Mon, 10 Aug 2020 06:37:41 +0000"  >&lt;p&gt;Serguei,&lt;br/&gt;
Please let me know if there&apos;s other information you would like me to get from the node.&lt;br/&gt;
thanks,&lt;br/&gt;
Olaf&lt;/p&gt;</comment>
                            <comment id="277134" author="ssmirnov" created="Mon, 10 Aug 2020 19:47:43 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;Did you just upgrade to version 2.12.5 for your routers and then started seeing this problem? If so, what was the version you used before?&#160;&lt;/p&gt;

&lt;p&gt;It looks like &quot;lnetctl lnet configure&quot; is getting stuck. Do you see it complete if you just load the modules and run it?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="277138" author="ofaaland" created="Mon, 10 Aug 2020 19:58:46 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;Yes, this symptom appeared after booting the cluster for the first time as 2.12.5. The cluster was at 2.12.4_6.chaos before the upgrade.&lt;/p&gt;

&lt;p&gt;At the time I ran lnetctl lnet configure, the modules were already loaded.&lt;/p&gt;

&lt;p&gt;The module parameters look like this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;options lnet forwarding=&quot;enabled&quot; \
&#160; &#160; &#160; &#160; networks=&quot;o2ib100(san0),o2ib36(hsi0)&quot; \
&#160; &#160; &#160; &#160; routes=&quot;o2ib600 172.19.2.[22-25]@o2ib100&quot; 

options libcfs libcfs_panic_on_lbug=1
options libcfs libcfs_debug=0x3060580
options ptlrpc at_min=45
options ptlrpc at_max=600
options ksocklnd keepalive_count=100
options ksocklnd keepalive_idle=30
options lnet check_routers_before_use=1&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We use systemd to start lnet. The unit file looks like this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Unit]
Description=lnet management

Requires=network-online.target
After=network-online.target

Before=pacemaker_remote.service
After=rc-local.service

ConditionPathExists=!/proc/sys/lnet/
ConditionPathExists=!/etc/badnode

[Service]
Type=oneshot
RemainAfterExit=true
ExecStart=/sbin/modprobe lnet
ExecStart=/usr/sbin/lnetctl lnet configure --all
ExecStart=/usr/sbin/lnetctl import /etc/lnet.conf
ExecStart=/usr/sbin/lnetctl set discovery 0
ExecStop=/usr/sbin/lustre_rmmod ptlrpc
ExecStop=/usr/sbin/lnetctl lnet unconfigure
ExecStop=/usr/sbin/lustre_rmmod libcfs ldiskfs

[Install]
WantedBy=multi-user.target
RequiredBy=pacemaker_remote.service&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and /etc/lnet.conf is all comments:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@corona86:~]# grep -v &apos;^#&apos; /etc/lnet.conf
[root@corona86:~]# &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;thanks,&lt;br/&gt;
 Olaf&lt;/p&gt;</comment>
                            <comment id="277140" author="ofaaland" created="Mon, 10 Aug 2020 20:09:29 +0000"  >&lt;p&gt;The issue is intermittent.&lt;/p&gt;

&lt;p&gt;The cluster has two router nodes.&#160; Both had the same symptoms initially.&#160; One was rebooted and came up without these errors and without hanging lctl/lnetctl commands.&#160; This node was rebooted and still had the issue.&lt;/p&gt;</comment>
                            <comment id="277143" author="ssmirnov" created="Mon, 10 Aug 2020 20:43:26 +0000"  >&lt;p&gt;Could you please try changing &quot;check_routers_before_use&quot; to be 0? This could help to narrow down where the initialization is getting stuck. Also, could you please make sure lnet logging is enabled (lctl set_param debug=+net)?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="277148" author="ofaaland" created="Mon, 10 Aug 2020 22:18:17 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;I&apos;ve set check_routers_before_use=0 and since then haven&apos;t seen the issue (4 reboots).&#160; I&apos;m repeating the experiment now.&#160; I&apos;ll post the results in about an hour.&lt;/p&gt;

&lt;p&gt;thanks,&lt;/p&gt;

&lt;p&gt;Olaf&lt;/p&gt;</comment>
                            <comment id="277152" author="ssmirnov" created="Mon, 10 Aug 2020 23:13:13 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;I&apos;m wondering about the state of the routers that you have configured in lnet options. Can they be &quot;lnetctl ping&quot;ed at the time the router under test is being rebooted?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="277155" author="ofaaland" created="Mon, 10 Aug 2020 23:29:32 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;lnetctl ping of those routers was successful from corona85, while corona86 was booting and then entered the problem state.&lt;/p&gt;

&lt;p&gt;I had about 15 reboots of corona86 in a row without the issue when check_routers_before_use=0, and as soon as I set it to 1 and rebooted, corona86 entered the problem state.&lt;/p&gt;

&lt;p&gt;thanks,&lt;br/&gt;
Olaf&lt;/p&gt;</comment>
                            <comment id="277225" author="ssmirnov" created="Tue, 11 Aug 2020 19:26:01 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;With &quot;check_routers_before_use&quot; on, the issue appears to be that corona86 stalls initialization because it believes that not all of the routers on its list are &quot;alive&quot;, even though they seem to be pingable from another node. When you provide the lnet debug log we should be able to see more. Please capture one with&#160; &quot;check_routers_before_use&quot; turned off, too (for this one, allow ~2 minutes).&#160;&lt;/p&gt;

&lt;p&gt;I checked with Amir on this. Could you please also provide the output of &quot;lnetctl global show&quot; for corona86?&#160; If the health feature is on, you can try disabling and see if there&apos;s any effect.&#160;&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="277240" author="ofaaland" created="Tue, 11 Aug 2020 22:02:58 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
I uploaded two files:&lt;br/&gt;
pass5.check_1.tgz - check_routers_before_use is set to 1 and we see the issue&lt;br/&gt;
pass6.check_0.tgz - check_routers_before_use is set to 0 and we do not see the issue&lt;/p&gt;

&lt;p&gt;both have modprobe.conf files, dmesg, and debug logs.  The one where check_routers_before_use is 0 has the output of &lt;tt&gt;lnetctl export&lt;/tt&gt; which includes the global settings.  That output isn&apos;t present for &lt;tt&gt;check_routers_before_use=1&lt;/tt&gt; because the lnetctl command hangs, as reported originally.&lt;/p&gt;

&lt;p&gt;thanks&lt;br/&gt;
Olaf&lt;/p&gt;</comment>
                            <comment id="277246" author="ofaaland" created="Tue, 11 Aug 2020 23:15:45 +0000"  >&lt;p&gt;Disabling lnet health also seems to work around the issue.&#160; 3 reboots without incident, with &lt;tt&gt;check_routers_before_use=1&lt;/tt&gt; and &lt;tt&gt;lnet_health_sensitivity=0&lt;/tt&gt;&lt;/p&gt;

&lt;p&gt;I&apos;ve uploaded logs in pass9.check_1.health_0.tgz including debug logs, dmesg, modprobe.d files, and lnetctl export output.&lt;/p&gt;</comment>
                            <comment id="277252" author="ofaaland" created="Wed, 12 Aug 2020 00:38:10 +0000"  >&lt;p&gt;In case it helps, the relevant lnet topology is:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[corona compute nodes]--o2ib36/coronaIBfabric---[corona85,86]--o2ib100/IB_SAN_B451--[orelic2,3,4,5]--10gigE--[zrelic2,3,4,5]--o2ib600/IB_SAN_B654--[two lustre clusters]&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So the gateways specified in the &lt;tt&gt;router&lt;/tt&gt; lnet module parameter are the orelic nodes in that diagram.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# corona86 NID on the IB_SAN_B451 net is  172.19.2.27@o2ib100
[root@orelic2:~]# lnetctl export | grep -A14 &quot;primary.*172.19.2.27@o2ib100&quot;
    - primary nid: 172.19.2.27@o2ib100
      Multi-Rail: False
      peer ni:
        - nid: 172.19.2.27@o2ib100
          state: up
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: -23
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: -74
          send_count: 1486791834
          recv_count: 1548015297
          drop_count: 6805
          refcount: 4
[root@orelic2:~]# rpm -q lustre
lustre-2.10.8_9.chaos-1.ch6.x86_64&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I wonder if our buffers or credits configuration is causing messages to be dropped on orelic nodes, which then causes ping failures on corona86?&lt;br/&gt;
 See &lt;tt&gt;min_tx_credits&lt;/tt&gt; on the routes between a pair of orelic and zrelic nodes:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@zrelic2:~]# lctl list_nids
172.16.70.62@tcp
172.19.3.62@o2ib600

[root@zrelic2:~]# lnetctl export | grep -A10 &apos;primary.*172.16.66.22@tcp&apos;
    - primary nid: 172.16.66.22@tcp
      Multi-Rail: False
      peer ni:
        - nid: 172.16.66.22@tcp
          state: up
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: -233
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: -4
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@orelic2:~]# lctl list_nids
172.16.66.22@tcp
172.19.2.22@o2ib100

[root@orelic2:~]# lnetctl export | grep -A10 &apos;primary.*172.16.70.62@tcp&apos;
    - primary nid: 172.16.70.62@tcp
      Multi-Rail: False
      peer ni:
        - nid: 172.16.70.62@tcp
          state: up
          max_ni_tx_credits: 8
          available_tx_credits: 8
          min_tx_credits: -397
          tx_q_num_of_buf: 0
          available_rtr_credits: 8
          min_rtr_credits: -5
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="277358" author="ssmirnov" created="Wed, 12 Aug 2020 19:01:19 +0000"  >&lt;p&gt;Hi Olaf,&#160;&lt;/p&gt;

&lt;p&gt;It may be a good idea to check the credit configuration, but I don&apos;t think it is related to the issue with the start-up we&apos;re trying to debug.&lt;/p&gt;

&lt;p&gt;Here are a few more experiments that would be nice to try:&lt;/p&gt;

&lt;p&gt;1) With&#160;&lt;tt&gt;check_routers_before_use=1&lt;/tt&gt; and &lt;tt&gt;lnet_health_sensitivity=100,&#160;}}try setting&#160;{{retry_count=1&#160;}}and&#160;{{transaction_timeout=100&lt;/tt&gt;&lt;/p&gt;

&lt;p&gt;2) This is unlikely but still would be good to check: in addition to linux routing setup described &lt;a href=&quot;http://wiki.lustre.org/LNet_Router_Config_Guide#ARP_flux_issue_for_MR_node&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;here&lt;/a&gt;, accept_local flag should be set to 1.&#160;&lt;/p&gt;

&lt;p&gt;Amir and I were trying to track the first attempt to send to the routers, but it looks like this moment is not captured in the logs. Would it be possible to start lnet logging sooner? You should be able to reproduce without rebooting the node, just reloading the modules should work. It may give us more clues as to why the connection times out in one scenario but not the other.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei&#160;&lt;/p&gt;</comment>
                            <comment id="277364" author="ofaaland" created="Wed, 12 Aug 2020 19:53:09 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
OK.  I&apos;ll try experiments 1 and 2.&lt;br/&gt;
The reboots are because it&apos;s often slower to wait for lnet to unload than it is to reboot the node.&lt;/p&gt;

&lt;p&gt;That&apos;s odd that the initial send attempt was not captured.  I&apos;m setting&lt;br/&gt;
&lt;tt&gt;options libcfs libcfs_debug=-1&lt;/tt&gt;&lt;br/&gt;
via modprobe.d config file, so I don&apos;t know how I could have missed it.  Which &quot;pass&quot; contains the debug log dump you are referring to, so I can look to see if I can understand what happened?&lt;/p&gt;

&lt;p&gt;thanks&lt;/p&gt;</comment>
                            <comment id="277370" author="ssmirnov" created="Wed, 12 Aug 2020 20:21:05 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;For example, if I search for &quot;172.19.2.22&quot; in the debug log of&#160;pass9.check_1.health_0, I see that the first occurrence is the callback that signals that the connection is established. If I understand correctly, there should be messages related to queuing the tx prior to this, leading to initiating and establishing the connection.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="277387" author="ofaaland" created="Wed, 12 Aug 2020 22:12:51 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
 I uploaded logs for the test with transaction_timeout=100 and a re-run of the test with sensitivity=0. See logs with names pass10* and pass11*.&lt;br/&gt;
thanks,&lt;br/&gt;
Olaf&lt;/p&gt;</comment>
                            <comment id="277388" author="ssmirnov" created="Wed, 12 Aug 2020 22:47:20 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;Just to clarify, did the system hang in pass10 or pass11?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei&lt;/p&gt;</comment>
                            <comment id="277390" author="ofaaland" created="Wed, 12 Aug 2020 22:54:12 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
 The system did not hang on startup in either pass10 or pass11.&lt;br/&gt;
 For pass10, I was able to stop lnet and unload the modules.&lt;br/&gt;
 For pass11, the system hung when stopped lnet, with console log errors like this:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2020-08-12 14:06:38 [79017.582133] LNetError: 80234:0:(lib-move.c:4391:lnet_parse()) 172.19.1.184@o2ib100, src 172.19.1.184@o2ib100: Dropping GET (error -108 looking up sender)
2020-08-12 14:06:38 [79017.595904] LNetError: 80234:0:(lib-move.c:4391:lnet_parse()) Skipped 83936 previous similar messages
2020-08-12 14:13:39 [79438.224874] LNet: 80439:0:(api-ni.c:2010:lnet_clear_zombies_nis_locked()) Waiting for zombie LNI 172.19.2.27@o2ib100
2020-08-12 14:16:39 [79617.606824] LNetError: 80204:0:(lib-move.c:4391:lnet_parse()) 172.19.2.22@o2ib100, src 172.19.3.102@o2ib600: Dropping PUT (error -108 looking up sender)
2020-08-12 14:16:42 [79617.620470] LNetError: 80204:0:(lib-move.c:4391:lnet_parse()) Skipped 88073 previous similar messages
2020-08-12 14:26:40 [80217.667311] LNetError: 80224:0:(lib-move.c:4391:lnet_parse()) 172.19.1.229@o2ib100, src 172.19.1.229@o2ib100: Dropping GET (error -108 looking up sender)
2020-08-12 14:26:40 [80217.681198] LNetError: 80224:0:(lib-move.c:4391:lnet_parse()) Skipped 89526 previous similar messages
2020-08-12 14:36:40 [80817.759190] LNetError: 80224:0:(lib-move.c:4391:lnet_parse()) 172.19.1.229@o2ib100, src 172.19.1.229@o2ib100: Dropping GET (error -108 looking up sender)
2020-08-12 14:36:40 [80817.772927] LNetError: 80224:0:(lib-move.c:4391:lnet_parse()) Skipped 83044 previous similar messages&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and I ended up rebooting the node.&lt;/p&gt;

&lt;p&gt;edit: I pasted in the wrong log entries initially&lt;/p&gt;</comment>
                            <comment id="277494" author="ofaaland" created="Thu, 13 Aug 2020 23:53:57 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;I&apos;m considering continuing my rollout with settings&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;options lnet check_routers_before_use=1
options lnet lnet_health_sensitivity=0 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Do you have any clearer idea where the issue is yet?  Do you have any idea whether the issue is really worked around this way, or just made less obvious?&lt;/p&gt;

&lt;p&gt;thanks,&lt;br/&gt;
Olaf&lt;/p&gt;</comment>
                            <comment id="277495" author="ssmirnov" created="Fri, 14 Aug 2020 00:11:57 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;My current understanding is that the issue is with the connection taking long time to establish, exceeding the lnd timeout. Something goes wrong then and the tx is retried too soon and that makes things even more wrong. I believe this is why there&apos;s no more hanging when the transaction timeout, from which the lnd timeout is derived, is increased this way or another. That said, I&apos;m not sure why this behaviour was not seen with the release you used previously, as I haven&apos;t been able to locate the change in 2.12.5 that could affect this.&#160;&lt;/p&gt;

&lt;p&gt;So in my opinion, everything that you&apos;ve tried successfully so far is a workaround: the issue still exists, but is hidden because the long enough lnd timeout is in effect.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="277528" author="ofaaland" created="Fri, 14 Aug 2020 16:54:55 +0000"  >&lt;p&gt;Hi Serguei,  Thanks&lt;/p&gt;</comment>
                            <comment id="278089" author="ofaaland" created="Wed, 26 Aug 2020 00:00:04 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;Have you learned anything further?&lt;/p&gt;

&lt;p&gt;thanks&lt;/p&gt;</comment>
                            <comment id="278144" author="ssmirnov" created="Wed, 26 Aug 2020 20:12:02 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;I&apos;m trying to simulate the issue using eth as I currently have no access to IB. I haven&apos;t been able to figure it out yet.&lt;/p&gt;

&lt;p&gt;Did you have a chance to check on the linux routing setup on your machines? I&apos;m wondering about the value of accept_local flag. This can affect local interface recovery. There&apos;s more detail to be found here:&#160;&lt;a href=&quot;http://wiki.lustre.org/LNet_Router_Config_Guide#ARP_flux_issue_for_MR_node&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://wiki.lustre.org/LNet_Router_Config_Guide#ARP_flux_issue_for_MR_node&lt;/a&gt;&#160;and here:&#160;&lt;a href=&quot;https://wiki.whamcloud.com/display/LNet/MR+Cluster+Setup&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.whamcloud.com/display/LNet/MR+Cluster+Setup&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Please try&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 sysctl -w net.ipv4.conf.all.accept_local=1&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;if you see that accept_local is 0 on your system.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="278147" author="ofaaland" created="Wed, 26 Aug 2020 20:25:43 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;Sorry, I didn&apos;t look into that because I misread the situation where that applies.  Those routers do have accept_local set to 0.  I&apos;ll try it.&lt;/p&gt;

&lt;p&gt;thanks,&lt;br/&gt;
Olaf&lt;/p&gt;</comment>
                            <comment id="279266" author="charr" created="Thu, 10 Sep 2020 17:51:28 +0000"  >&lt;p&gt;As a data point, setting &lt;tt&gt;net.ipv4.conf.all.accept_local=1&lt;/tt&gt; doesn&apos;t appear to have an effect on lnet shutdown. I still get the following and indefinite hangs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2020-09-10 10:48:14 [ 3433.237844] LNetError: 20855:0:(lib-move.c:4391:lnet_parse()) 192.168.128.123@o2ib35, src 192.168.128.123@o2ib35: Dropping GET (error -108 looking up sender)
2020-09-10 10:48:14 [ 3433.253593] LNetError: 20855:0:(lib-move.c:4391:lnet_parse()) Skipped 2 previous similar messages
2020-09-10 10:48:15 [ 3433.773088] LNetError: 20861:0:(lib-move.c:4391:lnet_parse()) 172.19.3.136@o2ib600, src 172.19.3.136@o2ib600: Dropping GET (error -108 looking up sender)
2020-09-10 10:48:15 [ 3433.788436] LNetError: 20861:0:(lib-move.c:4391:lnet_parse()) Skipped 272 previous similar messages
2020-09-10 10:48:15 [ 3434.225432] LNet: 34467:0:(api-ni.c:2010:lnet_clear_zombies_nis_locked()) Waiting for zombie LNI 172.19.3.66@o2ib600
... &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="279541" author="ssmirnov" created="Mon, 14 Sep 2020 17:42:11 +0000"  >&lt;p&gt;Hi Cameron,&lt;/p&gt;

&lt;p&gt;When you ran the tests with&#160;{{net.ipv4.conf.all.accept_local=1,&#160;}}did you see hangs on shutdown only?&#160;&lt;/p&gt;

&lt;p&gt;What were&#160;check_routers_before_use and lnet_health_sensitivity set to?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="279544" author="charr" created="Mon, 14 Sep 2020 17:59:54 +0000"  >&lt;p&gt;Serguei,&lt;/p&gt;

&lt;p&gt;I can&apos;t say for certain, but judging by the settings on the 3 sibling routers, those settings were check_routers...=1 and lnet_health_sensitivity=0, and I&apos;m fairly sure I didn&apos;t change those.&lt;/p&gt;</comment>
                            <comment id="280550" author="ofaaland" created="Thu, 24 Sep 2020 23:13:07 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;Reproduced even with the recommended arp settings.&#160; Node was booted, and as you see &lt;b&gt;lnetctl&lt;/b&gt; was hung.&#160; I forgot to set debug=-1, so I can do that and reproduce if it&apos;s helpful. I did capture SysRq w, if you want that let me know.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@corona81:~]# pdsh -w e85 systemctl status lnet
e85: * lnet.service - lnet management
e85:&#160; &#160; Loaded: loaded (/etc/systemd/system/lnet.service; enabled; vendor preset: disabled)
e85:&#160; &#160; Active: activating (start) since Thu 2020-09-24 15:57:18 PDT; 3min 14s ago
e85: &#160; Process: 21710 ExecStart=/bin/grep --with-filename . /sys/module/lnet/parameters/lnet_health_sensitivity (code=exited, status=0/SUCCESS)
e85: &#160; Process: 21697 ExecStart=/sbin/modprobe lnet (code=exited, status=0/SUCCESS)
e85: &#160; Process: 21684 ExecStart=/tftpboot/dumps/set-arp-lnet set (code=exited, status=0/SUCCESS)
e85: &#160; Process: 21676 ExecStart=/bin/bash -c echo lnet for Lustre: Build Version: $(rpm -q lustre) | /usr/bin/logger -t lnet.service -p local7.info (code=exited, status=0/SUCCESS)
e85:&#160; Main PID: 21712 (lnetctl)
e85:&#160; &#160; CGroup: /system.slice/lnet.service
e85:&#160; &#160; &#160; &#160; &#160; &#160; `-21712 /usr/sbin/lnetctl lnet configure --all
e85: 
e85: Sep 24 15:57:18 corona85 set-arp-lnet[21684]: net.ipv4.conf.san0.arp_filter = 0
e85: Sep 24 15:57:18 corona85 set-arp-lnet[21684]: net.ipv4.conf.san0.arp_announce = 2
e85: Sep 24 15:57:18 corona85 set-arp-lnet[21684]: net.ipv4.conf.san0.rp_filter = 0
e85: Sep 24 15:57:18 corona85 set-arp-lnet[21684]: net.ipv4.conf.hsi0.arp_ignore = 1
e85: Sep 24 15:57:18 corona85 set-arp-lnet[21684]: net.ipv4.conf.hsi0.arp_filter = 0
e85: Sep 24 15:57:18 corona85 set-arp-lnet[21684]: net.ipv4.conf.hsi0.arp_announce = 2
e85: Sep 24 15:57:18 corona85 set-arp-lnet[21684]: net.ipv4.conf.hsi0.rp_filter = 0
e85: Sep 24 15:57:18 corona85 set-arp-lnet[21684]: net.ipv4.conf.all.rp_filter = 0
e85: Sep 24 15:57:18 corona85 set-arp-lnet[21684]: net.ipv4.conf.all.accept_local = 1
e85: Sep 24 15:57:18 corona85 grep[21710]: /sys/module/lnet/parameters/lnet_health_sensitivity:100&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;What next? Another pass with debug=-1? Another pass with debug from both this router node and the peers described in its routes? A debug patch?&lt;/p&gt;

&lt;p&gt;thanks&lt;/p&gt;</comment>
                            <comment id="280551" author="ssmirnov" created="Thu, 24 Sep 2020 23:22:36 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;Good to know this is not because of linux routing settings. Sysrq w output would be helpful. Regarding next steps, if you&apos;d be ok with a debug patch, that would be great, as I still haven&apos;t seen this happen anywhere else.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="280556" author="ofaaland" created="Fri, 25 Sep 2020 00:11:29 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;Yes, a debug patch would be fine.&#160; Please just push it to gerrit so it goes through testing to show it doesn&apos;t itself blow things up, and so I can keep a record of where it came from.&lt;/p&gt;

&lt;p&gt;I&apos;ve attached dmesg output including SysRq w content.&#160; It&apos;s called&#160;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/36121/36121_pass12.dmesg.corona85.1600988107.out&quot; title=&quot;pass12.dmesg.corona85.1600988107.out attached to LU-13892&quot;&gt;pass12.dmesg.corona85.1600988107.out&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Thank you.&lt;/p&gt;</comment>
                            <comment id="280717" author="gerrit" created="Sat, 26 Sep 2020 01:40:55 +0000"  >&lt;p&gt;Serguei Smirnov (ssmirnov@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40059&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40059&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13892&quot; title=&quot;LNetError kiblnd_check_conns() Timed out RDMA with 172.19.1.217@o2ib100 (107): c: 8, oc: 0, rc: 8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13892&quot;&gt;&lt;del&gt;LU-13892&lt;/del&gt;&lt;/a&gt; lnet: lock-up during router check&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 74097295693793ca3566dccfe96abf3a7e56a659&lt;/p&gt;</comment>
                            <comment id="280914" author="ofaaland" created="Tue, 29 Sep 2020 00:30:55 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;Looking at a crash dump I found that the &quot;lnetctl configure --all&quot; from the lnet.service file was the process that was holding&#160;the_lnet.ln_api_mutex, and it was in&#160;lnet_wait_known_routerstate().&#160;&#160;Based on your patch and what you&apos;ve said earlier, I think you figured this out a long time ago, I just mention it in case you had any doubt.&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="280920" author="ssmirnov" created="Tue, 29 Sep 2020 03:10:00 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;You&apos;re correct about the idea behind this patch. I was waiting for it to get reviewed first, but if you already looked at the change and are comfortable with it, maybe you can give it a try? If the issue can still be reproduced, please capture the lnet debug log.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="281600" author="ofaaland" created="Tue, 6 Oct 2020 21:42:53 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;The patch looks to be working properly.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="281707" author="gerrit" created="Wed, 7 Oct 2020 23:24:36 +0000"  >&lt;p&gt;Serguei Smirnov (ssmirnov@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/40172&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40172&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13892&quot; title=&quot;LNetError kiblnd_check_conns() Timed out RDMA with 172.19.1.217@o2ib100 (107): c: 8, oc: 0, rc: 8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13892&quot;&gt;&lt;del&gt;LU-13892&lt;/del&gt;&lt;/a&gt; lnet: lock-up during router check&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: db1905826983024a1bb4808c5ae93d6d32446b12&lt;/p&gt;</comment>
                            <comment id="281795" author="ofaaland" created="Thu, 8 Oct 2020 21:25:36 +0000"  >&lt;p&gt;Serguei,&lt;/p&gt;

&lt;p&gt;Why is the &quot;lock-up during router check&quot; patch against b2_12 and not master?&lt;/p&gt;

&lt;p&gt;thanks&lt;/p&gt;</comment>
                            <comment id="281819" author="ssmirnov" created="Thu, 8 Oct 2020 22:53:55 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;It didn&apos;t look to me that master needed this fix. There were lots of changes on master with the introduction of MRR, I guess one of these changes moved out the &quot;wait_router_start&quot; from under the api_mutex.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="281820" author="ofaaland" created="Thu, 8 Oct 2020 22:56:17 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
OK, thanks.&lt;/p&gt;</comment>
                            <comment id="282956" author="gerrit" created="Thu, 22 Oct 2020 06:19:07 +0000"  >&lt;p&gt;Oleg Drokin (green@whamcloud.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/40172/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/40172/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-13892&quot; title=&quot;LNetError kiblnd_check_conns() Timed out RDMA with 172.19.1.217@o2ib100 (107): c: 8, oc: 0, rc: 8&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-13892&quot;&gt;&lt;del&gt;LU-13892&lt;/del&gt;&lt;/a&gt; lnet: lock-up during router check&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_12&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 877d95b582db3d182d13dac4947c1f43b0e851dc&lt;/p&gt;</comment>
                            <comment id="282989" author="pjones" created="Thu, 22 Oct 2020 13:57:55 +0000"  >&lt;p&gt;Landed for 2.12.6&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="35540" name="dk.corona86.1596747314.txt" size="240319" author="ofaaland" created="Fri, 7 Aug 2020 16:30:11 +0000"/>
                            <attachment id="35544" name="dk.corona86.1596747437.txt" size="1380582" author="ofaaland" created="Fri, 7 Aug 2020 16:30:14 +0000"/>
                            <attachment id="35543" name="dk.corona86.1596748295.txt" size="6345426" author="ofaaland" created="Fri, 7 Aug 2020 16:30:25 +0000"/>
                            <attachment id="35541" name="dmesg.corona86.1596747297.txt" size="277657" author="ofaaland" created="Fri, 7 Aug 2020 16:30:12 +0000"/>
                            <attachment id="35545" name="dmesg.corona86.1596748292.txt" size="278733" author="ofaaland" created="Fri, 7 Aug 2020 16:30:12 +0000"/>
                            <attachment id="35542" name="dmesg.corona86.1596750724.txt" size="282456" author="ofaaland" created="Fri, 7 Aug 2020 16:30:12 +0000"/>
                            <attachment id="35589" name="pass10.sens100.retry1.check1.timeout100.tgz" size="19539225" author="ofaaland" created="Wed, 12 Aug 2020 20:57:18 +0000"/>
                            <attachment id="35588" name="pass11.sens0.retry0.check1.timeout50.tgz" size="27887764" author="ofaaland" created="Wed, 12 Aug 2020 20:57:23 +0000"/>
                            <attachment id="36121" name="pass12.dmesg.corona85.1600988107.out" size="857803" author="ofaaland" created="Fri, 25 Sep 2020 00:10:29 +0000"/>
                            <attachment id="35574" name="pass5.check_1.tgz" size="15887897" author="ofaaland" created="Tue, 11 Aug 2020 21:58:59 +0000"/>
                            <attachment id="35575" name="pass6.check_0.tgz" size="20378980" author="ofaaland" created="Tue, 11 Aug 2020 21:59:00 +0000"/>
                            <attachment id="35576" name="pass9.check_1.health_0.tgz" size="21315109" author="ofaaland" created="Tue, 11 Aug 2020 23:22:38 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i017fj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>