<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:58:05 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-13067] lnet router crashes with Thread overran stack, or stack corrupted</title>
                <link>https://jira.whamcloud.com/browse/LU-13067</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Lustre router crashed with following in vmcore-dmesg.txt:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[  963.601219] LNet: 29007:0:(o2iblnd_cb.c:3396:kiblnd_check_conns()) Timed out tx for 172.19.1.165@o2ib100: 50 seconds
[  963.611771] LNetError: 29007:0:(lib-msg.c:485:lnet_handle_local_failure()) ni 172.19.2.26@o2ib100 added to recovery queue. Health = 900
[  963.623984] LNetError: 29007:0:(peer.c:3451:lnet_peer_ni_add_to_recoveryq_locked()) lpni 172.19.1.165@o2ib100 added to recovery queue. Health = 900
[  963.637202] LNetError: 29007:0:(peer.c:3451:lnet_peer_ni_add_to_recoveryq_locked()) Skipped 3 previous similar messages
[  963.648165] BUG: unable to handle kernel paging request at 00000000c10cb305
[  963.655155] IP: [&amp;lt;00000000c10cb305&amp;gt;] 0xc10cb305
[  963.659715] PGD 0
[  963.661750] Thread overran stack, or stack corrupted
[  963.666716] Oops: 0010 [#1] SMP
[  963.669994] Modules linked in: ko2iblnd(OE) lnet(OE) libcfs(OE) rdma_ucm(OE) ib_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx4_en(OE) mlx4_ib(OE) mlx4_core(OE) mlx5_ib(OE) ib_uverbs(OE) ib_core(OE) nf_conntrack_ipv4 nf_defrag_ipv4 xt_owner xt_conntrack mlx5_core(OE) amd64_edac_mod nf_conntrack edac_mce_amd joydev kvm_amd libcrc32c mlx_compat(OE) kvm mlxfw(OE) ses devlink enclosure iptable_filter irqbypass sg pcspkr ipmi_si ipmi_devintf ipmi_msghandler i2c_designware_platform pcc_cpufreq pinctrl_amd i2c_designware_core i2c_piix4 k10temp acpi_cpufreq sch_fq_codel binfmt_misc msr_safe(OE) ip_tables nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache overlay(T) ext4 mbcache jbd2 sd_mod crc_t10dif crct10dif_generic be2iscsi bnx2i cnic uio cxgb4i
[  963.741681]  cxgb4 cxgb3i cxgb3 mdio libcxgbi libcxgb qla4xxx iscsi_boot_sysfs dm_multipath ast drm_kms_helper crct10dif_pclmul syscopyarea crct10dif_common sysfillrect crc32_pclmul sysimgblt 8021q crc32c_intel fb_sys_fops ghash_clmulni_intel garp ttm mrp aesni_intel stp lrw llc gf128mul glue_helper igb mpt3sas ablk_helper dca raid_class cryptd drm ptp scsi_transport_sas ccp pps_core drm_panel_orientation_quirks i2c_algo_bit nfit libnvdimm sunrpc dm_mirror dm_region_hash dm_log dm_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi
[  963.788993] CPU: 5 PID: 29007 Comm: kiblnd_connd Kdump: loaded Tainted: G           OE  ------------ T 3.10.0-1062.7.1.1chaos.ch6.x86_64 #1
[  963.801495] Hardware name: Penguin Computing Altus XE2112/MZ91-FS0-ZB, BIOS F08a 12/19/2018
[  963.809833] task: ffff9df47705c1c0 ti: ffff9df47bf80000 task.ti: ffff9df47bf80000
[  963.817304] RIP: 0010:[&amp;lt;00000000c10cb305&amp;gt;]  [&amp;lt;00000000c10cb305&amp;gt;] 0xc10cb305
[  963.824280] RSP: 0018:ffff9df47bf80020  EFLAGS: 00010246
[  963.829585] RAX: 0000000000000000 RBX: ffff9e147019c280 RCX: ffffffffc11a1430
[  963.836716] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff9e147019c280
[  963.843842] RBP: ffff9df47bf80030 R08: 000000000000ffff R09: 000000000000ffff
[  963.850972] R10: 0000000000000280 R11: ffff9df47bf8006e R12: 0000000000000000
[  963.858096] R13: ffff9e147019c280 R14: ffff9de479d3b840 R15: ffff9de46fe4a200
[  963.865223] FS:  00007fffddfd0700(0000) GS:ffff9ddc7ef40000(0000) knlGS:0000000000000000
[  963.873307] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  963.879052] CR2: 00000000c10cb305 CR3: 0000003fd2410000 CR4: 00000000003407e0
[  963.886176] Call Trace:
[  963.888636]  [&amp;lt;ffffffffc10d15d5&amp;gt;] libcfs_debug_vmsg2+0xe5/0xbb0 [libcfs]
[  963.895336]  [&amp;lt;ffffffff977a8e03&amp;gt;] ? number.isra.2+0x323/0x360
[  963.901074]  [&amp;lt;ffffffff977a8f7b&amp;gt;] ? string.isra.7+0x3b/0xf0
[  963.906652]  [&amp;lt;ffffffffc10d20f7&amp;gt;] libcfs_debug_msg+0x57/0x80 [libcfs]
[  963.913106]  [&amp;lt;ffffffffc114e9df&amp;gt;] lnet_post_send_locked+0x40f/0xa40 [lnet]
[  963.919987]  [&amp;lt;ffffffffc1150ca8&amp;gt;] lnet_return_tx_credits_locked+0x238/0x4a0 [lnet]
[  963.927558]  [&amp;lt;ffffffffc1144511&amp;gt;] lnet_health_check+0x6a1/0x8b0 [lnet]
[  963.934084]  [&amp;lt;ffffffffc114488f&amp;gt;] lnet_finalize+0x16f/0x9a0 [lnet]
[  963.940262]  [&amp;lt;ffffffffc10d20f7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[  963.946876]  [&amp;lt;ffffffffc114e9fa&amp;gt;] lnet_post_send_locked+0x42a/0xa40 [lnet]
[  963.953749]  [&amp;lt;ffffffffc1150ca8&amp;gt;] lnet_return_tx_credits_locked+0x238/0x4a0 [lnet]
[  963.961321]  [&amp;lt;ffffffffc1144511&amp;gt;] lnet_health_check+0x6a1/0x8b0 [lnet]
[  963.967855]  [&amp;lt;ffffffffc114488f&amp;gt;] lnet_finalize+0x16f/0x9a0 [lnet]
[  963.974033]  [&amp;lt;ffffffffc10d20f7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[  963.980648]  [&amp;lt;ffffffffc114e9fa&amp;gt;] lnet_post_send_locked+0x42a/0xa40 [lnet]
[  963.987520]  [&amp;lt;ffffffffc1150ca8&amp;gt;] lnet_return_tx_credits_locked+0x238/0x4a0 [lnet]
[  963.995085]  [&amp;lt;ffffffffc1144511&amp;gt;] lnet_health_check+0x6a1/0x8b0 [lnet]
[  964.001611]  [&amp;lt;ffffffffc114488f&amp;gt;] lnet_finalize+0x16f/0x9a0 [lnet]
... &amp;lt;more of the same cycle&amp;gt;...
[  965.398450]  [&amp;lt;ffffffffc1012d42&amp;gt;] ? kiblnd_pool_free_node+0x82/0x180 [ko2iblnd]
[  965.405761]  [&amp;lt;ffffffffc101c79d&amp;gt;] kiblnd_tx_done+0x10d/0x3e0 [ko2iblnd]
[  965.412372]  [&amp;lt;ffffffffc101cabb&amp;gt;] kiblnd_txlist_done+0x4b/0x60 [ko2iblnd]
[  965.419159]  [&amp;lt;ffffffffc1021dd3&amp;gt;] kiblnd_check_conns+0x573/0x8c0 [ko2iblnd]
[  965.426129]  [&amp;lt;ffffffffc1026eeb&amp;gt;] kiblnd_connd+0x83b/0xa00 [ko2iblnd]
[  965.432567]  [&amp;lt;ffffffff97bac120&amp;gt;] ? __schedule+0x430/0xa00
[  965.438053]  [&amp;lt;ffffffff974e1890&amp;gt;] ? wake_up_state+0x20/0x20
[  965.443624]  [&amp;lt;ffffffffc10266b0&amp;gt;] ? kiblnd_cm_callback+0x23b0/0x23b0 [ko2iblnd]
[  965.450928]  [&amp;lt;ffffffff974cb451&amp;gt;] kthread+0xd1/0xe0
[  965.455806]  [&amp;lt;ffffffff974cb380&amp;gt;] ? insert_kthread_work+0x40/0x40
[  965.461899]  [&amp;lt;ffffffff97bb9f64&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[  965.468337]  [&amp;lt;ffffffff974cb380&amp;gt;] ? insert_kthread_work+0x40/0x40
[  965.474429] Code:  Bad RIP value.
[  965.477784] RIP  [&amp;lt;00000000c10cb305&amp;gt;] 0xc10cb305
[  965.482429]  RSP &amp;lt;ffff9df47bf80020&amp;gt;
[  965.485922] CR2: 00000000c10cb305
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>lustre-2.12.3_2.chaos-1.4mofed.ch6.x86_64&lt;br/&gt;
clients they connect to run the same lustre 2.12 version&lt;br/&gt;
servers and other routers they connect to run lustre-2.10.8_5.chaos-1.ch6.x86_64&lt;br/&gt;
RHEL 7.7 derivative&lt;br/&gt;
linux 3.10.0-1062.7.1.1chaos.ch6.x86_64&lt;br/&gt;
mlx5_ib: Mellanox Connect-IB Infiniband driver v4.7-1.0.0&lt;br/&gt;
See &lt;a href=&quot;https://github.com/LLNL/lustre/&quot;&gt;https://github.com/LLNL/lustre/&lt;/a&gt; for these patch stacks.&lt;br/&gt;
One file system was undergoing an OS update at the time, so the servers were likely going up or down at the time.&lt;br/&gt;
</environment>
        <key id="57615">LU-13067</key>
            <summary>lnet router crashes with Thread overran stack, or stack corrupted</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="ofaaland">Olaf Faaland</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Thu, 12 Dec 2019 00:37:39 +0000</created>
                <updated>Thu, 30 Jan 2020 20:52:22 +0000</updated>
                            <resolved>Thu, 30 Jan 2020 20:52:22 +0000</resolved>
                                    <version>Lustre 2.12.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="259661" author="ofaaland" created="Thu, 12 Dec 2019 00:39:08 +0000"  >&lt;p&gt;Our local bug ID: TOSS4698&lt;/p&gt;</comment>
                            <comment id="259663" author="ofaaland" created="Thu, 12 Dec 2019 00:46:02 +0000"  >&lt;p&gt;I have the core dumps, so I can obtain information from them.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Reading symbols from /usr/lib/debug/usr/lib/modules/3.10.0-1062.4.1.1chaos.ch6.x86_64/extra/lustre/net/lnet.ko.debug...done.                      
(gdb) l *(lnet_finalize+0x16f)                                                                                                                    
0x128bf is in lnet_finalize (/usr/src/debug/lustre-2.12.3_2.chaos/lnet/lnet/lib-msg.c:914).                                                       
909                      * if the message send is success, timed out or failed in the                                                             
910                      * health check for any reason then we&apos;ll just finalize the                                                               
911                      * message. Otherwise just return since the message has been                                                              
912                      * put on the resend queue.                                                                                               
913                      */                                                                                                                       
914                     if (!lnet_health_check(msg))                                                                                              
915                             /* Message is queued for resend */                                                                                
916                             return;                                                                                                           
917             }                                                                                                                                 
918                                                                                                                                               
(gdb) l *(lnet_health_check+0x6a1)                                                                                                                
0x12541 is in lnet_health_check (/usr/src/debug/lustre-2.12.3_2.chaos/lnet/lnet/lib-msg.c:750).                                                   
745              */                                                                                                                               
746             msg-&amp;gt;msg_target.nid = msg-&amp;gt;msg_hdr.dest_nid;                                                                                      
747             lnet_msg_decommit_tx(msg, -EAGAIN);                                                                                               
748             msg-&amp;gt;msg_sending = 0;                                                                                                             
749             msg-&amp;gt;msg_receiving = 0;                                                                                                           
750             msg-&amp;gt;msg_target_is_router = 0;
751
752             CDEBUG(D_NET, &quot;%s-&amp;gt;%s:%s:%s - queuing for resend\n&quot;,
753                    libcfs_nid2str(msg-&amp;gt;msg_hdr.src_nid),
754                    libcfs_nid2str(msg-&amp;gt;msg_hdr.dest_nid),
(gdb) l *(lnet_return_tx_credits_locked+0x238)
0x1ecd8 is in lnet_return_tx_credits_locked (/usr/src/debug/lustre-2.12.3_2.chaos/lnet/lnet/lib-move.c:1212).
1207                            if (msg2_cpt != msg-&amp;gt;msg_tx_cpt) {
1208                                    lnet_net_unlock(msg-&amp;gt;msg_tx_cpt);
1209                                    lnet_net_lock(msg2_cpt);
1210                            }
1211                            (void) lnet_post_send_locked(msg2, 1);
1212                            if (msg2_cpt != msg-&amp;gt;msg_tx_cpt) {
1213                                    lnet_net_unlock(msg2_cpt);
1214                                    lnet_net_lock(msg-&amp;gt;msg_tx_cpt);
1215                            }
1216                    } else {
(gdb) l *(lnet_post_send_locked+0x40f)
0x1ca0f is in lnet_post_send_locked (/usr/src/debug/lustre-2.12.3_2.chaos/lnet/lnet/lib-move.c:963).
958                                             LNET_STATS_TYPE_DROP);
959
960                     CNETERR(&quot;Dropping message for %s: peer not alive\n&quot;,
961                             libcfs_id2str(msg-&amp;gt;msg_target));
962                     msg-&amp;gt;msg_health_status = LNET_MSG_STATUS_REMOTE_DROPPED;
963                     if (do_send)
964                             lnet_finalize(msg, -EHOSTUNREACH);
965
966                     lnet_net_lock(cpt);
967                     return -EHOSTUNREACH;
(gdb) quit
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="259695" author="pjones" created="Thu, 12 Dec 2019 15:51:45 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Could you please investigate?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="259713" author="ashehata" created="Thu, 12 Dec 2019 18:16:45 +0000"  >&lt;p&gt;Can you please share the share the patch list you have? This looks like a crash which has already been fixed by these two patches:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
3df41bb8515d5012d7e2f19b2d7019e3e1b64a71 LU-12568 lnet: Defer rspt cleanup when MD queued &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; unlink
c095fbda55ca632cff2696550f22a13a19ee4514 LU-12441 lnet: Detach rspt when md_threshold is infinite
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Do you have them?&lt;/p&gt;

&lt;p&gt;Looks similar to this: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12907&quot; title=&quot;LNet routers: LNetError: 14141:0:(lib-msg.c:894:lnet_finalize()) ASSERTION( !(((current_thread_info()-&amp;gt;preempt_count) &amp;amp; ((((1UL &amp;lt;&amp;lt; (10))-1) &amp;lt;&amp;lt; ((0 + 8) + 8)) | (((1UL &amp;lt;&amp;lt; (8))-1) &amp;lt;&amp;lt; (0 + 8)) | (((1UL &amp;lt;&amp;lt; (1))-1) &amp;lt;&amp;lt; (((0 + 8) + 8) + 10)))))&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12907&quot;&gt;&lt;del&gt;LU-12907&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="259737" author="ofaaland" created="Thu, 12 Dec 2019 19:47:27 +0000"  >&lt;p&gt;Hi Amir,&lt;br/&gt;
No, we do not have those two patches.   See &lt;a href=&quot;https://github.com/LLNL/lustre/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/LLNL/lustre/&lt;/a&gt; for our patch stacks.  Thanks.&lt;/p&gt;</comment>
                            <comment id="262228" author="ofaaland" created="Thu, 30 Jan 2020 20:49:25 +0000"  >&lt;p&gt;We are updating our 2.12 machines to 2.12.4-RC1 in the next week or two and will reopen the ticket if necessary.&lt;/p&gt;</comment>
                            <comment id="262229" author="ofaaland" created="Thu, 30 Jan 2020 20:52:22 +0000"  >&lt;p&gt;Amir believes this is a dupe of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12907&quot; title=&quot;LNet routers: LNetError: 14141:0:(lib-msg.c:894:lnet_finalize()) ASSERTION( !(((current_thread_info()-&amp;gt;preempt_count) &amp;amp; ((((1UL &amp;lt;&amp;lt; (10))-1) &amp;lt;&amp;lt; ((0 + 8) + 8)) | (((1UL &amp;lt;&amp;lt; (8))-1) &amp;lt;&amp;lt; (0 + 8)) | (((1UL &amp;lt;&amp;lt; (1))-1) &amp;lt;&amp;lt; (((0 + 8) + 8) + 10)))))&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12907&quot;&gt;&lt;del&gt;LU-12907&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00qvr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>