<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:08:23 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14282] WARNING: CPU: 22 PID: 5953 at kernel/softirq.c:151 __local_bh_enable_ip+0x82/0xb0</title>
                <link>https://jira.whamcloud.com/browse/LU-14282</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Lustre router produces the following stack dump when LNet is started:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LNet: Added LNI 192.168.129.134@o2ib27 [8/1024/0/180]
LNet: Using FastReg for registration
------------[ cut here ]------------
WARNING: CPU: 22 PID: 5953 at kernel/softirq.c:151 __local_bh_enable_ip+0x82/0xb0
Modules linked in: ko2iblnd(OE) lnet(OE) libcfs(OE) opa_vnic rpcrdma ib_iser mlx5_ib iTCO_wdt iTCO_vendor_support sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi hfi1 kvm sg rdmavt irqbypass joydev pcspkr mlx5_core lpc_ich mlxfw devlink i2c_i801 ioatdma ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_cpufreq sch_fq_codel ib_ipoib rdma_ucm ib_uverbs binfmt_misc msr_safe(OE) ib_umad iw_cxgb4 rdma_cm iw_cm ib_cm iw_cxgb3 ib_core ip_tables nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache overlay(T) ext4 mbcache jbd2 dm_service_time sd_mod crc_t10dif[&#160; 234.953405] LNet: Added LNI 172.19.3.75@o2ib600 [8/1024/0/180]
crct10dif_generic be2iscsi bnx2i cnic uio cxgb4i cxgb4 cxgb3i cxgb3 mdio libcxgbi 8021q libcxgb garp mrp stp qla4xxx llc iscsi_boot_sysfs mgag200 drm_kms_helper syscopyarea crct10dif_pclmul sysfillrect crct10dif_common sysimgblt crc32_pclmul fb_sys_fops crc32c_intel ttm ghash_clmulni_intel igb mxm_wmi drm ahci aesni_intel lrw libahci gf128mul dca glue_helper ablk_helper ptp libata cryptd drm_panel_orientation_quirks pps_core i2c_algo_bit dm_multipath wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi fuse
CPU: 22 PID: 5953 Comm: kworker/22:2H Kdump: loaded Tainted: G &#160; &#160; &#160; &#160; &#160; OE&#160; ------------ T 3.10.0-1160.4.1.1chaos.ch6.x86_64 #1
Hardware name: Penguin Computing Relion 2900e/S2600WT2R, BIOS SE5C610.86B.01.01.0027.071020182329 07/10/2018
Workqueue: rdmavt_cq send_complete [rdmavt]
Call Trace:
 [&amp;lt;ffffffff865ae498&amp;gt;] dump_stack+0x19/0x1b
 [&amp;lt;ffffffff85e9e0e8&amp;gt;] __warn+0xd8/0x100
 [&amp;lt;ffffffff85e9e22d&amp;gt;] warn_slowpath_null+0x1d/0x20
 [&amp;lt;ffffffff85ea7652&amp;gt;] __local_bh_enable_ip+0x82/0xb0
 [&amp;lt;ffffffff865b737e&amp;gt;] _raw_spin_unlock_bh+0x1e/0x20
 [&amp;lt;ffffffffc09673c5&amp;gt;] cfs_trace_unlock_tcd+0x65/0xb0 [libcfs]
 [&amp;lt;ffffffffc096de18&amp;gt;] libcfs_debug_vmsg2+0x728/0xbb0 [libcfs]
 [&amp;lt;ffffffff85eed858&amp;gt;] ? enqueue_task_fair+0x208/0x6c0
 [&amp;lt;ffffffff85edd4df&amp;gt;] ? ttwu_do_activate+0x6f/0x80
 [&amp;lt;ffffffffc096e2f7&amp;gt;] libcfs_debug_msg+0x57/0x80 [libcfs]
 [&amp;lt;ffffffffc086f32a&amp;gt;] kiblnd_cq_completion+0x11a/0x160 [ko2iblnd]
 [&amp;lt;ffffffffc0d745ae&amp;gt;] send_complete+0x3e/0x60 [rdmavt]
 [&amp;lt;ffffffff85ec298f&amp;gt;] process_one_work+0x18f/0x4a0
 [&amp;lt;ffffffff85ec3776&amp;gt;] worker_thread+0x126/0x3e0
 [&amp;lt;ffffffff85ec3650&amp;gt;] ? rescuer_thread+0x430/0x430
 [&amp;lt;ffffffff85ecafc1&amp;gt;] kthread+0xd1/0xe0
 [&amp;lt;ffffffff85ecaef0&amp;gt;] ? insert_kthread_work+0x40/0x40
 [&amp;lt;ffffffff865c1ff7&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
 [&amp;lt;ffffffff85ecaef0&amp;gt;] ? insert_kthread_work+0x40/0x40
---[ end trace 07f572a412f12f32 ]---
OK Started lnet management.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And the warning indicates that _raw_spin_unlock_bh() was called when either IRQs are disabled, or we are in an IRQ handler.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;(gdb) l *(__local_bh_enable_ip+0x82/0xb0)
0xffffffff810a75d0 is in __local_bh_enable_ip (kernel/softirq.c:150).
145	}
146	
147	EXPORT_SYMBOL(_local_bh_enable);
148	
149	void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
150	{
151		WARN_ON_ONCE(in_irq() || irqs_disabled());
152	#ifdef CONFIG_TRACE_IRQFLAGS
153		local_irq_disable();
154	#endif 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>lustre-2.12.5_10.llnl&lt;br/&gt;
RHEL 7.9 derivative&lt;br/&gt;
3.10.0-1160.4.1.1chaos.ch6.x86_64</environment>
        <key id="62179">LU-14282</key>
            <summary>WARNING: CPU: 22 PID: 5953 at kernel/softirq.c:151 __local_bh_enable_ip+0x82/0xb0</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ssmirnov">Serguei Smirnov</assignee>
                                    <reporter username="ofaaland">Olaf Faaland</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Tue, 29 Dec 2020 20:24:04 +0000</created>
                <updated>Thu, 8 Feb 2024 16:56:18 +0000</updated>
                                            <version>Lustre 2.12.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="288487" author="ofaaland" created="Tue, 29 Dec 2020 20:25:25 +0000"  >&lt;p&gt;For my tracking, my local ticket is TOSS4981&lt;/p&gt;</comment>
                            <comment id="288509" author="pjones" created="Wed, 30 Dec 2020 13:41:29 +0000"  >&lt;p&gt;Serguei&lt;/p&gt;

&lt;p&gt;Can you please advise?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="288524" author="ssmirnov" created="Wed, 30 Dec 2020 22:27:13 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;I do believe I can see the problem in the code, I&apos;m just curious when you started seeing the warning. The related code is fairly old as far as I can tell. Did something change in the way the router is used?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="288525" author="ofaaland" created="Thu, 31 Dec 2020 00:03:40 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
Most of our clusters, including that one, switched from Lustre 2.10.x to 2.12.4 or .5 this year between roughly June and August.  And I can see that at least some of our clusters were logging that error in September and I just failed to notice.  I&apos;ll get more detail.&lt;br/&gt;
thanks!&lt;/p&gt;</comment>
                            <comment id="288526" author="ofaaland" created="Thu, 31 Dec 2020 01:04:42 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
Actually, it looks like we started seeing it in October 2020, after these clusters had already switched from Lustre 2.10 to 2.12. It appeared on multiple clusters around the same time.  I don&apos;t know what changed.&lt;/p&gt;</comment>
                            <comment id="288550" author="ssmirnov" created="Thu, 31 Dec 2020 23:47:01 +0000"  >&lt;p&gt;Olaf,&lt;/p&gt;

&lt;p&gt;Do you have net-level debug logging enabled on these machines? (lctl set_param debug=+net)&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="288609" author="ofaaland" created="Mon, 4 Jan 2021 22:45:01 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
We do have that enabled (at least on some).  I&apos;m looking for an instance recent enough that startup is still in the debug buffer and will upload as soon as I find it.&lt;br/&gt;
Can you identify the suspect function(s)?&lt;br/&gt;
Thanks&lt;/p&gt;</comment>
                            <comment id="288611" author="ofaaland" created="Mon, 4 Jan 2021 22:58:03 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
I&apos;ve uploaded console and debug logs (*.1609800485.txt).  This particular node was running lustre-2.12.6_2.llnl-1.ch6.x86_64 on kernel 3.10.0-1160.11.1.1chaos.ch6.x86_64.  It has an OPA interface (internal fabric of a compute cluster) and a 10GigE interface.&lt;br/&gt;
thanks&lt;/p&gt;</comment>
                            <comment id="288618" author="ssmirnov" created="Mon, 4 Jan 2021 23:57:00 +0000"  >&lt;p&gt;Hi Olaf,&lt;/p&gt;

&lt;p&gt;The suspect functions are &lt;em&gt;kiblnd_conn_addref&lt;/em&gt; and &lt;em&gt;kiblnd_conn_decref&lt;/em&gt;. They call &lt;em&gt;CDEBUG&lt;/em&gt; which resolves to&#160;&lt;em&gt;libcfs_debug_vmsg2&lt;/em&gt;&#160;which calls &lt;em&gt;cfs_trace_lock_tcd/cfs_trace_unlock_tcd&lt;/em&gt; which may call &lt;em&gt;spin_lock_bh/spin_unlock_bh&lt;/em&gt; respectively.&lt;/p&gt;

&lt;p&gt;&lt;em&gt;kiblnd_conn_addref&lt;/em&gt; and &lt;em&gt;kiblnd_conn_decref&lt;/em&gt;&#160;are sometimes called with interrupts disabled, like in&#160;&lt;em&gt;kiblnd_cq_completion&lt;/em&gt;&#160;from the stack dump in this ticket.&lt;/p&gt;

&lt;p&gt;If the suspicion is correct, this warning shouldn&apos;t occur when running without lnet debug logging enabled.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="288619" author="ofaaland" created="Tue, 5 Jan 2021 00:09:38 +0000"  >&lt;p&gt;Hi Serguei,&lt;/p&gt;

&lt;p&gt;I see.&#160; That fits with the timing I saw.&#160; I changed our default debug mask to add +net on Oct 13th.&#160; I guess I&apos;ll be changing that back temporarily &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;.&#160; And it might explain another bug I&apos;m seeing.&#160; Thanks!&lt;/p&gt;</comment>
                            <comment id="289849" author="gerrit" created="Tue, 19 Jan 2021 19:37:45 +0000"  >&lt;p&gt;Serguei Smirnov (ssmirnov@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/41270&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/41270&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14282&quot; title=&quot;WARNING: CPU: 22 PID: 5953 at kernel/softirq.c:151 __local_bh_enable_ip+0x82/0xb0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14282&quot;&gt;LU-14282&lt;/a&gt; libcfs: refine use of locks when in softirq&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 16dead15d8e2eaf696c8f688387c2d1db993aa8f&lt;/p&gt;</comment>
                            <comment id="400223" author="ssmirnov" created="Thu, 18 Jan 2024 16:46:15 +0000"  >&lt;p&gt;Olaf,&lt;/p&gt;

&lt;p&gt;Andreas brought to my attention that the patch for this never got merged to master. Are you still seeing this warning sometimes?&#160;&lt;/p&gt;</comment>
                            <comment id="400241" author="ofaaland" created="Thu, 18 Jan 2024 18:06:45 +0000"  >&lt;p&gt;Hi Serguei,&lt;br/&gt;
Yes, we still see this intermittently.&lt;/p&gt;</comment>
                            <comment id="400629" author="ssmirnov" created="Mon, 22 Jan 2024 18:10:48 +0000"  >&lt;p&gt;Olaf,&lt;/p&gt;

&lt;p&gt;Are you seeing it with &quot;+net&quot; in debug mask?&lt;/p&gt;</comment>
                            <comment id="403257" author="ofaaland" created="Thu, 8 Feb 2024 16:56:18 +0000"  >&lt;p&gt;Hi Serguei, sorry for the slow response.  Actually, I do not think we are seeing this now.  My prior search was wrong and I found different stacks that are completely unrelated.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="37071" name="dk.opal187.1609800485.txt" size="18321327" author="ofaaland" created="Mon, 4 Jan 2021 22:53:39 +0000"/>
                            <attachment id="37072" name="dmesg.opal187.1609800485.txt" size="182530" author="ofaaland" created="Mon, 4 Jan 2021 22:53:32 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01ibj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>