<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:50:27 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12194] clients getting soft lockups on 2.10.7</title>
                <link>https://jira.whamcloud.com/browse/LU-12194</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Getting occasional soft lockups on 2.10.7 clients&lt;/p&gt;

&lt;p&gt;kernel: NMI watchdog: BUG: soft lockup - CPU#7 stuck for 22s! &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpcd_01_08:11711&amp;#93;&lt;/span&gt;&lt;/p&gt;</description>
                <environment>EL 7.4.1708</environment>
        <key id="55434">LU-12194</key>
            <summary>clients getting soft lockups on 2.10.7</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ys">Yang Sheng</assignee>
                                    <reporter username="cmcl">Campbell Mcleay</reporter>
                        <labels>
                    </labels>
                <created>Thu, 18 Apr 2019 15:02:15 +0000</created>
                <updated>Thu, 10 Dec 2020 18:47:17 +0000</updated>
                                            <version>Lustre 2.10.7</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="245993" author="bzzz" created="Thu, 18 Apr 2019 15:04:16 +0000"  >&lt;p&gt;it would be very helpful if you can provide backtraces.&lt;/p&gt;</comment>
                            <comment id="245994" author="cmcl" created="Thu, 18 Apr 2019 15:04:59 +0000"  >&lt;p&gt;Do we still need to set lru size on the MDS? We have:&lt;/p&gt;

&lt;p&gt;cmcl@mds1 ~ -bash$ sudo lctl get_param &apos;ldlm.namespaces.*.lru_size&apos;&lt;br/&gt;
ldlm.namespaces.MGC10.21.22.50@tcp.lru_size=3200&lt;br/&gt;
ldlm.namespaces.MGS.lru_size=3200&lt;br/&gt;
ldlm.namespaces.bravo-MDT0000-lwp-MDT0000.lru_size=0&lt;/p&gt;

&lt;p&gt;ldlm.namespaces.mdt-bravo-MDT0000_UUID.lru_size=3200&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="245995" author="cmcl" created="Thu, 18 Apr 2019 15:12:21 +0000"  >&lt;p&gt;A couple of backtraces:&lt;/p&gt;

&lt;p&gt;Apr 16 02:12:47 bravo2 kernel: NMI watchdog: BUG: soft lockup - CPU#5 stuck for 22s! &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpcd_01_02:11705&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: Modules linked in: osc(OE) mgc(OE) lustre(OE) lmv(OE) fld(OE) mdc(OE) fid(OE) lov(OE) ksocklnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) vfat fat mpt3sas 8d&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: stp dm_multipath llc serio_raw raid_class myri10ge scsi_transport_sas bnx2 drm_panel_orientation_quirks dca sunrpc dm_mirror dm_region_hash dm_log dm_mod &lt;span class=&quot;error&quot;&gt;&amp;#91;last l&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: CPU: 5 PID: 11705 Comm: ptlrpcd_01_02 Kdump: loaded Tainted: G IOEL ------------ 3.10.0-957.1.3.el7.x86_64 #1&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: Hardware name: Dell Inc. PowerEdge R610/0F0XJ6, BIOS 6.4.0 07/23/2013&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: task: ffff9ff40efb6180 ti: ffff9feff6e88000 task.ti: ffff9feff6e88000&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: RIP: 0010:&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9cd121e6&amp;gt;&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9cd121e6&amp;gt;&amp;#93;&lt;/span&gt; native_queued_spin_lock_slowpath+0x126/0x200&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: RSP: 0018:ffff9feff6e8bb78 EFLAGS: 00000246&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: RAX: 0000000000000000 RBX: ffffffffc0d0ca40 RCX: 0000000000290000&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: RDX: ffff9ff497c9b780 RSI: 0000000000a90001 RDI: ffff9ffa8da6c640&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: RBP: ffff9feff6e8bb78 R08: ffff9ff497a9b780 R09: 0000000000000000&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: R10: 0000000000000000 R11: 000000000000000f R12: ffff9ff2dad40a00&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: R13: 0005ca25f87f7110 R14: ffff9ff2dc03a100 R15: 000000000000000a&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: FS: 0000000000000000(0000) GS:ffff9ff497a80000(0000) knlGS:0000000000000000&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: CR2: 00007f76653c03cc CR3: 00000000a7c10000 CR4: 00000000000207e0&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: Call Trace:&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9d35bfcb&amp;gt;&amp;#93;&lt;/span&gt; queued_spin_lock_slowpath+0xb/0xf&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9d36a480&amp;gt;&amp;#93;&lt;/span&gt; _raw_spin_lock+0x20/0x30&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc098bc18&amp;gt;&amp;#93;&lt;/span&gt; cfs_percpt_lock+0x58/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0a05f08&amp;gt;&amp;#93;&lt;/span&gt; LNetMDUnlink+0x78/0x180 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0c9df2f&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_unregister_reply+0xbf/0x790 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0ca2c1a&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_expire_one_request+0xba/0x480 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0ca308f&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_expired_set+0xaf/0x1a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:12:47 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0cd333c&amp;gt;&amp;#93;&lt;/span&gt; ptlrpcd+0x29c/0x550 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
5 c9 74 04 41 0f 18 09 8b 17 0f b7 c2&lt;/p&gt;

&lt;p&gt;Apr 16 02:13:03 bravo2 kernel: NMI watchdog: BUG: soft lockup - CPU#3 stuck for 22s! &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpcd_01_10:11713&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: Modules linked in: osc(OE) mgc(OE) lustre(OE) lmv(OE) fld(OE) mdc(OE) fid(OE) lov(OE) ksocklnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) vfat fat mpt3sas mp&lt;br/&gt;
tctl mptbase nfsv3 nfs fscache dell_rbu bonding intel_powerclamp coretemp kvm acpi_power_meter joydev ipmi_si ipmi_devintf iTCO_wdt irqbypass ipmi_msghandler sg iTCO_vendor_support gpio_ich dcd&lt;br/&gt;
bas wmi i7core_edac lpc_ich nfsd auth_rpcgss nfs_acl lockd grace binfmt_misc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic ata_generic pata_acpi mgag200 i2c_algo_bit drm_kms_helpe&lt;br/&gt;
r syscopyarea sysfillrect sysimgblt fb_sys_fops ttm scsi_transport_iscsi crct10dif_pclmul ata_piix crct10dif_common crc32_pclmul crc32c_intel drm ghash_clmulni_intel libata mpt2sas aesni_intel &lt;br/&gt;
8021q lrw gf128mul garp glue_helper ablk_helper mrp cryptd&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: stp dm_multipath llc serio_raw raid_class myri10ge scsi_transport_sas bnx2 drm_panel_orientation_quirks dca sunrpc dm_mirror dm_region_hash dm_log dm_mod [last un&lt;br/&gt;
loaded: usb_storage]&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: CPU: 3 PID: 11713 Comm: ptlrpcd_01_10 Kdump: loaded Tainted: G IOEL ------------ 3.10.0-957.1.3.el7.x86_64 #1&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: Hardware name: Dell Inc. PowerEdge R610/0F0XJ6, BIOS 6.4.0 07/23/2013&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: task: ffff9ff40ffb0000 ti: ffff9ff480798000 task.ti: ffff9ff480798000&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: RIP: 0010:&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9cd121e2&amp;gt;&amp;#93;&lt;/span&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9cd121e2&amp;gt;&amp;#93;&lt;/span&gt; native_queued_spin_lock_slowpath+0x122/0x200&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: RSP: 0018:ffff9ff48079bb78 EFLAGS: 00000246&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: RAX: 0000000000000000 RBX: ffffffffc0d0ca40 RCX: 0000000000190000&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: RDX: ffff9ffaaf65b780 RSI: 0000000000110001 RDI: ffff9ff43a7214c0&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: RBP: ffff9ff48079bb78 R08: ffff9ff497a5b780 R09: 0000000000000000&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: R10: 0000000000000000 R11: 000000000000000f R12: ffff9feeb730c400&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: R13: 0005ca25f78289c0 R14: ffff9feeeb73b300 R15: 000000000000000a&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: FS: 0000000000000000(0000) GS:ffff9ff497a40000(0000) knlGS:0000000000000000&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: CR2: 00007fe1cc1a4490 CR3: 00000000a7c10000 CR4: 00000000000207e0&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: Call Trace:&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9d35bfcb&amp;gt;&amp;#93;&lt;/span&gt; queued_spin_lock_slowpath+0xb/0xf&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9d36a480&amp;gt;&amp;#93;&lt;/span&gt; _raw_spin_lock+0x20/0x30&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc098bc18&amp;gt;&amp;#93;&lt;/span&gt; cfs_percpt_lock+0x58/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0a05f08&amp;gt;&amp;#93;&lt;/span&gt; LNetMDUnlink+0x78/0x180 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0c9df2f&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_unregister_reply+0xbf/0x790 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0ca2c1a&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_expire_one_request+0xba/0x480 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0ca308f&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_expired_set+0xaf/0x1a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0cd333c&amp;gt;&amp;#93;&lt;/span&gt; ptlrpcd+0x29c/0x550 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9ccd67b0&amp;gt;&amp;#93;&lt;/span&gt; ? wake_up_state+0x20/0x20&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffc0cd30a0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpcd_check+0x5e0/0x5e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9ccc1c31&amp;gt;&amp;#93;&lt;/span&gt; kthread+0xd1/0xe0&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9ccc1b60&amp;gt;&amp;#93;&lt;/span&gt; ? insert_kthread_work+0x40/0x40&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9d374c37&amp;gt;&amp;#93;&lt;/span&gt; ret_from_fork_nospec_begin+0x21/0x21&lt;br/&gt;
Apr 16 02:13:03 bravo2 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff9ccc1b60&amp;gt;&amp;#93;&lt;/span&gt; ? insert_kthread_work+0x40/0x40&lt;br/&gt;
d 8b 08 4d 85 c9 74 04 41 0f 18 09 8b&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="246003" author="pjones" created="Thu, 18 Apr 2019 16:36:31 +0000"  >&lt;p&gt;Yang Sheng&lt;/p&gt;

&lt;p&gt;Can you please advise?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="246344" author="cmcl" created="Thu, 25 Apr 2019 09:09:25 +0000"  >&lt;p&gt;Please let me know if you need any additional information.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="246363" author="ys" created="Thu, 25 Apr 2019 17:32:02 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;Do you have collect sysrq-t while soft lockup? &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="246390" author="cmcl" created="Fri, 26 Apr 2019 10:54:56 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;They only occur occasionally (maybe once a day) so it is difficult to do it at the time of the lockup.&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="246509" author="cmcl" created="Tue, 30 Apr 2019 10:12:53 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;Any other suggestions as to how we can find out what is going on here?&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="246510" author="ys" created="Tue, 30 Apr 2019 10:38:53 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;For softlock up issue, collect sysrq-t is a better way. So we can find out who causes the problem. I think you can deploy a script to monitor the dmesg output and then trigger the sysrq-t while sockftlock up occurred. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="246615" author="cmcl" created="Thu, 2 May 2019 10:27:30 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;Attached are some sysrq-t dumps from when the soft lockups occurred.&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;


&lt;p&gt;Campbell&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/32515/32515_bravo2-soft-lockups.gz&quot; title=&quot;bravo2-soft-lockups.gz attached to LU-12194&quot;&gt;bravo2-soft-lockups.gz&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;</comment>
                            <comment id="246747" author="ys" created="Mon, 6 May 2019 16:44:31 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;From stack trace:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;May  2 02:24:47 bravo2 kernel: NMI watchdog: BUG: soft lockup - CPU#12 stuck for 22s! [ptlrpcd_00_04:11695]
May  2 02:24:47 bravo2 kernel: Modules linked in: osc(OE) mgc(OE) lustre(OE) lmv(OE) fld(OE) mdc(OE) fid(OE) lov(OE) ksocklnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) vfat fat mpt3sas mptctl mptbase nfsv3 nfs fscache dell_rbu bonding intel_powerclamp coretemp kvm acpi_power_meter joydev ipmi_si ipmi_devintf iTCO_wdt irqbypass ipmi_msghandler sg iTCO_vendor_support gpio_ich dcdbas wmi i7core_edac lpc_ich nfsd auth_rpcgss nfs_acl lockd grace binfmt_misc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic ata_generic pata_acpi mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm scsi_transport_iscsi crct10dif_pclmul ata_piix crct10dif_common crc32_pclmul crc32c_intel drm ghash_clmulni_intel libata mpt2sas aesni_intel 8021q lrw gf128mul garp glue_helper ablk_helper mrp cryptd
May  2 02:24:47 bravo2 kernel: stp dm_multipath llc serio_raw raid_class myri10ge scsi_transport_sas bnx2 drm_panel_orientation_quirks dca sunrpc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: usb_storage]
May  2 02:24:47 bravo2 kernel: CPU: 12 PID: 11695 Comm: ptlrpcd_00_04 Kdump: loaded Tainted: G        W IOEL ------------   3.10.0-957.1.3.el7.x86_64 #1
May  2 02:24:47 bravo2 kernel: Hardware name: Dell Inc. PowerEdge R610/0F0XJ6, BIOS 6.4.0 07/23/2013
May  2 02:24:47 bravo2 kernel: task: ffff9ff4036a30c0 ti: ffff9ff42d388000 task.ti: ffff9ff42d388000
May  2 02:24:47 bravo2 kernel: RIP: 0010:[&amp;lt;ffffffffc09f7a08&amp;gt;]  [&amp;lt;ffffffffc09f7a08&amp;gt;] lnet_res_lh_lookup+0x48/0x70 [lnet]
May  2 02:24:47 bravo2 kernel: RSP: 0018:ffff9ff42d38bbc0  EFLAGS: 00000206
May  2 02:24:47 bravo2 kernel: RAX: 0000000000000000 RBX: ffffffffffffff10 RCX: ffffb22686ad0f90
May  2 02:24:47 bravo2 kernel: RDX: ffff9fef08190610 RSI: 00000008d13a57cd RDI: ffff9feeb344f000
May  2 02:24:47 bravo2 kernel: RBP: ffff9ff42d38bbc0 R08: ffff9ffaaf79b780 R09: ffff9ff497c1b780
May  2 02:24:47 bravo2 kernel: R10: 0000000000000000 R11: 000000000000000f R12: 0000000000010001
May  2 02:24:47 bravo2 kernel: R13: ffff9ffaaf61b780 R14: 0000000000610000 R15: 0000000000000000
May  2 02:24:47 bravo2 kernel: FS:  0000000000000000(0000) GS:ffff9ffaaf780000(0000) knlGS:0000000000000000
May  2 02:24:47 bravo2 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
May  2 02:24:47 bravo2 kernel: CR2: 00007f3c4fc10000 CR3: 00000000a7c10000 CR4: 00000000000207e0
May  2 02:24:47 bravo2 kernel: Call Trace:
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffffc0a05f3c&amp;gt;] LNetMDUnlink+0xac/0x180 [lnet]
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffffc0c9df2f&amp;gt;] ptlrpc_unregister_reply+0xbf/0x790 [ptlrpc]
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffffc0ca2c1a&amp;gt;] ptlrpc_expire_one_request+0xba/0x480 [ptlrpc]
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffffc0ca308f&amp;gt;] ptlrpc_expired_set+0xaf/0x1a0 [ptlrpc]
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffffc0cd333c&amp;gt;] ptlrpcd+0x29c/0x550 [ptlrpc]
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffff9ccd67b0&amp;gt;] ? wake_up_state+0x20/0x20
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffffc0cd30a0&amp;gt;] ? ptlrpcd_check+0x5e0/0x5e0 [ptlrpc]
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffff9ccc1c31&amp;gt;] kthread+0xd1/0xe0
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffff9ccc1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffff9d374c37&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
May  2 02:24:47 bravo2 kernel: [&amp;lt;ffffffff9ccc1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
May  2 02:24:47 bravo2 kernel: Code: 00 48 89 f2 83 c1 02 48 d3 ea 48 89 d1 81 e1 ff 0f 00 00 48 c1 e1 04 48 03 4f 20 48 8b 11 48 39 ca 75 10 eb 17 66 0f 1f 44 00 00 &amp;lt;48&amp;gt; 8b 12 48 39 ca 74 10 48 39 72 10 75 f2 48 89 d0 5d c3 0f 1f

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If this thread hold lock and loop over the list for a long time, then the soft lockup could be triggered. But still not very clear why it spend long time in there?  Have any possible to apply a debug patch in your site?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="246778" author="cmcl" created="Tue, 7 May 2019 11:39:16 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;Yes, we can apply a debug patch - please let me know what you need me to do.&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="246782" author="ys" created="Tue, 7 May 2019 14:48:14 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;That is great. So you just use standard 2.10.7 release without other extra patch on your site?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="246784" author="cmcl" created="Tue, 7 May 2019 15:04:51 +0000"  >&lt;p&gt;Yes, it is a standard 2.10.7 release, with no patches.&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="246982" author="gerrit" created="Fri, 10 May 2019 16:38:49 +0000"  >&lt;p&gt;Yang Sheng (ys@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/34845&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34845&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12194&quot; title=&quot;clients getting soft lockups on 2.10.7&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12194&quot;&gt;LU-12194&lt;/a&gt; lnet: debug patch&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 9c51db3b1fd9d9c47c60da79c9f5fd3f27533c27&lt;/p&gt;</comment>
                            <comment id="247043" author="ys" created="Mon, 13 May 2019 02:46:35 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;The debug patch has passed tests. Do you have chance to install the it on your site?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="247118" author="cmcl" created="Tue, 14 May 2019 15:58:50 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;Do I need to just build and install the client packages, or do I have to build and install it for the OSSs and MDS?&lt;/p&gt;

&lt;p&gt;Regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="247122" author="pfarrell" created="Tue, 14 May 2019 18:22:38 +0000"  >&lt;p&gt;Campbell,&lt;/p&gt;

&lt;p&gt;This particular patch is only relevant where you&apos;re getting the lockups, so in this case, clients.&lt;/p&gt;</comment>
                            <comment id="247130" author="ys" created="Wed, 15 May 2019 02:44:48 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;As Patrick pointed out, only client need this patch.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="247160" author="cmcl" created="Wed, 15 May 2019 15:34:53 +0000"  >&lt;p&gt;Thanks - should I use&#160;lustre-client-debuginfo or will lustre-client suffice?&lt;/p&gt;</comment>
                            <comment id="247161" author="pfarrell" created="Wed, 15 May 2019 15:39:40 +0000"  >&lt;p&gt;Well, this will be self regulating - if you just install lustre-client-debuginfo it won&apos;t work &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;edit&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Reading this again, I see you were just suggesting it as an additional package.&#160; Sorry for my flippancy.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;/edit&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;More seriously:&lt;br/&gt;
 Just lustre-client.&#160; lustre-client-debuginfo is additional debug information for the lustre-client package, used when examining a crash, not at runtime (and it doesn&apos;t contain the lustre-client stuff).&lt;/p&gt;</comment>
                            <comment id="247252" author="cmcl" created="Thu, 16 May 2019 13:36:01 +0000"  >&lt;p&gt;No problem, thanks Patrick. Just having issues with configure just erroring out (with no useful error, not even in the config log). It did build successfully on a different kernel source tree to the running kernel, but obviously that&apos;s no use. Will let you know when I have it patched.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 checking &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; /lib/modules/3.10.0-957.1.3.el7.x86_64/source/include/linux/kconfig.h... yes
checking &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; external module build target... configure: error: unknown; check config.log &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; details&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


</comment>
                            <comment id="247254" author="cmcl" created="Thu, 16 May 2019 13:58:47 +0000"  >&lt;p&gt;Just to check: it should build against kernel version 3.10.0-957.1.3.el7.x86_64?&lt;/p&gt;</comment>
                            <comment id="247277" author="ys" created="Thu, 16 May 2019 15:41:44 +0000"  >&lt;p&gt;Yes, It can be built on this version. Just check whether your kernel tree has been prepared proper.&lt;/p&gt;</comment>
                            <comment id="247396" author="cmcl" created="Mon, 20 May 2019 17:24:03 +0000"  >&lt;p&gt;I think it was due to some feature missing from EL 7.4 that caused it to break when compiling kernel version 3.10.0-957, since it built fine on kernel version 3.10.0-693 for example. I ended up building it on a 7.6 host and that worked, so now it is installed so I&apos;ll let you know if we get any results.&lt;/p&gt;

&lt;p&gt;Cheers,&lt;br/&gt;
Campbell&lt;/p&gt;</comment>
                            <comment id="247413" author="ys" created="Tue, 21 May 2019 04:54:56 +0000"  >&lt;p&gt;Hi&#65292; Campbell,&lt;/p&gt;

&lt;p&gt;Thanks for the info. Please monitor the output of /proc/lnet_spt/spt_table.  You can collect it periodicity(I think 1 minute is enough). Especially after softlockup. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng &lt;/p&gt;</comment>
                            <comment id="247457" author="cmcl" created="Tue, 21 May 2019 17:47:08 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;Attached is some spt_table data at the time of the lockups (there were 8 lockup events). Let me know if you need spt_table data outside these times as well as I have collection triggered by lockups only&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="247504" author="ys" created="Wed, 22 May 2019 03:21:56 +0000"  >&lt;p&gt;Hi&#65292; Campbell,&lt;/p&gt;

&lt;p&gt;Since the patch want to gather maximum hold time of cpt lock. So the latest the better. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Yangsheng &lt;/p&gt;</comment>
                            <comment id="247516" author="cmcl" created="Wed, 22 May 2019 09:30:19 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;As it is collecting spt_table data when there are lockups, I assume that it is showing the maximum hold time of the lock on the cpu - or have I got that wrong? Should I just gather the data every minute? Please let me know what periods you will need.&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="247517" author="ys" created="Wed, 22 May 2019 09:51:39 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;Just latest data is enough. Except you reload the lnet module after lockup. From the log, Looks like the delay is not so high. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="247520" author="cmcl" created="Wed, 22 May 2019 11:52:40 +0000"  >&lt;p&gt;Should I use &apos;lustre_rmmod&apos; and then &apos;modprobe lustre&apos; after a lockup is detected? And should I keep sending you data after lockups or do you have enough to work with for now?&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="247522" author="ys" created="Wed, 22 May 2019 12:35:08 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;--Should I use &apos;lustre_rmmod&apos; and then &apos;modprobe lustre&apos; after a lockup is detected? &lt;br/&gt;
No, Please collect data after lockup without &apos;rmmod&apos;.&lt;/p&gt;

&lt;p&gt;--should I keep sending you data after lockups or do you have enough to work with for now?&lt;br/&gt;
Yes, please send the data after every lockup. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="247528" author="cmcl" created="Wed, 22 May 2019 14:18:46 +0000"  >&lt;p&gt;Latest one (only one cpu core locked up):&lt;/p&gt;

&lt;p&gt;LNetEQAlloc:000:0,0,0,0,0,0,0,0,0,0:3:&lt;br/&gt;
LNetEQAlloc:001:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
LNetEQAlloc:002:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
LNetMEAttach:000:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
LNetMEAttach:001:1863,90,7,0,0,0,0,0,0,0:94076607:&lt;br/&gt;
LNetMEAttach:002:8331,102,44,0,0,0,0,0,0,0:455990518:&lt;br/&gt;
LNetMDAttach:000:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
LNetMDAttach:001:2874,67,14,0,0,0,0,0,0,0:94076607:&lt;br/&gt;
LNetMDAttach:002:1490,93,62,0,0,0,0,0,0,0:455990518:&lt;br/&gt;
LNetSetLazyPortal:000:0,0,0,0,0,0,0,0,0,0:1:&lt;br/&gt;
LNetSetLazyPortal:001:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
LNetSetLazyPortal:002:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
lnet_res_lock_current:000:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
lnet_res_lock_current:001:0,0,0,0,0,0,0,0,0,0:208589267:&lt;br/&gt;
lnet_res_lock_current:002:0,0,0,0,0,0,0,0,0,0:337491708:&lt;br/&gt;
LNetPut:000:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
LNetPut:001:903,904,904,904,213,32,61,62,1,0:208589267:&lt;br/&gt;
LNetPut:002:2042,3861,3862,16575,483,40,56,56,8,0:337491708:&lt;br/&gt;
lnet_finalize:000:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
lnet_finalize:001:17604,1020,23,0,0,0,0,0,0,0:301706190:&lt;br/&gt;
lnet_finalize:002:18562,110,55,0,0,0,0,0,0,0:789653414:&lt;br/&gt;
lnet_ptl_match_md:000:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
lnet_ptl_match_md:001:113272,1277,1160,707,0,0,0,0,0,0:94578970:&lt;br/&gt;
lnet_ptl_match_md:002:24143,1081,799,52,0,0,0,0,0,0:459115756:&lt;br/&gt;
LNetMDUnlink:000:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
LNetMDUnlink:001:6202,11217,15551,15551,319,67,68,69,69,46:93967533:&lt;br/&gt;
LNetMDUnlink:002:212311,212312,212314,212314,173,100,101,101,77,80:446618837:&lt;br/&gt;
lnet_ptl_match_delay:000:0,0,0,0,0,0,0,0,0,0:0:&lt;br/&gt;
lnet_ptl_match_delay:001:62,1,0,0,0,0,0,0,0,0:89980:&lt;br/&gt;
lnet_ptl_match_delay:002:36,20,0,0,0,0,0,0,0,0:47777:&lt;/p&gt;</comment>
                            <comment id="247570" author="ys" created="Thu, 23 May 2019 02:45:51 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;


&lt;p&gt;Could you please collect data as below:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# lctl get_param cpu_partition_table
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;
</comment>
                            <comment id="247581" author="cmcl" created="Thu, 23 May 2019 11:18:47 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;All clients have:&lt;/p&gt;

&lt;p&gt;cpu_partition_table=&lt;br/&gt;
0	: 0 2 4 6 8 10 12 14 16 18 20 22&lt;br/&gt;
1	: 1 3 5 7 9 11 13 15 17 19 21 23&lt;/p&gt;

&lt;p&gt;Regards,&lt;br/&gt;
Campbell&lt;/p&gt;</comment>
                            <comment id="247596" author="ys" created="Thu, 23 May 2019 16:57:13 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;Please add this line into /etc/modprobe.d/ko2iblnd.conf.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
options libcfs cpu_npartitions=6

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;And then reload the lustre modules to verify whether the lockup still be hit. Please ensure it is effective by &apos;lctl get_param cpu_partition_table&apos;. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="247598" author="cmcl" created="Thu, 23 May 2019 17:50:20 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;I added the modprobe line and reloaded lustre modules, but it is not working:&lt;/p&gt;

&lt;p&gt;May 23 18:39:20 bravo2 kernel: LNet: HW NUMA nodes: 2, HW CPU cores: 24, npartitions: 2&lt;br/&gt;
May 23 18:46:16 bravo2 kernel: LNet: HW NUMA nodes: 2, HW CPU cores: 24, npartitions: 2&lt;/p&gt;

&lt;p&gt;cpu_partition_table=&lt;br/&gt;
0	: 0 2 4 6 8 10 12 14 16 18 20 22&lt;br/&gt;
1	: 1 3 5 7 9 11 13 15 17 19 21 23&lt;/p&gt;

&lt;p&gt;/etc/modprobe.d/ko2iblnd.conf&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
alias ko2iblnd-opa ko2iblnd
options ko2iblnd-opa peer_credits=128 peer_credits_hiw=64 credits=1024 concurrent_sends=256 ntx=2048 map_on_demand=32 fmr_pool_size=2048 fmr_flush_trigger=512 fmr_cache=1 conns_per_peer=4 libcfs cpu_npartitions=6

install ko2iblnd /usr/sbin/ko2iblnd-probe
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Having a look at what I&apos;m doing wrong&lt;/p&gt;</comment>
                            <comment id="247611" author="ys" created="Fri, 24 May 2019 01:26:32 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;Please add the &quot;options libcfs cpu_npartitions=6&quot; as a NEW line. Also you can use &apos;modprobe libcfs cpu_npartitions=6&apos; &lt;br/&gt;
before mount lustre. So can avoid changing any files.  &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="247615" author="ys" created="Fri, 24 May 2019 02:35:36 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;I note that you have 2 NUMA nodes. So we need partition explicitly as below:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;options libcfs cpu_pattern=[0,2,4,6,8,10]1[12,14,16,18,20,22]2[1,3,5,7,9,11]3[13,15,17,19,21,23]

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Or you can use &apos;modprobe cpu_pattern=&lt;span class=&quot;error&quot;&gt;&amp;#91;0,2,4,6,8,10&amp;#93;&lt;/span&gt;1&lt;span class=&quot;error&quot;&gt;&amp;#91;12,14,16,18,20,22&amp;#93;&lt;/span&gt;2&lt;span class=&quot;error&quot;&gt;&amp;#91;1,3,5,7,9,11&amp;#93;&lt;/span&gt;3&lt;span class=&quot;error&quot;&gt;&amp;#91;13,15,17,19,21,23&amp;#93;&lt;/span&gt;&apos;&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="247634" author="cmcl" created="Fri, 24 May 2019 09:58:09 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;Had to modify it slightly to work as it complained:&lt;/p&gt;

&lt;p&gt;May 24 10:38:49 bravo2 kernel: LNetError: 21221:0:(linux-cpu.c:1151:cfs_cpu_init()) Failed to create cptab from pattern &apos;&lt;span class=&quot;error&quot;&gt;&amp;#91;0,2,4,6,8,10&amp;#93;&lt;/span&gt;1&lt;span class=&quot;error&quot;&gt;&amp;#91;12,14,16,18,20,22&amp;#93;&lt;/span&gt;2&lt;span class=&quot;error&quot;&gt;&amp;#91;1,3,5,7,9,11&amp;#93;&lt;/span&gt;3&lt;span class=&quot;error&quot;&gt;&amp;#91;13,15,17,19,21,23&amp;#93;&lt;/span&gt;&apos;&lt;/p&gt;

&lt;p&gt;Modified cpu_pattern to have a partition number for the first set, so I have:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
alias ko2iblnd-opa ko2iblnd
options ko2iblnd-opa peer_credits=128 peer_credits_hiw=64 credits=1024 concurrent_sends=256 ntx=2048 map_on_demand=32 fmr_pool_size=2048 fmr_flush_trigger=512 fmr_cache=1 conns_per_peer=4
options libcfs cpu_npartitions=6
options libcfs cpu_pattern=0[0,2,4,6,8,10]1[12,14,16,18,20,22]2[1,3,5,7,9,11]3[13,15,17,19,21,23]

install ko2iblnd /usr/sbin/ko2iblnd-probe
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So I get:&lt;/p&gt;

&lt;p&gt;cpu_partition_table=&lt;br/&gt;
0	: 0 2 4 6 8 10&lt;br/&gt;
1	: 12 14 16 18 20 22&lt;br/&gt;
2	: 1 3 5 7 9 11&lt;br/&gt;
3	: 13 15 17 19 21 23&lt;/p&gt;

&lt;p&gt;Which looks like what we want I assume.&lt;/p&gt;</comment>
                            <comment id="247635" author="ys" created="Fri, 24 May 2019 10:18:57 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;Yes, I am sorry have typo in my comment. So please test with this pattern to see whether the lockup can be reproduced.&lt;/p&gt;

&lt;p&gt;BTW: The &apos;options libcfs cpu_npartitions=6&apos; can be removed.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="247758" author="ys" created="Mon, 27 May 2019 11:22:01 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;Could you please tell me the status of site? Do you still collect spt_table data?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                            <comment id="248351" author="cmcl" created="Tue, 4 Jun 2019 10:28:14 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;I&apos;m not collecting spt_table_data at the moment, but I also haven&apos;t seen any soft lockups since the changes were made. So what next from here? Do I just add these options to all clients on 2.10.7? Or is there a patch imminent to prevent the issue with the default CPU topology?&lt;/p&gt;

&lt;p&gt;Kind regards,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="248367" author="ys" created="Tue, 4 Jun 2019 12:45:30 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;I think you can apply this change to all of clients that might be impacted by this issue. I&apos;ll try to push a patch to make this change more easy. But i think it could take a long time. So can we close this one first? &lt;/p&gt;

&lt;p&gt;BTW: you can back to your original version lustre to remove the debug patch.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Yangsheng&lt;/p&gt;</comment>
                            <comment id="248378" author="cmcl" created="Tue, 4 Jun 2019 14:30:00 +0000"  >&lt;p&gt;Thanks Yangsheng. So the proposed patch will be to modify ko2iblnd.conf?&lt;/p&gt;</comment>
                            <comment id="248418" author="ys" created="Wed, 5 Jun 2019 01:56:13 +0000"  >&lt;p&gt;Hi&#65292; Campbell,&lt;/p&gt;

&lt;p&gt;No, It will set cpt automatically.  So we needn&apos;t set it by manually. We do it for UMA node. But looks like not on NUMA node.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Yangsheng&lt;/p&gt;</comment>
                            <comment id="248551" author="cmcl" created="Thu, 6 Jun 2019 16:02:03 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;What is the general rule for setting cpu_npartitions - is it number of NUMA node cpus divided by no. of NUMA nodes?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="249138" author="cmcl" created="Wed, 12 Jun 2019 16:40:02 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;Are you able to confirm what the general rule is for partitioning?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Campbell&lt;/p&gt;</comment>
                            <comment id="249140" author="ys" created="Wed, 12 Jun 2019 17:29:04 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;You can refer to document &lt;a href=&quot;http://doc.lustre.org/lustre_manual.xhtml#dbdoclet.libcfstuning&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://doc.lustre.org/lustre_manual.xhtml#dbdoclet.libcfstuning&lt;/a&gt;. But we still haven&apos;t a detail standard for CPT configuration. Since it is really depend on situation. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Yangsheng &lt;/p&gt;</comment>
                            <comment id="249976" author="cmcl" created="Tue, 25 Jun 2019 15:17:12 +0000"  >&lt;p&gt;Hi YangSheng,&lt;/p&gt;

&lt;p&gt;I didn&apos;t see a patch in the 2.10.7 -&amp;gt; 2.10.8 changelog that will set NUMA topology - you mentioned it may take some time to get this patched - do you think it may get done within the next few months? I&apos;m just wondering whether to wait for the patches and upgrade.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Campbell&lt;/p&gt;</comment>
                            <comment id="250042" author="ys" created="Wed, 26 Jun 2019 05:44:23 +0000"  >&lt;p&gt;Hi, Campbell,&lt;/p&gt;

&lt;p&gt;I am testing the patch in our test cluster. Yes, I think it will be landed in next few months.  You can setup it via cpu_pattern before that. &lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
YangSheng&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="54687">LU-11895</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="56667">LU-12667</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="32515" name="bravo2-soft-lockups.gz" size="139121" author="cmcl" created="Thu, 2 May 2019 10:27:25 +0000"/>
                            <attachment id="32606" name="spt-table-data-bravo4" size="11741" author="cmcl" created="Tue, 21 May 2019 17:47:18 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00f0n:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>