<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:47:52 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11895] CPU lockup in LNetMDUnlink</title>
                <link>https://jira.whamcloud.com/browse/LU-11895</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This just occurred on a Lustre 2.12.0 client, running Robinhood, many cores with 2 x AMD EPYC 7401 so 96 cpu threads total.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[1488057.711176] CPU: 63 PID: 54246 Comm: ptlrpcd_07_06 Kdump: loaded Tainted: G           OEL ------------   3.10.0-957.1.3.el7_lustre.x86_64 #1
[1488057.711177] Hardware name: Dell Inc. PowerEdge R7425/02MJ3T, BIOS 1.3.6 04/20/2018
[1488057.711178] task: ffff8abafc522080 ti: ffff8abaf4f54000 task.ti: ffff8abaf4f54000
[1488057.711182] RIP: 0010:[&amp;lt;ffffffff897121e6&amp;gt;]  [&amp;lt;ffffffff897121e6&amp;gt;] native_queued_spin_lock_slowpath+0x126/0x200
[1488057.711183] RSP: 0018:ffff8abaf4f57b78  EFLAGS: 00000246
[1488057.711183] RAX: 0000000000000000 RBX: ffffffffc0ff5a97 RCX: 0000000001f90000
[1488057.711184] RDX: ffff8a9affa9b780 RSI: 0000000000910000 RDI: ffff8adafa7b1b00
[1488057.711185] RBP: ffff8abaf4f57b78 R08: ffff8afb3f9db780 R09: 0000000000000000
[1488057.711185] R10: 0000000000000000 R11: 000000000000000f R12: ffff8aed569e2a00
[1488057.711186] R13: 0005c4aa17c51860 R14: ffff8aed24f64b00 R15: 0000000000000007
[1488057.711187] FS:  00007ea1d389b700(0000) GS:ffff8afb3f9c0000(0000) knlGS:0000000000000000
[1488057.711188] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1488057.711189] CR2: 00007ecf3e8c7ff6 CR3: 000000276ae10000 CR4: 00000000003407e0
[1488057.711190] Call Trace:
[1488057.711192]  [&amp;lt;ffffffff89d5bfcb&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
[1488057.711193]  [&amp;lt;ffffffff89d6a480&amp;gt;] _raw_spin_lock+0x20/0x30
[1488057.711201]  [&amp;lt;ffffffffc0c23418&amp;gt;] cfs_percpt_lock+0x58/0x110 [libcfs]
[1488057.711211]  [&amp;lt;ffffffffc0c889d8&amp;gt;] LNetMDUnlink+0x78/0x180 [lnet]
[1488057.711250]  [&amp;lt;ffffffffc0f276bf&amp;gt;] ptlrpc_unregister_reply+0xbf/0x790 [ptlrpc]
[1488057.711287]  [&amp;lt;ffffffffc0f2c35e&amp;gt;] ptlrpc_expire_one_request+0xee/0x520 [ptlrpc]
[1488057.711324]  [&amp;lt;ffffffffc0f2c83f&amp;gt;] ptlrpc_expired_set+0xaf/0x1a0 [ptlrpc]
[1488057.711362]  [&amp;lt;ffffffffc0f5cc5c&amp;gt;] ptlrpcd+0x28c/0x550 [ptlrpc]
[1488057.711364]  [&amp;lt;ffffffff896d67b0&amp;gt;] ? wake_up_state+0x20/0x20
[1488057.711402]  [&amp;lt;ffffffffc0f5c9d0&amp;gt;] ? ptlrpcd_check+0x590/0x590 [ptlrpc]
[1488057.711404]  [&amp;lt;ffffffff896c1c31&amp;gt;] kthread+0xd1/0xe0
[1488057.711406]  [&amp;lt;ffffffff896c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
[1488057.711408]  [&amp;lt;ffffffff89d74c24&amp;gt;] ret_from_fork_nospec_begin+0xe/0x21
[1488057.711410]  [&amp;lt;ffffffff896c1b60&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;A lot of CPU were stucks in LNetMDUnlink. Server crashed with hard lockup at the end. vmcore available on demand. Attached vmcore-dmesg.txt&lt;/p&gt;

&lt;p&gt;I kept the default lru_size/lru_max_age 0/3900000 values on this server, so I&apos;ll try to reduce them as follow:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;lru_size=100&lt;/li&gt;
	&lt;li&gt;lru_max_age=1200&lt;br/&gt;
like on our Lustre 2.10 robinhood server for Oak.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Any other recommendation welcomed.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;br/&gt;
Stephane&lt;/p&gt;
</description>
                <environment>CentOS 7.6, kernel 3.10.0-957.1.3.el7_lustre.x86_64, MOFED 4.5</environment>
        <key id="54687">LU-11895</key>
            <summary>CPU lockup in LNetMDUnlink</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Mon, 28 Jan 2019 23:36:00 +0000</created>
                <updated>Mon, 11 Jul 2022 18:51:13 +0000</updated>
                                            <version>Lustre 2.12.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="240889" author="pjones" created="Tue, 29 Jan 2019 18:56:36 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Could you please investigate?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="240891" author="adilger" created="Tue, 29 Jan 2019 18:59:31 +0000"  >&lt;p&gt;How many CPTs on this system?  &lt;tt&gt;lctl get_param cpu*&lt;/tt&gt;&lt;/p&gt;</comment>
                            <comment id="240892" author="sthiell" created="Tue, 29 Jan 2019 19:05:14 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@fir-rbh01 ~]# lctl get_param cpu*
cpu_partition_distance=
0	: 0:10 1:16 2:16 3:16 4:28 5:28 6:22 7:28
1	: 0:16 1:10 2:16 3:16 4:28 5:28 6:28 7:22
2	: 0:16 1:16 2:10 3:16 4:22 5:28 6:28 7:28
3	: 0:16 1:16 2:16 3:10 4:28 5:22 6:28 7:28
4	: 0:28 1:28 2:22 3:28 4:10 5:16 6:16 7:16
5	: 0:28 1:28 2:28 3:22 4:16 5:10 6:16 7:16
6	: 0:22 1:28 2:28 3:28 4:16 5:16 6:10 7:16
7	: 0:28 1:22 2:28 3:28 4:16 5:16 6:16 7:10
cpu_partition_table=
0	: 0 8 16 24 32 40 48 56 64 72 80 88
1	: 2 10 18 26 34 42 50 58 66 74 82 90
2	: 4 12 20 28 36 44 52 60 68 76 84 92
3	: 6 14 22 30 38 46 54 62 70 78 86 94
4	: 1 9 17 25 33 41 49 57 65 73 81 89
5	: 3 11 19 27 35 43 51 59 67 75 83 91
6	: 5 13 21 29 37 45 53 61 69 77 85 93
7	: 7 15 23 31 39 47 55 63 71 79 87 95
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I think that makes sense as each EPYC CPU socket has 4 domains.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@fir-rbh01 ~]# numactl --hardware
available: 8 nodes (0-7)
node 0 cpus: 0 8 16 24 32 40 48 56 64 72 80 88
node 0 size: 65213 MB
node 0 free: 41206 MB
node 1 cpus: 2 10 18 26 34 42 50 58 66 74 82 90
node 1 size: 65535 MB
node 1 free: 34082 MB
node 2 cpus: 4 12 20 28 36 44 52 60 68 76 84 92
node 2 size: 65535 MB
node 2 free: 63939 MB
node 3 cpus: 6 14 22 30 38 46 54 62 70 78 86 94
node 3 size: 65535 MB
node 3 free: 63611 MB
node 4 cpus: 1 9 17 25 33 41 49 57 65 73 81 89
node 4 size: 65535 MB
node 4 free: 63923 MB
node 5 cpus: 3 11 19 27 35 43 51 59 67 75 83 91
node 5 size: 65535 MB
node 5 free: 63569 MB
node 6 cpus: 5 13 21 29 37 45 53 61 69 77 85 93
node 6 size: 65535 MB
node 6 free: 63449 MB
node 7 cpus: 7 15 23 31 39 47 55 63 71 79 87 95
node 7 size: 65535 MB
node 7 free: 63535 MB
node distances:
node   0   1   2   3   4   5   6   7 
  0:  10  16  16  16  28  28  22  28 
  1:  16  10  16  16  28  28  28  22 
  2:  16  16  10  16  22  28  28  28 
  3:  16  16  16  10  28  22  28  28 
  4:  28  28  22  28  10  16  16  16 
  5:  28  28  28  22  16  10  16  16 
  6:  22  28  28  28  16  16  10  16 
  7:  28  22  28  28  16  16  16  10
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</comment>
                            <comment id="242002" author="ashehata" created="Thu, 14 Feb 2019 20:23:22 +0000"  >&lt;p&gt;This appears similar to: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11100&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-11100&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Would you be able to try the patches suggested there?&lt;/p&gt;

&lt;p&gt;In the meantime if you&apos;re able to make the vmcore and vmlinux available so I can take a further look, that&apos;ll be helpful.&lt;/p&gt;</comment>
                            <comment id="245450" author="sthiell" created="Mon, 8 Apr 2019 23:41:57 +0000"  >&lt;p&gt;Hi Amir,&lt;/p&gt;

&lt;p&gt;Oops sorry for being so slow on this one, we had seen only the problem once so far. But today a login node started to do the same, however it did recover by itself!&lt;/p&gt;

&lt;p&gt;Anyway, we had a lot of soft lockups, like that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[1230992.435041] NMI watchdog: BUG: soft lockup - CPU#8 stuck for 23s! [ptlrpcd_00_10:42801]
[1230992.444173] Modules linked in: binfmt_misc squashfs overlay(T) rpcsec_gss_krb5 nfsv4 dns_resolver fuse mgc(OE) lustre(OE) lmv(OE) mdc(OE) fid(OE) osc(OE) lov(OE) fld(OE) ptlrpc(OE) ko2iblnd(OE) obdclass(OE)
 lnet(OE) libcfs(OE) xt_multiport ip_set_hash_ip nfsv3 nfs_acl nfs lockd grace ip6t_MASQUERADE nf_nat_masquerade_ipv6 ipt_MASQUERADE nf_nat_masquerade_ipv4 xt_set ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJ
ECT nf_reject_ipv6 xt_conntrack ip_set_hash_net ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw
 iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter
[1230992.524464]  rdma_ucm(OE) ib_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_fpga_tools(OE) mlx5_ib(OE) ib_uverbs(OE) mlx5_core(OE) mlxfw(OE) mlx4_en(OE) dell_rbu cachefiles fscache sb
_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm mgag200 i2c_algo_bit irqbypass ttm drm_kms_helper crc32_pclmul syscopyarea ghash_clmulni_intel sysfillrect sysimgblt aesni_intel lrw fb_sys_fops 
gf128mul drm iTCO_wdt iTCO_vendor_support glue_helper ablk_helper dcdbas cryptd cdc_ether usbnet mii drm_panel_orientation_quirks pcspkr sg ipmi_si joydev wmi lpc_ich ipmi_devintf mei_me mei ipmi_msghandler acpi
_pad acpi_power_meter ext4 mbcache jbd2 loop auth_rpcgss sunrpc ip_tables xfs mlx4_ib(OE) ib_core(OE) sr_mod cdrom sd_mod crc_t10dif crct10dif_generic bnx2x ahci crct10dif_pclmul
[1230992.602267]  mlx4_core(OE) crct10dif_common libahci mlx_compat(OE) crc32c_intel mdio devlink ptp libata megaraid_sas pps_core libcrc32c
[1230992.614517] CPU: 8 PID: 42801 Comm: ptlrpcd_00_10 Kdump: loaded Tainted: G           OEL ------------ T 3.10.0-957.10.1.el7.x86_64 #1
[1230992.628103] Hardware name: Dell Inc. PowerEdge R620/0GFKVD, BIOS 2.7.0 05/23/2018
[1230992.636647] task: ffff8eb9f3334100 ti: ffff8eb9f47c4000 task.ti: ffff8eb9f47c4000
[1230992.645192] RIP: 0010:[&amp;lt;ffffffff9cd12226&amp;gt;]  [&amp;lt;ffffffff9cd12226&amp;gt;] native_queued_spin_lock_slowpath+0x126/0x200
[1230992.656465] RSP: 0018:ffff8eb9f47c7b78  EFLAGS: 00000246
[1230992.662583] RAX: 0000000000000000 RBX: ffffffffc1014956 RCX: 0000000000410000
[1230992.670738] RDX: ffff8eba1f09b780 RSI: 0000000000290001 RDI: ffff8eb21a6e0a00
[1230992.678893] RBP: ffff8eb9f47c7b78 R08: ffff8eb21f91b780 R09: 0000000000000000
[1230992.687047] R10: 0000000000000000 R11: 000000000000000f R12: ffff8eb01a572a00
[1230992.695201] R13: 0005c991f29a0f30 R14: ffff8eb69a22b000 R15: 0000000000000007
[1230992.703356] FS:  0000000000000000(0000) GS:ffff8eb21f900000(0000) knlGS:0000000000000000
[1230992.712577] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1230992.719182] CR2: 00007fd45e897090 CR3: 0000000fb8c10000 CR4: 00000000001607e0
[1230992.727339] Call Trace:
[1230992.730262]  [&amp;lt;ffffffff9d35cfcb&amp;gt;] queued_spin_lock_slowpath+0xb/0xf
[1230992.737453]  [&amp;lt;ffffffff9d36b480&amp;gt;] _raw_spin_lock+0x20/0x30
[1230992.743785]  [&amp;lt;ffffffffc0bbe418&amp;gt;] cfs_percpt_lock+0x58/0x110 [libcfs]
[1230992.751181]  [&amp;lt;ffffffffc0c3c9d8&amp;gt;] LNetMDUnlink+0x78/0x180 [lnet]
[1230992.758106]  [&amp;lt;ffffffffc0fa685f&amp;gt;] ptlrpc_unregister_reply+0xbf/0x790 [ptlrpc]
[1230992.766280]  [&amp;lt;ffffffffc0fab4fe&amp;gt;] ptlrpc_expire_one_request+0xee/0x520 [ptlrpc]
[1230992.774648]  [&amp;lt;ffffffffc0fab9df&amp;gt;] ptlrpc_expired_set+0xaf/0x1a0 [ptlrpc]
[1230992.782343]  [&amp;lt;ffffffffc0fdae8c&amp;gt;] ptlrpcd+0x28c/0x550 [ptlrpc]
[1230992.789047]  [&amp;lt;ffffffff9ccd67f0&amp;gt;] ? wake_up_state+0x20/0x20
[1230992.795477]  [&amp;lt;ffffffffc0fdac00&amp;gt;] ? ptlrpcd_check+0x590/0x590 [ptlrpc]
[1230992.802956]  [&amp;lt;ffffffff9ccc1c71&amp;gt;] kthread+0xd1/0xe0
[1230992.808590]  [&amp;lt;ffffffff9ccc1ba0&amp;gt;] ? insert_kthread_work+0x40/0x40
[1230992.815585]  [&amp;lt;ffffffff9d375c37&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[1230992.823063]  [&amp;lt;ffffffff9ccc1ba0&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;When I checked at this time, the login node (lustre client) was completely loaded with its 32 CPUs taking 100% cpu each (in ptlrpcd).&lt;/p&gt;

&lt;p&gt;Then, the client recovered with the following logs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[1231021.312842] Lustre: 42815:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for sent delay: [sent 1554765376/real 1554764903]  req@ffff8eb33a496000 x1629003522849584/t0(0) o103-&amp;gt;fir-OST0007-osc-ffff8eba05ae6000@10.0.10.102@o2ib7:17/18 lens 328/224 e 0 to 1 dl 1554765478 ref 2 fl Rpc:X/2/ffffffff rc 0/-1
[1231021.345646] Lustre: 42815:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 1744666 previous similar messages
[1231161.240794] LNetError: 42764:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don&apos;t perform health checking (-125, 0)
[1231161.254982] LNetError: 42764:0:(lib-msg.c:811:lnet_is_health_check()) Skipped 1 previous similar message
[1231171.900676] LNetError: 42765:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don&apos;t perform health checking (-125, 0)
[1231171.914863] LNetError: 42765:0:(lib-msg.c:811:lnet_is_health_check()) Skipped 1 previous similar message
[1231247.531671] Lustre: fir-OST000d-osc-ffff8eba05ae6000: Connection to fir-OST000d (at 10.0.10.104@o2ib7) was lost; in progress operations using this service will wait for recovery to complete
[1231247.550706] Lustre: Skipped 104 previous similar messages                  
[1231374.205074] LNetError: 42762:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don&apos;t perform health checking (-125, 0)
[1231374.219259] LNetError: 42762:0:(lib-msg.c:811:lnet_is_health_check()) Skipped 1 previous similar message
[1231416.399571] LNetError: 42768:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don&apos;t perform health checking (-125, 0)
[1231416.413765] LNetError: 42768:0:(lib-msg.c:811:lnet_is_health_check()) Skipped 1 previous similar message
[1231486.750358] LNetError: 42768:0:(lib-msg.c:811:lnet_is_health_check()) Msg is in inconsistent state, don&apos;t perform health checking (-125, 0)
[1231486.755230] LustreError: 167-0: fir-MDT0003-mdc-ffff8eba05ae6000: This client was evicted by fir-MDT0003; in progress operations using this service will fail.
[1231486.755365] LustreError: 34127:0:(file.c:4393:ll_inode_revalidate_fini()) fir: revalidate FID [0x28000f583:0x2cf0:0x0] error: rc = -5
[1231486.755368] LustreError: 34127:0:(file.c:4393:ll_inode_revalidate_fini()) Skipped 1 previous similar message
[1231486.805305] LNetError: 42768:0:(lib-msg.c:811:lnet_is_health_check()) Skipped 1 previous similar message
[1231486.844678] Lustre: fir-MDT0003-mdc-ffff8eba05ae6000: Connection restored to 10.0.10.52@o2ib7 (at 10.0.10.52@o2ib7)
[1231486.856531] Lustre: Skipped 152 previous similar messages                  
[1231555.033198] Lustre: Evicted from MGS (at 10.0.10.51@o2ib7) after server handle changed from 0x5c08f3702ce50dae to 0x5c08f37030f7e96c
[1231557.546491] Lustre: Evicted from fir-MDT0000_UUID (at 10.0.10.51@o2ib7) after server handle changed from 0x5c08f3702ce68e45 to 0x5c08f37030f7e957
[1231557.561265] LustreError: 167-0: fir-MDT0000-mdc-ffff8eba05ae6000: This client was evicted by fir-MDT0000; in progress operations using this service will fail.
[1231557.578742] LustreError: 31049:0:(file.c:4393:ll_inode_revalidate_fini()) fir: revalidate FID [0x240000406:0x138:0x0] error: rc = -5
[1231557.592241] LustreError: 31049:0:(file.c:4393:ll_inode_revalidate_fini()) Skipped 9 previous similar messages
[1231558.858554] LustreError: 32190:0:(file.c:216:ll_close_inode_openhandle()) fir-clilmv-ffff8eba05ae6000: inode [0x20000fd69:0x154bf:0x0] mdc close failed: rc = -5
[1231566.830744] LustreError: 32327:0:(file.c:4393:ll_inode_revalidate_fini()) fir: revalidate FID [0x240000406:0x138:0x0] error: rc = -5
[1231566.844246] LustreError: 32327:0:(file.c:4393:ll_inode_revalidate_fini()) Skipped 2 previous similar messages
[1231596.575109] LustreError: 166-1: MGC10.0.10.51@o2ib7: Connection to MGS (at 10.0.10.51@o2ib7) was lost; in progress operations using this service will fail
[1231621.286801] Lustre: 42791:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for sent delay: [sent 1554766045/real 0]  req@ffff8eace510a900 x1629003644418896/t0(0) o103-&amp;gt;fir-OST002b-osc-ffff8eba05ae6000@10.0.10.108@o2ib7:17/18 lens 328/224 e 0 to 1 dl 1554766077 ref 2 fl Rpc:X/0/ffffffff rc 0/-1
[1231621.318732] Lustre: 42791:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 1009312 previous similar messages
[1231723.524429] LustreError: 43001:0:(mgc_request.c:599:do_requeue()) failed processing log: -5
[1231873.637411] Lustre: DEBUG MARKER: Mon Apr  8 16:32:27 2019   
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;We are not running with the patches from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11100&quot; title=&quot;Clients hangs in LNetMDUnlink&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11100&quot;&gt;&lt;del&gt;LU-11100&lt;/del&gt;&lt;/a&gt; for now. I&apos;ll see if I can find the original crash dump.&lt;/p&gt;</comment>
                            <comment id="245453" author="sthiell" created="Tue, 9 Apr 2019 00:04:13 +0000"  >&lt;p&gt;Uploaded original crash dump of &lt;tt&gt;fir-rbh01&lt;/tt&gt; to your FTP as &lt;tt&gt;vmcore-fir-rbh01-2019-01-28-15-10-41.gz&lt;/tt&gt;&lt;/p&gt;

&lt;p&gt;kernel used is &lt;tt&gt;3.10.0-957.1.3.el7_lustre.x86_64&lt;/tt&gt;&#160;(even tough it&apos;s not a lustre server).&lt;br/&gt;
 &lt;tt&gt;kernel-debuginfo-3.10.0-957.1.3.el7_lustre.x86_64.rpm&lt;/tt&gt; and -common should be available already on the FTP&lt;/p&gt;

&lt;p&gt;The issue that occurred today is on a server with less CPUs (dual Xeon CPU E5-2650 v2) and running kernel &lt;tt&gt;3.10.0-957.10.1.el7.x86_64&lt;/tt&gt; (unpatched).&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="55434">LU-12194</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="31884" name="fir-rbh01-vmcore-dmesg.txt" size="1049890" author="sthiell" created="Mon, 28 Jan 2019 23:36:24 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00afj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>