<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:50:11 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12162] Major issue on OSS after upgrading Oak to 2.10.7</title>
                <link>https://jira.whamcloud.com/browse/LU-12162</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We recently upgraded Oak servers to 2.10.7 but since then, the OSS are showing a lot of these errors:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[29459.670359] Pid: 184632, comm: ll_ost_io00_061 3.10.0-862.14.4.el7_lustre.x86_64 #1 SMP Mon Oct 8 11:21:37 PDT 2018
[29459.682008] Call Trace:
[29459.684747]  [&amp;lt;ffffffffa9996684&amp;gt;] __lock_page+0x74/0x90
[29459.690595]  [&amp;lt;ffffffffa99971a4&amp;gt;] __find_lock_page+0x54/0x70
[29459.696924]  [&amp;lt;ffffffffa9997e14&amp;gt;] find_or_create_page+0x34/0xa0
[29459.703545]  [&amp;lt;ffffffffc1257ac5&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
[29459.711052]  [&amp;lt;ffffffffc13d18fb&amp;gt;] ofd_preprw+0x6bb/0x1170 [ofd]
[29459.717672]  [&amp;lt;ffffffffc0dc7845&amp;gt;] tgt_brw_read+0x975/0x1860 [ptlrpc]
[29459.724829]  [&amp;lt;ffffffffc0dc572a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[29459.732543]  [&amp;lt;ffffffffc0d6e04b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[29459.741128]  [&amp;lt;ffffffffc0d71792&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[29459.748166]  [&amp;lt;ffffffffa98bdf21&amp;gt;] kthread+0xd1/0xe0
[29459.753622]  [&amp;lt;ffffffffa9f255f7&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29459.760630]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29459.766199] LustreError: dumping log to /tmp/lustre-log.1554392681.184632
[29558.439589] Lustre: 184902:0:(service.c:1346:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/-150), not sending early reply
  req@ffff883e21427450 x1628649065264016/t0(0) o3-&amp;gt;214046b3-82aa-4e1f-6785-53da21760b7f@10.8.7.25@o2ib6:295/0 lens 488/432 e 0 to 0 dl 1554392785 ref 2 fl Interpret:/2/0 rc 0/0
[29558.471803] Lustre: 184902:0:(service.c:1346:ptlrpc_at_send_early_reply()) Skipped 86 previous similar messages
[29565.756656] LustreError: 184652:0:(ldlm_lib.c:3199:target_bulk_io()) @@@ bulk READ failed: rc -107  req@ffff8841a2c23c50 x1629017475649200/t0(0) o3-&amp;gt;9c21af89-2623-9dd8-0966-6a7e83d1ca0c@10.9.105.61@o2ib4:106/0 lens 488/432 e 0 to 0 dl 1554392596 ref 1 fl Interpret:/2/0 rc 0/0
[29565.783924] LustreError: 184652:0:(ldlm_lib.c:3199:target_bulk_io()) Skipped 41 previous similar messages
[29565.794652] Lustre: 184652:0:(service.c:2114:ptlrpc_server_handle_request()) @@@ Request took longer than estimated (600:191s); client may timeout.  req@ffff8841a2c23c50 x1629017475649200/t0(0) o3-&amp;gt;9c21af89-2623-9dd8-0966-6a7e83d1ca0c@10.9.105.61@o2ib4:106/0 lens 488/432 e 0 to 0 dl 1554392596 ref 1 fl Complete:/2/ffffffff rc -107/-1
[29565.827647] Lustre: 184652:0:(service.c:2114:ptlrpc_server_handle_request()) Skipped 4 previous similar messages
[29592.627263] Lustre: oak-OST002f: deleting orphan objects from 0x0:23118956 to 0x0:23118977
[29652.157536] LNet: Service thread pid 184907 was inactive for 1200.29s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[29652.176670] Pid: 184907, comm: ll_ost_io01_098 3.10.0-862.14.4.el7_lustre.x86_64 #1 SMP Mon Oct 8 11:21:37 PDT 2018
[29652.188320] Call Trace:
[29652.191067]  [&amp;lt;ffffffffa9996684&amp;gt;] __lock_page+0x74/0x90
[29652.196917]  [&amp;lt;ffffffffa99971a4&amp;gt;] __find_lock_page+0x54/0x70
[29652.203246]  [&amp;lt;ffffffffa9997e14&amp;gt;] find_or_create_page+0x34/0xa0
[29652.209865]  [&amp;lt;ffffffffc1257ac5&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
[29652.217369]  [&amp;lt;ffffffffc13d18fb&amp;gt;] ofd_preprw+0x6bb/0x1170 [ofd]
[29652.223997]  [&amp;lt;ffffffffc0dc7845&amp;gt;] tgt_brw_read+0x975/0x1860 [ptlrpc]
[29652.231172]  [&amp;lt;ffffffffc0dc572a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[29652.238889]  [&amp;lt;ffffffffc0d6e04b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[29652.247474]  [&amp;lt;ffffffffc0d71792&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[29652.254504]  [&amp;lt;ffffffffa98bdf21&amp;gt;] kthread+0xd1/0xe0
[29652.259960]  [&amp;lt;ffffffffa9f255f7&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29652.266967]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[29652.272539] LustreError: dumping log to /tmp/lustre-log.1554392874.184907
[29688.912990] Lustre: oak-OST0023: Connection restored to MGC10.0.2.51@o2ib5_1 (at 10.0.2.52@o2ib5)
[29688.922902] Lustre: Skipped 242 previous similar messages
[29688.929136] Lustre: oak-OST0023: deleting orphan objects from 0x0:23044807 to 0x0:23044835
[29690.017000] Lustre: oak-OST001d: deleting orphan objects from 0x0:24237516 to 0x0:24237537
[29690.216982] Lustre: oak-OST000d: deleting orphan objects from 0x0:22136235 to 0x0:22136257
[29692.305008] Lustre: oak-OST0007: deleting orphan objects from 0x0:21489097 to 0x0:21489122
[29692.721065] Lustre: oak-OST0021: deleting orphan objects from 0x0:22812231 to 0x0:22812257
[29754.557446] LNet: Service thread pid 184833 was inactive for 1201.54s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[29754.576566] Pid: 184833, comm: ll_ost_io01_080 3.10.0-862.14.4.el7_lustre.x86_64 #1 SMP Mon Oct 8 11:21:37 PDT 2018
[29754.588216] Call Trace:
[29754.590953]  [&amp;lt;ffffffffa9996684&amp;gt;] __lock_page+0x74/0x90
[29754.596802]  [&amp;lt;ffffffffa99971a4&amp;gt;] __find_lock_page+0x54/0x70
[29754.603129]  [&amp;lt;ffffffffa9997e14&amp;gt;] find_or_create_page+0x34/0xa0
[29754.609750]  [&amp;lt;ffffffffc1257ac5&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
[29754.617253]  [&amp;lt;ffffffffc13d18fb&amp;gt;] ofd_preprw+0x6bb/0x1170 [ofd]
[29754.623881]  [&amp;lt;ffffffffc0dc7845&amp;gt;] tgt_brw_read+0x975/0x1860 [ptlrpc]
[29754.631039]  [&amp;lt;ffffffffc0dc572a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[29754.638753]  [&amp;lt;ffffffffc0d6e04b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[29754.647335]  [&amp;lt;ffffffffc0d71792&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[29754.654363]  [&amp;lt;ffffffffa98bdf21&amp;gt;] kthread+0xd1/0xe0
[29754.659821]  [&amp;lt;ffffffffa9f255f7&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[29754.666831]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Users are reporting problem accessing some file data (from 2.12.0 clients).&lt;/p&gt;

&lt;p&gt;When checking with htop, the OSS are quiet, definitively too quiet, but the load keeps increasing, which makes me think there is some kind of thread deadlocks on the OSS. I&apos;ll attach logs soon.&lt;/p&gt;</description>
                <environment>Most clients are 2.12.0 (Sherlock) behind 2.12 routers, a few local clients are 2.10.6</environment>
        <key id="55355">LU-12162</key>
            <summary>Major issue on OSS after upgrading Oak to 2.10.7</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="pjones">Peter Jones</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Thu, 4 Apr 2019 15:58:17 +0000</created>
                <updated>Fri, 16 Aug 2019 17:35:51 +0000</updated>
                                            <version>Lustre 2.10.7</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="245250" author="pfarrell" created="Thu, 4 Apr 2019 16:04:34 +0000"  >&lt;p&gt;I would love, like, three more lines (earlier) of the kern log/dmesg there, Stephane. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&#160; Can&apos;t quite see &lt;b&gt;why&lt;/b&gt; that trace happened.&lt;/p&gt;

&lt;p&gt;I&apos;m guessing that&apos;s in what you attached.&lt;/p&gt;</comment>
                            <comment id="245251" author="sthiell" created="Thu, 4 Apr 2019 16:09:07 +0000"  >&lt;p&gt;We also upgraded the kernel from EL7.4 to EL7.5 so it could be it&apos;s more a kernel issue. We&apos;re using mdraid and ldiskfs. I&apos;ll prepare a kernel downgrade to revert back to the same kernel that we had and let you know.&lt;/p&gt;</comment>
                            <comment id="245252" author="sthiell" created="Thu, 4 Apr 2019 16:11:00 +0000"  >&lt;p&gt;Hi Patrick! Yes, it was just a snippet sorry, this should be in what I attached.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[Thu Apr  4 08:42:08 2019][29306.558140] Lustre: oak-OST0017: Client baf44bcb-3eb8-9027-d0b9-2a322ccc6d81 (at 10.9.105.58@o2ib4) reconnecting
[Thu Apr  4 08:42:08 2019][29306.569507] Lustre: Skipped 185 previous similar messages
[Thu Apr  4 08:42:55 2019][29353.149646] LNet: Service thread pid 184642 was inactive for 1200.77s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
[Thu Apr  4 08:42:55 2019][29353.164207] LNet: Skipped 9 previous similar messages
[Thu Apr  4 08:42:55 2019][29353.169846] LustreError: dumping log to /tmp/lustre-log.1554392575.184642
[Thu Apr  4 08:44:33 2019][29451.390645] Lustre: oak-OST001b: Bulk IO write error with 7cf5cc42-1c3f-487d-ba0a-8c0dd9f06aae (at 10.9.114.12@o2ib4), client will retry: rc = -110
[Thu Apr  4 08:44:33 2019][29451.405399] Lustre: Skipped 54 previous similar messages
[Thu Apr  4 08:44:41 2019][29459.645596] LNet: Service thread pid 184632 was inactive for 1202.75s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[Thu Apr  4 08:44:41 2019][29459.664718] LNet: Skipped 4 previous similar messages
[Thu Apr  4 08:44:41 2019][29459.670359] Pid: 184632, comm: ll_ost_io00_061 3.10.0-862.14.4.el7_lustre.x86_64 #1 SMP Mon Oct 8 11:21:37 PDT 2018
[Thu Apr  4 08:44:41 2019][29459.682008] Call Trace:
[Thu Apr  4 08:44:41 2019][29459.684747]  [&amp;lt;ffffffffa9996684&amp;gt;] __lock_page+0x74/0x90
[Thu Apr  4 08:44:41 2019][29459.690595]  [&amp;lt;ffffffffa99971a4&amp;gt;] __find_lock_page+0x54/0x70
[Thu Apr  4 08:44:41 2019][29459.696924]  [&amp;lt;ffffffffa9997e14&amp;gt;] find_or_create_page+0x34/0xa0
[Thu Apr  4 08:44:41 2019][29459.703545]  [&amp;lt;ffffffffc1257ac5&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
[Thu Apr  4 08:44:41 2019][29459.711052]  [&amp;lt;ffffffffc13d18fb&amp;gt;] ofd_preprw+0x6bb/0x1170 [ofd]
[Thu Apr  4 08:44:41 2019][29459.717672]  [&amp;lt;ffffffffc0dc7845&amp;gt;] tgt_brw_read+0x975/0x1860 [ptlrpc]
[Thu Apr  4 08:44:41 2019][29459.724829]  [&amp;lt;ffffffffc0dc572a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[Thu Apr  4 08:44:41 2019][29459.732543]  [&amp;lt;ffffffffc0d6e04b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[Thu Apr  4 08:44:41 2019][29459.741128]  [&amp;lt;ffffffffc0d71792&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[Thu Apr  4 08:44:41 2019][29459.748166]  [&amp;lt;ffffffffa98bdf21&amp;gt;] kthread+0xd1/0xe0
[Thu Apr  4 08:44:41 2019][29459.753622]  [&amp;lt;ffffffffa9f255f7&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[Thu Apr  4 08:44:41 2019][29459.760630]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[Thu Apr  4 08:44:41 2019][29459.766199] LustreError: dumping log to /tmp/lustre-log.1554392681.184632
[Thu Apr  4 08:46:20 2019][29558.439589] Lustre: 184902:0:(service.c:1346:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/-150), not sending early reply
[Thu Apr  4 08:46:20 2019][29558.439589]   req@ffff883e21427450 x1628649065264016/t0(0) o3-&amp;gt;214046b3-82aa-4e1f-6785-53da21760b7f@10.8.7.25@o2ib6:295/0 lens 488/432 e 0 to 0 dl 1554392785 ref 2 fl Interpret:/2/0 rc 0/0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="245258" author="sthiell" created="Thu, 4 Apr 2019 18:00:55 +0000"  >&lt;p&gt;Downgraded our kernel to the one we know work well, &lt;tt&gt;3.10.0-693.2.2.el7_lustre.pl3.x86_64&lt;/tt&gt; but same thing shortly after I mount some OSTs.&lt;/p&gt;

&lt;p&gt;We&apos;ll downgrade to a previous version of Lustre (I&apos;ll try 2.10.6 but OSS were in 2.10.5 before). This is with 2.10.7:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 04 10:53:41 oak-io2-s1 kernel: LustreError: 274592:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk WRITE after 100+0s  req@ffff883c7ad8fc50 x1628738268119872/t0(0) o4-&amp;gt;16192e1c-b70c-8fd7-57d7-f3dd08
Apr 04 10:53:41 oak-io2-s1 kernel: LustreError: 274592:0:(ldlm_lib.c:3239:target_bulk_io()) Skipped 1 previous similar message
Apr 04 10:53:41 oak-io2-s1 kernel: Lustre: oak-OST0036: Bulk IO write error with 16192e1c-b70c-8fd7-57d7-f3dd08bd3bbc (at 10.8.2.16@o2ib6), client will retry: rc = -110
Apr 04 10:53:48 oak-io2-s1 kernel: Lustre: oak-OST005a: Bulk IO read error with ab5c224c-4fd0-5e76-d8f2-f040c05793fa (at 10.9.112.17@o2ib4), client will retry: rc -110
Apr 04 10:54:49 oak-io2-s1 kernel: LustreError: 274606:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk READ after 100+0s  req@ffff883f19293450 x1628689723090960/t0(0) o3-&amp;gt;6172323c-7b29-e8ff-0f71-3b394ff
Apr 04 10:54:49 oak-io2-s1 kernel: Lustre: oak-OST005a: Bulk IO read error with 996c60db-fb86-b0ec-9a52-31daedfb544e (at 10.9.108.17@o2ib4), client will retry: rc -110
Apr 04 10:54:49 oak-io2-s1 kernel: LustreError: 274606:0:(ldlm_lib.c:3239:target_bulk_io()) Skipped 4 previous similar messages
Apr 04 10:54:49 oak-io2-s1 kernel: Lustre: oak-OST004e: Bulk IO write error with 96144f0d-ee16-e204-f990-dd9b0819c753 (at 10.9.108.40@o2ib4), client will retry: rc = -110
Apr 04 10:54:49 oak-io2-s1 kernel: Lustre: Skipped 2 previous similar messages
Apr 04 10:54:50 oak-io2-s1 kernel: LNet: Service thread pid 274648 was inactive for 200.68s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purpos
Apr 04 10:54:50 oak-io2-s1 kernel: Pid: 274648, comm: ll_ost_io00_015 3.10.0-693.2.2.el7_lustre.pl3.x86_64 #1 SMP Thu Mar 15 13:06:45 PDT 2018
Apr 04 10:54:50 oak-io2-s1 kernel: Call Trace:
Apr 04 10:54:50 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b4b48e&amp;gt;] target_bulk_io+0x4ae/0xab0 [ptlrpc]
Apr 04 10:54:50 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf57f2&amp;gt;] tgt_brw_read+0xf32/0x1850 [ptlrpc]
Apr 04 10:54:50 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf3115&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
Apr 04 10:54:50 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9bdd6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
Apr 04 10:54:50 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9f512&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Apr 04 10:54:50 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Apr 04 10:54:50 oak-io2-s1 kernel:  [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Apr 04 10:54:50 oak-io2-s1 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 04 10:54:50 oak-io2-s1 kernel: LustreError: dumping log to /tmp/lustre-log.1554400490.274648
Apr 04 10:54:54 oak-io2-s1 kernel: LNet: Service thread pid 275319 was inactive for 200.48s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purpos
Apr 04 10:54:54 oak-io2-s1 kernel: Pid: 275319, comm: ll_ost_io00_036 3.10.0-693.2.2.el7_lustre.pl3.x86_64 #1 SMP Thu Mar 15 13:06:45 PDT 2018
Apr 04 10:54:54 oak-io2-s1 kernel: Call Trace:
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b4b48e&amp;gt;] target_bulk_io+0x4ae/0xab0 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf57f2&amp;gt;] tgt_brw_read+0xf32/0x1850 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf3115&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9bdd6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9f512&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 04 10:54:54 oak-io2-s1 kernel: LustreError: dumping log to /tmp/lustre-log.1554400494.275319
Apr 04 10:54:54 oak-io2-s1 kernel: Pid: 275455, comm: ll_ost_io01_123 3.10.0-693.2.2.el7_lustre.pl3.x86_64 #1 SMP Thu Mar 15 13:06:45 PDT 2018
Apr 04 10:54:54 oak-io2-s1 kernel: Call Trace:
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182094&amp;gt;] __lock_page+0x74/0x90
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182a34&amp;gt;] __find_lock_page+0x54/0x70
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff81183614&amp;gt;] find_or_create_page+0x34/0xa0
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc103d885&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc11b7728&amp;gt;] ofd_preprw+0x6a8/0x1150 [ofd]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf522f&amp;gt;] tgt_brw_read+0x96f/0x1850 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf3115&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9bdd6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9f512&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 04 10:54:54 oak-io2-s1 kernel: Pid: 275453, comm: ll_ost_io01_122 3.10.0-693.2.2.el7_lustre.pl3.x86_64 #1 SMP Thu Mar 15 13:06:45 PDT 2018
Apr 04 10:54:54 oak-io2-s1 kernel: Call Trace:
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182094&amp;gt;] __lock_page+0x74/0x90
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182a34&amp;gt;] __find_lock_page+0x54/0x70
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff81183614&amp;gt;] find_or_create_page+0x34/0xa0
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc103d885&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc11b7728&amp;gt;] ofd_preprw+0x6a8/0x1150 [ofd]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf522f&amp;gt;] tgt_brw_read+0x96f/0x1850 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf3115&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9bdd6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9f512&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 04 10:54:54 oak-io2-s1 kernel: Pid: 275451, comm: ll_ost_io01_121 3.10.0-693.2.2.el7_lustre.pl3.x86_64 #1 SMP Thu Mar 15 13:06:45 PDT 2018
Apr 04 10:54:54 oak-io2-s1 kernel: Call Trace:
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182094&amp;gt;] __lock_page+0x74/0x90
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182a34&amp;gt;] __find_lock_page+0x54/0x70
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff81183614&amp;gt;] find_or_create_page+0x34/0xa0
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc103d885&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc11b7728&amp;gt;] ofd_preprw+0x6a8/0x1150 [ofd]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf522f&amp;gt;] tgt_brw_read+0x96f/0x1850 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf3115&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9bdd6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9f512&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Apr 04 10:54:54 oak-io2-s1 kernel:  [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Apr 04 10:54:55 oak-io2-s1 kernel:  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
Apr 04 10:54:55 oak-io2-s1 kernel: LNet: Service thread pid 275449 was inactive for 200.86s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
Apr 04 10:54:55 oak-io2-s1 kernel: LustreError: dumping log to /tmp/lustre-log.1554400495.275368
Apr 04 10:54:55 oak-io2-s1 kernel: LNet: Service thread pid 275438 was inactive for 200.22s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
Apr 04 10:54:55 oak-io2-s1 kernel: LNet: Skipped 97 previous similar messages
Apr 04 10:54:56 oak-io2-s1 kernel: LustreError: dumping log to /tmp/lustre-log.1554400496.274598
Apr 04 10:54:56 oak-io2-s1 kernel: LNet: Service thread pid 274540 was inactive for 200.76s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
Apr 04 10:54:56 oak-io2-s1 kernel: LNet: Skipped 8 previous similar messages
Apr 04 10:54:58 oak-io2-s1 kernel: LNet: Service thread pid 275311 was inactive for 200.31s. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
Apr 04 10:54:58 oak-io2-s1 kernel: LustreError: dumping log to /tmp/lustre-log.1554400498.275311
Apr 04 10:55:16 oak-io2-s1 kernel: INFO: task ll_ost_io00_000:264820 blocked for more than 120 seconds.
Apr 04 10:55:16 oak-io2-s1 kernel: &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
Apr 04 10:55:16 oak-io2-s1 kernel: ll_ost_io00_000 D ffff883f18a27988     0 264820      2 0x00000080
Apr 04 10:55:16 oak-io2-s1 kernel:  ffff883f18a27820 0000000000000046 ffff883f1339cf10 ffff883f18a27fd8
Apr 04 10:55:16 oak-io2-s1 kernel:  ffff883f18a27fd8 ffff883f18a27fd8 ffff883f1339cf10 ffff881ffd3d6cc0
Apr 04 10:55:16 oak-io2-s1 kernel:  0000000000000000 7fffffffffffffff ffffffff816a7610 ffff883f18a27988
Apr 04 10:55:16 oak-io2-s1 kernel: Call Trace:
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a7610&amp;gt;] ? bit_wait+0x50/0x50
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a94e9&amp;gt;] schedule+0x29/0x70
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a6ff9&amp;gt;] schedule_timeout+0x239/0x2c0
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc084ae97&amp;gt;] ? cfs_hash_bd_lookup_intent+0x57/0x160 [libcfs]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc03a1740&amp;gt;] ? mlx4_ib_post_send+0x510/0xb50 [mlx4_ib]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a7610&amp;gt;] ? bit_wait+0x50/0x50
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a8b6d&amp;gt;] io_schedule_timeout+0xad/0x130
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a8c08&amp;gt;] io_schedule+0x18/0x20
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a7621&amp;gt;] bit_wait_io+0x11/0x50
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a733f&amp;gt;] __wait_on_bit_lock+0x5f/0xc0
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182094&amp;gt;] __lock_page+0x74/0x90
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b19d0&amp;gt;] ? wake_bit_function+0x40/0x40
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182a34&amp;gt;] __find_lock_page+0x54/0x70
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff81183614&amp;gt;] find_or_create_page+0x34/0xa0
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc103d885&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc11b7728&amp;gt;] ofd_preprw+0x6a8/0x1150 [ofd]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bb5a1d&amp;gt;] ? __req_capsule_get+0x15d/0x700 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf522f&amp;gt;] tgt_brw_read+0x96f/0x1850 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff811de591&amp;gt;] ? __slab_free+0x81/0x2f0
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b8d8d0&amp;gt;] ? lustre_msg_buf_v2+0x1b0/0x1b0 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0949549&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bc8196&amp;gt;] ? null_alloc_rs+0x176/0x330 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9004f&amp;gt;] ? lustre_pack_reply_v2+0x14f/0x280 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b901ef&amp;gt;] ? lustre_pack_reply_flags+0x6f/0x1e0 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b90371&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bf3115&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9bdd6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b98408&amp;gt;] ? ptlrpc_wait_event+0x98/0x340 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9f512&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff81029557&amp;gt;] ? __switch_to+0xd7/0x510
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a8f00&amp;gt;] ? __schedule+0x2f0/0x8b0
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9ea80&amp;gt;] ? ptlrpc_register_service+0xe30/0xe30 [ptlrpc]
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
Apr 04 10:55:16 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
Apr 04 10:55:16 oak-io2-s1 kernel: INFO: task ll_ost_io00_002:264822 blocked for more than 120 seconds.
Apr 04 10:55:16 oak-io2-s1 kernel: &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
Apr 04 10:55:17 oak-io2-s1 kernel: ll_ost_io00_002 D ffff883f11f73988     0 264822      2 0x00000080
Apr 04 10:55:17 oak-io2-s1 kernel:  ffff883f11f73820 0000000000000046 ffff883fb4ef0fd0 ffff883f11f73fd8
Apr 04 10:55:17 oak-io2-s1 kernel:  ffff883f11f73fd8 ffff883f11f73fd8 ffff883fb4ef0fd0 ffff881ffcf96cc0
Apr 04 10:55:17 oak-io2-s1 kernel:  0000000000000000 7fffffffffffffff ffffffff816a7610 ffff883f11f73988
Apr 04 10:55:17 oak-io2-s1 kernel: Call Trace:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="245259" author="sthiell" created="Thu, 4 Apr 2019 19:03:37 +0000"  >&lt;p&gt;Same issue with 2.10.6. Not sure what to do, perhaps this is unrelated to the recent upgrade after all.&lt;br/&gt;
&#160;&lt;br/&gt;
On the Lnet routers, I can see this from time to time, this is between routers and clients:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 04 11:45:37 sh-rtr-oak-1-1.int kernel: LNet: 9232:0:(o2iblnd_cb.c:408:kiblnd_handle_rx()) PUT_NACK from 10.8.2.18@o2ib6
Apr 04 11:45:37 sh-rtr-oak-1-1.int kernel: LNet: 9232:0:(o2iblnd_cb.c:408:kiblnd_handle_rx()) Skipped 35 previous similar messages
Apr 04 11:47:59 sh-rtr-oak-1-1.int kernel: Lustre: DEBUG MARKER: Thu Apr 4 11:47:59 2019
Apr 04 11:55:37 sh-rtr-oak-1-1.int kernel: LNet: 9230:0:(o2iblnd_cb.c:408:kiblnd_handle_rx()) PUT_NACK from 10.8.7.27@o2ib6
Apr 04 11:55:37 sh-rtr-oak-1-1.int kernel: LNet: 9230:0:(o2iblnd_cb.c:408:kiblnd_handle_rx()) Skipped 42 previous similar messages

Apr 04 11:44:19 sh-rtr-oak-1-2.int kernel: LNet: 87573:0:(o2iblnd_cb.c:408:kiblnd_handle_rx()) PUT_NACK from 10.8.17.10@o2ib6
Apr 04 11:44:19 sh-rtr-oak-1-2.int kernel: LNet: 87573:0:(o2iblnd_cb.c:408:kiblnd_handle_rx()) Skipped 24 previous similar messages
Apr 04 11:47:18 sh-rtr-oak-1-2.int kernel: Lustre: DEBUG MARKER: Thu Apr  4 11:47:18 2019
Apr 04 11:54:32 sh-rtr-oak-1-2.int kernel: LNet: 87574:0:(o2iblnd_cb.c:408:kiblnd_handle_rx()) PUT_NACK from 10.8.17.10@o2ib6
Apr 04 11:54:32 sh-rtr-oak-1-2.int kernel: LNet: 87574:0:(o2iblnd_cb.c:408:kiblnd_handle_rx()) Skipped 37 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;ll check the IB fabric of the cluster now.&lt;/p&gt;

&lt;p&gt;All clients and routers are 2.12.0. Routers and some of the clients (only ~300 or 20%) have the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12065&quot; title=&quot;Client got evicted when  lock callback timer expired  on OSS &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12065&quot;&gt;&lt;del&gt;LU-12065&lt;/del&gt;&lt;/a&gt; lnd: increase CQ entries.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="245268" author="sthiell" created="Thu, 4 Apr 2019 22:17:15 +0000"  >&lt;p&gt;We restarted all Oak servers and routers (not clients), and soon after everything was back up, I still got this call traces on the OSS.... any ideas Patrick?? I found a single IB link in the cluster fabric that had SymbolError and we fixed it. So IB should be okay...&lt;/p&gt;

&lt;p&gt;This is a 2.10.7 servers, clients o2ib4 and o2ib6 are running 2.12.0.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;oak-io2-s1: Apr 04 15:12:00 oak-io2-s1 kernel: Lustre: Skipped 2 previous similar messages
oak-io2-s1: Apr 04 15:12:02 oak-io2-s1 kernel: LustreError: 132873:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk WRITE after 100+0s  req@ffff881f65e36850 x1629070194604080/t0(0) o4-&amp;gt;526957d3-630b-5bfe-a257-c024f120aeda@10.9.112.3@o2ib4:527/0 lens 3896/448 e 0 to 0 dl 1554416422 ref 1 fl Interpret:/2/0 rc 
oak-io2-s1: Apr 04 15:12:02 oak-io2-s1 kernel: LustreError: 132873:0:(ldlm_lib.c:3239:target_bulk_io()) Skipped 19 previous similar messages
oak-io2-s1: Apr 04 15:12:02 oak-io2-s1 kernel: Lustre: oak-OST0034: Bulk IO write error with 526957d3-630b-5bfe-a257-c024f120aeda (at 10.9.112.3@o2ib4), client will retry: rc = -110
oak-io2-s1: Apr 04 15:12:02 oak-io2-s1 kernel: Lustre: Skipped 13 previous similar messages
oak-io2-s1: Apr 04 15:12:05 oak-io2-s1 kernel: Lustre: oak-OST003a: Bulk IO read error with 485eaf5b-5ae8-c5f5-c435-15983c7561ae (at 10.8.0.68@o2ib6), client will retry: rc -110
oak-io2-s1: Apr 04 15:12:05 oak-io2-s1 kernel: Lustre: Skipped 13 previous similar messages
oak-io2-s1: Apr 04 15:12:43 oak-io2-s1 kernel: LustreError: 132881:0:(tgt_grant.c:738:tgt_grant_check()) oak-OST0030: cli 07b089ce-55cb-8869-f5ce-92a5a6d9616f claims 4218880 GRANT, real grant 0
oak-io2-s1: Apr 04 15:13:04 oak-io2-s1 kernel: LustreError: 132553:0:(tgt_grant.c:738:tgt_grant_check()) oak-OST0030: cli 07b089ce-55cb-8869-f5ce-92a5a6d9616f claims 4071424 GRANT, real grant 0
oak-io2-s1: Apr 04 15:13:04 oak-io2-s1 kernel: LustreError: 132553:0:(tgt_grant.c:738:tgt_grant_check()) Skipped 4 previous similar messages
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel: INFO: task ll_ost_io00_000:99182 blocked for more than 120 seconds.
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel: &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel: ll_ost_io00_000 D ffff881f1714b988     0 99182      2 0x00000080
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  ffff881f1714b820 0000000000000046 ffff881f18c66eb0 ffff881f1714bfd8
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  ffff881f1714bfd8 ffff881f1714bfd8 ffff881f18c66eb0 ffff881ffd096cc0
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  0000000000000000 7fffffffffffffff ffffffff816a7610 ffff881f1714b988
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel: Call Trace:
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a7610&amp;gt;] ? bit_wait+0x50/0x50
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a94e9&amp;gt;] schedule+0x29/0x70
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a6ff9&amp;gt;] schedule_timeout+0x239/0x2c0
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff811de591&amp;gt;] ? __slab_free+0x81/0x2f0
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a7610&amp;gt;] ? bit_wait+0x50/0x50
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a8b6d&amp;gt;] io_schedule_timeout+0xad/0x130
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a8c08&amp;gt;] io_schedule+0x18/0x20
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a7621&amp;gt;] bit_wait_io+0x11/0x50
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a733f&amp;gt;] __wait_on_bit_lock+0x5f/0xc0
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182094&amp;gt;] __lock_page+0x74/0x90
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b19d0&amp;gt;] ? wake_bit_function+0x40/0x40
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff81182a34&amp;gt;] __find_lock_page+0x54/0x70
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff81183614&amp;gt;] find_or_create_page+0x34/0xa0
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc1042885&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc11bc728&amp;gt;] ofd_preprw+0x6a8/0x1150 [ofd]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bc2a1d&amp;gt;] ? __req_capsule_get+0x15d/0x700 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff810cb62c&amp;gt;] ? set_next_entity+0x3c/0xe0
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0c0222f&amp;gt;] tgt_brw_read+0x96f/0x1850 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0956549&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bd5196&amp;gt;] ? null_alloc_rs+0x176/0x330 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9d04f&amp;gt;] ? lustre_pack_reply_v2+0x14f/0x280 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9d1ef&amp;gt;] ? lustre_pack_reply_flags+0x6f/0x1e0 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0b9d371&amp;gt;] ? lustre_pack_reply+0x11/0x20 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0c00115&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0ba8dd6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0ba5408&amp;gt;] ? ptlrpc_wait_event+0x98/0x340 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff810c4810&amp;gt;] ? wake_up_state+0x20/0x20
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0bac512&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff81029557&amp;gt;] ? __switch_to+0xd7/0x510
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816a8f00&amp;gt;] ? __schedule+0x2f0/0x8b0
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffffc0baba80&amp;gt;] ? ptlrpc_register_service+0xe30/0xe30 [ptlrpc]
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel:  [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
oak-io2-s1: Apr 04 15:13:18 oak-io2-s1 kernel: INFO: task ll_ost_io00_001:99183 blocked for more than 120 seconds.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="245269" author="sthiell" created="Thu, 4 Apr 2019 22:40:06 +0000"  >&lt;p&gt;A note about &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12065&quot; title=&quot;Client got evicted when  lock callback timer expired  on OSS &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12065&quot;&gt;&lt;del&gt;LU-12065&lt;/del&gt;&lt;/a&gt; lnd: increase CQ entries&lt;/p&gt;

&lt;p&gt;&lt;tt&gt;10.9.112.3@o2ib4&lt;/tt&gt; in the logs from my previous comment is one of the clients that has 2.12.0 + the patch for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12065&quot; title=&quot;Client got evicted when  lock callback timer expired  on OSS &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12065&quot;&gt;&lt;del&gt;LU-12065&lt;/del&gt;&lt;/a&gt; (lnd: increase CQ entries).&lt;/p&gt;

&lt;p&gt;All Oak servers are now all back to 2.10.7 to they have it too.&lt;/p&gt;

&lt;p&gt;The LNet routers between Oak (o2ib5) and Sherlock (o2ib4/6) also have the patch on top of 2.12.0.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;so at least we know that this specific patch doesn&apos;t fix this issue&lt;/p&gt;</comment>
                            <comment id="245274" author="sthiell" created="Fri, 5 Apr 2019 05:34:28 +0000"  >&lt;p&gt;Hi - we have made some progress but not resolved the issue yet.&lt;/p&gt;

&lt;p&gt;Oak has 3 IO cells, each with 2 x OSS, and only a single IOCell is impacted by this problem IOcell#2. They all run 2.10.7 now with our well-known kernel, again only IOCell #2 exhibit the issues, both redundant servers oak-io2-s1 and oak-io2-s2. All Oak&apos;s OSS use the same LNet routers. So if it&apos;s only localized on one IOCell, it could be the backend. But there is no indication of an issue. And today we power-cycled the full IO cell (rack). Still, what makes me think there is a problem in the backend more than the network is these backtraces:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 7874.415621] Lustre: 272943:0:(service.c:1346:ptlrpc_at_send_early_reply()) @@@ Couldn&apos;t add any time (5/5), not sending early reply
  req@ffff883ad89b0450 x1629296681700080/t0(0) o3-&amp;gt;f060da2b-47c0-9c4a-3590-7c18d2889f30@10.9.113.6@o2ib4:358/0 lens 488/432 e 0 to 0 dl 1554441168 ref 2 fl Interpret:/2/0 rc 0/0
[ 7874.447638] Lustre: 272943:0:(service.c:1346:ptlrpc_at_send_early_reply()) Skipped 73 previous similar messages
[ 7879.489456] LustreError: 272844:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk READ after 89+0s  req@ffff883ad89b2c50 x1628590488062352/t0(0) o3-&amp;gt;0ef862b6-a2d6-21a8-44a8-9efaf84bbf69@10.9.103.11@o2ib4:358/0 lens 488/432 e 0 to 0 dl 1554441168 ref 1 fl Interpret:/2/0 rc 0/0
[ 7879.517403] LustreError: 272844:0:(ldlm_lib.c:3239:target_bulk_io()) Skipped 242 previous similar messages
[ 7879.528213] Lustre: oak-OST0048: Bulk IO read error with 0ef862b6-a2d6-21a8-44a8-9efaf84bbf69 (at 10.9.103.11@o2ib4), client will retry: rc -110
[ 7879.542676] Lustre: Skipped 141 previous similar messages
[ 7885.530499] LNet: Service thread pid 274929 completed after 1762.63s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
[ 7885.548754] LNet: Skipped 29 previous similar messages
[ 7973.018750] Lustre: oak-OST0036: Bulk IO write error with f1e4ff8b-e962-43e4-68cf-4383c7c66071 (at 10.9.104.25@o2ib4), client will retry: rc = -110
[ 7973.033510] Lustre: Skipped 135 previous similar messages
[ 7992.876118] Pid: 272883, comm: ll_ost_io00_013 3.10.0-693.2.2.el7_lustre.pl3.x86_64 #1 SMP Thu Mar 15 13:06:45 PDT 2018
[ 7992.888157] Call Trace:
[ 7992.890898]  [&amp;lt;ffffffff81182094&amp;gt;] __lock_page+0x74/0x90
[ 7992.896748]  [&amp;lt;ffffffff81182a34&amp;gt;] __find_lock_page+0x54/0x70
[ 7992.903076]  [&amp;lt;ffffffff81183614&amp;gt;] find_or_create_page+0x34/0xa0
[ 7992.909695]  [&amp;lt;ffffffffc1018885&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
[ 7992.917201]  [&amp;lt;ffffffffc1192728&amp;gt;] ofd_preprw+0x6a8/0x1150 [ofd]
[ 7992.923832]  [&amp;lt;ffffffffc0c5322f&amp;gt;] tgt_brw_read+0x96f/0x1850 [ptlrpc]
[ 7992.931008]  [&amp;lt;ffffffffc0c51115&amp;gt;] tgt_request_handle+0x925/0x1370 [ptlrpc]
[ 7992.938727]  [&amp;lt;ffffffffc0bf9dd6&amp;gt;] ptlrpc_server_handle_request+0x236/0xa90 [ptlrpc]
[ 7992.947312]  [&amp;lt;ffffffffc0bfd512&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[ 7992.954345]  [&amp;lt;ffffffff810b098f&amp;gt;] kthread+0xcf/0xe0
[ 7992.959795]  [&amp;lt;ffffffff816b4f58&amp;gt;] ret_from_fork+0x58/0x90
[ 7992.965835]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;as it is in osd_ldiskfs. Slow backend leading to slow response leading to IO timeout, am I right to think this?&lt;/p&gt;

&lt;p&gt;Would be easy if the error came from a single OST, as in that case it could be a slow drives, volume or something like that, but no, it doesn&apos;t just come from a single OST, as shown by lctl dk:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@oak-io2-s2 ~]# lctl dk | grep Bulk
00000020:02000400:21.0:1554442186.572762:0:274935:0:(tgt_handler.c:2046:tgt_brw_read()) oak-OST003f: Bulk IO read error with bf2139cf-7e53-4025-4e6e-f6f1d7d97a02 (at 10.9.101.50@o2ib4), client will retry: rc -110
00000020:02000400:8.0:1554442187.507754:0:274982:0:(tgt_handler.c:2046:tgt_brw_read()) oak-OST0048: Bulk IO read error with 0ef862b6-a2d6-21a8-44a8-9efaf84bbf69 (at 10.9.103.11@o2ib4), client will retry: rc -110
00000020:02000400:3.0:1554442187.551764:0:273000:0:(tgt_handler.c:2046:tgt_brw_read()) oak-OST0048: Bulk IO read error with 99d2fc1e-a85b-fa24-ca8b-03d73ee86475 (at 10.9.112.8@o2ib4), client will retry: rc -110
00000020:02000400:31.0:1554442187.703820:0:274997:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST003e: Bulk IO write error with 5588751e-34a5-8e15-f371-63b5b81517bb (at 10.9.101.20@o2ib4), client will retry: rc = -110
00000020:02000400:15.0:1554442187.709735:0:275042:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST0042: Bulk IO write error with 51bb6bf0-2ddf-6781-f22c-360735045eba (at 10.9.107.21@o2ib4), client will retry: rc = -110
00000020:02000400:11.0:1554442187.709742:0:273003:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST0036: Bulk IO write error with f1e4ff8b-e962-43e4-68cf-4383c7c66071 (at 10.9.104.25@o2ib4), client will retry: rc = -110
00000020:02000400:21.0:1554442187.713740:0:275045:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST0052: Bulk IO write error with 526957d3-630b-5bfe-a257-c024f120aeda (at 10.9.112.3@o2ib4), client will retry: rc = -110
00000020:02000400:11.0:1554442187.728733:0:272998:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST0030: Bulk IO write error with 526957d3-630b-5bfe-a257-c024f120aeda (at 10.9.112.3@o2ib4), client will retry: rc = -110
00000020:02000400:15.0:1554442187.816743:0:272950:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST005c: Bulk IO write error with ed020952-e5b6-7729-7a78-b61d9e90b696 (at 10.9.105.32@o2ib4), client will retry: rc = -110
00000020:02000400:9.0:1554442188.093753:0:272924:0:(tgt_handler.c:2046:tgt_brw_read()) oak-OST0033: Bulk IO read error with ace149ff-a4e3-8eeb-f9dc-177b4b8f7f6d (at 10.9.106.25@o2ib4), client will retry: rc -110
00000020:02000400:23.0:1554442188.207750:0:274976:0:(tgt_handler.c:2046:tgt_brw_read()) oak-OST005b: Bulk IO read error with 8bab6b54-7a61-a87c-68e6-da0d28bb25d6 (at 10.9.106.1@o2ib4), client will retry: rc -110
00000020:02000400:17.0:1554442188.230752:0:273030:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST0048: Bulk IO write error with b312f397-6359-6d9f-b796-be326bf43e45 (at 10.9.101.29@o2ib4), client will retry: rc = -110
00000020:02000400:5.0:1554442188.259746:0:275020:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST0054: Bulk IO write error with d8bbb1fe-48fb-3725-9c52-55f23effddce (at 10.9.108.24@o2ib4), client will retry: rc = -110
00000020:02000400:19.0:1554442188.260735:0:275002:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST0054: Bulk IO write error with d8bbb1fe-48fb-3725-9c52-55f23effddce (at 10.9.108.24@o2ib4), client will retry: rc = -110
00000020:02000400:24.0:1554442206.739748:0:272887:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST0055: Bulk IO write error with d8bbb1fe-48fb-3725-9c52-55f23effddce (at 10.9.108.24@o2ib4), client will retry: rc = -110
00000020:02000400:1.0:1554442206.739750:0:275047:0:(tgt_handler.c:2347:tgt_brw_write()) oak-OST0055: Bulk IO write error with d8bbb1fe-48fb-3725-9c52-55f23effddce (at 10.9.108.24@o2ib4), client will retry: rc = -110
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;For the night, we have set max_create_count=0 to avoid new files to these OSTs (the 48 of the cell). Any ideas would be greatly appreciated.&lt;/p&gt;</comment>
                            <comment id="245283" author="sthiell" created="Fri, 5 Apr 2019 13:25:24 +0000"  >&lt;p&gt;I think we found the issue. There is some kind of incompatibility in the 2.10 recovery for 2.12 clients. It leads to a deadlock as shown in my logs ( I attached a second dump of sysrq t last night also). After remounting all 2.10.7 OSTs doing Bulk IO errors using -o abort_recov (GRR I really don&apos;t like that, it killed a lot of jobs), things are back to normal!! I think! Logs are all super clean. And we are in 2.10.7 now at least!!&lt;/p&gt;</comment>
                            <comment id="245291" author="bzzz" created="Fri, 5 Apr 2019 15:05:47 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sthiell&quot; class=&quot;user-hover&quot; rel=&quot;sthiell&quot;&gt;sthiell&lt;/a&gt; this looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12018&quot; title=&quot;deadlock on OSS: quota reintegration vs memory release&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12018&quot;&gt;&lt;del&gt;LU-12018&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="245293" author="bzzz" created="Fri, 5 Apr 2019 15:13:18 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Call Trace:
 [&amp;lt;ffffffff816a94e9&amp;gt;] schedule+0x29/0x70
 [&amp;lt;ffffffff816a6ff9&amp;gt;] schedule_timeout+0x239/0x2c0
 [&amp;lt;ffffffff810c9ef9&amp;gt;] ? select_task_rq_fair+0x549/0x700
 [&amp;lt;ffffffff816a989d&amp;gt;] wait_for_completion+0xfd/0x140
 [&amp;lt;ffffffff810c4810&amp;gt;] ? wake_up_state+0x20/0x20
 [&amp;lt;ffffffff810b07ea&amp;gt;] kthread_create_on_node+0xaa/0x140
 [&amp;lt;ffffffffc0f92110&amp;gt;] ? qsd_reconciliation+0xa90/0xa90 [lquota]
 [&amp;lt;ffffffffc09f4549&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
 [&amp;lt;ffffffffc0f938f8&amp;gt;] qsd_start_reint_thread+0x858/0xc10 [lquota]
 [&amp;lt;ffffffff81267292&amp;gt;] ? dqput+0x162/0x1e0
 [&amp;lt;ffffffffc0f97781&amp;gt;] qsd_ready+0x231/0x3c0 [lquota]
 [&amp;lt;ffffffffc0f9a6d2&amp;gt;] qsd_adjust+0xa2/0x890 [lquota]
 [&amp;lt;ffffffffc0f8cbfa&amp;gt;] ? qsd_refresh_usage+0x6a/0x2b0 [lquota]
 [&amp;lt;ffffffffc0f9ba80&amp;gt;] qsd_op_adjust+0x4e0/0x730 [lquota]
 [&amp;lt;ffffffffc0ff5d80&amp;gt;] osd_object_delete+0x230/0x330 [osd_ldiskfs]
 [&amp;lt;ffffffffc0a106cd&amp;gt;] lu_object_free.isra.31+0x9d/0x1a0 [obdclass]
 [&amp;lt;ffffffffc0a1132e&amp;gt;] lu_site_purge_objects+0x2fe/0x520 [obdclass]
 [&amp;lt;ffffffffc0a12179&amp;gt;] lu_cache_shrink+0x259/0x2d0 [obdclass]
 [&amp;lt;ffffffff81195413&amp;gt;] shrink_slab+0x163/0x330
 [&amp;lt;ffffffff811f7521&amp;gt;] ? vmpressure+0x61/0x90
 [&amp;lt;ffffffff81198091&amp;gt;] zone_reclaim+0x1d1/0x2f0
 [&amp;lt;ffffffff8118c264&amp;gt;] get_page_from_freelist+0x2c4/0x9e0
 [&amp;lt;ffffffff81029557&amp;gt;] ? __switch_to+0xd7/0x510
 [&amp;lt;ffffffff8118caf6&amp;gt;] __alloc_pages_nodemask+0x176/0x420
 [&amp;lt;ffffffff811d1108&amp;gt;] alloc_pages_current+0x98/0x110
 [&amp;lt;ffffffff81182917&amp;gt;] __page_cache_alloc+0x97/0xb0
 [&amp;lt;ffffffff81183625&amp;gt;] find_or_create_page+0x45/0xa0
 [&amp;lt;ffffffffc1018885&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
 [&amp;lt;ffffffffc1192728&amp;gt;] ofd_preprw+0x6a8/0x1150 [ofd]

Call Trace:
 [&amp;lt;ffffffff816a94e9&amp;gt;] schedule+0x29/0x70
 [&amp;lt;ffffffff816aab1d&amp;gt;] rwsem_down_read_failed+0x10d/0x1a0
 [&amp;lt;ffffffff8121a554&amp;gt;] ? shrink_dentry_list+0x274/0x490
 [&amp;lt;ffffffff81332038&amp;gt;] call_rwsem_down_read_failed+0x18/0x30
 [&amp;lt;ffffffff816a8780&amp;gt;] down_read+0x20/0x40
 [&amp;lt;ffffffffc0a11f8a&amp;gt;] lu_cache_shrink+0x6a/0x2d0 [obdclass]
 [&amp;lt;ffffffff81195359&amp;gt;] shrink_slab+0xa9/0x330
 [&amp;lt;ffffffff8118861f&amp;gt;] ? zone_watermark_ok+0x1f/0x30
 [&amp;lt;ffffffff811a8853&amp;gt;] ? compaction_suitable+0xa3/0xb0
 [&amp;lt;ffffffff81198091&amp;gt;] zone_reclaim+0x1d1/0x2f0
 [&amp;lt;ffffffff8118c264&amp;gt;] get_page_from_freelist+0x2c4/0x9e0
 [&amp;lt;ffffffff81085e1b&amp;gt;] ? copy_process+0xeeb/0x1970
 [&amp;lt;ffffffff8118caf6&amp;gt;] __alloc_pages_nodemask+0x176/0x420
 [&amp;lt;ffffffff810b08c0&amp;gt;] ? insert_kthread_work+0x40/0x40
 [&amp;lt;ffffffff8108510d&amp;gt;] copy_process+0x1dd/0x1970
 [&amp;lt;ffffffff81029557&amp;gt;] ? __switch_to+0xd7/0x510
 [&amp;lt;ffffffff81086a51&amp;gt;] do_fork+0x91/0x320
 [&amp;lt;ffffffff816a8fad&amp;gt;] ? __schedule+0x39d/0x8b0
 [&amp;lt;ffffffff81086d06&amp;gt;] kernel_thread+0x26/0x30
 [&amp;lt;ffffffff810b1341&amp;gt;] kthreadd+0x2c1/0x300
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="245294" author="sthiell" created="Fri, 5 Apr 2019 16:11:50 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=bzzz&quot; class=&quot;user-hover&quot; rel=&quot;bzzz&quot;&gt;bzzz&lt;/a&gt;&#160;Thanks!!! That must be it! I noticed the same symptoms as described in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12018&quot; title=&quot;deadlock on OSS: quota reintegration vs memory release&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12018&quot;&gt;&lt;del&gt;LU-12018&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Would you be able to port the patch to b2_10?&lt;/p&gt;</comment>
                            <comment id="245295" author="pfarrell" created="Fri, 5 Apr 2019 16:18:40 +0000"  >&lt;p&gt;Stephane,&lt;/p&gt;

&lt;p&gt;The patch shouldn&apos;t require porting.&#160; You can just apply it to the 2.10.7 source without issue.&lt;/p&gt;</comment>
                            <comment id="245296" author="sthiell" created="Fri, 5 Apr 2019 16:38:29 +0000"  >&lt;p&gt;Ok thanks, that&apos;s great. It is very short indeed!&lt;br/&gt;
I&apos;m going to prepare a patched 2.10.7 for that then.&lt;/p&gt;</comment>
                            <comment id="245331" author="pjones" created="Sat, 6 Apr 2019 03:20:56 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sthiell&quot; class=&quot;user-hover&quot; rel=&quot;sthiell&quot;&gt;sthiell&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;How soon will you expect to know whether this patch has helped? What is puzzling to me is that this issue has been seen (albeit quite rarely) as far back as 2.7.x releases so it&apos;s puzzling why it just suddenly start happening frequently after upgrading to 2.10.7...&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="245333" author="sthiell" created="Sat, 6 Apr 2019 05:23:06 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=pfarrell&quot; class=&quot;user-hover&quot; rel=&quot;pfarrell&quot;&gt;pfarrell&lt;/a&gt; I confirm the patch does successfully apply on top of 2.10.7 and I was able to build new server rpms for Oak. Thanks!&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=pjones&quot; class=&quot;user-hover&quot; rel=&quot;pjones&quot;&gt;pjones&lt;/a&gt; I think it&apos;s important to clarify what exactly happened and the title of this issue probably must be changed.&lt;/p&gt;

&lt;p&gt;The deadlock happened repeatedly on two servers serving a set of OSTs in IO Cell #2. But it&apos;s critical to highlight two things:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;Every time that the OST recovery finished on those servers (always successfully against about 1400 clients - that stayed up during the whole time), we shortly hit this deadlock issue. I definitively think that we hit a recovery scenario that triggered this same issue each time (and that&apos;s exactly what the recovery is for... replying transactions) - aborting the recovery on those OSTs just broke the cycle and we were finally able to resume operation.&lt;/li&gt;
	&lt;li&gt;Last night I tried to downgrade to Lustre 2.10.6, 2.10.5 and even 2.10.3+patch! (and old version that we have run for quite some time), and EVERY TIME the same issue happened too. This is definitively not a 2.10.7-specific issue. Please feel free to change the title accordingly.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;As of what really triggered the issue, I was thinking perhaps it&apos;s due to the upgrade of Sherlock (our lustre clients) to 2.12 recently, and because we haven&apos;t restarted Oak since then we hit this now while we upgraded Oak... Do you know if &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12018&quot; title=&quot;deadlock on OSS: quota reintegration vs memory release&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12018&quot;&gt;&lt;del&gt;LU-12018&lt;/del&gt;&lt;/a&gt; occurred also with 2.12 clients? That would be interesting to check.&lt;/p&gt;

&lt;p&gt;I would also like to mention that when this issue occurred it was extremely difficult to diagnose at first. Oak was down for about 18 hours and we spent most of the time looking for wrong causes, like network or backend issues, because Lustre was reporting these timeout errors or ldiskfs watchdogs because of the deadlock. But I&apos;m glad this issue was already found and special thanks to Alex for pointing us to the right patch (at least, we hope so, but it looks like it very much).&lt;/p&gt;

&lt;p&gt;Finally, to answer your question, I have the updated rpms ready to be deployed on Oak servers but the plan is to not touch anything right now as everything is working again as expected ( both 2.10 on Oak and 2.12 on Fir ). We&apos;ll patch only if an issue occur or at the next maintenance (TBD).&lt;/p&gt;</comment>
                            <comment id="245626" author="elwell" created="Fri, 12 Apr 2019 03:22:54 +0000"  >&lt;p&gt;I think we&apos;re seeing the same issue at Pawsey following an upgrade of our astrofs servers the other week:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@astrofs-oss3 ~&amp;#93;&lt;/span&gt;# rpm -qa | grep lustre | sort&lt;br/&gt;
kernel-3.10.0-862.9.1.el7_lustre.x86_64&lt;br/&gt;
kernel-3.10.0-957.el7_lustre.x86_64&lt;br/&gt;
kmod-lustre-2.10.6-1.el7.x86_64&lt;br/&gt;
kmod-lustre-osd-ldiskfs-2.10.6-1.el7.x86_64&lt;br/&gt;
lustre-2.10.6-1.el7.x86_64&lt;br/&gt;
lustre-osd-ldiskfs-mount-2.10.6-1.el7.x86_64&lt;br/&gt;
lustre-resource-agents-2.10.6-1.el7.x86_64&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@astrofs-oss3 ~&amp;#93;&lt;/span&gt;# uname -r&lt;br/&gt;
3.10.0-957.el7_lustre.x86_64&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;root@astrofs-oss3 ~&amp;#93;&lt;/span&gt;#&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;and it&apos;s giving the same call trace that Stephane originally reported:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Fri Apr 12 10:35:31 2019] LustreError: dumping log to /tmp/lustre-log.1555036556.16340
[Fri Apr 12 10:35:31 2019] Pid: 16367, comm: ll_ost_io02_074 3.10.0-957.el7_lustre.x86_64 #1 SMP Wed Dec 12 15:03:08 UTC 2018
[Fri Apr 12 10:35:31 2019] Call Trace:
[Fri Apr 12 10:35:31 2019]  [&amp;lt;ffffffffb51b58d4&amp;gt;] __lock_page+0x74/0x90
[Fri Apr 12 10:35:31 2019]  [&amp;lt;ffffffffb51b65d4&amp;gt;] __find_lock_page+0x54/0x70
[Fri Apr 12 10:35:31 2019]  [&amp;lt;ffffffffb51b7244&amp;gt;] find_or_create_page+0x34/0xa0
[Fri Apr 12 10:35:31 2019]  [&amp;lt;ffffffffc1018755&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
[Fri Apr 12 10:35:31 2019]  [&amp;lt;ffffffffc115d5d2&amp;gt;] ofd_preprw_write.isra.30+0x202/0xda0 [ofd]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc115e592&amp;gt;] ofd_preprw+0x422/0x1170 [ofd]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0d5706a&amp;gt;] tgt_brw_write+0xc3a/0x17d0 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0d5342a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0cfbe5b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0cff5a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb50c1c31&amp;gt;] kthread+0xd1/0xe0
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb5774c1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[Fri Apr 12 10:35:32 2019] Pid: 16256, comm: ll_ost_io02_020 3.10.0-957.el7_lustre.x86_64 #1 SMP Wed Dec 12 15:03:08 UTC 2018
[Fri Apr 12 10:35:32 2019] Call Trace:
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb51b58d4&amp;gt;] __lock_page+0x74/0x90
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb51b65d4&amp;gt;] __find_lock_page+0x54/0x70
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb51b7244&amp;gt;] find_or_create_page+0x34/0xa0
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc1018755&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc115d5d2&amp;gt;] ofd_preprw_write.isra.30+0x202/0xda0 [ofd]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc115e592&amp;gt;] ofd_preprw+0x422/0x1170 [ofd]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0d5706a&amp;gt;] tgt_brw_write+0xc3a/0x17d0 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0d5342a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0cfbe5b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0cff5a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb50c1c31&amp;gt;] kthread+0xd1/0xe0
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb5774c1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[Fri Apr 12 10:35:32 2019] Pid: 15878, comm: ll_ost_io02_002 3.10.0-957.el7_lustre.x86_64 #1 SMP Wed Dec 12 15:03:08 UTC 2018
[Fri Apr 12 10:35:32 2019] Call Trace:
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb51b58d4&amp;gt;] __lock_page+0x74/0x90
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb51b65d4&amp;gt;] __find_lock_page+0x54/0x70
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb51b7244&amp;gt;] find_or_create_page+0x34/0xa0
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc1018755&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc115d5d2&amp;gt;] ofd_preprw_write.isra.30+0x202/0xda0 [ofd]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc115e592&amp;gt;] ofd_preprw+0x422/0x1170 [ofd]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0d5706a&amp;gt;] tgt_brw_write+0xc3a/0x17d0 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0d5342a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0cfbe5b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0cff5a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb50c1c31&amp;gt;] kthread+0xd1/0xe0
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb5774c1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[Fri Apr 12 10:35:32 2019] Pid: 16302, comm: ll_ost_io02_037 3.10.0-957.el7_lustre.x86_64 #1 SMP Wed Dec 12 15:03:08 UTC 2018
[Fri Apr 12 10:35:32 2019] Call Trace:
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb51b58d4&amp;gt;] __lock_page+0x74/0x90
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb51b65d4&amp;gt;] __find_lock_page+0x54/0x70
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb51b7244&amp;gt;] find_or_create_page+0x34/0xa0
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc1018755&amp;gt;] osd_bufs_get+0x235/0x430 [osd_ldiskfs]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc115d5d2&amp;gt;] ofd_preprw_write.isra.30+0x202/0xda0 [ofd]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc115e592&amp;gt;] ofd_preprw+0x422/0x1170 [ofd]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0d5706a&amp;gt;] tgt_brw_write+0xc3a/0x17d0 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0d5342a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0cfbe5b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffc0cff5a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb50c1c31&amp;gt;] kthread+0xd1/0xe0
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffb5774c1d&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[Fri Apr 12 10:35:32 2019]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[Fri Apr 12 10:35:32 2019] LNet: Service thread pid 16285 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 1201.11s. Watchdog stack traces are limited to 3 per 300 seconds, skipping &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; one.
[Fri Apr 12 10:35:32 2019] LNet: Skipped 34 previous similar messages &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;as well as a bunch of other Bulk IO read and write errors.&lt;/p&gt;

&lt;p&gt;For info, the storage for this is via ib_srp to a DDN array rather than the SAS arrays we&apos;re using on our other filesystems - so I wonder if it is timing related?&lt;/p&gt;

&lt;p&gt;and we&apos;re also seeing IO blocked&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[root@astrofs-oss3 ~]# vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 1 193      0 734084 3879288 120170720    0    0  4520  8360    2    1  0  1 67 32  0
 1 193      0 733936 3879288 120170720    0    0     0    16 3474 16751  0  0  0 100  0
 0 193      0 733944 3879288 120170720    0    0     0    20 3141 15211  0  0  0 100  0
 0 193      0 733944 3879288 120170720    0    0     0     8 2111 10145  0  0  0 100  0
 0 193      0 733944 3879288 120170720    0    0     0     8 2723 13012  0  0  0 100  0
 0 193      0 733944 3879288 120170720    0    0     0    12 2938 14461  0  0  0 100  0
 0 193      0 733944 3879288 120170720    0    0     0    16 4151 20537  0  0  0 100  0
 0 193      0 734156 3879288 120171072    0    0     0    20 4384 21279  0  0  0 100  0
^C&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="245645" author="sthiell" created="Fri, 12 Apr 2019 14:38:01 +0000"  >&lt;p&gt;Hi Andrew,&lt;br/&gt;
Thanks for your comment and sorry to hear that. This is interesting that it has happened shortly after a server upgrade for you also. Were you able to recover from this? If so how? Did you abort the recovery like we did?&lt;br/&gt;
And also, I&apos;m curious, are you using any 2.12 clients?&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="54992">LU-12018</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="32388" name="oak-io1-s2-kern-and-sysrq-t.txt.gz" size="524935" author="sthiell" created="Thu, 4 Apr 2019 16:02:13 +0000"/>
                            <attachment id="32389" name="oak-io2-s2-20190404_sysrq_big.log" size="6228310" author="sthiell" created="Fri, 5 Apr 2019 06:11:09 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00ej3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>