<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:55:30 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12772] bulk timeout after 2.12.2 clients upgrade </title>
                <link>https://jira.whamcloud.com/browse/LU-12772</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>
&lt;p&gt;After upgrading to 2.12.2 clients and 2.10.8 servers we start to see large amounts of bulk io timeouts.&lt;/p&gt;

&lt;p&gt;client side&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Sep 16 16:36:59 r323i3n6 kernel: [1568677019.825837] Lustre: nbp2-OST0008-osc-ffff9ad887dc1800: Connection to nbp2-OST0008 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:37:01 r585i7n2 kernel: [1568677021.115645] Lustre: nbp2-OST0008-osc-ffff90c85d05e000: Connection to nbp2-OST0008 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:37:01 r311i0n3 kernel: [1568677021.371165] Lustre: nbp2-OST0094-osc-ffff976bea358800: Connection to nbp2-OST0094 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:37:01 r311i4n9 kernel: [1568677021.578522] Lustre: nbp2-OST0094-osc-ffff9c68adf2d000: Connection to nbp2-OST0094 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; sea12000: Connection to nbp2-OST0008 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:36:59 r323i3n6 kernel: [1568677019.825837] Lustre: nbp2-OST0008-osc-ffff9ad887dc1800: Connection to nbp2-OST0008 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:37:01 r585i7n2 kernel: [1568677021.115645] Lustre: nbp2-OST0008-osc-ffff90c85d05e000: Connection to nbp2-OST0008 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:37:01 r311i0n3 kernel: [1568677021.371165] Lustre: nbp2-OST0094-osc-ffff976bea358800: Connection to nbp2-OST0094 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:37:01 r311i4n9 kernel: [1568677021.578522] Lustre: nbp2-OST0094-osc-ffff9c68adf2d000: Connection to nbp2-OST0094 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; seConnection to nbp2-OST0008 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:36:59 r323i3n6 kernel: [1568677019.825837] Lustre: nbp2-OST0008-osc-ffff9ad887dc1800: Connection to nbp2-OST0008 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:37:01 r585i7n2 kernel: [1568677021.115645] Lustre: nbp2-OST0008-osc-ffff90c85d05e000: Connection to nbp2-OST0008 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:37:01 r311i0n3 kernel: [1568677021.371165] Lustre: nbp2-OST0094-osc-ffff976bea358800: Connection to nbp2-OST0094 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 16 16:37:01 r311i4n9 kernel: [1568677021.578522] Lustre: nbp2-OST0094-osc-ffff9c68adf2d000: Connection to nbp2-OST0094 (at 10.151.26.105@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; se&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;server side&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[90158.366440] LustreError: 30777:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk WRITE after 300+0s  req@ffff8affe46eb450 x1644657589074304/t0(0) o4-&amp;gt;825a80a7-da45-880e-35d1-4a750d2cf7f0@10.151.16.212@o2ib:502/0 lens 2168/448 e 0 to 0 dl 1568676837 ref 1 fl Interpret:/2/0 rc 0/0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>our github&lt;br/&gt;
&lt;a href=&quot;https://github.com/jlan/lustre-nas&quot;&gt;https://github.com/jlan/lustre-nas&lt;/a&gt;</environment>
        <key id="56924">LU-12772</key>
            <summary>bulk timeout after 2.12.2 clients upgrade </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Mon, 16 Sep 2019 23:38:43 +0000</created>
                <updated>Wed, 15 Jan 2020 23:22:43 +0000</updated>
                            <resolved>Sat, 19 Oct 2019 03:41:33 +0000</resolved>
                                    <version>Lustre 2.12.2</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="254849" author="mhanafi" created="Tue, 17 Sep 2019 04:20:15 +0000"  >&lt;p&gt;This looks to be an issue with lustre 2.10.8 servers. 2.12.2 clients work fine with 2.10.6 servers.&lt;/p&gt;</comment>
                            <comment id="254910" author="adilger" created="Tue, 17 Sep 2019 18:20:44 +0000"  >&lt;p&gt;Are OST0008 and OST0094 on the same OSS node?  Are the problems specific to a single client or OSS, or does any client running 2.12.2 with 2.10.8 servers have this problem?&lt;/p&gt;

&lt;p&gt;The root of the problem appears that the OSS never gets an RDMA completion during the bulk IO, so my first guess would be the network, but that doesn&apos;t match with the symptoms only appearing with 2.12.2 clients. Are you running the same OFED versions on these different Lustre versions?&lt;/p&gt;</comment>
                            <comment id="254912" author="sthiell" created="Tue, 17 Sep 2019 18:34:15 +0000"  >&lt;p&gt;Just wanted to comment on this one as we&apos;re seeing that too at Stanford, especially on Oak with 2.10.8 servers and 2.12.2++ clients, MOFED 4.5. Much less frequent on Fir with 2.12.2++ servers. A workaround is to reboot the clients, one by one, until all timeout messages stop.&lt;/p&gt;

&lt;p&gt;client 10.9.101.58@o2ib4 (2.12.2_116+3):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Sep 17 11:14:28 sh-101-58.int kernel: Lustre: 86512:0:(client.c:2132:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1568743764/real 1568743764]  req@ffff8caf167eb000 x1643132546106464/t0(0) o4-&amp;gt;oak-OST0023-osc-ffff8cc35667a000@10.0.2.102@o2ib5:6/4 lens 544/448 e 4 to 1 dl 1568744066 ref 2 fl Rpc:X/2/ffffffff rc -11/-1
Sep 17 11:14:28 sh-101-58.int kernel: Lustre: 86512:0:(client.c:2132:ptlrpc_expire_one_request()) Skipped 35 previous similar messages
Sep 17 11:14:28 sh-101-58.int kernel: Lustre: oak-OST0023-osc-ffff8cc35667a000: Connection to oak-OST0023 (at 10.0.2.102@o2ib5) was lost; in progress operations using this service will wait for recovery to complete
Sep 17 11:14:28 sh-101-58.int kernel: Lustre: Skipped 40 previous similar messages
Sep 17 11:14:28 sh-101-58.int kernel: Lustre: oak-OST0023-osc-ffff8cc35667a000: Connection restored to 10.0.2.102@o2ib5 (at 10.0.2.102@o2ib5)
Sep 17 11:14:28 sh-101-58.int kernel: Lustre: Skipped 40 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;server 10.0.2.102@o2ib5 (2.10.8+3):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Sep 17 11:15:56 oak-io1-s2 kernel: Lustre: oak-OST0025: Client a7ab87c4-94ef-8b76-5604-89835c870ef0 (at 10.9.112.10@o2ib4) reconnecting
Sep 17 11:15:56 oak-io1-s2 kernel: Lustre: Skipped 353 previous similar messages
Sep 17 11:16:56 oak-io1-s2 kernel: LustreError: 190975:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk WRITE after 100+0s  req@ffff96efe88a8c50 x1643132546112848/t0(0) o4-&amp;gt;060757d0-4357-b07c-8904-58235d7c3370@10.9.101.58@o2ib4:157/0 lens 544/448 e 0 to 0 dl 1568744442 ref 1 fl Interpret:/2/0 rc 0/0
Sep 17 11:16:56 oak-io1-s2 kernel: LustreError: 190975:0:(ldlm_lib.c:3239:target_bulk_io()) Skipped 400 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="254914" author="mhanafi" created="Tue, 17 Sep 2019 18:39:25 +0000"  >&lt;p&gt;yes those are on the same OSS. &lt;br/&gt;
This  definitely started after we upgraded to 2.10.8 servers. Something seem very broken in k2oib. So at this point we can rule out the clients. &lt;/p&gt;

&lt;p&gt;The bulk IO timeouts are across all the 2.10.8 OSS with large number of client mounts. We are going to revert back to 2.10.6 for now. &lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[155621.295389] Lustre: nbp2-OST006e: Connection restored to 9cddaa10-5bcb-a7c6-f68d-4eba1a376000 (at 10.149.3.236@o2ib313)
[155621.295393] Lustre: Skipped 1719 previous similar messages
[155841.846788] LustreError: 14394:0:(events.c:449:server_bulk_callback()) event type 5, status -61, desc ffff8e4d40301c00
[155841.879081] LustreError: 14394:0:(events.c:449:server_bulk_callback()) event type 3, status -61, desc ffff8e4d40301c00
[155841.911375] LustreError: 30988:0:(ldlm_lib.c:3255:target_bulk_io()) @@@ network error on bulk WRITE  req@ffff8e5a047f3050 x1644657945812448/t0(0) o4-&amp;gt;a32eba78-7a3b-187c-ccab-7b9bb580d609@10.151.37.225@o2ib:740/0 lens 4680/448 e 0 to 0 dl 1568744270 ref 1 fl Interpret:/2/0 rc 0/0
[155841.985521] LustreError: 30988:0:(ldlm_lib.c:3255:target_bulk_io()) Skipped 1 previous similar message
[155921.301100] Lustre: nbp2-OST006e: Bulk IO write error with 9cddaa10-5bcb-a7c6-f68d-4eba1a376000 (at 10.149.3.236@o2ib313), client will retry: rc = -110
[155921.341960] Lustre: Skipped 7 previous similar messages
[155923.883043] Lustre: nbp2-OST006e: Bulk IO read error with 5442e784-ac4d-d3c5-bf01-79fcadb31ec9 (at 10.149.3.37@o2ib313), client will retry: rc -110
[155923.922869] Lustre: Skipped 1 previous similar message
[155969.103933] Lustre: nbp2-OST00e6: haven&lt;span class=&quot;code-quote&quot;&gt;&apos;t heard from client 45011623-1dfa-a5aa-fe92-9ff4a40b4d8a (at 10.153.16.71@o2ib233) in 227 seconds. I think it&apos;&lt;/span&gt;s dead, and I am evicting it. exp ffff8e6abbf6dc00, cur 1568744143 expire 1568743993 last 1568743916
[155980.710433] Lustre: nbp2-OST0096: Client 9cddaa10-5bcb-a7c6-f68d-4eba1a376000 (at 10.149.3.236@o2ib313) reconnecting
[155980.742181] Lustre: Skipped 8 previous similar messages
[156104.603487] LustreError: 31030:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk WRITE after 300+0s  req@ffff8e5bcea95450 x1644719438934704/t0(0) o4-&amp;gt;9cddaa10-5bcb-a7c6-f68d-4eba1a376000@10.149.3.236@o2ib313:24/0 lens 584/448 e 0 to 0 dl 1568744309 ref 1 fl Interpret:/2/0 rc 0/0
[156104.679469] LustreError: 31030:0:(ldlm_lib.c:3239:target_bulk_io()) Skipped 7 previous similar messages
[156111.348763] LNet: 22448:0:(o2iblnd_cb.c:2351:kiblnd_passive_connect()) Can&apos;t accept conn from 10.151.43.200@o2ib (version 12): max_frags 256 too large (32 wanted)
[156111.392490] LNet: 22448:0:(o2iblnd_cb.c:2351:kiblnd_passive_connect()) Skipped 2 previous similar messages
[156296.514579] Lustre: nbp2-OST010e: Connection restored to a32eba78-7a3b-187c-ccab-7b9bb580d609 (at 10.151.37.225@o2ib)
[156296.514584] Lustre: Skipped 2797 previous similar messages
[156296.564402] LustreError: 14393:0:(events.c:449:server_bulk_callback()) event type 5, status -61, desc ffff8e5a011e7e00
[156296.596709] LustreError: 14393:0:(events.c:449:server_bulk_callback()) event type 3, status -61, desc ffff8e5a011e7e00
[156549.618678] LNet: 60518:0:(o2iblnd_cb.c:2351:kiblnd_passive_connect()) Can&apos;t accept conn from 10.151.39.217@o2ib (version 12): max_frags 256 too large (32 wanted)
[156549.662418] LNet: 60518:0:(o2iblnd_cb.c:2351:kiblnd_passive_connect()) Skipped 12 previous similar messages
[156609.272058] Lustre: nbp2-OST006e: Client a4a75f76-add8-c556-8b66-84b4ec76d557 (at 10.149.3.251@o2ib313) reconnecting
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="254917" author="mhanafi" created="Tue, 17 Sep 2019 18:43:36 +0000"  >&lt;p&gt;Do you have to reboot a client? Did you try a umount and mount? &lt;/p&gt;</comment>
                            <comment id="254918" author="sthiell" created="Tue, 17 Sep 2019 18:52:45 +0000"  >&lt;p&gt;I haven&apos;t tried that unfortunately, so I don&apos;t know if that is enough to stop the error (it&apos;s easier for us to just schedule nodes for reboot with Slurm). However, I see that you have some LNet errors too that we don&apos;t have (max_frags 256 too large), so maybe you have a different problem. We have relatively few timeouts like that, but still, and they are appearing over time.&lt;/p&gt;</comment>
                            <comment id="254919" author="mhanafi" created="Tue, 17 Sep 2019 18:56:29 +0000"  >&lt;p&gt;I wonder if this is related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12065&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-12065&lt;/a&gt;. We have that patch in our 2.10.8.&lt;/p&gt;</comment>
                            <comment id="254927" author="mhanafi" created="Tue, 17 Sep 2019 19:58:27 +0000"  >&lt;p&gt;So we downgrade one filesystem to 2.10.6. The timeout should have stopped. But they didn&apos;t so back to pointing the issue to the 2.12.2 client.&lt;/p&gt;</comment>
                            <comment id="254929" author="pjones" created="Tue, 17 Sep 2019 20:36:44 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Can you please advise on this one?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="254936" author="mhanafi" created="Tue, 17 Sep 2019 22:18:23 +0000"  >&lt;p&gt;I uploaded client side logs to ftp site&lt;br/&gt;
uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12772&quot; title=&quot;bulk timeout after 2.12.2 clients upgrade &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12772&quot;&gt;&lt;del&gt;LU-12772&lt;/del&gt;&lt;/a&gt;/r317i5n8.client.dk.log&lt;/p&gt;</comment>
                            <comment id="254949" author="mhanafi" created="Wed, 18 Sep 2019 05:15:47 +0000"  >&lt;p&gt;I was able to get debug logs from client and server for this event&lt;br/&gt;
the client is r633i7n17 (10.151.31.170@o2ib) ftp:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12772&quot; title=&quot;bulk timeout after 2.12.2 clients upgrade &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12772&quot;&gt;&lt;del&gt;LU-12772&lt;/del&gt;&lt;/a&gt;/r633i7n17.out.gz&lt;br/&gt;
the server is nbp2-oss17 (10.151.26.113@o2ib) ftp:/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12772&quot; title=&quot;bulk timeout after 2.12.2 clients upgrade &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12772&quot;&gt;&lt;del&gt;LU-12772&lt;/del&gt;&lt;/a&gt;/nbp2-oss17.dk.out.gz&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Sep 17 22:03:18 nbp2-oss17 kernel: [194810.573564] LustreError: 31722:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk READ after 300+0s  req@ffff9c19ce233850 x1644516363853088/t0(0) o3-&amp;gt;887d1e1b-cf59-b475-8cc4-c77eefdb6691@10.151.31.170@o2ib:437/0 lens 488/432 e 0 to 0 dl 1568783227 ref 1 fl Interpret:/2/0 rc 0/0
Sep 17 22:03:18 nbp2-oss17 kernel: [194810.649004] LustreError: 31722:0:(ldlm_lib.c:3239:target_bulk_io()) Skipped 13 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="255009" author="ashehata" created="Wed, 18 Sep 2019 19:56:42 +0000"  >&lt;p&gt;I looked at the latest logs r63* and nbp*, but I couldn&apos;t find any o2iblnd issues there. I see only one &quot;timeout on bulk READ&quot;.&lt;/p&gt;

&lt;p&gt;there are a lot of errors though in r31*.log&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
00000800:00000100:6.0:1568588205.447894:0:7311:0:(o2iblnd_cb.c:3200:kiblnd_cm_callback()) 10.151.26.102@o2ib: ROUTE ERROR -22
00000800:00000100:22.0:1568553486.441956:0:3360:0:(o2iblnd_cb.c:507:kiblnd_rx_complete()) Rx from 10.151.26.100@o2ib failed: 5
00000800:00000100:22.0:1568553486.441961:0:3360:0:(o2iblnd_cb.c:507:kiblnd_rx_complete()) Rx from 10.151.26.100@o2ib failed: 5
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I&apos;m looking at:&lt;/p&gt;

&lt;p&gt;nas-2.10.8&lt;br/&gt;
 nas-2.12.2&lt;/p&gt;

&lt;p&gt;Branches in the NASA git repo and I see that in 2.10.8&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 #define IBLND_CQ_ENTRIES(c)         (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;but in 2.12.2&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + kiblnd_send_wrs(c))&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Can you confirm if I&apos;m right and not misinterpreting something?&lt;/p&gt;

&lt;p&gt;This will create a mismatched CQ size on both ends of the connection. I&apos;m wondering if this would cause issues.&lt;/p&gt;</comment>
                            <comment id="255012" author="mhanafi" created="Wed, 18 Sep 2019 21:26:49 +0000"  >
&lt;p&gt;Yes We have this in our 2.10.8 because we reverted &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9180&quot; title=&quot;Upstream ko2iblnd does not work with map_on_demand &amp;lt;256&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9180&quot;&gt;&lt;del&gt;LU-9180&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
#define IBLND_SEND_WRS(c)	\
	((c-&amp;gt;ibc_max_frags + 1) * kiblnd_concurrent_sends(c-&amp;gt;ibc_version, \
							  c-&amp;gt;ibc_peer-&amp;gt;ibp_ni))
#define IBLND_CQ_ENTRIES(c)         (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;



&lt;p&gt;Time line of events:&lt;/p&gt;

&lt;p&gt;1. servers had been 2.10.6 for a long time&lt;/p&gt;

&lt;p&gt;2.&#160; all clients were update to 2.12.2 on 9/5/2019&lt;/p&gt;

&lt;p&gt;3. 2.10.6 servers started to hit &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;4. We had planed on going to 2.10.8 so patch 2.10.8 for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt; and install on servers 9/8/2019&lt;/p&gt;

&lt;p&gt;5. &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt; was fixed but after the server reboot we say bulk timeout errors. (NOTE no errors before the reboot with 2.12 clients)&lt;/p&gt;

&lt;p&gt;6. 9/17/2019 downgrade 1 filesystem back to 2.10.6+&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12120&quot; title=&quot;LustreError: 15069:0:(tgt_grant.c:561:tgt_grant_incoming()) LBUG &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12120&quot;&gt;&lt;del&gt;LU-12120&lt;/del&gt;&lt;/a&gt;. It didn&apos;t resolve the timeout issue.&lt;/p&gt;

&lt;p&gt;7. As recommend by Stephane we started reboot clients. And we have seen the timeout clear up on the reboot clients.&lt;/p&gt;

&lt;p&gt;We haven&apos;t reboot all clients yet it may take another 5 days. We have about 100 more clients that need to be rebooted.&lt;/p&gt;

&lt;p&gt;This issue could come back without knowing a root cause.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="255021" author="sthiell" created="Wed, 18 Sep 2019 23:42:42 +0000"  >&lt;p&gt;@Mahmoud FYI instead of rebooting the clients, we tried to evict them from specific OSTs, and while it&apos;s a bit early to confirm it works all the time, I&apos;ve been playing with that this afternoon and it seems ok so far! It breaks the loop of reconnect/recovery/bulk error (at least for bulk read error):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;# echo e5822630-4b69-cb1b-3ba7-273be887cac8 &amp;gt; /proc/fs/lustre/obdfilter/oak-OST001c/evict_client&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[172383.896250] Lustre: 48149:0:(genops.c:1734:obd_export_evict_by_uuid()) oak-OST001c: evicting e5822630-4b69-cb1b-3ba7-273be887cac8 at adminstrative request
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;br/&gt;
 client:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[172579.060082] Lustre: 48149:0:(genops.c:1734:obd_export_evict_by_uuid()) oak-OST0002: evicting 30024d45-d33c-6bfc-4a6e-e26592b33903 at adminstrative request
[172580.013039] LustreError: 304836:0:(ldlm_lib.c:3243:target_bulk_io()) @@@ Eviction on bulk READ req@ffff9a71a7578450 x1631575659750048/t0(0) o3-&amp;gt;30024d45-d33c-6bfc-4a6e-e26592b33903@10.9.101.72@o2ib4:321/0 lens 488/432 e 0 to 0 dl 1568847286 ref 1 fl Interpret:/2/0 rc 0/0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;then... reconnect and ok&#160;&lt;/p&gt;</comment>
                            <comment id="255022" author="ashehata" created="Thu, 19 Sep 2019 01:04:38 +0000"  >&lt;p&gt;The symptom that we&apos;re not receiving RDMA completions&#160; seem to indicate that completions are not being processed or getting delayed. One suspicion is that the mismatch in the&#160; number of CQ entries between the client and the server might have some side effects&lt;/p&gt;

&lt;p&gt;Currently when a connection is created the qp_attrs values are negotiated between the two peers ensuring the values are the same. Previous to your client upgrade the CQ entries calculation would&apos;ve been the same on both the client and the server. After the upgrade, that&apos;s no longer the case.&lt;/p&gt;

&lt;p&gt;There are also a couple of other patches which fixed some problems around the calculations of the qp_attrs value:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
instead of the loop where we keep decrementing the queue depth in iterations until we successfully create the qp, we cache the maximum send_wrs the QP can support and use that as a cap.
https:&lt;span class=&quot;code-comment&quot;&gt;//review.whamcloud.com/#/c/36073/
&lt;/span&gt;
&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; we &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; endup reducing the queue depth, then make sure you take that into account when calculating the high water mark (hiw) &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the connection
https:&lt;span class=&quot;code-comment&quot;&gt;//review.whamcloud.com/#/c/35578/ &lt;/span&gt;&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The issue with miscalculating the hiw is that under certain work loads connection credits might get depleted and not returned promptly. This could lead to timeouts on the connection. This is only a problem if the queue depth is reduced. On connection creation you&apos;d see:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 937 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;CWARN(&lt;span class=&quot;code-quote&quot;&gt;&quot;peer %s - queue depth reduced from %u to %u&quot;&lt;/span&gt;
 938 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;      &lt;span class=&quot;code-quote&quot;&gt;&quot;  to allow &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; qp creation\n&quot;&lt;/span&gt;,
 939 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;      libcfs_nid2str(peer_ni-&amp;gt;ibp_nid),
 940 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;      peer_ni-&amp;gt;ibp_queue_depth,
 941 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;      conn-&amp;gt;ibc_queue_depth);
 942 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Do you see that warning?&lt;/p&gt;

&lt;p&gt;I do see a lot of &quot;no credits&quot; message in the nbp2 log.&lt;/p&gt;

&lt;p&gt;I also see this:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
nbp2-oss17.dk.out:00000800:00000200:8.0:1568783002.636317:0:14106:0:(o2iblnd_cb.c:331:kiblnd_handle_rx()) Received d0[33] from 10.151.45.62@o2ib &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This indicates that there are 33 credits being returned in one shot on the connection. That&apos;s the largest number of returned credits I see in the log and I see one instance. I also see one instance of bulk timeout. That could be a correlation.&lt;/p&gt;

&lt;p&gt;What do you have your peer_credits set to? Keep in mind that that value can be reduced. If you see the warning above that&apos;ll tell you the exact value. If 33 is greater than the queue depth, then we&apos;re sure running in to the problem fixed by the above patches.&lt;/p&gt;

&lt;p&gt;I also see 18912 instances of more than 20 credits being returned in one shot.&lt;/p&gt;

&lt;p&gt;My recommendations:&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;apply the two patches above on both your code bases, 2.12.2 and 2.10.8&lt;/li&gt;
	&lt;li&gt;Make sure that CQE calculations are identical between clients and servers. I would suggest calculating CQEs on 2.10.8 the same way as it&apos;s being calculated in 2.12.2. IE apply &quot;&lt;tt&gt;0205858f6533c7cff8fe8b43d09cece15e0b5385 &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12065&quot; title=&quot;Client got evicted when  lock callback timer expired  on OSS &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12065&quot;&gt;&lt;del&gt;LU-12065&lt;/del&gt;&lt;/a&gt; lnd: increase CQ entries&lt;/tt&gt;&quot; on 2.10.8. This way the CQ is calculated identically.&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="255023" author="mhanafi" created="Thu, 19 Sep 2019 01:43:36 +0000"  >&lt;p&gt;We see lots of queue depth warnings. both client and server side. They show up only EDR and HDR connect hosts.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Sep 18 16:41:32 elrtr1 kernel: [1568850092.715421] LNet: 43868:0:(o2iblnd.c:941:kiblnd_create_conn()) peer 10.141.3.13@o2ib417 - queue depth reduced from 63 to 42  to allow &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; qp creation
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The two patches you listed are &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12621&quot; title=&quot;o2iblnd QP sizing -EINVAL retries&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12621&quot;&gt;&lt;del&gt;LU-12621&lt;/del&gt;&lt;/a&gt; and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12569&quot; title=&quot;IBLND_CREDITS_HIGHWATER does not check connection queue depth&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12569&quot;&gt;&lt;del&gt;LU-12569&lt;/del&gt;&lt;/a&gt;. But there is a note in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12569&quot; title=&quot;IBLND_CREDITS_HIGHWATER does not check connection queue depth&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12569&quot;&gt;&lt;del&gt;LU-12569&lt;/del&gt;&lt;/a&gt;  that your going to rework the patch as part of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10213&quot; title=&quot;o2iblnd: Potential discrepancy when allocating qp&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10213&quot;&gt;&lt;del&gt;LU-10213&lt;/del&gt;&lt;/a&gt;. We have that in our 2.10.8 and 2.12.2.  And there is a note at the end of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10213&quot; title=&quot;o2iblnd: Potential discrepancy when allocating qp&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10213&quot;&gt;&lt;del&gt;LU-10213&lt;/del&gt;&lt;/a&gt; that it should be reverted for 2.12.x. &lt;/p&gt;

&lt;p&gt;Remember we are also struggling with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11644&quot; title=&quot;LNet: Service thread inactive for 300  causes client evictions &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11644&quot;&gt;LU-11644&lt;/a&gt;. Our workaround of setting +net for it is still working. &lt;/p&gt;

&lt;p&gt;It not exactly clear which patches we should apply&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
Here is our service config
{code:java}
options ko2iblnd require_privileged_port=0 use_privileged_port=0
options ko2iblnd ntx=251072 credits=125536 fmr_pool_size=62769 
options ko2iblnd timeout=100 retry_count=7 peer_timeout=0 map_on_demand=32 peer_credits=63 concurrent_sends=63
#lnet
options lnet networks=o2ib(ib1) 
options lnet routes=&lt;span class=&quot;code-quote&quot;&gt;&quot;o2ib233 10.151.26.[80-94]@o2ib; o2ib313 10.151.25.[195-197,202-205,222]@o2ib 10.151.26.[60,127,140-144,150-154]@o2ib; o2ib417 10.151.26.[148,149]@o2ib 10.151.25.[167-170]@o2ib&quot;&lt;/span&gt;
options lnet dead_router_check_interval=60 live_router_check_interval=30
options lnet avoid_asym_router_failure=1 check_routers_before_use=1 small_router_buffers=65536 large_router_buffers=8192
options ptlrpc at_max=600 at_min=200
 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Client side&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
options ko2iblnd require_privileged_port=0 use_privileged_port=0
options ko2iblnd timeout=100 retry_count=7 peer_timeout=0 map_on_demand=32 peer_credits=63 concurrent_sends=63
options ko2iblnd credits=64
#lnet
options lnet networks=o2ib(ib1)
options lnet avoid_asym_router_failure=1 check_routers_before_use=1 small_router_buffers=65536 large_router_buffers=8192
options ptlrpc at_max=600 at_min=200
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Routers&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
options ko2iblnd require_privileged_port=0 use_privileged_port=0
options ko2iblnd timeout=150 retry_count=7 map_on_demand=32 peer_credits=63 concurrent_sends=63
options ko2iblnd ntx=32768 credits=32768 fmr_pool_size=8193 
#lnet
options lnet networks=&lt;span class=&quot;code-quote&quot;&gt;&quot;o2ib(ib0,ib1),o2ib417(ib2,ib3)&quot;&lt;/span&gt; forwarding=enabled
options lnet lnet_peer_discovery_disabled=1
options lnet avoid_asym_router_failure=1 check_routers_before_use=1 small_router_buffers=65536 large_router_buffers=8192
options ptlrpc at_max=600 at_min=200
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="255071" author="ashehata" created="Thu, 19 Sep 2019 20:41:13 +0000"  >&lt;p&gt;The peer_credits_hiw if not set will be set to half of the peer_credits. However, in the current code if connection queue depth, which is based on the peer_credits is reduced, the peer_credits_hiw is not. The primary purpose of the peer_credits_hiw is to return credits on the connection if they accumulate. If that value is too high then the credits are not returned promptly, which could lead to the no credits situation and eventually the timeouts you&apos;re seeing.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12621&quot; title=&quot;o2iblnd QP sizing -EINVAL retries&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12621&quot;&gt;&lt;del&gt;LU-12621&lt;/del&gt;&lt;/a&gt; and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12569&quot; title=&quot;IBLND_CREDITS_HIGHWATER does not check connection queue depth&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12569&quot;&gt;&lt;del&gt;LU-12569&lt;/del&gt;&lt;/a&gt; resolve this issue.  Basically these two patches fix the core problem in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10213&quot; title=&quot;o2iblnd: Potential discrepancy when allocating qp&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10213&quot;&gt;&lt;del&gt;LU-10213&lt;/del&gt;&lt;/a&gt;. I&apos;ll add a comment on &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10213&quot; title=&quot;o2iblnd: Potential discrepancy when allocating qp&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10213&quot;&gt;&lt;del&gt;LU-10213&lt;/del&gt;&lt;/a&gt; to clarify.&lt;/p&gt;

&lt;p&gt;Both &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12621&quot; title=&quot;o2iblnd QP sizing -EINVAL retries&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12621&quot;&gt;&lt;del&gt;LU-12621&lt;/del&gt;&lt;/a&gt; and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12569&quot; title=&quot;IBLND_CREDITS_HIGHWATER does not check connection queue depth&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12569&quot;&gt;&lt;del&gt;LU-12569&lt;/del&gt;&lt;/a&gt; are planned to land on master and be ported back to 2.12.3. So the three patches which need to be applied to 2.12.2 and 2.10.8 are:&lt;/p&gt;

&lt;p&gt;1. &lt;a href=&quot;https://review.whamcloud.com/#/c/36073/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/36073/&lt;/a&gt;&lt;br/&gt;
2. &lt;a href=&quot;https://review.whamcloud.com/#/c/35578/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/35578/&lt;/a&gt;&lt;br/&gt;
3. &lt;a href=&quot;https://review.whamcloud.com/#/c/34427&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/34427&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;These three patches should correctly calculate the queue depth, adjust the hiw, and keep the CQE the same between the peers.&lt;/p&gt;

&lt;p&gt;For &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12569&quot; title=&quot;IBLND_CREDITS_HIGHWATER does not check connection queue depth&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12569&quot;&gt;&lt;del&gt;LU-12569&lt;/del&gt;&lt;/a&gt;, it was intended to fix the issue where the queue depth &amp;lt; hiw. In this case the credits will be depleted and the algorithm which uses the hiw will not be triggered. The patches fixes this issue by:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
#define IBLND_CREDITS_HIGHWATER(t, conn) ((conn-&amp;gt;ibc_version) == IBLND_MSG_VERSION_1 ? \
           IBLND_CREDIT_HIGHWATER_V1 : \
           min(t-&amp;gt;lnd_peercredits_hiw, (__u32)conn-&amp;gt;ibc_queue_depth - 1))
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The hiw is managed as follows:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
239 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!tunables-&amp;gt;lnd_peercredits_hiw)
240 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;tunables-&amp;gt;lnd_peercredits_hiw = peer_credits_hiw;
241 
242 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (tunables-&amp;gt;lnd_peercredits_hiw &amp;lt; net_tunables-&amp;gt;lct_peer_tx_credits / 2)
243 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;tunables-&amp;gt;lnd_peercredits_hiw = net_tunables-&amp;gt;lct_peer_tx_credits / 2;
244 
245 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (tunables-&amp;gt;lnd_peercredits_hiw &amp;gt;= net_tunables-&amp;gt;lct_peer_tx_credits)
246 &#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;&#187;&#183;&#183;&#183;&#183;&#183;&#183;&#183;tunables-&amp;gt;lnd_peercredits_hiw = net_tunables-&amp;gt;lct_peer_tx_credits - 1;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In your case you don&apos;t set the HIW explicitly. tunables-&amp;gt;lnd_peercredits_hiw = 0, which would result in: tunables-&amp;gt;lnd_peercredits_hiw = net_tunables-&amp;gt;lct_peer_tx_credits / 2&lt;/p&gt;

&lt;p&gt;The &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12569&quot; title=&quot;IBLND_CREDITS_HIGHWATER does not check connection queue depth&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12569&quot;&gt;&lt;del&gt;LU-12569&lt;/del&gt;&lt;/a&gt; patch would result in the hiw value being set to conn-&amp;gt;ibc_queue_depth - 1. IMHO this violates the expectations in the system. The algorithm will be a lot less aggressive in returning the credits than the standard default, which I&apos;m not sure is what we&apos;d want. In fact based on the many no credits we&apos;re see in the logs, I suspect that we&apos;d need to be more aggressive in returning the credits. One option is, now that we know that the queue depth is negotiated to 42, would be to explicitly set peer_credits_hiw to 21. I have no hard recommendation, I would try 21 and observe the system. If there are still a lot of no credits, then I would try to decrease the hiw further.&lt;/p&gt;

&lt;p&gt;In summary, I think the issue is potentially an amalgamation between the two hiw credit issue and the non-matching CQE.&lt;/p&gt;</comment>
                            <comment id="255072" author="mhanafi" created="Thu, 19 Sep 2019 21:30:28 +0000"  >&lt;p&gt;what does this mean.&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;73978.224013&amp;#93;&lt;/span&gt; LNet: 232:0:(o2iblnd_cb.c:2341:kiblnd_passive_connect()) Can&apos;t accept conn from 10.151.26.122@o2ib (version 12): max_frags 256 too large (32 wanted)&lt;/p&gt;</comment>
                            <comment id="255074" author="ashehata" created="Thu, 19 Sep 2019 22:47:35 +0000"  >&lt;p&gt;As discussed on the call here is the relevant comment from Andreas regarding the statfs:&lt;/p&gt;

&lt;p&gt;I&apos;m working on patch &lt;a href=&quot;https://review.whamcloud.com/35380&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/35380&lt;/a&gt; &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12368&quot; title=&quot;concurrent statfs() calls on the client should be blocked&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12368&quot;&gt;&lt;del&gt;LU-12368&lt;/del&gt;&lt;/a&gt; obdclass: don&apos;t send multiple statfs RPCs&quot; that adds a statfs mutex that will avoid the large majority of concurrent OST_STATFS RPCs being sent that are consuming the credits. While this is a new patch, it is a relatively straightforward client-only change, unlike the statfs proxy patches which are relatively large and change the protocol and need to be installed on both the client and server. This also includes a client-side tunable &quot;llite.*.statfs_max_age&quot; that will allow increasing the statfs cache age from the default 1s to something larger (e.g. 10s or 60s) on compute nodes to further reduce the number of RPCs being sent.&lt;/p&gt;

&lt;p&gt;The statfs mutex patch will only allow a single process to send the OST_STATFS RPC per target, and other threads will block until the first thread completes (they would be blocked waiting for their own RPCs to complete anyway, so this does not negatively impact performance), and then use the cached statfs data from the first thread. This will avoid up to 95/96 (ratio of CPU cores) of OST_STATFS RPCs, but each MDT+OST will still get one RPC. Conversely, the statfs proxy patch would avoid 129/130 (ratio of OSTs to OSTs+MDTs) of RPCs, but each thread will still send one MDT an RPC. They improvement is very comparable for this system (98% vs. 99% reduction in RPCs). The two patches are orthogonal, so they can both be used in master.&lt;/p&gt;</comment>
                            <comment id="255163" author="ashehata" created="Fri, 20 Sep 2019 17:50:46 +0000"  >&lt;p&gt;For record keeping:&lt;br/&gt;
what does this mean.&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;73978.224013&amp;#93;&lt;/span&gt; LNet: 232:0:(o2iblnd_cb.c:2341:kiblnd_passive_connect()) Can&apos;t accept conn from 10.151.26.122@o2ib (version 12): max_frags 256 too large (32 wanted)&lt;/p&gt;

&lt;p&gt;When two peers are negotiating the maximum fragments that can be transferred between the peers, one peer can have a larger default max_frags. In this case it&apos;s 256. If the other peer supports less, say 32. It&apos;ll reject the initial connection attempt and will return a response saying that the maximum it can support is 32. The active side will reconnect with 32. At this point the connection should succeed.&lt;/p&gt;

&lt;p&gt;The above message is printed during this negotiation attempt&lt;/p&gt;</comment>
                            <comment id="255174" author="jaylan" created="Fri, 20 Sep 2019 21:05:03 +0000"  >&lt;p&gt;&amp;gt; 1. &lt;a href=&quot;https://review.whamcloud.com/#/c/36073/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/36073/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The above suggested patch does not apply to 2.10.8 (and b2_10). Could you provide a backport or a list of prerequisite commits?&lt;/p&gt;</comment>
                            <comment id="255283" author="mhanafi" created="Mon, 23 Sep 2019 23:37:57 +0000"  >&lt;p&gt;I noticed that the clients keep timing out the same bulk request over and over. It seems to be stuck.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
ep 23 15:45:53 r401i6n8 kernel: [1569278753.126871] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection to nbp2-OST00d9 (at 10.151.26.114@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 23 15:45:53 r401i6n8 kernel: [1569278753.162870] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection restored to 10.151.26.114@o2ib (at 10.151.26.114@o2ib)
Sep 23 15:55:59 r401i6n8 kernel: [1569279359.335922] Lustre: 3467:0:(client.c:2134:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1569278753/real 1569278753]  req@ffff99555d201680 x1645173752822896/t0(0) o3-&amp;gt;nbp2-OST00d9-osc-ffff9947f4303800@10.151.26.114@o2ib:6/4 lens 488/4536 e 6 to 1 dl 1569279359 ref 2 fl Rpc:X/2/ffffffff rc -11/-1
Sep 23 15:55:59 r401i6n8 kernel: [1569279359.335922] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection to nbp2-OST00d9 (at 10.151.26.114@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 23 15:55:59 r401i6n8 kernel: [1569279359.371921] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection restored to 10.151.26.114@o2ib (at 10.151.26.114@o2ib)
Sep 23 16:06:05 r401i6n8 kernel: [1569279965.540809] Lustre: 3467:0:(client.c:2134:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1569279359/real 1569279359]  req@ffff99555d201680 x1645173752822896/t0(0) o3-&amp;gt;nbp2-OST00d9-osc-ffff9947f4303800@10.151.26.114@o2ib:6/4 lens 488/4536 e 6 to 1 dl 1569279965 ref 2 fl Rpc:X/2/ffffffff rc -11/-1
Sep 23 16:06:05 r401i6n8 kernel: [1569279965.540809] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection to nbp2-OST00d9 (at 10.151.26.114@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 23 16:06:05 r401i6n8 kernel: [1569279965.576807] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection restored to 10.151.26.114@o2ib (at 10.151.26.114@o2ib)
Sep 23 16:16:11 r401i6n8 kernel: [1569280571.753676] Lustre: 3467:0:(client.c:2134:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1569279965/real 1569279965]  req@ffff99555d201680 x1645173752822896/t0(0) o3-&amp;gt;nbp2-OST00d9-osc-ffff9947f4303800@10.151.26.114@o2ib:6/4 lens 488/4536 e 6 to 1 dl 1569280571 ref 2 fl Rpc:X/2/ffffffff rc -11/-1
Sep 23 16:16:11 r401i6n8 kernel: [1569280571.753676] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection to nbp2-OST00d9 (at 10.151.26.114@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 23 16:16:11 r401i6n8 kernel: [1569280571.789675] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection restored to 10.151.26.114@o2ib (at 10.151.26.114@o2ib)
Sep 23 16:26:17 r401i6n8 kernel: [1569281177.958608] Lustre: 3467:0:(client.c:2134:ptlrpc_expire_one_request()) @@@ Request sent has timed out &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; slow reply: [sent 1569280571/real 1569280571]  req@ffff99555d201680 x1645173752822896/t0(0) o3-&amp;gt;nbp2-OST00d9-osc-ffff9947f4303800@10.151.26.114@o2ib:6/4 lens 488/4536 e 6 to 1 dl 1569281177 ref 2 fl Rpc:X/2/ffffffff rc -11/-1
Sep 23 16:26:17 r401i6n8 kernel: [1569281177.958608] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection to nbp2-OST00d9 (at 10.151.26.114@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Sep 23 16:26:17 r401i6n8 kernel: [1569281177.994606] Lustre: nbp2-OST00d9-osc-ffff9947f4303800: Connection restored to 10.151.26.114@o2ib (at 10.151.26.114@o2ib)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I upload the client side and server side logs for the above.&lt;/p&gt;

&lt;p&gt;uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12772&quot; title=&quot;bulk timeout after 2.12.2 clients upgrade &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12772&quot;&gt;&lt;del&gt;LU-12772&lt;/del&gt;&lt;/a&gt;/client.tgz&lt;br/&gt;
 uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12772&quot; title=&quot;bulk timeout after 2.12.2 clients upgrade &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12772&quot;&gt;&lt;del&gt;LU-12772&lt;/del&gt;&lt;/a&gt;/server.tgz&lt;/p&gt;</comment>
                            <comment id="255317" author="ashehata" created="Tue, 24 Sep 2019 18:09:49 +0000"  >&lt;p&gt;Here is a port for 36073 based on NASA&apos;s b2_10_8.&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33524/33524_0001-LU-12621-b_2_10_8-port.patch&quot; title=&quot;0001-LU-12621-b_2_10_8-port.patch attached to LU-12772&quot;&gt;0001-LU-12621-b_2_10_8-port.patch&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Mahmoud, just to clarify, did this timeout you posted above started happening after any changes ?&lt;/p&gt;</comment>
                            <comment id="255318" author="mhanafi" created="Tue, 24 Sep 2019 18:29:15 +0000"  >&lt;p&gt;We update the clients to 2.12 but didn&apos;t see these timeouts. Only after rebooting the servers into 2.10.8 did we see the timeouts. But moving 1 filesystem back to 2.10.6 didn&apos;t make the timeouts go away on that filesystem. Only reboot of clients stops the timeouts. I am not convened these are actual &quot;network&quot; timeouts. The clients and server get out of sync and the client keeps trying the same bulk io over and the server times it out. The last sent of debug logs I uploaded give a good trace of the RPC traffic. &lt;br/&gt;
This is exactly like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12543&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-12543&lt;/a&gt;. When a user job finishes their process can&apos;t be killed because of a open file(or a lock).  Only an eviction or a reboot will clear up the client.&lt;/p&gt;
</comment>
                            <comment id="255319" author="ashehata" created="Tue, 24 Sep 2019 19:42:38 +0000"  >&lt;p&gt;I looked at the logs. The server logs show 637678 instances of &quot;no credits&quot;. I also see 24350 instances of more than 30 credits being returned in one shot.&lt;/p&gt;

&lt;p&gt;In the server logs I see:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
nbp2-oss18_4:00000100:00100000:15.0:1569279965.578558:0:30761:0:(service.c:1939:ptlrpc_server_handle_req_in()) got req x1645173752822896
nbp2-oss18_4:  req@ffff97c961286850 x1645173752822896/t0(0) o3-&amp;gt;4149294c-5096-99e7-e20f-15e803fd47f8@10.151.3.144@o2ib:36/0 lens 488/0 e 0 to 0 dl 1569280371 ref 1 fl New:/2/ffffffff rc 0/-1
nbp2-oss18_4:00002000:00100000:15.0:1569279965.578568:0:30761:0:(ofd_dev.c:2422:ofd_prolong_extent_locks()) Prolong locks &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; req ffff97c961286850 with x1645173752822896 ext(4096-&amp;gt;8191)
nbp2-oss18_4:00000100:00100000:15.0:1569279965.578575:0:30761:0:(service.c:2089:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc ll_ost_io01_113:4149294c-5096-99e7-e20f-15e803fd47f8+6:0:x1645173752822896:12345-10.151.3.144@o2ib:3
nbp2-oss18_5:00010000:00020000:23.0:1569280165.581313:0:30761:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk READ after 200+0s  req@ffff97c961286850 x1645173752822896/t0(0) o3-&amp;gt;4149294c-5096-99e7-e20f-15e803fd47f8@10.151.3.144@o2ib:36/0 lens 488/432 e 0 to 0 dl 1569280371 ref 1 fl Interpret:/2/0 rc 0/0
nbp2-oss18_5:00010000:00000080:23.0:1569280165.695547:0:30761:0:(ldlm_lib.c:2883:target_committed_to_req()) @@@ not sending last_committed update (0/1)  req@ffff97c961286850 x1645173752822896/t0(0) o3-&amp;gt;4149294c-5096-99e7-e20f-15e803fd47f8@10.151.3.144@o2ib:36/0 lens 488/432 e 0 to 0 dl 1569280371 ref 1 fl Interpret:/2/ffffffff rc -110/-1
nbp2-oss18_5:00000100:00100000:23.0:1569280165.695553:0:30761:0:(service.c:2139:ptlrpc_server_handle_request()) Handled RPC pname:cluuid+ref:pid:xid:nid:opc ll_ost_io01_113:4149294c-5096-99e7-e20f-15e803fd47f8+6:0:x1645173752822896:12345-10.151.3.144@o2ib:3 Request processed in 200116977us (200117000us total) trans 0 rc -110/-999
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Looks like the bulk read for that particular rpc request times out after 200 seconds. I see this multiple times.&lt;/p&gt;

&lt;p&gt;I&apos;m still suspecting the mismatch in the connection attributes as well as the HIW issue discussed on the call.&lt;/p&gt;

&lt;p&gt;Do you have a system with both clients and servers on 2.10.8? I&apos;m wondering if we&apos;d see the same behavior.&lt;/p&gt;

&lt;p&gt;If we&apos;re able to reproduce this issue on the test system, that would be very helpful. We can experiment with the different parameters.&lt;/p&gt;</comment>
                            <comment id="255320" author="mhanafi" created="Tue, 24 Sep 2019 19:58:21 +0000"  >&lt;p&gt;I have not been able to get a reproducer. Other than the specific IO request that gets timeout over and over (for days) other I/O requests complete just fine. On a client in this state a &apos;sync&apos; will just hang.  &lt;br/&gt;
An eviction from the OST will break the cycle of timeouts. &lt;br/&gt;
We can run 2.10.8 clients but since we don&apos;t have a reproducer it won&apos;t help.&lt;/p&gt;

&lt;p&gt;Some node after reboot get stuck again in repeated timeouts. I have looked at the jobs running across those there is nothing  common about the jobs.&lt;/p&gt;</comment>
                            <comment id="255325" author="ashehata" created="Tue, 24 Sep 2019 21:09:25 +0000"  >&lt;p&gt;do those problems persist after changing the peer_credits and peer_credits_hiw to 32/16 respectively on clients and servers?&lt;/p&gt;</comment>
                            <comment id="255327" author="mhanafi" created="Tue, 24 Sep 2019 21:40:11 +0000"  >&lt;p&gt;yes the errors return with peer_credits of 32.&lt;/p&gt;</comment>
                            <comment id="255334" author="ashehata" created="Wed, 25 Sep 2019 01:21:31 +0000"  >&lt;p&gt;Did that reduce the &quot;no credits&quot; message seen in the logs?&lt;/p&gt;

&lt;p&gt;I think the next step is to apply the patch which makes the cq size calculations the same on both clients and servers.&lt;/p&gt;</comment>
                            <comment id="255403" author="jaylan" created="Wed, 25 Sep 2019 22:53:55 +0000"  >&lt;p&gt;&amp;gt; 1. &lt;a href=&quot;https://review.whamcloud.com/#/c/36073/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/36073/&lt;/a&gt;&lt;br/&gt;
&amp;gt; 2. &lt;a href=&quot;https://review.whamcloud.com/#/c/35578/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/35578/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Amir, you ported the above two patches to b2_12. Should I cherry-pick these two into nas-2.12.2? If I cherry-pick these patches to one of our nas-2.10.8 or nas-2.12.2 branches, we should pick them also to the other branch to avoid another out-of-sync problem. Thanks!&lt;/p&gt;</comment>
                            <comment id="255454" author="mhanafi" created="Thu, 26 Sep 2019 18:47:54 +0000"  >&lt;p&gt;Been trying to get additional debugging while we wait to update our servers.&lt;br/&gt;
 I found this exchange between the server and client.&lt;br/&gt;
 x1645242562975920 is the request that keeps timeout over and over.&lt;/p&gt;

&lt;p&gt;server&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
00000100:00100000:14.0:1569522293.228000:0:32458:0:(service.c:1939:ptlrpc_server_handle_req_in()) got req x1645193981558368
00002000:00100000:14.0:1569522293.228005:0:32458:0:(ofd_dev.c:2563:ofd_rw_hpreq_check()) @@@ nbp8-OST0107 ll_ost_io01_205: refresh rw locks: [0x101070000:0xb9c4b93:0x0] (1134592-&amp;gt;1138687)
00002000:00100000:14.0:1569522293.228010:0:32458:0:(ofd_dev.c:2422:ofd_prolong_extent_locks()) Prolong locks &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; req ffff885aeed2dc50 with x1645193981558368 ext(1134592-&amp;gt;1138687)
00002000:00010000:14.0:1569522293.228012:0:32458:0:(ofd_dev.c:2568:ofd_rw_hpreq_check()) nbp8-OST0107: refreshed 0 locks timeout &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; req ffff885aeed2dc50.
00000100:00100000:14.0:1569522293.228016:0:32458:0:(nrs_fifo.c:179:nrs_fifo_req_get()) NRS start fifo request from 12345-10.141.2.117@o2ib417, seq: 127724415
00000100:00100000:14.0:1569522293.228019:0:32458:0:(service.c:2089:ptlrpc_server_handle_request()) Handling RPC pname:cluuid+ref:pid:xid:nid:opc ll_ost_io01_205:4cf6649f-db20-1dab-e991-b18b25cd7717+6:0:x1645193981558368:12345-10.141.2.117@o2ib417:3
00000100:00000200:14.0:1569522293.228020:0:32458:0:(service.c:2094:ptlrpc_server_handle_request()) got req 1645193981558368
00000400:00000200:14.0:1569522293.228069:0:32458:0:(lib-move.c:2796:LNetPut()) LNetPut -&amp;gt; 12345-10.141.2.117@o2ib417
00000400:00000200:14.0:1569522293.228076:0:32458:0:(lib-move.c:1930:lnet_select_pathway()) TRACE: 10.151.27.65@o2ib(10.151.27.65@o2ib:10.151.27.65@o2ib) -&amp;gt; 10.141.2.117@o2ib417(10.141.2.117@o2ib417:10.151.25.170@o2ib) : PUT
00000800:00000200:14.0:1569522293.228079:0:32458:0:(o2iblnd_cb.c:1510:kiblnd_send()) sending 4096 bytes in 1 frags to 12345-10.151.25.170@o2ib
00000800:00000200:14.0:1569522293.228081:0:32458:0:(o2iblnd_cb.c:703:kiblnd_setup_rd_kiov()) niov 1 offset 0 nob 4096
00000800:00000200:14.0:1569522293.228084:0:32458:0:(o2iblnd.c:405:kiblnd_find_peer_locked()) got peer_ni [ffff885d82149e80] -&amp;gt; 10.151.25.170@o2ib (2) version: 12
00000800:00000200:14.0:1569522293.228085:0:32458:0:(o2iblnd_cb.c:1391:kiblnd_launch_tx()) conn[ffff88588fc32800] (130)++
00000800:00000200:14.0:1569522293.228086:0:32458:0:(o2iblnd_cb.c:1166:kiblnd_queue_tx_locked()) conn[ffff88588fc32800] (131)++
00000800:00000200:14.0:1569522293.228088:0:32458:0:(o2iblnd_cb.c:1397:kiblnd_launch_tx()) conn[ffff88588fc32800] (132)--
00000100:00000200:14.0:1569522293.228090:0:32458:0:(niobuf.c:262:ptlrpc_start_bulk_transfer()) Transferring 1 pages 4096 bytes via portal 8 id 12345-10.141.2.117@o2ib417 mbits 0x0-0x0
00010000:00020000:8.0:1569522343.227486:0:32458:0:(ldlm_lib.c:3239:target_bulk_io()) @@@ timeout on bulk READ after 50+0s  req@ffff885aeed2dc50 x1645193981558368/t0(0) o3-&amp;gt;4cf6649f-db20-1dab-e991-b18b25cd7717@10.141.2.117@o2ib417:613/0 lens 488/432 e 0 to 0 dl 1569522548 ref 1 fl Interpret:/2/0 rc 0/0

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;On the client&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
00000100:00000200:52.0:1569522293.227691:0:4645:0:(events.c:57:request_out_callback()) @@@ type 5, status 0  req@ffff9568c0c48040 x1645193981558368/t0(0) o3-&amp;gt;nbp8-OST0107-osc-ffff958574643000@10.151.27.65@o2ib:6/4 lens 488/4536 e 0 to 0 dl 1569522648 ref 3 fl Rpc:/2/ffffffff rc 0/-1
00000800:00000200:47.0:1569522293.227691:0:4643:0:(o2iblnd_cb.c:3859:kiblnd_scheduler()) conn[ffff95709daf1800] (70)--
00000400:00000200:52.0:1569522293.227698:0:4645:0:(lib-md.c:69:lnet_md_unlink()) Unlinking md ffff956f2881d770
00000400:00000200:52.0:1569522293.227700:0:4645:0:(lib-msg.c:816:lnet_is_health_check()) health check = 1, status = 0, hstatus = 0
00000400:00000200:52.0:1569522293.227703:0:4645:0:(lib-msg.c:630:lnet_health_check()) health check: 10.141.2.117@o2ib417-&amp;gt;10.141.25.169@o2ib417: PUT: OK
00000800:00000200:52.0:1569522293.227706:0:4645:0:(o2iblnd_cb.c:3859:kiblnd_scheduler()) conn[ffff95709daf1800] (69)--
00000800:00000200:38.2:1569522293.227889:0:0:0:(o2iblnd_cb.c:3721:kiblnd_cq_completion()) conn[ffff956f289cee00] (68)++
00000800:00000200:13.0:1569522293.227905:0:4646:0:(o2iblnd_cb.c:3843:kiblnd_scheduler()) conn[ffff956f289cee00] (69)++
00000800:00000200:13.0:1569522293.227913:0:4646:0:(o2iblnd_cb.c:343:kiblnd_handle_rx()) Received d2[0] from 10.141.25.170@o2ib417
00000800:00000200:15.0:1569522293.227915:0:4644:0:(o2iblnd_cb.c:3859:kiblnd_scheduler()) conn[ffff956f289cee00] (70)--
00000400:00000200:13.0:1569522293.227920:0:4646:0:(lib-move.c:4114:lnet_parse()) TRACE: 10.141.2.117@o2ib417(10.141.2.117@o2ib417) &amp;lt;- 10.151.27.65@o2ib : PUT - &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; me
00000400:00000200:13.0:1569522293.227928:0:4646:0:(lib-ptl.c:571:lnet_ptl_match_md()) Request from 12345-10.151.27.65@o2ib of length 4096 into portal 8 MB=0x0
00000400:00000100:13.0:1569522293.227933:0:4646:0:(lib-move.c:3753:lnet_parse_put()) Dropping PUT from 12345-10.151.27.65@o2ib portal 8 match 0 offset 0 length 4096: 4
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So it drops it because it timeout the bulk io. But I think that was the previous one. It looks like to me thats its stuck timeout a previous transfer and drop the current one.&lt;/p&gt;

&lt;p&gt;I will up load full debug log that shows this keeps repeating.&lt;/p&gt;

&lt;p&gt;ftp:uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12772&quot; title=&quot;bulk timeout after 2.12.2 clients upgrade &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12772&quot;&gt;&lt;del&gt;LU-12772&lt;/del&gt;&lt;/a&gt;/oss4.out.gz&lt;/p&gt;

&lt;p&gt;ftp:uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12772&quot; title=&quot;bulk timeout after 2.12.2 clients upgrade &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12772&quot;&gt;&lt;del&gt;LU-12772&lt;/del&gt;&lt;/a&gt;/r901i3n9.out.gz&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;This looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-11951&quot; title=&quot;sanity: test_231a failure, idle disconnect &amp;amp; import generation disagreement&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-11951&quot;&gt;&lt;del&gt;LU-11951&lt;/del&gt;&lt;/a&gt; but our 2.12 clients has that patch. FYI we have turn osc idle timeout.&lt;/p&gt;

&lt;p&gt;osc.*.idle_timeout=0&lt;/p&gt;</comment>
                            <comment id="255455" author="ashehata" created="Thu, 26 Sep 2019 19:34:32 +0000"  >&lt;p&gt;Hi Jay,&lt;/p&gt;

&lt;p&gt;Yes those two patches should be applied to both 2.12.2 and 2.10.8&lt;/p&gt;</comment>
                            <comment id="255456" author="ashehata" created="Thu, 26 Sep 2019 19:53:29 +0000"  >&lt;p&gt;Mahmoud, I might have a clue about the issue with the timeouts. On 2.12 the LND timeouts have been hooked up to the lnet_transaction_timeout as part of the health work. The ko2iblnd timeout parameter has been deprecated.&lt;/p&gt;

&lt;p&gt;The idea is that you can set a transaction timeout and a retry count (lnet_retry_count). The LNet messages will be retried a max of lnet_retry_count within the transaction timeout. So for example if you have lnet_retry_count set to 3, and the transaction timeout set to 100. If a message fails to be sent then it&apos;ll be retried 3 times in that 100 seconds. The LND timeout will be derived as follows:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lnd_timeout = lnet_transaction_timeout / lnet_retry_count &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;If you disable retries, which it is disabled by default in 2.12.2, then:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lnd_timeout = lnet_transaction_timeout &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The default value of lnet_transaction_timeout is 50s&lt;/p&gt;

&lt;p&gt;In your configuration you want to set the lnd timeout to 100s.&lt;/p&gt;

&lt;p&gt;On the 2.12.2 clients you should set:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
options lnet lnet_transaction_timeout=100&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;That should give you the expected timeout. I&apos;m thinking that in your setup under heavy load you need the timeouts to be that large.&lt;/p&gt;</comment>
                            <comment id="255457" author="mhanafi" created="Thu, 26 Sep 2019 20:14:04 +0000"  >&lt;p&gt;looks like this setting can be changed after the module load correct?&lt;/p&gt;

&lt;p&gt;btw,&#160; lnet_retry_count=0 should we change that also to 3?&lt;/p&gt;</comment>
                            <comment id="255458" author="ashehata" created="Thu, 26 Sep 2019 20:45:49 +0000"  >&lt;p&gt;Yes. the new 2.12 module parameters should be dynamic. You can set them:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lnetctl set transaction_timeout 100 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;then you can verify with&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
lnetctl global show &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Let&apos;s keep the lnet_retry_count to 0. There are a few issues which have been fixed in master with health that haven&apos;t been ported back to 2.12 yet. So I don&apos;t think we should introduce any more variables.&lt;/p&gt;

&lt;p&gt;Also to clarify the ko2iblnd retry_count is used on connection establishment time. It&apos;s actually passed down to the IB stack rightaway. So we don&apos;t process it in LNet or the LND.&lt;/p&gt;</comment>
                            <comment id="255460" author="mhanafi" created="Thu, 26 Sep 2019 23:14:22 +0000"  >&lt;p&gt;The logs show that messages are getting to and from client and server. Only way to break the cycle is to evict the client. It would be good from someone to look at the rpc traffic to make sure we not hitting a bug.&lt;/p&gt;</comment>
                            <comment id="255463" author="ashehata" created="Fri, 27 Sep 2019 00:13:44 +0000"  >&lt;p&gt;After you evict the client and set its lnet_transaction_timeout to 100, does it get back into the state of the stuck RPC message?&lt;/p&gt;

&lt;p&gt;I&apos;ll continue looking at the logs.&lt;/p&gt;</comment>
                            <comment id="255487" author="mhanafi" created="Fri, 27 Sep 2019 14:24:16 +0000"  >&lt;p&gt;The transaction_timeout change as eliminated the timeouts. It took awhile to clean up all the nodes. But last night we only had errors on 3 nodes that still needed to be rebooted.&#160; We will keep an eye on it for the next few days to verify it has resolved the issue.&lt;/p&gt;

&lt;p&gt;so setting the value in lustre.conf doesn&apos;t work&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 r329i0n0 /etc/modprobe.d # cat lustre.conf 
# $Source: /cvsroot/bcfg2/bcv136/Cfg/etc/modprobe.d/lustre.conf/lustre.conf.genshi,v $
# Get rid of messages &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; missing, special-purpose hardware (LU-1599)
blacklist padlock-sha#ko2ib
# Setting map_on_demand=32 as per intel recommendation LU-7808
# Timeing testing on module load show enabling FMR cuts the load 
# time by 1/2- from 6 min to 3 min.
# On lustre servers setting credits to 1/2 the ntx value
# to help with HCA congestion. Stats gathered shows
# that during heavy reads we run into issues. 
# Reducing credits should throttle tx. 
options ko2iblnd require_privileged_port=0 use_privileged_port=0
options ko2iblnd timeout=100 retry_count=7 peer_timeout=0 map_on_demand=32 peer_credits=63 concurrent_sends=63
options ko2iblnd credits=64
#lnet
options lnet networks=o2ib(ib1) lnet_transaction_timeout=100
options lnet avoid_asym_router_failure=1 check_routers_before_use=1 small_router_buffers=65536 large_router_buffers=8192
## ptlrpc parameters
# lowering at_max. This will make recovery faster. No
# evidence that show we need at_max=700 
options ptlrpc at_max=600 at_min=200

r329i0n0 /etc/modprobe.d # modprobe lustre

r329i0n0 /etc/modprobe.d # lnetctl global show
global:
    numa_range: 0
    max_intf: 200
    discovery: 1
    drop_asym_route: 0
    retry_count: 0
    transaction_timeout: 50
    health_sensitivity: 0
    recovery_interval: 1
r329i0n0 /etc/modprobe.d # uname -a
Linux r329i0n0 4.12.14-95.19.1.20190617-nasa #1 SMP Tue Jun 11 11:04:48 UTC 2019 (a904a7f) x86_64 x86_64 x86_64 GNU/Linux
r329i0n0 /etc/modprobe.d # modinfo lnet
filename:       /lib/modules/4.12.14-95.19.1.20190617-nasa/updates/lustre-client/net/lnet.ko
license:        GPL
version:        0.7.0
description:    Lustre Networking layer
author:         OpenSFS, Inc. &amp;lt;http:&lt;span class=&quot;code-comment&quot;&gt;//www.lustre.org/&amp;gt;
&lt;/span&gt;suserelease:    SLE12-SP4
srcversion:     B54539F75D86EDF216C3DEF
depends:        libcfs
retpoline:      Y
vermagic:       4.12.14-95.19.1.20190617-nasa SMP mod_unload modversions retpoline 
parm:           accept:Accept connections (secure|all|none) (charp)
parm:           accept_port:Acceptor&apos;s port (same on all nodes) (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           accept_backlog:Acceptor&apos;s listen backlog (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           accept_timeout:Acceptor&apos;s timeout (seconds) (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           forwarding:Explicitly enable/disable forwarding between networks (charp)
parm:           tiny_router_buffers:# of 0 payload messages to buffer in the router (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           small_router_buffers:# of small (1 page) messages to buffer in the router (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           large_router_buffers:# of large messages to buffer in the router (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           peer_buffer_credits:# router buffer credits per peer (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           auto_down:Automatically mark peers down on comms error (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           check_routers_before_use:Assume routers are down and ping them before use (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           avoid_asym_router_failure:Avoid asymmetrical router failures (0 to disable) (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           dead_router_check_interval:Seconds between dead router health checks (&amp;lt;= 0 to disable) (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           live_router_check_interval:Seconds between live router health checks (&amp;lt;= 0 to disable) (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           router_ping_timeout:Seconds to wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; the reply to a router health query (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           config_on_load:configure network at module load (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           local_nid_dist_zero:Reserved (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           portal_rotor:redirect PUTs to different cpu-partitions (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           ip2nets:LNET network &amp;lt;- IP table (charp)
parm:           networks:local networks (charp)
parm:           routes:routes to non-local networks (charp)
parm:           rnet_htable_size:size of remote network hash table (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           use_tcp_bonding:Set to 1 to use socklnd bonding. 0 to use Multi-Rail (&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;)
parm:           lnet_numa_range:NUMA range to consider during Multi-Rail selection (uint)
parm:           lnet_health_sensitivity:Value to decrement the health value by on error (health_sensitivity)
parm:           lnet_recovery_interval:Interval to recover unhealthy interfaces in seconds (recovery_interval)
parm:           lnet_interfaces_max:Maximum number of interfaces in a node. (interfaces_max)
parm:           lnet_peer_discovery_disabled:Set to 1 to disable peer discovery on &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; node. (discovery_disabled)
parm:           lnet_drop_asym_route:Set to 1 to drop asymmetrical route messages. (drop_asym_route)
parm:           lnet_transaction_timeout:Maximum number of seconds to wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; a peer response. (transaction_timeout)
parm:           lnet_retry_count:Maximum number of times to retry transmitting a message (retry_count)


r329i0n0 /etc/modprobe.d # lnetctl set transaction_timeout 100
r329i0n0 /etc/modprobe.d # lnetctl global show
global:
    numa_range: 0
    max_intf: 200
    discovery: 1
    drop_asym_route: 0
    retry_count: 0
    transaction_timeout: 100
    health_sensitivity: 0
    recovery_interval: 1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="255500" author="ashehata" created="Fri, 27 Sep 2019 18:32:16 +0000"  >&lt;p&gt;That&apos;s good news.&lt;/p&gt;

&lt;p&gt;I opened a separate ticket to track the issue with setting the module parameters: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12817&quot; title=&quot;lnet: module parameters are not set correctly&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12817&quot;&gt;&lt;del&gt;LU-12817&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="256198" author="mhanafi" created="Thu, 10 Oct 2019 21:17:07 +0000"  >&lt;p&gt;Today we upgrade one of filesystem to match the CQW size as per the 3 patches. After the reboot of the OSS we started to see bulk timeouts again from the clients to the rebooted OSSes. We have the transaction_timeout=100. &lt;br/&gt;
The timeouts are the same as before, repeated cycle over and over again. &lt;/p&gt;
</comment>
                            <comment id="256688" author="mhanafi" created="Sat, 19 Oct 2019 03:41:33 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12856&quot; title=&quot;LustreError: 82937:0:(ldlm_lib.c:3268:target_bulk_io()) @@@ truncated bulk READ 0(270336)  &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12856&quot;&gt;&lt;del&gt;LU-12856&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="57147">LU-12856</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="57022">LU-12817</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="33524" name="0001-LU-12621-b_2_10_8-port.patch" size="4060" author="ashehata" created="Tue, 24 Sep 2019 18:08:30 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00muf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>