<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:03:26 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-6808] Interop 2.5.3&lt;-&gt;master sanity test_224c: Bulk IO write error</title>
                <link>https://jira.whamcloud.com/browse/LU-6808</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for sarah_lw &amp;lt;wei3.liu@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/ff285e72-24dd-11e5-bf7b-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/ff285e72-24dd-11e5-bf7b-5254006e85c2&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_224c failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;test failed to respond and timed out
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;OST log&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;16:17:08:Lustre: DEBUG MARKER: == sanity test 224c: Don&apos;t hang if one of md lost during large bulk RPC ============================== 16:16:38 (1436199398)
16:17:08:LustreError: 8997:0:(ldlm_lib.c:2718:target_bulk_io()) @@@ truncated bulk GET 1048576(4194304)  req@ffff88003db95000 x1505956611442792/t0(0) o4-&amp;gt;8e3c8004-e713-9f15-13d2-22232628f874@10.1.4.159@tcp:0/0 lens 608/448 e 0 to 0 dl 1436199422 ref 1 fl Interpret:/0/0 rc 0/0
16:17:08:Lustre: lustre-OST0004: Bulk IO write error with 8e3c8004-e713-9f15-13d2-22232628f874 (at 10.1.4.159@tcp), client will retry: rc -110
16:17:08:Lustre: lustre-OST0004: Client 8e3c8004-e713-9f15-13d2-22232628f874 (at 10.1.4.159@tcp) reconnecting
16:17:08:LustreError: 8990:0:(ldlm_lib.c:2718:target_bulk_io()) @@@ truncated bulk GET 1048576(3805696)  req@ffff88004901f800 x1505956611442980/t0(0) o4-&amp;gt;8e3c8004-e713-9f15-13d2-22232628f874@10.1.4.159@tcp:0/0 lens 608/448 e 0 to 0 dl 1436199427 ref 1 fl Interpret:/2/0 rc 0/0
16:17:08:LustreError: 8990:0:(ldlm_lib.c:2718:target_bulk_io()) Skipped 1 previous similar message
16:17:08:Lustre: lustre-OST0004: Bulk IO write error with 8e3c8004-e713-9f15-13d2-22232628f874 (at 10.1.4.159@tcp), client will retry: rc -110
16:17:08:Lustre: Skipped 1 previous similar message
16:21:30:LustreError: 8990:0:(ldlm_lib.c:2718:target_bulk_io()) @@@ truncated bulk GET 1048576(3805696)  req@ffff88003596d000 x1505956611443196/t0(0) o4-&amp;gt;8e3c8004-e713-9f15-13d2-22232628f874@10.1.4.159@tcp:0/0 lens 608/448 e 0 to 0 dl 1436199432 ref 1 fl Interpret:/2/0 rc 0/0
16:21:30:LustreError: 8990:0:(ldlm_lib.c:2718:target_bulk_io()) Skipped 1 previous similar message
16:21:30:Lustre: lustre-OST0004: Bulk IO write error with 8e3c8004-e713-9f15-13d2-22232628f874 (at 10.1.4.159@tcp), client will retry: rc -110
16:21:30:Lustre: Skipped 1 previous similar message
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>server: 2.5.3&lt;br/&gt;
client: lustre-master build # 3092 EL7</environment>
        <key id="30988">LU-6808</key>
            <summary>Interop 2.5.3&lt;-&gt;master sanity test_224c: Bulk IO write error</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jay">Jinshan Xiong</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Tue, 7 Jul 2015 20:42:48 +0000</created>
                <updated>Thu, 1 Feb 2018 16:27:19 +0000</updated>
                            <resolved>Tue, 25 Oct 2016 04:03:21 +0000</resolved>
                                    <version>Lustre 2.8.0</version>
                                    <fixVersion>Lustre 2.9.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>18</watches>
                                                                            <comments>
                            <comment id="129993" author="standan" created="Fri, 9 Oct 2015 19:33:09 +0000"  >&lt;p&gt;&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/588f0ce8-6b5d-11e5-94a7-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/588f0ce8-6b5d-11e5-94a7-5254006e85c2&lt;/a&gt;&lt;br/&gt;
server:2.5.5&lt;br/&gt;
client: 2.7.60/ build# 3203&lt;/p&gt;</comment>
                            <comment id="134331" author="standan" created="Tue, 24 Nov 2015 00:52:50 +0000"  >&lt;p&gt;Instance also found for 2.7.63 tag with following config:&lt;br/&gt;
Server: 2.7.0, b2_7/29&lt;br/&gt;
Client: Master, build# 3251, RHEL 7&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/fd0cce2c-90a1-11e5-a833-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/fd0cce2c-90a1-11e5-a833-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="136467" author="standan" created="Wed, 16 Dec 2015 00:11:15 +0000"  >&lt;p&gt;Another instance for following interop config&lt;br/&gt;
Server: 2.5.5, b2_5_fe/62&lt;br/&gt;
Client: Master, Build# 3266, Tag 2.7.64 &lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/8a0f2bb4-a04a-11e5-a33d-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/8a0f2bb4-a04a-11e5-a33d-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="136564" author="standan" created="Wed, 16 Dec 2015 16:44:26 +0000"  >&lt;p&gt;Server: 2.5.5, b2_5_fe/62&lt;br/&gt;
Client: Master, Build# 3266, Tag 2.7.64 , RHEL 7 &lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/fd8752d8-a05e-11e5-90cc-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/fd8752d8-a05e-11e5-90cc-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="137364" author="standan" created="Wed, 23 Dec 2015 22:13:09 +0000"  >&lt;p&gt;Another instance found for :&lt;br/&gt;
Server: b2_7_fe/34&lt;br/&gt;
Client: Master , Build# 3276&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/79afc16e-a5e2-11e5-a028-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/79afc16e-a5e2-11e5-a028-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="137429" author="standan" created="Thu, 24 Dec 2015 19:32:04 +0000"  >&lt;p&gt;Another instance found for the following config:&lt;br/&gt;
Server: 2.7.1 , b2_7_fe/34&lt;br/&gt;
Client: Master, build# 3276, RHEL 6.7&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/610eff92-a602-11e5-a14c-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/610eff92-a602-11e5-a14c-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="139307" author="standan" created="Tue, 19 Jan 2016 21:15:47 +0000"  >&lt;p&gt;Another instance found for interop : 2.7.1 Server/EL7 Client&lt;br/&gt;
Server: 2.7.1, b2_7_fe/34&lt;br/&gt;
Client: master, build# 3303, RHEL 7 &lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/23af3494-bb03-11e5-9137-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/23af3494-bb03-11e5-9137-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="139327" author="standan" created="Tue, 19 Jan 2016 22:41:57 +0000"  >&lt;p&gt;Another instance found for interop : 2.7.1 Server/EL6.7 Client&lt;br/&gt;
Server: 2.7.1, b2_7_fe/34&lt;br/&gt;
Client: master, build# 3303, RHEL 6.7&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/04afed48-bb05-11e5-9137-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/04afed48-bb05-11e5-9137-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="141861" author="standan" created="Wed, 10 Feb 2016 22:00:46 +0000"  >&lt;p&gt;Another instance found for interop tag 2.7.66 - 2.7.1 Server/EL7 Client, build# 3316&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/5e49e47c-ccde-11e5-8b0e-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/5e49e47c-ccde-11e5-8b0e-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Another instance found for interop tag 2.7.66 - 2.7.1 Server/EL6.7 Client, build# 3316&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/c23f2696-ccdd-11e5-b80c-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/c23f2696-ccdd-11e5-b80c-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Another instance found for interop tag 2.7.66 -2.5.5 Server/EL6.7 Client, build# 3316&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/fad4bcda-ccf8-11e5-b1fa-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/fad4bcda-ccf8-11e5-b1fa-5254006e85c2&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Another instance found for interop tag 2.7.66 - 2.5.5 Server/EL7 Client, build# 3316&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/f385c93c-ccc7-11e5-b80c-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/f385c93c-ccc7-11e5-b80c-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="143588" author="standan" created="Wed, 24 Feb 2016 17:10:31 +0000"  >&lt;p&gt;Another instance found for interop - 2.7.1 Server/EL7 Client, tag 2.7.90. &lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/3b9722f8-d2f8-11e5-bf08-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/3b9722f8-d2f8-11e5-bf08-5254006e85c2&lt;/a&gt;&lt;br/&gt;
Another instance found for interop - 2.7.1 Server/EL6.7 Client, tag 2.7.90. &lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/f371534e-d573-11e5-bc47-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/f371534e-d573-11e5-bc47-5254006e85c2&lt;/a&gt;&lt;br/&gt;
Another instance found for interop - 2.5.5 Server/EL6.7 Client, tag 2.7.90. &lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/e16b8c0c-d634-11e5-82a0-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/e16b8c0c-d634-11e5-82a0-5254006e85c2&lt;/a&gt;&lt;br/&gt;
Another instance found for interop - 2.5.5 Server/EL7 Client, tag 2.7.90. &lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/ba9d84fe-d300-11e5-be5c-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/ba9d84fe-d300-11e5-be5c-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="143591" author="standan" created="Wed, 24 Feb 2016 17:14:41 +0000"  >&lt;p&gt;This issue has been seen almost 30 times in past 30 days&lt;/p&gt;</comment>
                            <comment id="143637" author="jgmitter" created="Wed, 24 Feb 2016 19:11:58 +0000"  >&lt;p&gt;Hi Emoly,&lt;br/&gt;
Can you investigate this one?&lt;br/&gt;
Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="162868" author="ake_s" created="Tue, 23 Aug 2016 17:14:09 +0000"  >&lt;p&gt;Is there any progress on this problem?&lt;/p&gt;

&lt;p&gt;We are trying to use 2.8.55 client (we need support for a 4.4 kernel) against our 2.5.41 servers and are hitting Bulk IO write errors (and Bulk IO read errors) very repeatedly on normal client activity.&lt;/p&gt;</comment>
                            <comment id="162876" author="simmonsja" created="Tue, 23 Aug 2016 18:18:41 +0000"  >&lt;p&gt;Can you reproduce this? If so can you run lctl set_param debug=+&quot;net neterror nettrace&quot; and collect debug logs.&lt;/p&gt;</comment>
                            <comment id="162879" author="ake_s" created="Tue, 23 Aug 2016 18:35:01 +0000"  >&lt;p&gt;I can reproduce it easily (at least the IO read error)&lt;/p&gt;

&lt;p&gt;But i haven&apos;t used set_param debug before, so  in case i need to do anything else you need to tell me exactly what.&lt;br/&gt;
Meanwhile i&apos;ll do just the above on the client and the OSS:es where the problem shows up.&lt;/p&gt;</comment>
                            <comment id="163129" author="ake_s" created="Thu, 25 Aug 2016 14:46:17 +0000"  >&lt;p&gt;Collected debug logs from both client and server.&lt;br/&gt;
Where do you want them? client ~100M, server ~500M uncompressed&lt;/p&gt;</comment>
                            <comment id="163138" author="yujian" created="Thu, 25 Aug 2016 15:47:07 +0000"  >&lt;p&gt;Hi Ake,&lt;/p&gt;

&lt;p&gt;Please refer to &lt;a href=&quot;https://wiki.hpdd.intel.com/display/WC/Uploading+Large+Logs&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/WC/Uploading+Large+Logs&lt;/a&gt;. Thank you.&lt;/p&gt;</comment>
                            <comment id="163139" author="pjones" created="Thu, 25 Aug 2016 15:52:30 +0000"  >&lt;p&gt;Jian&lt;/p&gt;

&lt;p&gt;Ake will not be able to access that page on the wiki&lt;/p&gt;

&lt;p&gt;Ake&lt;/p&gt;

&lt;p&gt;It looks like you are using an Intel EE release rather than a community release so you should report this issue through your support channels.&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="163147" author="ake_s" created="Thu, 25 Aug 2016 16:29:11 +0000"  >&lt;p&gt;The client is taken from git://git.hpdd.intel.com/fs/lustre-release.git  using tag 2.8.55&lt;br/&gt;
The server is from DDN.&lt;/p&gt;

&lt;p&gt;So, what route should i take?&lt;/p&gt;</comment>
                            <comment id="163356" author="yujian" created="Sun, 28 Aug 2016 22:52:50 +0000"  >&lt;p&gt;Hi Ake,&lt;/p&gt;

&lt;p&gt;Sorry for the late reply and thank you for collecting the debug logs.&lt;/p&gt;

&lt;p&gt;I reproduced the failure on the following combination and gathered full debug logs:&lt;br/&gt;
Lustre client build: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-master/3425/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-master/3425/&lt;/a&gt;&lt;br/&gt;
Lustre server build: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-b2_7_fe/95/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-b2_7_fe/95/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/8cc945de-6d71-11e6-8afd-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/8cc945de-6d71-11e6-8afd-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="164877" author="yujian" created="Sat, 3 Sep 2016 00:43:30 +0000"  >&lt;p&gt;Hi Nasf,&lt;/p&gt;

&lt;p&gt;The server side patch &lt;a href=&quot;http://review.whamcloud.com/14399&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14399&lt;/a&gt; was included in build &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-b2_7_fe/95/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-b2_7_fe/95/&lt;/a&gt;. Could you please look into the issue? Thank you.&lt;/p&gt;</comment>
                            <comment id="165242" author="yong.fan" created="Thu, 8 Sep 2016 02:38:13 +0000"  >&lt;p&gt;The root reason is the interoperability issue introduced by the patch &lt;a href=&quot;http://review.whamcloud.com/15421&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/15421&lt;/a&gt;. In such patch, we replaced the &quot;rq_xid&quot; with &quot;rq_mbits&quot; as matchbits of bulk data. For be interoperable with old server, it introduced new connection flag: OBD_CONNECT_BULK_MBITS. If the server does not support such feature, then the &quot;rq_xid&quot; would be set the same value as &quot;rq_mbits&quot;. Unfortunately, it forgot to handle multiple bulk operations, for example 4MB IO. If the new client (such as master, b2_8_fe) wants to make 4MB IO, then the new client may send a small &quot;rq_xid&quot; to the old server (such as b2_7_fe, IEEL2), as to the old server will regard it as an 1MB IO or 2MB IO. So the data transfer will NOT complete because of only part of data transferred. Then the client will get timeout failure and retry again and again.&lt;/p&gt;</comment>
                            <comment id="165244" author="gerrit" created="Thu, 8 Sep 2016 02:39:23 +0000"  >&lt;p&gt;Fan Yong (fan.yong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/22373&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/22373&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6808&quot; title=&quot;Interop 2.5.3&amp;lt;-&amp;gt;master sanity test_224c: Bulk IO write error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6808&quot;&gt;&lt;del&gt;LU-6808&lt;/del&gt;&lt;/a&gt; ptlrpc: properly set &quot;rq_xid&quot; for 4MB IO&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 56766651d8a124156c615aef60c850cf41d9d5e6&lt;/p&gt;</comment>
                            <comment id="165269" author="ake_s" created="Thu, 8 Sep 2016 07:21:06 +0000"  >&lt;p&gt;That patch fixes our problem at least.&lt;br/&gt;
I.e., I can&apos;t trigger it the way I could before so I&apos;m very happy.&lt;br/&gt;
Thanks alot.&lt;/p&gt;</comment>
                            <comment id="165914" author="gerrit" created="Tue, 13 Sep 2016 20:02:30 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/22373/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/22373/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6808&quot; title=&quot;Interop 2.5.3&amp;lt;-&amp;gt;master sanity test_224c: Bulk IO write error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6808&quot;&gt;&lt;del&gt;LU-6808&lt;/del&gt;&lt;/a&gt; ptlrpc: properly set &quot;rq_xid&quot; for 4MB IO&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: bb75072cb679bf52e00537c19e42f8e4e95255b6&lt;/p&gt;</comment>
                            <comment id="165924" author="pjones" created="Tue, 13 Sep 2016 20:07:18 +0000"  >&lt;p&gt;Landed for 2.9&lt;/p&gt;</comment>
                            <comment id="168275" author="adilger" created="Tue, 4 Oct 2016 22:01:04 +0000"  >&lt;p&gt;Reopen to get fix &lt;a href=&quot;http://review.whamcloud.com/22906&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/22906&lt;/a&gt; landed for 2.9.0.&lt;/p&gt;</comment>
                            <comment id="168348" author="gerrit" created="Wed, 5 Oct 2016 15:25:19 +0000"  >&lt;p&gt;Jinshan Xiong (jinshan.xiong@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/22957&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/22957&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6808&quot; title=&quot;Interop 2.5.3&amp;lt;-&amp;gt;master sanity test_224c: Bulk IO write error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6808&quot;&gt;&lt;del&gt;LU-6808&lt;/del&gt;&lt;/a&gt; ptlrpc: BULK_BITS recovery for old servers&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: d58ad7550497e5766fe46582272ca3d2636bd0f5&lt;/p&gt;</comment>
                            <comment id="168903" author="gerrit" created="Mon, 10 Oct 2016 11:23:04 +0000"  >&lt;p&gt;Niu Yawei (yawei.niu@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/23048&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/23048&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6808&quot; title=&quot;Interop 2.5.3&amp;lt;-&amp;gt;master sanity test_224c: Bulk IO write error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6808&quot;&gt;&lt;del&gt;LU-6808&lt;/del&gt;&lt;/a&gt; ptlrpc: no need to reassign mbits for replay&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 487586d8d40506f1e9980ecd8f8aad5b5277f519&lt;/p&gt;</comment>
                            <comment id="170862" author="gerrit" created="Tue, 25 Oct 2016 02:20:20 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/23048/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/23048/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6808&quot; title=&quot;Interop 2.5.3&amp;lt;-&amp;gt;master sanity test_224c: Bulk IO write error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6808&quot;&gt;&lt;del&gt;LU-6808&lt;/del&gt;&lt;/a&gt; ptlrpc: no need to reassign mbits for replay&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 48488fa271e4444e4a2ab4f3babb6c91b779634e&lt;/p&gt;</comment>
                            <comment id="170890" author="pjones" created="Tue, 25 Oct 2016 04:03:25 +0000"  >&lt;p&gt;Landed for 2.9&lt;/p&gt;</comment>
                            <comment id="207108" author="twhitehead" created="Thu, 31 Aug 2017 19:05:52 +0000"  >&lt;p&gt;Just finished running into this in the in-tree lustre client driver for the 4.12 kernel.&lt;/p&gt;

&lt;p&gt;That is, larger reading/writing operations on a mount would hang any further file reading/writing operations on that mount and the system would endlessly spit&lt;/p&gt;

&lt;p&gt;...&lt;br/&gt;
Connection to &amp;lt;OST&amp;gt; was lost; in progress operations using this service will wait for recovery to complete &lt;br/&gt;
Connection restored to &amp;lt;OST&amp;gt; &lt;br/&gt;
Connection to &amp;lt;OST&amp;gt; was lost; in progress operations using this service will wait for recovery to complete &lt;br/&gt;
Connection restored to &amp;lt;OST&amp;gt; &lt;br/&gt;
...&lt;/p&gt;

&lt;p&gt;Applying the two merged patches from this ticket resolved the issue&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://review.whamcloud.com/#/c/22373/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/22373/&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://review.whamcloud.com/#/c/23048/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/23048/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;It would be great if these could both see their way into the in-tree lustre client driver.&lt;/p&gt;

&lt;p&gt;Thanks!  -Tyson&lt;/p&gt;</comment>
                            <comment id="207124" author="pjones" created="Thu, 31 Aug 2017 19:18:23 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=simmonsja&quot; class=&quot;user-hover&quot; rel=&quot;simmonsja&quot;&gt;simmonsja&lt;/a&gt; are these patches on your radar?&lt;/p&gt;</comment>
                            <comment id="207134" author="simmonsja" created="Thu, 31 Aug 2017 19:38:45 +0000"  >&lt;p&gt;No. I missed those. Will port them to upstream client.&lt;/p&gt;</comment>
                            <comment id="207301" author="adilger" created="Fri, 1 Sep 2017 23:08:22 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=twhitehead&quot; class=&quot;user-hover&quot; rel=&quot;twhitehead&quot;&gt;twhitehead&lt;/a&gt; if you have already ported those patches to the upstream kernel, please feel free to submit the patches directly to the mailing list.  This has the advantage that:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;James doesn&apos;t have to do more work to port and test the patches himself&lt;/li&gt;
	&lt;li&gt;it shows the kernel maintainers that people are interested and using the upstream kernel code&lt;/li&gt;
	&lt;li&gt;you become familiar with the process yourself and can potentially help with other patches in the future&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;See &lt;a href=&quot;https://wiki.hpdd.intel.com/display/PUB/Commit+Comments#Additional_commit_tags&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/PUB/Commit+Comments#Additional_commit_tags&lt;/a&gt; for labels to include in patches ported from the Lustre master tree to the upstream kernel, and the script &lt;tt&gt;scripts/get_maintainer.pl drivers/staging/lustre&lt;/tt&gt; to get a list of email addresses to sent patches to.&lt;/p&gt;</comment>
                            <comment id="207303" author="twhitehead" created="Sat, 2 Sep 2017 03:19:29 +0000"  >&lt;p&gt;Will do.  The script gives me 7 addresses, 3 of which are lists.  Do I send it to all of them?&lt;/p&gt;

&lt;p&gt;With regard to the commit message, I just did a format-patch on the two commit above and then applied them with am and fixed up the issues.  This retained the entire original commit message.  It references the issue number, but it doesn&apos;t include any info about the exact original commit.&lt;/p&gt;

&lt;p&gt;Is this sufficient, or should I be trying to add something more?&lt;/p&gt;

&lt;p&gt;Thanks!  -Tyson&lt;/p&gt;</comment>
                            <comment id="207480" author="adilger" created="Tue, 5 Sep 2017 20:52:06 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=twhitehead&quot; class=&quot;user-hover&quot; rel=&quot;twhitehead&quot;&gt;twhitehead&lt;/a&gt;, thanks for following up on this.  The &lt;a href=&quot;https://wiki.hpdd.intel.com/display/PUB/Commit+Comments&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/PUB/Commit+Comments&lt;/a&gt; page has a section &lt;tt&gt;Patch porting examples&lt;/tt&gt; that lists the commit comment tags that should be used when porting a patch to the upstream kernel.  The patch should be sent to at least Greg KH, CC Oleg, James, and myself, along with &lt;tt&gt;lustre-devel&lt;/tt&gt; and &lt;tt&gt;devel&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="207607" author="twhitehead" created="Wed, 6 Sep 2017 14:21:58 +0000"  >&lt;p&gt;Ok.  Have it ready to go, but am holding off because I would like to know everything is stable first and we are still experiencing instabilities that I believe are lustre related.&lt;/p&gt;

&lt;p&gt;That is, bulk operations seems okay now, but within 24hrs the machine falls off the network and the kernel spits soft lockup messages with osc_io_unplug0 showing up in the backtrace.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://photos.app.goo.gl/zcbG6GGmB0eBgo0G3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://photos.app.goo.gl/zcbG6GGmB0eBgo0G3&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;I believe it is possibly &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8680&quot; title=&quot;replay-single test_20b: BUG: soft lockup - osc_makes_rpc()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8680&quot;&gt;&lt;del&gt;LU-8680&lt;/del&gt;&lt;/a&gt;, so I&apos;ve ported that patch as well.  Just rebooted it into the new kernel last night.  So far it is still up with no signs of problems.  Fingers crossed!&lt;/p&gt;</comment>
                            <comment id="207625" author="simmonsja" created="Wed, 6 Sep 2017 15:32:18 +0000"  >&lt;p&gt;I already push &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8680&quot; title=&quot;replay-single test_20b: BUG: soft lockup - osc_makes_rpc()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8680&quot;&gt;&lt;del&gt;LU-8680&lt;/del&gt;&lt;/a&gt; but it wouldn&apos;t show up in Linus tree until the 4.14-rc1 merge window. Luckly 4.14 is a LTS kernel so the client will be usable for general users. The tree to work against is the staging tree. Specifically the staging-next branch. Normally the staging-tree is about 100+ patches ahead of Linus tree at any time. If you need help moving to the staging tree let me know. I will gladly help.&lt;/p&gt;

&lt;p&gt;Feel free to send me the patches for the upstream client. I will look at them. My email is in the MAINTAINERS file for lustre (James  Simmons)&lt;/p&gt;</comment>
                            <comment id="208364" author="twhitehead" created="Thu, 14 Sep 2017 14:45:04 +0000"  >&lt;p&gt;Just a quick follow up on this to let you know I believe things are now stable in 4.12 with this and the LU 8680 patch.&lt;/p&gt;

&lt;p&gt;The machine has been up under load for 8 days now without issue.  Prior to these patches it wouldn&apos;t last 24hrs.  Very happy about this.  &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;I&apos;ve ported the patch to staging (and, IIRC, emailed it to James), but am not in a position to test as we really need to avoid any further downtime on this system.&lt;/p&gt;

&lt;p&gt;Thanks!  -Tyson&lt;/p&gt;</comment>
                            <comment id="208754" author="simmonsja" created="Tue, 19 Sep 2017 16:15:25 +0000"  >&lt;p&gt;Sorry I have been busy with sysfs porting/fixing. So I looked at the latest staging I see one of the patches for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-6808&quot; title=&quot;Interop 2.5.3&amp;lt;-&amp;gt;master sanity test_224c: Bulk IO write error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-6808&quot;&gt;&lt;del&gt;LU-6808&lt;/del&gt;&lt;/a&gt; already landed. I missed the other one. I just got the latest staging-next tree to build and I&apos;m looking to port stuff so I will include your other patch. Thanks.&lt;/p&gt;</comment>
                            <comment id="219670" author="twhitehead" created="Thu, 1 Feb 2018 16:26:11 +0000"  >&lt;p&gt;Rebuilt for the spectre/meltdown bugs and discovered this hasn&apos;t full landed in upstream yet. Specifically, if I try and do something like a copy on a larger file, it just hangs while the kernel endlessly spits messages like these&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;60679.700856&amp;#93;&lt;/span&gt; Lustre: covework-OST0017-osc-ffff888d70ebf800: Connection to covework-OST0017 (at 10.18.26.8@tcp) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;60679.719316&amp;#93;&lt;/span&gt; Lustre: Skipped 49 previous similar messages&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;60679.740153&amp;#93;&lt;/span&gt; Lustre: covework-OST0017-osc-ffff888d70ebf800: Connection restored to covework-OST0017 (at 10.18.26.8@tcp)&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;60679.751843&amp;#93;&lt;/span&gt; Lustre: Skipped 49 previous similar messages&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;61279.724063&amp;#93;&lt;/span&gt; Lustre: 1144:0:(client.c:2068:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1517498129/real 1517498129&amp;#93;&lt;/span&gt;  req@ffff888d3c128c00 x1591147922869136/t0(0) o3-&amp;gt;covework-OST0017-osc-ffff888d70ebf800@10.18.26.8@tcp:6/4 lens 608/432 e 0 to 1 dl 1517498141 ref 2 fl Rpc:X/2/ffffffff rc 0/-1&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;61279.756551&amp;#93;&lt;/span&gt; Lustre: 1144:0:(client.c:2068:ptlrpc_expire_one_request()) Skipped 49 previous similar messages&lt;/p&gt;

&lt;p&gt;Dug through the prior patches and compared against the current master kernel. Looks like part of the the patch set never landed upstream&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://review.whamcloud.com/#/c/22373/5/lustre/ptlrpc/client.c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/22373/5/lustre/ptlrpc/client.c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://github.com/torvalds/linux/blob/255442c93843f52b6891b21d0b485bf2c97f93c3/drivers/staging/lustre/lustre/ptlrpc/client.c#L3133&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/torvalds/linux/blob/255442c93843f52b6891b21d0b485bf2c97f93c3/drivers/staging/lustre/lustre/ptlrpc/client.c#L3133&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://github.com/torvalds/linux/blob/255442c93843f52b6891b21d0b485bf2c97f93c3/drivers/staging/lustre/lustre/ptlrpc/client.c#L3163&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/torvalds/linux/blob/255442c93843f52b6891b21d0b485bf2c97f93c3/drivers/staging/lustre/lustre/ptlrpc/client.c#L3163&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Thanks!  -Tyson&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                                        </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="29428">LU-6441</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="11275">LU-484</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzxhm7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>