<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:13:49 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1135] connection between MDS and OSS constantly being dropped and reestablished.</title>
                <link>https://jira.whamcloud.com/browse/LU-1135</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After a IOR job was launched on our clients nodes if one attempts to go into the directory where the many files are being created the OSTS would start goin to recovery every few minutes. &lt;/p&gt;</description>
                <environment>Lustre 2.1.56 servers with Lustre 2.1.56 clients on a cray system.</environment>
        <key id="13297">LU-1135</key>
            <summary>connection between MDS and OSS constantly being dropped and reestablished.</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="2">Won&apos;t Fix</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="simmonsja">James A Simmons</reporter>
                        <labels>
                    </labels>
                <created>Fri, 24 Feb 2012 08:07:07 +0000</created>
                <updated>Thu, 8 Mar 2012 11:40:24 +0000</updated>
                            <resolved>Thu, 8 Mar 2012 11:15:26 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="29801" author="simmonsja" created="Fri, 24 Feb 2012 08:08:44 +0000"  >&lt;p&gt;I also placed the debug logs from the severs at the ftp site in uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1135&quot; title=&quot;connection between MDS and OSS constantly being dropped and reestablished.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1135&quot;&gt;&lt;del&gt;LU-1135&lt;/del&gt;&lt;/a&gt;/logs.tar&lt;/p&gt;</comment>
                            <comment id="29802" author="ian" created="Fri, 24 Feb 2012 09:36:48 +0000"  >&lt;p&gt;This must be fixed prior to the IR test at ORNL.&lt;/p&gt;</comment>
                            <comment id="29803" author="pjones" created="Fri, 24 Feb 2012 09:57:07 +0000"  >&lt;p&gt;Oleg&lt;/p&gt;

&lt;p&gt;Could you please look into this one as your top priority?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="29805" author="simmonsja" created="Fri, 24 Feb 2012 12:10:02 +0000"  >&lt;p&gt;Okay been busy bisecting and I think I found the source of the problem. Its was commit 0204171fd3e1b393c53bd374aff228e80080a55a from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1028&quot; title=&quot;Bus error  (core dumped) during fsx test&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1028&quot;&gt;&lt;del&gt;LU-1028&lt;/del&gt;&lt;/a&gt;. Running some final test to determine if this is the cause.&lt;/p&gt;</comment>
                            <comment id="29808" author="simmonsja" created="Fri, 24 Feb 2012 13:59:31 +0000"  >&lt;p&gt;Uploading more logs to uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1135&quot; title=&quot;connection between MDS and OSS constantly being dropped and reestablished.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1135&quot;&gt;&lt;del&gt;LU-1135&lt;/del&gt;&lt;/a&gt;/bug.tbz&lt;/p&gt;</comment>
                            <comment id="30287" author="simmonsja" created="Fri, 2 Mar 2012 09:47:02 +0000"  >&lt;p&gt;Doing some more testing I discovered that the problem went away when I upgraded to OFED 1.5.4 when on RHEL5.7. It is unknown if this is a Lustre bug or a OFED bug at this point. Will investigate with a image with a older OFED.&lt;/p&gt;</comment>
                            <comment id="30650" author="simmonsja" created="Wed, 7 Mar 2012 13:32:46 +0000"  >&lt;p&gt;Testing with the RHEL6 image with the default OFED stack shows the same problem.&lt;/p&gt;

&lt;p&gt;On the OSS&lt;/p&gt;

&lt;p&gt;Lustre: 4473:0:(ldlm_lib.c:634:target_handle_reconnect()) lustre-OST0018: lustre-MDT0000-mdtlov_UUID reconnecting&lt;br/&gt;
Lustre: 4473:0:(filter.c:2692:filter_connect_internal()) lustre-OST0018: Received MDS connection for group 0&lt;br/&gt;
Lustre: lustre-OST0018: received MDS connection from 10.37.248.61@o2ib1&lt;br/&gt;
Lustre: 4473:0:(llog_net.c:168:llog_receptor_accept()) changing the import ffff880432273000 - ffff8803f7954800&lt;br/&gt;
Lustre: 4473:0:(llog_net.c:168:llog_receptor_accept()) changing the import ffff880432273000 - ffff8803f7954800&lt;br/&gt;
Lustre: 4473:0:(filter.c:2548:filter_llog_connect()) lustre-OST0018: Recovery from log 0x3d7c29a/0x0:a1d23d49&lt;/p&gt;

&lt;p&gt;And on the mds&lt;br/&gt;
LustreError: 10220:0:(osc_create.c:175:osc_interpret_create()) @@@ Unknown rc &lt;del&gt;107 from async create: failing oscc  req@ffff880391376800 x1395817301711434/t0(0) o5&lt;/del&gt;&amp;gt;lustre-OST0001-osc-MDT0000@10.37.248.63@o2ib1:7/4 lens 400/400 e 0 to 1 dl 1331144444 ref 1 fl Interpret:RXN/0/ffffffff rc -107/-1&lt;br/&gt;
Lustre: 10220:0:(client.c:1789:ptlrpc_expire_one_request()) @@@ Request  sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1331144519/real 1331144519&amp;#93;&lt;/span&gt;  req@ffff8803f9d06000 x1395817301712253/t0(0) o400-&amp;gt;lustre-OST0005-osc-MDT0000@10.37.248.63@o2ib1:28/4 lens 192/192 e 0 to 1 dl 1331144526 ref 1 fl Rpc:RXN/0/ffffffff rc 0/-1&lt;br/&gt;
Lustre: lustre-OST0005-osc-MDT0000: Connection to service lustre-OST0005 via nid 10.37.248.63@o2ib1 was lost; in progress operations using this service will wait for recovery to complete.&lt;br/&gt;
Lustre: lustre-OST0005-osc-MDT0000: Connection restored to service lustre-OST0005 using nid 10.37.248.63@o2ib1.&lt;br/&gt;
LustreError: 18947:0:(lov_obd.c:1069:lov_clear_orphans()) error in orphan recovery on OST idx 1/28: rc = -5&lt;br/&gt;
LustreError: 18947:0:(mds_lov.c:882:__mds_lov_synchronize()) lustre-OST0001_UUID failed at mds_lov_clear_orphans: -5&lt;br/&gt;
LustreError: 18947:0:(mds_lov.c:903:__mds_lov_synchronize()) lustre-OST0001_UUID sync failed -5, deactivating&lt;/p&gt;

&lt;p&gt;Oleg has a idea that it&apos;s a race condition in the ptlrpc layer. I observed in the rhel5 distro with OFED 1.5.4 the problem was reduced.&lt;/p&gt;</comment>
                            <comment id="30651" author="simmonsja" created="Wed, 7 Mar 2012 13:35:28 +0000"  >&lt;p&gt;This error also happened on the MDS&lt;/p&gt;

&lt;p&gt;2012-03-07 13:32:17 Lustre: 10224:0:(import.c:525:import_select_connection()) lustre-OST0002-osc-MDT0000: tried all connections, increasing latency to 10s&lt;br/&gt;
2012-03-07 13:32:17 Lustre: 10224:0:(import.c:525:import_select_connection()) Skipped 2 previous similar messages&lt;br/&gt;
2012-03-07 13:33:32 LustreError: 10221:0:(lov_request.c:558:lov_update_create_set()) error creating fid 0x4c sub-object on OST idx 2/25: rc = -11&lt;br/&gt;
2012-03-07 13:33:32 LustreError: 10221:0:(lov_request.c:558:lov_update_create_set()) Skipped 15 previous similar messages&lt;br/&gt;
2012-03-07 13:33:32 LustreError: 10218:0:(lov_request.c:558:lov_update_create_set()) error creating fid 0x4c sub-object on OST idx 2/25: rc = -11&lt;br/&gt;
2012-03-07 13:33:33 LustreError: 10216:0:(lov_request.c:558:lov_update_create_set()) error creating fid 0x4c sub-object on OST idx 26/25: rc = -11&lt;br/&gt;
2012-03-07 13:33:33 LustreError: 10216:0:(lov_request.c:558:lov_update_create_set()) Skipped 53 previous similar messages&lt;br/&gt;
2012-03-07 13:33:33 Lustre: 11252:0:(lov_qos.c:472:qos_shrink_lsm()) using fewer stripes for object 76: old 25 new 19&lt;/p&gt;</comment>
                            <comment id="30722" author="simmonsja" created="Thu, 8 Mar 2012 11:09:15 +0000"  >&lt;p&gt;The reason for the errorswhen using the rhel6 image was due to the file system not being rebuilt. Previous I built the file system using the rhel5 image. After moving to a rhel6 image the problem was still present. I attempted to test IR but it toppled my client so the next time I reformated the file system. After the reformat all the problems went away.&lt;/p&gt;</comment>
                            <comment id="30723" author="pjones" created="Thu, 8 Mar 2012 11:15:26 +0000"  >&lt;p&gt;ok thanks for letting us know James.&lt;/p&gt;</comment>
                            <comment id="30726" author="simmonsja" created="Thu, 8 Mar 2012 11:40:24 +0000"  >&lt;p&gt;Just as a note if anyone will migrated from a RHEL5 envirnoment to RHEL6 with Lustre pre 2.2 that they would reformat there file system before use.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="10884" name="dmesg-mds" size="78386" author="simmonsja" created="Fri, 24 Feb 2012 08:07:07 +0000"/>
                            <attachment id="10883" name="dmesg-mgs" size="772" author="simmonsja" created="Fri, 24 Feb 2012 08:07:07 +0000"/>
                            <attachment id="10885" name="dmesg-oss1" size="64756" author="simmonsja" created="Fri, 24 Feb 2012 08:07:07 +0000"/>
                            <attachment id="10886" name="dmesg-oss2" size="55411" author="simmonsja" created="Fri, 24 Feb 2012 08:07:07 +0000"/>
                            <attachment id="10887" name="dmesg-oss3" size="56287" author="simmonsja" created="Fri, 24 Feb 2012 08:07:07 +0000"/>
                            <attachment id="10888" name="dmesg-oss4" size="62925" author="simmonsja" created="Fri, 24 Feb 2012 08:07:07 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvh9z:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6448</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>