<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:17:08 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1495] Hyperion -recovery-scale evictions </title>
                <link>https://jira.whamcloud.com/browse/LU-1495</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Seeing a repeated error where a single client is evicted during mds-recovery, followed by a number of other client evictions.&lt;br/&gt;
debug logs, logs uploaded to FTP. - LLNL.212rc.tar.gz  &lt;br/&gt;
The failover script is failing over MDS, however OSTs are also experiencing disconnect/reconnect events&lt;br/&gt;
Typical example:&lt;br/&gt;
Jun  7 15:17:43 ehyperion260 kernel: Lustre: 19716:0:(client.c:1780:ptlrpc_expire_one_request()) @@@ Request  sent has failed due to network error: [sent 1339107463&lt;br/&gt;
/real 1339107463]  req@ffff880144351800 x1404138588851383/t0(0) o8-&amp;gt;lustre-OST0002-osc-ffff880226efb000@192.168.127.60@o2ib1:28/4 lens 368/512 e 0 to 1 dl 133910751&lt;br/&gt;
8 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
Jun  7 15:17:43 ehyperion260 kernel: Lustre: 19716:0:(client.c:1780:ptlrpc_expire_one_request()) Skipped 217 previous similar messages&lt;br/&gt;
Jun  7 15:17:43 ehyperion260 kernel: Lustre: lustre-OST0001-osc-ffff880226efb000: Connection restored to lustre-OST0001 (at 192.168.127.61@o2ib1)&lt;br/&gt;
Jun  7 15:17:43 ehyperion260 kernel: Lustre: Skipped 4 previous similar messages&lt;br/&gt;
Jun  7 15:17:48 ehyperion260 kernel: Lustre: lustre-MDT0000-mdc-ffff880226efb000: Connection restored to lustre-MDT0000 (at 192.168.127.6@o2ib1)&lt;br/&gt;
Jun  7 15:17:48 ehyperion260 kernel: Lustre: Skipped 4 previous similar messages&lt;br/&gt;
Jun  7 15:18:08 ehyperion260 kernel: Lustre: lustre-OST0002-osc-ffff880226efb000: Connection restored to lustre-OST0002 (at 192.168.127.60@o2ib1)&lt;br/&gt;
Jun  7 15:18:33 ehyperion260 kernel: LustreError: 167-0: This client was evicted by lustre-OST0003; in progress operations using this service will fail.&lt;br/&gt;
Jun  7 15:18:33 ehyperion260 kernel: LustreError: 167-0: This client was evicted by lustre-OST0004; in progress operations using this service will fail.&lt;/p&gt;
</description>
                <environment>Hyperion RHEL6 - 2.1.2-RC2, servers and clients</environment>
        <key id="14805">LU-1495</key>
            <summary>Hyperion -recovery-scale evictions </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                    </labels>
                <created>Thu, 7 Jun 2012 20:22:11 +0000</created>
                <updated>Thu, 7 Jan 2016 23:42:13 +0000</updated>
                            <resolved>Thu, 7 Jan 2016 23:42:13 +0000</resolved>
                                    <version>Lustre 2.1.2</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                            <comments>
                            <comment id="40248" author="pjones" created="Fri, 8 Jun 2012 01:04:42 +0000"  >&lt;p&gt;Hongchao is looking into this one.&lt;/p&gt;</comment>
                            <comment id="40250" author="cliffw" created="Fri, 8 Jun 2012 01:20:34 +0000"  >&lt;p&gt;Please let me know if you need more data. The systems with set with dump_on_eviction=1.&lt;/p&gt;</comment>
                            <comment id="40262" author="hongchao.zhang" created="Fri, 8 Jun 2012 06:40:45 +0000"  >&lt;p&gt;from the logs, it seems the bad network is the reason to cause eviction, take OST0016 as an example,&lt;/p&gt;

&lt;p&gt;1, the server side:&lt;/p&gt;

&lt;p&gt;output of &apos; grep &quot;OST0016&quot; oss6/server-log.133910* |grep &quot;evict&quot; &apos;&lt;/p&gt;

&lt;p&gt;oss6/server-log.1339107443.28502.txt:00000100:00080000:9.0:1339107438.377395:0:28621:0:(service.c:829:ptlrpc_update_export_timer()) lustre-OST0016: Think about evicting 192.168.115.106@o2ib1 from 1339107065&lt;br/&gt;
oss6/server-log.1339107515.28820.txt:00000100:00080000:5.0:1339107514.029089:0:28820:0:(pinger.c:642:ping_evictor_main()) evicting all exports of obd lustre-OST0016 older than 1339107364&lt;br/&gt;
oss6/server-log.1339107515.28820.txt:00000100:02000400:5.0:1339107514.029092:0:28820:0:(pinger.c:668:ping_evictor_main()) lustre-OST0016: haven&apos;t heard from client b437de75-d7b5-c8e1-cda3-0fc115566ca1 (at 192.168.113.136@o2ib1) in 430 seconds. I think it&apos;s dead, and I am evicting it. exp ffff8802eceaac00, cur 1339107514 expire 1339107364 last 1339107084&lt;br/&gt;
oss6/server-log.1339107515.28820.txt:00000100:02000400:5.0:1339107514.029197:0:28820:0:(pinger.c:668:ping_evictor_main()) lustre-OST0016: haven&apos;t heard from client 502ca4cf-aa86-f823-123d-b12e548e49ba (at 192.168.114.16@o2ib1) in 425 seconds. I think it&apos;s dead, and I am evicting it. exp ffff880489661000, cur 1339107514 expire 1339107364 last 1339107089&lt;br/&gt;
oss6/server-log.1339107515.28820.txt:00000100:02000400:5.0:1339107514.029308:0:28820:0:(pinger.c:668:ping_evictor_main()) lustre-OST0016: haven&apos;t heard from client ad17b65f-2e05-a801-a642-28187f3ab454 (at 192.168.114.12@o2ib1) in 425 seconds. I think it&apos;s dead, and I am evicting it. exp ffff8801ec20f400, cur 1339107514 expire 1339107364 last 1339107089&lt;br/&gt;
oss6/server-log.1339107515.28820.txt:00000100:02000400:5.0:1339107514.029382:0:28820:0:(pinger.c:668:ping_evictor_main()) lustre-OST0016: haven&apos;t heard from client 5d184491-373a-66ab-0e22-d1fb789b66ee (at 192.168.114.13@o2ib1) in 425 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88053cc38400, cur 1339107514 expire 1339107364 last 1339107089&lt;br/&gt;
oss6/server-log.1339108693.6659.txt:00000100:00080000:4.0:1339108687.119017:0:28626:0:(service.c:829:ptlrpc_update_export_timer()) lustre-OST0016: Think about evicting 192.168.113.111@o2ib1 from 1339107938&lt;/p&gt;

&lt;p&gt;the OST0016 evicts these clients for it don&apos;t send ping requests to keep alive&lt;/p&gt;


&lt;p&gt;2, the client side&lt;br/&gt;
the excerpt of output &apos; grep &quot;OST0016&quot; hy260.debug.log hy281/* &apos;&lt;br/&gt;
...&lt;br/&gt;
hy260.debug.log:00000100:00000400:1.0:1339107138.280452:0:19715:0:(client.c:1780:ptlrpc_expire_one_request()) @@@ Request  sent has failed due to network error: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1339107114/real 1339107138&amp;#93;&lt;/span&gt;  req@ffff8801d4d6e000 x1404138588850748/t0(0) o400-&amp;gt;lustre-OST0016-osc-ffff880226efb000@192.168.127.65@o2ib1:28/4 lens 192/192 e 0 to 1 dl 1339107220 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
hy260.debug.log:00000100:00100000:1.0:1339107138.280460:0:19715:0:(client.c:1823:ptlrpc_expire_one_request()) @@@ TIMEOUT-NORESEND:  req@ffff8801d4d6e000 x1404138588850748/t0(0) o400-&amp;gt;lustre-OST0016-osc-ffff880226efb000@192.168.127.65@o2ib1:28/4 lens 192/192 e 0 to 1 dl 1339107220 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
hy260.debug.log:00000100:02000400:1.0:1339107138.280465:0:19715:0:(import.c:170:ptlrpc_set_import_discon()) lustre-OST0016-osc-ffff880226efb000: Connection to lustre-OST0016 (at 192.168.127.65@o2ib1) was lost; in progress operations using this service will wait for recovery to complete&lt;br/&gt;
hy260.debug.log:00000100:00080000:1.0:1339107138.280466:0:19715:0:(import.c:180:ptlrpc_set_import_discon()) ffff880110a9c000 lustre-OST0016_UUID: changing import state from FULL to DISCONN&lt;br/&gt;
hy260.debug.log:00000100:00080000:1.0:1339107138.280467:0:19715:0:(import.c:415:ptlrpc_fail_import()) lustre-OST0016_UUID: waking up pinger&lt;br/&gt;
hy260.debug.log:00000100:00080000:4.0:1339107138.280493:0:19717:0:(recover.c:71:ptlrpc_initiate_recovery()) lustre-OST0016_UUID: starting recovery&lt;br/&gt;
hy260.debug.log:00000100:00080000:4.0:1339107138.280494:0:19717:0:(import.c:626:ptlrpc_connect_import()) ffff880110a9c000 lustre-OST0016_UUID: changing import state from DISCONN to CONNECTING&lt;br/&gt;
hy260.debug.log:00000100:00080000:4.0:1339107138.280496:0:19717:0:(import.c:482:import_select_connection()) lustre-OST0016-osc-ffff880226efb000: connect to NID 192.168.127.65@o2ib1 last attempt 4309519608&lt;br/&gt;
hy260.debug.log:00000100:00080000:4.0:1339107138.280499:0:19717:0:(import.c:561:import_select_connection()) lustre-OST0016-osc-ffff880226efb000: import ffff880110a9c000 using connection 192.168.127.65@o2ib1/192.168.127.65@o2ib1&lt;br/&gt;
hy260.debug.log:00000100:00100000:4.0:1339107138.280505:0:19717:0:(import.c:725:ptlrpc_connect_import()) @@@ (re)connect request (timeout 5)  req@ffff8801dee51c00 x1404138588850806/t0(0) o8-&amp;gt;lustre-OST0016-osc-ffff880226efb000@192.168.127.65@o2ib1:28/4 lens 368/512 e 0 to 0 dl 0 ref 1 fl New:N/0/ffffffff rc 0/-1&lt;br/&gt;
hy260.debug.log:00000100:00080000:4.0:1339107138.280707:0:19717:0:(pinger.c:258:ptlrpc_pinger_process_import()) not pinging lustre-OST0016_UUID (in recovery  or recovery disabled: CONNECTING)&lt;br/&gt;
hy260.debug.log:00000100:00080000:4.0:1339107138.281232:0:19717:0:(pinger.c:258:ptlrpc_pinger_process_import()) not pinging lustre-OST0016_UUID (in recovery  or recovery disabled: CONNECTING)&lt;br/&gt;
hy260.debug.log:00000100:00000400:2.0:1339107138.283718:0:19716:0:(client.c:1780:ptlrpc_expire_one_request()) @@@ Request  sent has failed due to network error: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1339107138/real 1339107138&amp;#93;&lt;/span&gt;  req@ffff8801dee51c00 x1404138588850806/t0(0) o8-&amp;gt;lustre-OST0016-osc-ffff880226efb000@192.168.127.65@o2ib1:28/4 lens 368/512 e 0 to 1 dl 1339107188 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
hy260.debug.log:00000100:00100000:2.0:1339107138.283723:0:19716:0:(client.c:1812:ptlrpc_expire_one_request()) @@@ err &lt;del&gt;110, sent_state=CONNECTING (now=CONNECTING)  req@ffff8801dee51c00 x1404138588850806/t0(0) o8&lt;/del&gt;&amp;gt;lustre-OST0016-osc-ffff880226efb000@192.168.127.65@o2ib1:28/4 lens 368/512 e 0 to 1 dl 1339107188 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
hy260.debug.log:00000100:00080000:2.0:1339107138.283728:0:19716:0:(import.c:1111:ptlrpc_connect_interpret()) ffff880110a9c000 lustre-OST0016_UUID: changing import state from CONNECTING to DISCONN&lt;br/&gt;
hy260.debug.log:00000100:00080000:2.0:1339107138.283729:0:19716:0:(import.c:1157:ptlrpc_connect_interpret()) recovery of lustre-OST0016_UUID on 192.168.127.65@o2ib1 failed (-110)&lt;br/&gt;
...&lt;/p&gt;

&lt;p&gt;the client try several times before it can reach the server!&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv3cf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4041</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>