<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:20:54 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1926] Reboots during test runs</title>
                <link>https://jira.whamcloud.com/browse/LU-1926</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;See &lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/9ddf73ba-f23d-11e1-87d6-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/9ddf73ba-f23d-11e1-87d6-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/94e74180-f296-11e1-807d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/94e74180-f296-11e1-807d-52540035b04c&lt;/a&gt;&lt;/p&gt;</description>
                <environment></environment>
        <key id="15634">LU-1926</key>
            <summary>Reboots during test runs</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="yujian">Jian Yu</assignee>
                                    <reporter username="pjones">Peter Jones</reporter>
                        <labels>
                            <label>releases</label>
                    </labels>
                <created>Thu, 30 Aug 2012 12:34:03 +0000</created>
                <updated>Wed, 31 Jul 2013 15:19:49 +0000</updated>
                            <resolved>Thu, 13 Sep 2012 08:31:02 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="44002" author="chris" created="Thu, 30 Aug 2012 14:18:27 +0000"  >&lt;p&gt;I have no idea, we don&apos;t even know for sure if some other entity caused the reboots. Nothing in the logs suggests that autotest rebooted the nodes.&lt;/p&gt;

&lt;p&gt;We shall just have to monitor the situation.&lt;/p&gt;</comment>
                            <comment id="44077" author="pjones" created="Sun, 2 Sep 2012 11:40:21 +0000"  >&lt;p&gt;Didn&apos;t you isolate and fix this one?&lt;/p&gt;</comment>
                            <comment id="44099" author="chris" created="Mon, 3 Sep 2012 08:55:32 +0000"  >&lt;p&gt;Fixed by TT-852 and upgrade of the weekend 1/9/2012&lt;/p&gt;</comment>
                            <comment id="44223" author="sarah" created="Wed, 5 Sep 2012 15:47:49 +0000"  >&lt;p&gt;It looks like the MDS reboot for some reason&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/f50e5332-f6d3-11e1-b320-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/f50e5332-f6d3-11e1-b320-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="44481" author="chris" created="Mon, 10 Sep 2012 09:14:57 +0000"  >&lt;p&gt;Could you provide details of where the reboot was, the traces are long and the test looks not particularly successful. As you have already found the unexpected MDS reboots in the logs it would be help to pass that information on to me.&lt;/p&gt;</comment>
                            <comment id="44497" author="yujian" created="Mon, 10 Sep 2012 10:52:03 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_3/16&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_3/16&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;parallel-scale-nfsv4 test compilebench timed out: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/f7a291b0-fa72-11e1-a03c-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/f7a291b0-fa72-11e1-a03c-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;There is no error message in the test output.&lt;/p&gt;

&lt;p&gt;Console log on Client 2 (client-28vm2) showed that:&lt;/p&gt;
&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;02:18:51:Lustre: DEBUG MARKER: == parallel-scale-nfsv4 test compilebench: compilebench == 02:18:50 (1347182330)&lt;br/&gt;
02:18:51:Lustre: DEBUG MARKER: /usr/sbin/lctl mark .\/compilebench -D \/mnt\/lustre\/d0.compilebench -i 2         -r 2 --makej&lt;br/&gt;
02:18:51:Lustre: DEBUG MARKER: ./compilebench -D /mnt/lustre/d0.compilebench -i 2 -r 2 --makej&lt;br/&gt;
&lt;font color=&quot;red&quot;&gt;02:30:43:nfs: server client-28vm7 not responding, still trying&lt;/font&gt;&lt;br/&gt;
03:18:59:********** Timeout by autotest system **********03:19:54:&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on MDS (client-28vm7) showed that:&lt;/p&gt;
&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;02:18:56:Lustre: DEBUG MARKER: == parallel-scale-nfsv4 test compilebench: compilebench == 02:18:50 (1347182330)&lt;br/&gt;
02:18:56:Lustre: ctl-lustre-MDT0000: super-sequence allocation rc = 0 [0x0000000200000400-0x0000000240000400):0:mdt&lt;br/&gt;
02:18:56:Lustre: DEBUG MARKER: /usr/sbin/lctl mark .\/compilebench -D \/mnt\/lustre\/d0.compilebench -i 2         -r 2 --makej&lt;br/&gt;
02:18:56:Lustre: DEBUG MARKER: ./compilebench -D /mnt/lustre/d0.compilebench -i 2 -r 2 --makej&lt;br/&gt;
03:18:59:********** Timeout by autotest system **********03:20:39:&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The MDS (client-28vm7) was not responding during the test, however there was no error message in its console log. Now, let&apos;s take a look at its syslong from /scratch/logs/syslog/client-28vm7.log:&lt;/p&gt;
&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;Sep  9 02:18:51 client-28vm7 kernel: Lustre: DEBUG MARKER: ./compilebench -D /mnt/lustre/d0.compilebench -i 2 -r 2 --makej&lt;br/&gt;
Sep  9 02:18:51 client-28vm7 rshd&lt;span class=&quot;error&quot;&gt;&amp;#91;5258&amp;#93;&lt;/span&gt;: pam_unix(rsh:session): session closed for user root&lt;br/&gt;
Sep  9 02:18:51 client-28vm7 xinetd&lt;span class=&quot;error&quot;&gt;&amp;#91;1661&amp;#93;&lt;/span&gt;: EXIT: shell status=0 pid=5258 duration=0(sec)&lt;br/&gt;
&lt;font color=&quot;red&quot;&gt;Sep  9 02:19:38 client-28vm7 ntpd&lt;span class=&quot;error&quot;&gt;&amp;#91;1669&amp;#93;&lt;/span&gt;: synchronized to 10.10.0.2, stratum 3&lt;/font&gt;&lt;br/&gt;
Sep  9 02:19:38 client-28vm7 ntpd&lt;span class=&quot;error&quot;&gt;&amp;#91;1669&amp;#93;&lt;/span&gt;: time reset -0.393295 s&lt;br/&gt;
Sep  9 02:19:38 client-28vm7 ntpd&lt;span class=&quot;error&quot;&gt;&amp;#91;1669&amp;#93;&lt;/span&gt;: kernel time sync status change 2001&lt;br/&gt;
Sep  9 02:24:25 client-28vm7 ntpd&lt;span class=&quot;error&quot;&gt;&amp;#91;1669&amp;#93;&lt;/span&gt;: synchronized to 10.10.0.1, stratum 2&lt;br/&gt;
&lt;font color=&quot;red&quot;&gt;Sep  9 03:21:09 client-28vm7 kernel: MTRR variable ranges enabled:&lt;/font&gt;&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel:  0 base 00E0000000 mask FFE0000000 uncachable&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel:  1 disabled&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel:  2 disabled&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel:  3 disabled&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel:  4 disabled&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel:  5 disabled&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel:  6 disabled&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel:  7 disabled&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel: PAT not supported by CPU.&lt;br/&gt;
Sep  9 03:21:09 client-28vm7 kernel: initial memory mapped : 0 - 20000000&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;As we can see, before the MDS node was rebooted on 03:21:09, there was no error message on both the console and syslog. The only culprit is the ntpd run on MDS node during the test. Will check more test reports associated with this ticket.&lt;/p&gt;</comment>
                            <comment id="44506" author="yujian" created="Mon, 10 Sep 2012 11:42:03 +0000"  >&lt;p&gt;More instances on b2_3 with the same issue:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/dc68d638-fa73-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/dc68d638-fa73-11e1-887d-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/8ec57d46-fa73-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/8ec57d46-fa73-11e1-887d-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/891ac004-fa73-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/891ac004-fa73-11e1-887d-52540035b04c&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/7c320096-fa73-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/7c320096-fa73-11e1-887d-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="44507" author="yujian" created="Mon, 10 Sep 2012 11:54:15 +0000"  >&lt;p&gt;performance-sanity: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/7c320096-fa73-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/7c320096-fa73-11e1-887d-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The above report showed that the MDS (client-30vm3) was out of service during the test. Its syslog showed that:&lt;/p&gt;
&lt;div class=&quot;panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;panelContent&quot;&gt;
&lt;p&gt;&lt;font color=&quot;red&quot;&gt;Sep  8 20:30:43 client-30vm3 kernel: Lustre: DEBUG MARKER: ===== mdsrate-create-small.sh&lt;/font&gt;&lt;br/&gt;
Sep  8 20:30:43 client-30vm3 rshd&lt;span class=&quot;error&quot;&gt;&amp;#91;23422&amp;#93;&lt;/span&gt;: pam_unix(rsh:session): session closed for user root&lt;br/&gt;
Sep  8 20:30:43 client-30vm3 xinetd&lt;span class=&quot;error&quot;&gt;&amp;#91;1549&amp;#93;&lt;/span&gt;: EXIT: shell status=0 pid=23422 duration=0(sec)&lt;br/&gt;
&lt;font color=&quot;red&quot;&gt;Sep  8 20:31:33 client-30vm3 kernel: MTRR variable ranges enabled:&lt;/font&gt;&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel:  0 base 00E0000000 mask FFE0000000 uncachable&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel:  1 disabled&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel:  2 disabled&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel:  3 disabled&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel:  4 disabled&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel:  5 disabled&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel:  6 disabled&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel:  7 disabled&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel: PAT not supported by CPU.&lt;br/&gt;
Sep  8 20:31:33 client-30vm3 kernel: initial memory mapped : 0 - 20000000&lt;/p&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The MDS node was really rebooted with unknown reason during the test.&lt;/p&gt;</comment>
                            <comment id="44577" author="chris" created="Tue, 11 Sep 2012 07:13:11 +0000"  >&lt;p&gt;I&apos;m looking at somemore that look similar, we now have stack traces etc.&lt;/p&gt;

&lt;p&gt;1////&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/0b9ab03c-fbed-11e1-a4a6-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/0b9ab03c-fbed-11e1-a4a6-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Conf Sanity reboots MDS &lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_logs/7b2c562e-fbef-11e1-a4a6-52540035b04c/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_logs/7b2c562e-fbef-11e1-a4a6-52540035b04c/show_text&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;but prior to the reboot we see an assertion&lt;/p&gt;

&lt;p&gt;21:29:47:LustreError: 32529:0:(obd_class.h:527:obd_set_info_async()) obd_set_info_async: dev 0 no operation&lt;br/&gt;
21:29:48:LustreError: 402:0:(mgs_internal.h:283:mgs_env_info()) ASSERTION( info != ((void *)0) ) failed: &lt;br/&gt;
21:29:48:LustreError: 402:0:(mgs_internal.h:283:mgs_env_info()) LBUG&lt;br/&gt;
21:29:48:Pid: 402, comm: llog_process_th&lt;/p&gt;


&lt;p&gt;2////&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/a41f27a2-fbde-11e1-bde2-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/a41f27a2-fbde-11e1-bde2-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;same on conf-sanity&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_logs/2e3dc9cc-fbe2-11e1-bde2-52540035b04c/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_logs/2e3dc9cc-fbe2-11e1-bde2-52540035b04c/show_text&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;again a reboot, but an assertion before that&lt;/p&gt;

&lt;p&gt;20:16:48:Lustre: MGS: Client 9711f524-c291-d894-f430-704c69e8e03d (at 0@lo) reconnecting&lt;br/&gt;
20:16:48:LustreError: 32139:0:(obd_class.h:527:obd_set_info_async()) obd_set_info_async: dev 0 no operation&lt;br/&gt;
20:16:48:LustreError: 32478:0:(mgs_internal.h:288:mgs_env_info()) ASSERTION( info != ((void *)0) ) failed: &lt;br/&gt;
20:16:48:LustreError: 32478:0:(mgs_internal.h:288:mgs_env_info()) LBUG&lt;br/&gt;
20:16:48:Pid: 32478, comm: llog_process_th&lt;/p&gt;


&lt;p&gt;same with the other failures this morning, I can&apos;t see why the previous did not have the LBUG but I can see any that replicate it.&lt;/p&gt;

&lt;p&gt;I&apos;m going to create a ticket for the bugs I found this morning.&lt;/p&gt;</comment>
                            <comment id="44579" author="chris" created="Tue, 11 Sep 2012 07:34:23 +0000"  >&lt;p&gt;Of the 4 examples of Yu Jian,&lt;/p&gt;

&lt;p&gt;1st&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/dc68d638-fa73-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/dc68d638-fa73-11e1-887d-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Timeouts correctly and reboot, the timeout message can be seen. Strangly the reboot is still captured after the timeout but the timeout is correct.&lt;/p&gt;

&lt;p&gt;02:10:44:Lustre: DEBUG MARKER: == parallel-scale-nfsv4 test compilebench: compilebench == 02:10:36 (1347181836)&lt;br/&gt;
02:10:44:Lustre: DEBUG MARKER: /usr/sbin/lctl mark .\/compilebench -D \/mnt\/lustre\/d0.compilebench -i 2         -r 2 --makej&lt;br/&gt;
02:10:44:Lustre: DEBUG MARKER: ./compilebench -D /mnt/lustre/d0.compilebench -i 2 -r 2 --makej&lt;br/&gt;
02:21:56:nfs: server client-30vm3 not responding, still trying&lt;br/&gt;
03:10:59:********** Timeout by autotest system **********03:11:08:&lt;br/&gt;
03:11:08:&amp;lt;ConMan&amp;gt; Console &lt;span class=&quot;error&quot;&gt;&amp;#91;client-30vm6&amp;#93;&lt;/span&gt; disconnected from &amp;lt;client-30:6005&amp;gt; at 09-09 03:11.&lt;br/&gt;
03:11:29:&lt;br/&gt;
03:11:29:&amp;lt;ConMan&amp;gt; Console &lt;span class=&quot;error&quot;&gt;&amp;#91;client-30vm6&amp;#93;&lt;/span&gt; connected to &amp;lt;client-30:6005&amp;gt; at 09-09 03:11.&lt;br/&gt;
03:11:29:&lt;br/&gt;
Press any key to continue.&lt;br/&gt;
03:11:29:&lt;/p&gt;

&lt;p&gt;2nd&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/8ec57d46-fa73-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/8ec57d46-fa73-11e1-887d-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Timeouts correctly and reboot, the timeout message can be seen. Strangly the reboot is still captured after the timeout but the timeout is correct.&lt;/p&gt;

&lt;p&gt;00:02:38:Lustre: 2964:0:(client.c:1917:ptlrpc_expire_one_request()) Skipped 151 previous similar messages&lt;br/&gt;
00:12:49:Lustre: 2964:0:(client.c:1917:ptlrpc_expire_one_request()) @@@ Request  sent has timed out for slow reply: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1347174744/real 1347174747&amp;#93;&lt;/span&gt;  req@ffff880076483000 x1412612131656402/t0(0) o250-&amp;gt;MGC10.10.4.182@tcp@10.10.4.182@tcp:26/25 lens 400/544 e 0 to 1 dl 1347174769 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
00:12:49:Lustre: 2964:0:(client.c:1917:ptlrpc_expire_one_request()) Skipped 146 previous similar messages&lt;br/&gt;
00:23:01:Lustre: 2964:0:(client.c:1917:ptlrpc_expire_one_request()) @@@ Request  sent has failed due to network error: &lt;span class=&quot;error&quot;&gt;&amp;#91;sent 1347175374/real 1347175374&amp;#93;&lt;/span&gt;  req@ffff880076483000 x1412612131657436/t0(0) o250-&amp;gt;MGC10.10.4.182@tcp@10.10.4.182@tcp:26/25 lens 400/544 e 0 to 1 dl 1347175399 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1&lt;br/&gt;
00:23:01:Lustre: 2964:0:(client.c:1917:ptlrpc_expire_one_request()) Skipped 151 previous similar messages&lt;br/&gt;
00:26:58:********** Timeout by autotest system **********00:28:03:&lt;br/&gt;
00:28:03:&amp;lt;ConMan&amp;gt; Console &lt;span class=&quot;error&quot;&gt;&amp;#91;client-30vm6&amp;#93;&lt;/span&gt; disconnected from &amp;lt;client-30:6005&amp;gt; at 09-09 00:27.&lt;br/&gt;
00:28:24:&lt;br/&gt;
00:28:24:&amp;lt;ConMan&amp;gt; Console &lt;span class=&quot;error&quot;&gt;&amp;#91;client-30vm6&amp;#93;&lt;/span&gt; connected to &amp;lt;client-30:6005&amp;gt; at 09-09 00:28.&lt;br/&gt;
00:28:24:&lt;br/&gt;
Press any key to continue.&lt;br/&gt;
00:28:24:&lt;/p&gt;


&lt;p&gt;3rd &lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/dc68d638-fa73-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/dc68d638-fa73-11e1-887d-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Timeouts correctly and reboot, the timeout message can be seen. Strangly the reboot is still captured after the timeout but the timeout is correct.&lt;/p&gt;

&lt;p&gt;02:10:44:Lustre: DEBUG MARKER: == parallel-scale-nfsv4 test compilebench: compilebench == 02:10:36 (1347181836)&lt;br/&gt;
02:10:44:Lustre: DEBUG MARKER: /usr/sbin/lctl mark .\/compilebench -D \/mnt\/lustre\/d0.compilebench -i 2         -r 2 --makej&lt;br/&gt;
02:10:44:Lustre: DEBUG MARKER: ./compilebench -D /mnt/lustre/d0.compilebench -i 2 -r 2 --makej&lt;br/&gt;
02:21:56:nfs: server client-30vm3 not responding, still trying&lt;br/&gt;
03:10:59:********** Timeout by autotest system **********03:11:08:&lt;br/&gt;
03:11:08:&amp;lt;ConMan&amp;gt; Console &lt;span class=&quot;error&quot;&gt;&amp;#91;client-30vm6&amp;#93;&lt;/span&gt; disconnected from &amp;lt;client-30:6005&amp;gt; at 09-09 03:11.&lt;br/&gt;
03:11:29:&lt;br/&gt;
03:11:29:&amp;lt;ConMan&amp;gt; Console &lt;span class=&quot;error&quot;&gt;&amp;#91;client-30vm6&amp;#93;&lt;/span&gt; connected to &amp;lt;client-30:6005&amp;gt; at 09-09 03:11.&lt;br/&gt;
03:11:29:&lt;br/&gt;
Press any key to continue.&lt;/p&gt;


&lt;p&gt;4th&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/757ba820-fb85-11e1-8e05-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/757ba820-fb85-11e1-8e05-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Timeouts correctly and reboot, the timeout message can be seen. Strangly the reboot is still captured after the timeout but the timeout is correct.&lt;/p&gt;

&lt;p&gt;02:10:44:Lustre: DEBUG MARKER: == parallel-scale-nfsv4 test compilebench: compilebench == 02:10:36 (1347181836)&lt;br/&gt;
02:10:44:Lustre: DEBUG MARKER: /usr/sbin/lctl mark .\/compilebench -D \/mnt\/lustre\/d0.compilebench -i 2         -r 2 --makej&lt;br/&gt;
02:10:44:Lustre: DEBUG MARKER: ./compilebench -D /mnt/lustre/d0.compilebench -i 2 -r 2 --makej&lt;br/&gt;
02:21:56:nfs: server client-30vm3 not responding, still trying&lt;br/&gt;
03:10:59:********** Timeout by autotest system **********03:11:08:&lt;br/&gt;
03:11:08:&amp;lt;ConMan&amp;gt; Console &lt;span class=&quot;error&quot;&gt;&amp;#91;client-30vm6&amp;#93;&lt;/span&gt; disconnected from &amp;lt;client-30:6005&amp;gt; at 09-09 03:11.&lt;br/&gt;
03:11:29:&lt;br/&gt;
03:11:29&lt;/p&gt;

</comment>
                            <comment id="44580" author="chris" created="Tue, 11 Sep 2012 07:35:26 +0000"  >&lt;p&gt;An issue did exist for a short while where the console log was not being shut before the report. This lead to the reboot appearing in the test log when the test was being correctly timedout.&lt;/p&gt;

&lt;p&gt;I&apos;m closing this issue as this is I beleive not resolved.&lt;/p&gt;</comment>
                            <comment id="44581" author="bobijam" created="Tue, 11 Sep 2012 08:14:54 +0000"  >&lt;p&gt;we need collect all relevent logs before rebooting the node. Or else we know nothing about the issue.&lt;/p&gt;</comment>
                            <comment id="44582" author="yujian" created="Tue, 11 Sep 2012 08:32:36 +0000"  >&lt;blockquote&gt;&lt;p&gt;same with the other failures this morning, I can&apos;t see why the previous did not have the LBUG but I can see any that replicate it.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;The two conf-sanity failures are caused by &lt;a href=&quot;http://review.whamcloud.com/#change,3671&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,3671&lt;/a&gt; and &lt;a href=&quot;http://review.whamcloud.com/#change,3670&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,3670&lt;/a&gt;, which are still under development.&lt;/p&gt;

&lt;p&gt;The failure instances I reported in this ticket are on the main b2_3 branch. There is no such LBUG.&lt;/p&gt;</comment>
                            <comment id="44585" author="chris" created="Tue, 11 Sep 2012 09:10:15 +0000"  >&lt;p&gt;The failures are real timeouts is my point. The reboot happens after the test has timed out.&lt;/p&gt;

&lt;p&gt;and we do capture the logs, autotest was restarted because it had stopped capturing logs.&lt;/p&gt;</comment>
                            <comment id="44586" author="yujian" created="Tue, 11 Sep 2012 09:20:25 +0000"  >&lt;p&gt;Hi Chris,&lt;br/&gt;
Please take a look at my above comments on the syslog of performance-sanity: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/7c320096-fa73-11e1-887d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/7c320096-fa73-11e1-887d-52540035b04c&lt;/a&gt;, the MDS (client-30vm3) was rebooted before timeout.&lt;/p&gt;
</comment>
                            <comment id="44588" author="yujian" created="Tue, 11 Sep 2012 09:21:55 +0000"  >&lt;p&gt;BTW, autotest did not gather syslogs for the above reports, I had to find out the syslogs on brent:/scratch/logs/syslog.&lt;/p&gt;</comment>
                            <comment id="44683" author="yujian" created="Wed, 12 Sep 2012 10:24:34 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_3/17&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_3/17&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The issue still exists:&lt;br/&gt;
parallel-scale-nfsv4 test_compilebench: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/f2b8c2b8-fc85-11e1-a4a6-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/f2b8c2b8-fc85-11e1-a4a6-52540035b04c&lt;/a&gt;&lt;br/&gt;
parallel-scale-nfsv3 test_compilebench: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/d241d4ca-fc85-11e1-a4a6-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/d241d4ca-fc85-11e1-a4a6-52540035b04c&lt;/a&gt;&lt;br/&gt;
parallel-scale test_compilebench: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/3b4a8f4e-fc85-11e1-a4a6-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/3b4a8f4e-fc85-11e1-a4a6-52540035b04c&lt;/a&gt;&lt;br/&gt;
large-scale test_3a: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/733bf24e-fc85-11e1-a4a6-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/733bf24e-fc85-11e1-a4a6-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;BTW, all of the syslogs in the above reports are empty. I checked the syslogs from brent node but still found nothing useful for debugging.&lt;/p&gt;

&lt;p&gt;However, comparing to the results on b2_3 build #16, although performance-saniy test_3 and sanity test_32n also hit MDS reboot issue, there are error messages on the MDS console logs on build #17 (no such messages on build #16), please refer to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1906&quot; title=&quot;performance-sanity subtest test_3: Oops: RIP: __find_get_block_slow+0x87/0x130&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1906&quot;&gt;&lt;del&gt;LU-1906&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1909&quot; title=&quot;performance-sanity subtest test_3: Oops: RIP: __do_softirq+0x73/0x1e0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1909&quot;&gt;&lt;del&gt;LU-1909&lt;/del&gt;&lt;/a&gt; and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1863&quot; title=&quot;Test failure with MDS spontaneous rebooting (test suite sanity, subtest test_32n)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1863&quot;&gt;&lt;del&gt;LU-1863&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;So, I&apos;m not sure whether the above parallel-scale* and large-scale failures were caused by Lustre issues or not although there were no specific error messages on their logs.&lt;/p&gt;</comment>
                            <comment id="44684" author="yujian" created="Wed, 12 Sep 2012 10:27:57 +0000"  >&lt;p&gt;Oleg, Bobi, do you have any ideas about whether the above MDS reboot issue is a Lustre issue or test environment issue? Thanks.&lt;br/&gt;
This is really blocking the Lustre b2_3 testing now.&lt;/p&gt;</comment>
                            <comment id="44771" author="chris" created="Thu, 13 Sep 2012 05:53:04 +0000"  >&lt;p&gt;From Skype&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;13/09/2012 08:21:29&amp;#93;&lt;/span&gt; Yu Jian&lt;/p&gt;

&lt;p&gt;aha, parallel-scale test compilebench hung in manual run, the MDS is being rebooted&lt;br/&gt;
more logs occurr&lt;br/&gt;
ok, Chris, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1926&quot; title=&quot;Reboots during test runs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1926&quot;&gt;&lt;del&gt;TT-851&lt;/del&gt;&lt;/a&gt; is not a test environment issue, it&apos;s a Lustre issue, I&apos;ll gather logs, file new ticket and update &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1926&quot; title=&quot;Reboots during test runs&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1926&quot;&gt;&lt;del&gt;TT-851&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
thank you, Chris&lt;/p&gt;</comment>
                            <comment id="44772" author="chris" created="Thu, 13 Sep 2012 05:53:51 +0000"  >&lt;p&gt;Just assigning this to you for clarity.&lt;/p&gt;</comment>
                            <comment id="44773" author="yujian" created="Thu, 13 Sep 2012 06:16:28 +0000"  >&lt;p&gt;Chris, I&apos;m still creating new LU tickets per the manual test runs. After all of the tests are done and the corresponding LU tickets are created, we can close this one.&lt;/p&gt;

&lt;p&gt;The remaining issue for TT is that the Oops on consoles were not gathered, and syslogs on Maloo were empty.&lt;/p&gt;</comment>
                            <comment id="44780" author="yujian" created="Thu, 13 Sep 2012 08:31:02 +0000"  >&lt;p&gt;After running those failed tests manually, it turned out to be Lustre failures:&lt;/p&gt;

&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_3/17&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_3/17&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;parallel-scale-nfsv4 test_compilebench: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1928&quot; title=&quot;parallel-scale-nfsv3 subtest test_compilebench: Oops: RIP: put_page+0x9/0x40&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1928&quot;&gt;&lt;del&gt;LU-1928&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
parallel-scale-nfsv3 test_compilebench: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1928&quot; title=&quot;parallel-scale-nfsv3 subtest test_compilebench: Oops: RIP: put_page+0x9/0x40&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1928&quot;&gt;&lt;del&gt;LU-1928&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
parallel-scale test_compilebench: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1925&quot; title=&quot;kernel tried to execute NX-protected page - exploit attempt? (uid: 0)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1925&quot;&gt;&lt;del&gt;LU-1925&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
large-scale test_3a: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1927&quot; title=&quot;large-scale subtest test_3a: Oops: RIP: _spin_lock+0xe/0x30&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1927&quot;&gt;&lt;del&gt;LU-1927&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
performance-saniy test_3: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1906&quot; title=&quot;performance-sanity subtest test_3: Oops: RIP: __find_get_block_slow+0x87/0x130&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1906&quot;&gt;&lt;del&gt;LU-1906&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1909&quot; title=&quot;performance-sanity subtest test_3: Oops: RIP: __do_softirq+0x73/0x1e0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1909&quot;&gt;&lt;del&gt;LU-1909&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1929&quot; title=&quot;performance-sanity subtest test_3: list_add corruption&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1929&quot;&gt;&lt;del&gt;LU-1929&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
sanity test_32n: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1863&quot; title=&quot;Test failure with MDS spontaneous rebooting (test suite sanity, subtest test_32n)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1863&quot;&gt;&lt;del&gt;LU-1863&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
Console log and syslog missing issue: TT-875&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="15838">LU-1863</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                <customfield id="customfield_10003" key="com.atlassian.jira.plugin.system.customfieldtypes:float">
                        <customfieldname>Business Value</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>0.1</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzus4v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>2220</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                    <customfield id="customfield_10002" key="com.atlassian.jira.plugin.system.customfieldtypes:float">
                        <customfieldname>Story Points</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>