<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:44:23 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4621] recovery-mds-scale: test_failover_ost</title>
                <link>https://jira.whamcloud.com/browse/LU-4621</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for sarah &amp;lt;sarah@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;http://maloo.whamcloud.com/test_sets/2a24816e-9088-11e3-91ee-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://maloo.whamcloud.com/test_sets/2a24816e-9088-11e3-91ee-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_failover_ost failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;test_failover_ost returned 1&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Client 3 console shows D process:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;11:05:02:Lustre: DEBUG MARKER: == recovery-mds-scale test failover_ost: failover OST == 11:04:36 (1391713476)
11:05:03:Lustre: DEBUG MARKER: /usr/sbin/lctl mark Started client load: dd on client-32vm5
11:05:03:Lustre: DEBUG MARKER: Started client load: dd on client-32vm5
11:05:04:Lustre: DEBUG MARKER: PATH=/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/utils/gss:/usr/lib64/lustre/utils:/usr/lib64/openmpi/bin:/usr/bin:/bin:
11:05:04:Lustre: DEBUG MARKER: /usr/sbin/lctl mark Started client load: tar on client-32vm6
11:05:05:Lustre: DEBUG MARKER: Started client load: tar on client-32vm6
11:05:05:Lustre: DEBUG MARKER: cat /tmp/client-load.pid
11:05:06:Lustre: DEBUG MARKER: /usr/sbin/lctl mark ==== Checking the clients loads BEFORE failover -- failure NOT OK              ELAPSED=0 DURATION=86400 PERIOD=1200
11:05:06:Lustre: DEBUG MARKER: ==== Checking the clients loads BEFORE failover -- failure NOT OK ELAPSED=0 DURATION=86400 PERIOD=1200
11:05:07:Lustre: DEBUG MARKER: rc=$([ -f /proc/sys/lnet/catastrophe ] &amp;amp;&amp;amp;
11:05:07:		echo $(&amp;lt; /proc/sys/lnet/catastrophe) || echo 0);
11:05:08:		if [ $rc -ne 0 ]; then echo $(hostname): $rc; fi
11:05:08:		exit $rc
11:05:09:Lustre: DEBUG MARKER: ps auxwww | grep -v grep | grep -q run_tar.sh
11:05:09:Lustre: DEBUG MARKER: /usr/sbin/lctl mark Wait ost7 recovery complete before doing next failover...
11:05:09:Lustre: DEBUG MARKER: Wait ost7 recovery complete before doing next failover...
11:05:10:LustreError: 1982:0:(client.c:2777:ptlrpc_replay_interpret()) @@@ status 301, old was 0  req@ffff880028c28400 x1459227849618364/t309237768099(309237768099) o101-&amp;gt;lustre-MDT0000-mdc-ffff88007a375c00@10.10.4.198@tcp:12/10 lens 568/544 e 0 to 0 dl 1391713537 ref 2 fl Interpret:R/4/0 rc 301/301
11:05:10:LustreError: 1982:0:(client.c:2777:ptlrpc_replay_interpret()) Skipped 53 previous similar messages
11:05:11:LustreError: 1982:0:(client.c:2777:ptlrpc_replay_interpret()) @@@ status 301, old was 0  req@ffff88002fdea800 x1459227849618852/t309237768160(309237768160) o101-&amp;gt;lustre-MDT0000-mdc-ffff88007a375c00@10.10.4.198@tcp:12/10 lens 568/544 e 0 to 0 dl 1391713537 ref 2 fl Interpret:R/4/0 rc 301/301
11:05:11:LustreError: 1982:0:(client.c:2777:ptlrpc_replay_interpret()) Skipped 13 previous similar messages
11:05:11:Lustre: DEBUG MARKER: /usr/sbin/lctl mark Checking clients are in FULL state before doing next failover...
11:05:12:Lustre: DEBUG MARKER: Checking clients are in FULL state before doing next failover...
11:05:12:Lustre: DEBUG MARKER: PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/u
11:05:13:Lustre: lustre-MDT0000-mdc-ffff88007a375c00: Connection restored to lustre-MDT0000 (at 10.10.4.198@tcp)
11:05:13:Lustre: DEBUG MARKER: lctl get_param -n at_max
11:05:14:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:14:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:15:Lustre: DEBUG MARKER: osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:38:Lustre: DEBUG MARKER: osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:38:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:39:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0001-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:39:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0001-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:39:Lustre: DEBUG MARKER: osc.lustre-OST0000-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:40:Lustre: DEBUG MARKER: osc.lustre-OST0001-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:40:Lustre: DEBUG MARKER: osc.lustre-OST0001-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:41:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0001-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:41:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0002-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:41:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0002-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:42:Lustre: DEBUG MARKER: osc.lustre-OST0002-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:42:Lustre: DEBUG MARKER: osc.lustre-OST0002-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:43:Lustre: DEBUG MARKER: osc.lustre-OST0001-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:43:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0002-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:43:Lustre: DEBUG MARKER: osc.lustre-OST0002-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:44:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0003-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:44:Lustre: DEBUG MARKER: osc.lustre-OST0003-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:44:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0004-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:45:Lustre: DEBUG MARKER: osc.lustre-OST0004-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:45:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0005-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:46:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0003-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:46:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0003-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:48:Lustre: DEBUG MARKER: osc.lustre-OST0005-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:48:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0006-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:49:Lustre: DEBUG MARKER: osc.lustre-OST0006-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:49:Lustre: DEBUG MARKER: osc.lustre-OST0003-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:49:Lustre: DEBUG MARKER: osc.lustre-OST0003-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:49:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0004-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:50:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0004-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:50:Lustre: DEBUG MARKER: osc.lustre-OST0004-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:50:Lustre: DEBUG MARKER: osc.lustre-OST0004-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:51:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0005-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:51:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0005-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:52:Lustre: DEBUG MARKER: osc.lustre-OST0005-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:52:Lustre: DEBUG MARKER: osc.lustre-OST0005-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:52:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0006-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:52:Lustre: DEBUG MARKER: /usr/sbin/lctl mark osc.lustre-OST0006-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:53:Lustre: DEBUG MARKER: osc.lustre-OST0006-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:53:Lustre: DEBUG MARKER: osc.lustre-OST0006-osc-*.ost_server_uuid in FULL state after 0 sec
11:05:53:Lustre: DEBUG MARKER: /usr/sbin/lctl mark Starting failover on ost7
11:05:54:Lustre: DEBUG MARKER: Starting failover on ost7
11:10:01:Lustre: 1984:0:(client.c:1912:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1391713512/real 1391713512]  req@ffff880024f1fc00 x1459227849666748/t0(0) o2-&amp;gt;lustre-OST0004-osc-ffff88007a375c00@10.10.4.199@tcp:28/4 lens 440/432 e 0 to 1 dl 1391713530 ref 1 fl Rpc:X/0/ffffffff rc 0/-1
11:10:01:Lustre: 1984:0:(client.c:1912:ptlrpc_expire_one_request()) Skipped 2 previous similar messages
11:10:02:Lustre: lustre-OST0004-osc-ffff88007a375c00: Connection to lustre-OST0004 (at 10.10.4.199@tcp) was lost; in progress operations using this service will wait for recovery to complete
11:10:02:Lustre: lustre-OST0002-osc-ffff88007a375c00: Connection to lustre-OST0002 (at 10.10.4.199@tcp) was lost; in progress operations using this service will wait for recovery to complete
11:10:02:Lustre: lustre-OST0005-osc-ffff88007a375c00: Connection to lustre-OST0005 (at 10.10.4.199@tcp) was lost; in progress operations using this service will wait for recovery to complete
11:10:02:Lustre: Skipped 1 previous similar message
11:10:02:LNet: Host 10.10.4.199 reset our connection while we were sending data; it may have rebooted.
11:10:02:Lustre: lustre-OST0000-osc-ffff88007a375c00: Connection to lustre-OST0000 (at 10.10.4.199@tcp) was lost; in progress operations using this service will wait for recovery to complete
11:10:02:Lustre: Skipped 2 previous similar messages
11:10:02:Lustre: lustre-OST0000-osc-ffff88007a375c00: Connection restored to lustre-OST0000 (at 10.10.4.203@tcp)
11:10:02:Lustre: lustre-OST0001-osc-ffff88007a375c00: Connection restored to lustre-OST0001 (at 10.10.4.203@tcp)
11:10:03:Lustre: 1982:0:(client.c:1912:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1391713662/real 1391713662]  req@ffff88002438c400 x1459227849671944/t0(0) o8-&amp;gt;lustre-OST0002-osc-ffff88007a375c00@10.10.4.199@tcp:28/4 lens 400/544 e 0 to 1 dl 1391713687 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
11:10:03:Lustre: 1982:0:(client.c:1912:ptlrpc_expire_one_request()) Skipped 128 previous similar messages
11:10:04:Lustre: lustre-OST0002-osc-ffff88007a375c00: Connection restored to lustre-OST0002 (at 10.10.4.203@tcp)
11:10:04:INFO: task tar:24176 blocked for more than 120 seconds.
11:10:04:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
11:10:05:tar           D 0000000000000001     0 24176  24163 0x00000080
11:10:05: ffff880062a21a18 0000000000000086 ffff880062a219e0 ffff880062a219dc
11:10:05: ffff88007ae95538 ffff88007f823240 ffff880002216700 0000000000000400
11:10:05: ffff88002fcc9058 ffff880062a21fd8 000000000000fb88 ffff88002fcc9058
11:10:05:Call Trace:
11:10:05: [&amp;lt;ffffffff8150f035&amp;gt;] schedule_timeout+0x215/0x2e0
11:10:05: [&amp;lt;ffffffffa0743d90&amp;gt;] ? lustre_swab_ost_body+0x0/0x10 [ptlrpc]
11:10:05: [&amp;lt;ffffffff8150ecb3&amp;gt;] wait_for_common+0x123/0x180
11:10:06: [&amp;lt;ffffffff81063990&amp;gt;] ? default_wake_function+0x0/0x20
11:10:06: [&amp;lt;ffffffff8150edcd&amp;gt;] wait_for_completion+0x1d/0x20
11:10:06: [&amp;lt;ffffffffa0b5d88c&amp;gt;] osc_io_setattr_end+0xbc/0x190 [osc]
11:10:06: [&amp;lt;ffffffffa093b3a0&amp;gt;] ? lov_io_end_wrapper+0x0/0x100 [lov]
11:10:07: [&amp;lt;ffffffffa055d100&amp;gt;] cl_io_end+0x60/0x150 [obdclass]
11:10:07: [&amp;lt;ffffffffa055dc80&amp;gt;] ? cl_io_start+0x0/0x140 [obdclass]
11:10:07: [&amp;lt;ffffffffa093b491&amp;gt;] lov_io_end_wrapper+0xf1/0x100 [lov]
11:10:07: [&amp;lt;ffffffffa093b07e&amp;gt;] lov_io_call+0x8e/0x130 [lov]
11:10:07: [&amp;lt;ffffffffa093ce0c&amp;gt;] lov_io_end+0x4c/0xf0 [lov]
11:10:07: [&amp;lt;ffffffffa055d100&amp;gt;] cl_io_end+0x60/0x150 [obdclass]
11:10:08: [&amp;lt;ffffffffa0561e52&amp;gt;] cl_io_loop+0xc2/0x1b0 [obdclass]
11:10:09: [&amp;lt;ffffffffa0a1bb58&amp;gt;] cl_setattr_ost+0x218/0x2f0 [lustre]
11:10:09: [&amp;lt;ffffffffa09e67f5&amp;gt;] ll_setattr_raw+0xa45/0x10c0 [lustre]
11:10:09: [&amp;lt;ffffffffa09e6ecd&amp;gt;] ll_setattr+0x5d/0xf0 [lustre]
11:10:09: [&amp;lt;ffffffff8119ea78&amp;gt;] notify_change+0x168/0x340
11:10:10: [&amp;lt;ffffffff811b2b1c&amp;gt;] utimes_common+0xdc/0x1b0
11:10:10: [&amp;lt;ffffffff81182bf1&amp;gt;] ? __fput+0x1a1/0x210
11:10:10: [&amp;lt;ffffffff811b2cce&amp;gt;] do_utimes+0xde/0xf0
11:10:10: [&amp;lt;ffffffff811b2de2&amp;gt;] sys_utimensat+0x32/0x90
11:10:10: [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
11:10:10:Lustre: lustre-OST0003-osc-ffff88007a375c00: Connection restored to lustre-OST0003 (at 10.10.4.203@tcp)
11:10:10:LustreError: 1982:0:(client.c:2777:ptlrpc_replay_interpret()) @@@ status -2, old was 0  req@ffff880023a03800 x1459227849663432/t4295715092(4295715092) o2-&amp;gt;lustre-OST0004-osc-ffff88007a375c00@10.10.4.203@tcp:28/4 lens 440/400 e 0 to 0 dl 1391713884 ref 2 fl Interpret:R/4/0 rc -2/-2
11:10:10:LustreError: 1982:0:(client.c:2777:ptlrpc_replay_interpret()) Skipped 117 previous similar messages
11:12:36:LustreError: 11-0: lustre-OST0004-osc-ffff88007a375c00: Communicating with 10.10.4.203@tcp, operation ldlm_enqueue failed with -12.
11:12:36:LustreError: 11-0: lustre-OST0004-osc-ffff88007a375c00: Communicating with 10.10.4.203@tcp, operation ldlm_enqueue failed with -12.
11:12:36:LustreError: 1982:0:(import.c:631:ptlrpc_connect_import()) already connecting
11:12:36:LustreError: 1982:0:(import.c:631:ptlrpc_connect_import()) already connecting
11:12:37:Lustre: lustre-OST0005-osc-ffff88007a375c00: Connection restored to lustre-OST0005 (at 10.10.4.203@tcp)
11:12:37:INFO: task tar:24176 blocked for more than 120 seconds.
11:12:37:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
11:12:37:tar           D 0000000000000001     0 24176  24163 0x00000080
11:12:39: ffff880062a21a18 0000000000000086 ffff880062a219e0 ffff880062a219dc
11:12:39: ffff88007ae95538 ffff88007f823240 ffff880002216700 0000000000000400
11:12:40: ffff88002fcc9058 ffff880062a21fd8 000000000000fb88 ffff88002fcc9058
11:12:40:Call Trace:
11:12:40: [&amp;lt;ffffffff8150f035&amp;gt;] schedule_timeout+0x215/0x2e0
11:12:40: [&amp;lt;ffffffffa0743d90&amp;gt;] ? lustre_swab_ost_body+0x0/0x10 [ptlrpc]
11:12:40: [&amp;lt;ffffffff8150ecb3&amp;gt;] wait_for_common+0x123/0x180
11:12:40: [&amp;lt;ffffffff81063990&amp;gt;] ? default_wake_function+0x0/0x20
11:12:41: [&amp;lt;ffffffff8150edcd&amp;gt;] wait_for_completion+0x1d/0x20
11:12:41: [&amp;lt;ffffffffa0b5d88c&amp;gt;] osc_io_setattr_end+0xbc/0x190 [osc]
11:12:41: [&amp;lt;ffffffffa093b3a0&amp;gt;] ? lov_io_end_wrapper+0x0/0x100 [lov]
11:12:42: [&amp;lt;ffffffffa055d100&amp;gt;] cl_io_end+0x60/0x150 [obdclass]
11:12:42: [&amp;lt;ffffffffa055dc80&amp;gt;] ? cl_io_start+0x0/0x140 [obdclass]
11:12:42: [&amp;lt;ffffffffa093b491&amp;gt;] lov_io_end_wrapper+0xf1/0x100 [lov]
11:12:42: [&amp;lt;ffffffffa093b07e&amp;gt;] lov_io_call+0x8e/0x130 [lov]
11:12:42: [&amp;lt;ffffffffa093ce0c&amp;gt;] lov_io_end+0x4c/0xf0 [lov]
11:12:43: [&amp;lt;ffffffffa055d100&amp;gt;] cl_io_end+0x60/0x150 [obdclass]
11:12:44: [&amp;lt;ffffffffa0561e52&amp;gt;] cl_io_loop+0xc2/0x1b0 [obdclass]
11:12:44: [&amp;lt;ffffffffa0a1bb58&amp;gt;] cl_setattr_ost+0x218/0x2f0 [lustre]
11:12:44: [&amp;lt;ffffffffa09e67f5&amp;gt;] ll_setattr_raw+0xa45/0x10c0 [lustre]
11:12:44: [&amp;lt;ffffffffa09e6ecd&amp;gt;] ll_setattr+0x5d/0xf0 [lustre]
11:12:44: [&amp;lt;ffffffff8119ea78&amp;gt;] notify_change+0x168/0x340
11:12:45: [&amp;lt;ffffffff811b2b1c&amp;gt;] utimes_common+0xdc/0x1b0
11:12:45: [&amp;lt;ffffffff81182bf1&amp;gt;] ? __fput+0x1a1/0x210
11:12:45: [&amp;lt;ffffffff811b2cce&amp;gt;] do_utimes+0xde/0xf0
11:12:46: [&amp;lt;ffffffff811b2de2&amp;gt;] sys_utimensat+0x32/0x90
11:12:46: [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
11:12:46:Lustre: DEBUG MARKER: /usr/sbin/lctl mark ==== Checking the clients loads AFTER failover -- failure NOT OK
11:12:47:Lustre: DEBUG MARKER: ==== Checking the clients loads AFTER failover -- failure NOT OK
11:12:48:Lustre: DEBUG MARKER: rc=$([ -f /proc/sys/lnet/catastrophe ] &amp;amp;&amp;amp;
11:12:48:		echo $(&amp;lt; /proc/sys/lnet/catastrophe) || echo 0);
11:12:48:		if [ $rc -ne 0 ]; then echo $(hostname): $rc; fi
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</description>
                <environment>client and server: lustre-master build # 1877</environment>
        <key id="23130">LU-4621</key>
            <summary>recovery-mds-scale: test_failover_ost</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>22pl</label>
                    </labels>
                <created>Wed, 12 Feb 2014 22:41:37 +0000</created>
                <updated>Wed, 4 Oct 2017 13:54:37 +0000</updated>
                            <resolved>Mon, 27 Apr 2015 22:04:26 +0000</resolved>
                                    <version>Lustre 2.6.0</version>
                    <version>Lustre 2.5.3</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="82646" author="adilger" created="Mon, 28 Apr 2014 17:35:14 +0000"  >&lt;p&gt;Sarah, when running the failover testing, how many times can the servers properly do failover before they hit a problem?  I&apos;m trying to figure out whether failover is working most of the time and occasionally fails, or if it mostly fails and rarely or never works?&lt;/p&gt;</comment>
                            <comment id="83215" author="sarah" created="Mon, 5 May 2014 17:55:46 +0000"  >&lt;p&gt;Hello Andreas, &lt;/p&gt;

&lt;p&gt;In this case, the ost hit error only after 1 failover&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2014-02-06 11:24:44 Terminating clients loads ...
Duration:               86400
Server failover period: 1200 seconds
Exited after:           384 seconds
Number of failovers before exit:
mds1: 0 times
ost1: 0 times
ost2: 0 times
ost3: 0 times
ost4: 0 times
ost5: 0 times
ost6: 0 times
ost7: 1 times
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="83631" author="pjones" created="Fri, 9 May 2014 14:59:14 +0000"  >&lt;p&gt;Hongchao&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="85479" author="adilger" created="Mon, 2 Jun 2014 17:28:07 +0000"  >&lt;p&gt;Hi Hongchao, could you please take a look at this. If needed, please run this test manually a few times to see if it can handle failovers. I&apos;m most interested that failover is working in general, and not necessarily in this particular failure unless it is causing the recovery to break often. &lt;/p&gt;</comment>
                            <comment id="85581" author="hongchao.zhang" created="Tue, 3 Jun 2014 14:08:36 +0000"  >&lt;p&gt;In the recent failed reports of this issue in Maloo, there are some suspected logs related to the bug.&lt;br/&gt;
the clients needed to recover have connected and queued its replay request, but the &quot;transno&quot; order is still broken&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Lustre: lustre-OST0004: Will be in recovery for at least 1:00, or until 3 clients reconnect
LustreError: 9129:0:(ldlm_lib.c:1744:check_for_next_transno()) lustre-OST0004: waking for gap in transno, VBR is OFF (skip: 4295715089, ql: 2, comp: 1, conn: 3, next: 4295715092, last_committed: 4295715015)
LustreError: 9129:0:(ldlm_lib.c:1744:check_for_next_transno()) lustre-OST0004: waking for gap in transno, VBR is OFF (skip: 4295715093, ql: 2, comp: 1, conn: 3, next: 4295715095, last_committed: 4295715015)
LustreError: 9129:0:(ldlm_lib.c:1744:check_for_next_transno()) lustre-OST0004: waking for gap in transno, VBR is OFF (skip: 4295715121, ql: 2, comp: 1, conn: 3, next: 4295715130, last_committed: 4295715015)
LustreError: 9129:0:(ldlm_resource.c:1154:ldlm_resource_get()) lustre-OST0004: lvbo_init failed for resource 0x2f90d:0x0: rc = -2
LustreError: 9129:0:(ldlm_resource.c:1154:ldlm_resource_get()) lustre-OST0004: lvbo_init failed for resource 0x2f90c:0x0: rc = -2
Lustre: lustre-OST0004: Client 8710edc6-b33b-5d92-cf10-e2fffadf506e (at 10.10.4.201@tcp) reconnecting, waiting for 3 clients in recovery for 1:45
Lustre: lustre-OST0004: Client 8710edc6-b33b-5d92-cf10-e2fffadf506e (at 10.10.4.201@tcp) reconnecting, waiting for 3 clients in recovery for 1:45
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;this is caused by the multiple creating operations (ofd_precreate_objects) in &quot;ofd_create_hdl&quot;, for each creating operation will start/stop a transaction, which will increase the transno, and the final transno will be&lt;br/&gt;
the transno produced by the last creating operation.&lt;/p&gt;

&lt;p&gt;this issue could be related to the different result of the original creating request from the replay one, which the replay request could create less objects, for instance, there is less free space for the &quot;dd&quot; in the other node.&lt;/p&gt;</comment>
                            <comment id="85694" author="hongchao.zhang" created="Wed, 4 Jun 2014 13:41:42 +0000"  >&lt;p&gt;the debug patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#/c/10626/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10626/&lt;/a&gt;, it will check the result of the replayed creation request from OSP during recovery.&lt;/p&gt;</comment>
                            <comment id="86463" author="tappro" created="Thu, 12 Jun 2014 20:05:32 +0000"  >&lt;p&gt;well, you are right, the precreate is &apos;multi-transaction&apos; request, like OUT batches, and during replay we will see not continuos transaction sequence but with gaps. Technically it is not a problem. Meanwhile I don&apos;t see how replay can be differ from original request, but let&apos;s see what your patch will show. Is that possible to create test case for that?&lt;/p&gt;</comment>
                            <comment id="87188" author="adilger" created="Fri, 20 Jun 2014 17:50:05 +0000"  >&lt;p&gt;Looking at this test, it seems unlikely that you will hit the same problem just in a single autotest run.  I think the patch needs to be fixed up and landed to master in order to be able to get enough runs to hit the failure.&lt;/p&gt;</comment>
                            <comment id="88206" author="adilger" created="Fri, 4 Jul 2014 17:57:05 +0000"  >&lt;p&gt;Note that the &quot;debug&quot; patch &lt;a href=&quot;http://review.whamcloud.com/10626&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10626&lt;/a&gt; has now turned into a &quot;fix&quot; patch.&lt;/p&gt;</comment>
                            <comment id="88692" author="jlevi" created="Thu, 10 Jul 2014 12:46:09 +0000"  >&lt;p&gt;Patch has landed to Master.&lt;/p&gt;</comment>
                            <comment id="91479" author="yujian" created="Tue, 12 Aug 2014 21:23:03 +0000"  >&lt;p&gt;Hi Hongchao,&lt;/p&gt;

&lt;p&gt;I&apos;m trying to back-port the patch &lt;a href=&quot;http://review.whamcloud.com/10626&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10626&lt;/a&gt; to Lustre b2_5 branch but found that the function ofd_create_hdl() does not exist on b2_5. The function was introduced by patch &lt;a href=&quot;http://review.whamcloud.com/7130&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7130&lt;/a&gt; on master branch.&lt;/p&gt;

&lt;p&gt;Could you please take a look to see how to fix the issue (originally reported in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3325&quot; title=&quot;recovery-mds-scale test_failover_mds: tar: Cannot write: Input/output error&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3325&quot;&gt;&lt;del&gt;LU-3325&lt;/del&gt;&lt;/a&gt;) on Lustre b2_5 branch?&lt;/p&gt;</comment>
                            <comment id="92122" author="hongchao.zhang" created="Thu, 21 Aug 2014 11:23:48 +0000"  >&lt;p&gt;the patch against b2_5 is tracked at &lt;a href=&quot;http://review.whamcloud.com/#/c/11541/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/11541/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="92996" author="yujian" created="Tue, 2 Sep 2014 17:59:04 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;https://build.hpdd.intel.com/job/lustre-b2_5/86/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://build.hpdd.intel.com/job/lustre-b2_5/86/&lt;/a&gt; (2.5.3 RC1)&lt;br/&gt;
TEST_GROUP=failover&lt;/p&gt;

&lt;p&gt;The same failure occurred:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/cd75fb30-3269-11e4-8c3a-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/cd75fb30-3269-11e4-8c3a-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/fd86748a-3269-11e4-bbc4-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/fd86748a-3269-11e4-bbc4-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="97916" author="yujian" created="Thu, 30 Oct 2014 06:37:37 +0000"  >&lt;p&gt;More instances on Lustre b2_5 branch:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/6143ccd0-5ea6-11e4-badb-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/6143ccd0-5ea6-11e4-badb-5254006e85c2&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/567427ca-80b9-11e4-9ec8-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/567427ca-80b9-11e4-9ec8-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="27154">LU-5785</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="18910">LU-3325</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="25058">LU-5157</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwezj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>12647</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>