<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:35:49 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10519] replay-vbr fails to start running tests due to MDS not restarting</title>
                <link>https://jira.whamcloud.com/browse/LU-10519</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;replay-vbr doesn&#8217;t start running any tests due to problems with unmounting the clients. The following description is based on the replay-vbr information at&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/22a8e51e-f750-11e7-a10a-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/22a8e51e-f750-11e7-a10a-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;In the replay-vbr suite_log, we see a problem contacting the MGS:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;onyx-42vm1: mount.lustre: mount onyx-42vm7:onyx-42vm8:/lustre at /mnt/lustre2 failed: Input/output error
onyx-42vm1: Is the MGS running?
onyx-42vm4: mount.lustre: mount onyx-42vm7:onyx-42vm8:/lustre at /mnt/lustre2 failed: Input/output error
onyx-42vm4: Is the MGS running?
onyx-42vm3: mount.lustre: mount onyx-42vm7:onyx-42vm8:/lustre at /mnt/lustre2 failed: Input/output error
onyx-42vm3: Is the MGS running?
Stopping clients: onyx-42vm1.onyx.hpdd.intel.com,onyx-42vm3,onyx-42vm4 /mnt/lustre (opts:)
CMD: onyx-42vm1.onyx.hpdd.intel.com,onyx-42vm3,onyx-42vm4 running=\$(grep -c /mnt/lustre&apos; &apos; /proc/mounts);
if [ \$running -ne 0 ] ; then
echo Stopping client \$(hostname) /mnt/lustre opts:;
lsof /mnt/lustre || need_kill=no;
if [ x != x -a x\$need_kill != xno ]; then
    pids=\$(lsof -t /mnt/lustre | sort -u);
    if [ -n \&quot;\$pids\&quot; ]; then
             kill -9 \$pids;
    fi
fi;
while umount  /mnt/lustre 2&amp;gt;&amp;amp;1 | grep -q busy; do
    echo /mnt/lustre is still busy, wait one second &amp;amp;&amp;amp; sleep 1;
done;
fi
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;In the console log of client 4 (vm4), we see a problem unmounting the file system at the end of replay-dual and this impacts replay-vbr:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 3200.984760] Lustre: DEBUG MARKER: == replay-dual test complete, duration 2281 sec ====================================================== 00:16:26 (1515716186)
[ 3201.183181] Lustre: DEBUG MARKER: running=$(grep -c /mnt/lustre2&apos; &apos; /proc/mounts);
[ 3201.183181] if [ $running -ne 0 ] ; then
[ 3201.183181] echo Stopping client $(hostname) /mnt/lustre2 opts:;
[ 3201.183181] lsof /mnt/lustre2 || need_kill=no;
[ 3201.183181] if [ x != x -a x$need_kill != xno ]; then
[ 3201.183181]     pids=$(lsof -t /mnt/lustre2 | sort -u);
[ 3201.183181]    
[ 3360.201681] INFO: task umount:17035 blocked for more than 120 seconds.
[ 3360.202477] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 3360.203291] umount          D ffff880069f5bf40     0 17035  17028 0x00000080
[ 3360.204040] Call Trace:
[ 3360.204299]  [&amp;lt;ffffffff816ac5f9&amp;gt;] schedule_preempt_disabled+0x29/0x70
[ 3360.204982]  [&amp;lt;ffffffff816aa427&amp;gt;] __mutex_lock_slowpath+0xc7/0x1d0
[ 3360.205732]  [&amp;lt;ffffffff816a983f&amp;gt;] mutex_lock+0x1f/0x2f
[ 3360.206277]  [&amp;lt;ffffffffc061e147&amp;gt;] mgc_process_config+0x207/0x13f0 [mgc]
[ 3360.206996]  [&amp;lt;ffffffffc0704a75&amp;gt;] obd_process_config.constprop.13+0x85/0x2d0 [obdclass]
[ 3360.207837]  [&amp;lt;ffffffffc05babc7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[ 3360.208560]  [&amp;lt;ffffffffc06f1de9&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[ 3360.209290]  [&amp;lt;ffffffffc0705d9f&amp;gt;] lustre_end_log+0x1ff/0x550 [obdclass]
[ 3360.210025]  [&amp;lt;ffffffffc0afcc0d&amp;gt;] ll_put_super+0x8d/0xaa0 [lustre]
[ 3360.210672]  [&amp;lt;ffffffff81248c61&amp;gt;] ? fsnotify_destroy_marks+0x81/0x90
[ 3360.211327]  [&amp;lt;ffffffff8113ae4d&amp;gt;] ? call_rcu_sched+0x1d/0x20
[ 3360.211922]  [&amp;lt;ffffffffc0b2794c&amp;gt;] ? ll_destroy_inode+0x1c/0x20 [lustre]
[ 3360.212634]  [&amp;lt;ffffffff8121f5a8&amp;gt;] ? destroy_inode+0x38/0x60
[ 3360.213206]  [&amp;lt;ffffffff8121f6da&amp;gt;] ? evict+0x10a/0x180
[ 3360.213789]  [&amp;lt;ffffffff8121f78e&amp;gt;] ? dispose_list+0x3e/0x50
[ 3360.214338]  [&amp;lt;ffffffff81220446&amp;gt;] ? evict_inodes+0xd6/0x140
[ 3360.214903]  [&amp;lt;ffffffff812054d2&amp;gt;] generic_shutdown_super+0x72/0x100
[ 3360.215628]  [&amp;lt;ffffffff812058a2&amp;gt;] kill_anon_super+0x12/0x20
[ 3360.216207]  [&amp;lt;ffffffffc0704115&amp;gt;] lustre_kill_super+0x45/0x50 [obdclass]
[ 3360.216925]  [&amp;lt;ffffffff81205c59&amp;gt;] deactivate_locked_super+0x49/0x60
[ 3360.217570]  [&amp;lt;ffffffff812063c6&amp;gt;] deactivate_super+0x46/0x60
[ 3360.218186]  [&amp;lt;ffffffff8122375f&amp;gt;] cleanup_mnt+0x3f/0x80
[ 3360.218753]  [&amp;lt;ffffffff812237f2&amp;gt;] __cleanup_mnt+0x12/0x20
[ 3360.219318]  [&amp;lt;ffffffff810aee05&amp;gt;] task_work_run+0xc5/0xf0
[ 3360.219860]  [&amp;lt;ffffffff8102ab52&amp;gt;] do_notify_resume+0x92/0xb0
[ 3360.220504]  [&amp;lt;ffffffff816b8d37&amp;gt;] int_signal+0x12/0x17
[ 3365.065664] LustreError: 166-1: MGC10.2.8.151@tcp: Connection to MGS (at 10.2.8.152@tcp) was lost; in progress operations using this service will fail
[ 3365.067092] LustreError: Skipped 2 previous similar messages
[ 3365.067690] LustreError: 3103:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1515716050, 300s ago), entering recovery for MGS@10.2.8.152@tcp ns: MGC10.2.8.151@tcp lock: ffff88007bb0a000/0xa3346a06e12d0d8e lrc: 4/1,0 mode: --/CR res: [0x65727473756c:0x2:0x0].0x0 rrc: 2 type: PLN flags: 0x1000000000000 nid: local remote: 0x62460a664e0e6625 expref: -99 pid: 3103 timeout: 0 lvb_type: 0
[ 3365.071478] LustreError: 17045:0:(ldlm_resource.c:1100:ldlm_resource_complain()) MGC10.2.8.151@tcp: namespace resource [0x65727473756c:0x2:0x0].0x0 (ffff880069efcb40) refcount nonzero (1) after lock cleanup; forcing cleanup.
[ 3365.073441] LustreError: 17045:0:(ldlm_resource.c:1682:ldlm_resource_dump()) --- Resource: [0x65727473756c:0x2:0x0].0x0 (ffff880069efcb40) refcount = 2
[ 3365.074760] LustreError: 17045:0:(ldlm_resource.c:1703:ldlm_resource_dump()) Waiting locks:
[ 3365.075569] LustreError: 17045:0:(ldlm_resource.c:1705:ldlm_resource_dump()) ### ### ns: ?? lock: ffff88007bb0a000/0xa3346a06e12d0d8e lrc: 4/1,0 mode: --/CR res: ?? rrc=?? type: ??? flags: 0x1106400000000 nid: local remote: 0x62460a664e0e6625 expref: -99 pid: 3103 timeout: 0 lvb_type: 0
[ 3365.092970] Lustre: Unmounted lustre-client
[ 3377.067710] Lustre: lustre-MDT0000-mdc-ffff88007a24d800: Connection to lustre-MDT0000 (at 10.2.8.152@tcp) was lost; in progress operations using this service will wait for recovery to complete
[ 3377.069582] Lustre: Skipped 31 previous similar messages
[ 3379.942361] Lustre: DEBUG MARKER: /usr/sbin/lctl mark  auster : @@@@@@ FAIL: Restart of mds1 failed! 
[ 3380.113928] Lustre: DEBUG MARKER: auster : @@@@@@ FAIL: Restart of mds1 failed!
[ 3380.325749] Lustre: DEBUG MARKER: /usr/sbin/lctl dk &amp;gt; /tmp/test_logs/1515713768/auster..debug_log.$(hostname -s).1515716366.log;
[ 3380.325749]          dmesg &amp;gt; /tmp/test_logs/1515713768/auster..dmesg.$(hostname -s).1515716366.log
[ 3380.730891] Lustre: DEBUG MARKER: lctl set_param -n fail_loc=0 	    fail_val=0 2&amp;gt;/dev/null
[ 3381.125455] Lustre: DEBUG MARKER: /usr/sbin/lctl mark -----============= acceptance-small: replay-vbr ============----- Fri Jan 12 00:19:27 UTC 2018
[ 3381.320973] Lustre: DEBUG MARKER: -----============= acceptance-small: replay-vbr ============----- Fri Jan 12 00:19:27 UTC 2018
[ 3382.105085] Lustre: DEBUG MARKER: /usr/sbin/lctl mark excepting tests: 12a
[ 3382.287300] Lustre: DEBUG MARKER: excepting tests: 12a
[ 3382.470266] Lustre: DEBUG MARKER: 
[ 3382.470266] running=$(mount | grep -c /mnt/lustre2&apos; &apos;);
[ 3382.470266] rc=0;
[ 3382.470266] if [ $running -eq 0 ] ; then
[ 3382.470266] 	mkdir -p /mnt/lustre2;
[ 3382.470266] 	mount -t lustre  -o user_xattr,flock onyx-42vm7:onyx-42vm8:/lustre /mnt/lustre2;
[ 3382.470266] 	rc=$?;
[ 3382.470266] fi;
[ 3382.470266] exit $rc
[ 3402.626707] LustreError: 15c-8: MGC10.2.8.151@tcp: The configuration from log &apos;lustre-client&apos; failed (-5). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.
[ 3402.629189] Lustre: Unmounted lustre-client
[ 3402.629745] LustreError: 17479:0:(obd_mount.c:1506:lustre_fill_super()) Unable to mount  (-5)
[ 3403.290421] Lustre: DEBUG MARKER: running=$(grep -c /mnt/lustre&apos; &apos; /proc/mounts);
[ 3403.290421] if [ $running -ne 0 ] ; then
[ 3403.290421] echo Stopping client $(hostname) /mnt/lustre opts:;
[ 3403.290421] lsof /mnt/lustre || need_kill=no;
[ 3403.290421] if [ x != x -a x$need_kill != xno ]; then
[ 3403.290421]     pids=$(lsof -t /mnt/lustre | sort -u);
[ 3403.290421]     if 
[ 3600.220677] INFO: task umount:17558 blocked for more than 120 seconds.
[ 3600.221454] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 3600.222245] umount          D ffff88007a1a5ee0     0 17558  17551 0x00000080
[ 3600.223132] Call Trace:
[ 3600.223389]  [&amp;lt;ffffffff816ac5f9&amp;gt;] schedule_preempt_disabled+0x29/0x70
[ 3600.224028]  [&amp;lt;ffffffff816aa427&amp;gt;] __mutex_lock_slowpath+0xc7/0x1d0
[ 3600.224762]  [&amp;lt;ffffffff816a983f&amp;gt;] mutex_lock+0x1f/0x2f
[ 3600.225339]  [&amp;lt;ffffffffc061e147&amp;gt;] mgc_process_config+0x207/0x13f0 [mgc]
[ 3600.226023]  [&amp;lt;ffffffffc0704a75&amp;gt;] obd_process_config.constprop.13+0x85/0x2d0 [obdclass]
[ 3600.226895]  [&amp;lt;ffffffffc05babc7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[ 3600.227594]  [&amp;lt;ffffffffc06f1de9&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[ 3600.228319]  [&amp;lt;ffffffffc0705d9f&amp;gt;] lustre_end_log+0x1ff/0x550 [obdclass]
[ 3600.229078]  [&amp;lt;ffffffffc0afcc0d&amp;gt;] ll_put_super+0x8d/0xaa0 [lustre]
[ 3600.229740]  [&amp;lt;ffffffff81248c61&amp;gt;] ? fsnotify_destroy_marks+0x81/0x90
[ 3600.230428]  [&amp;lt;ffffffff8113ae4d&amp;gt;] ? call_rcu_sched+0x1d/0x20
[ 3600.231006]  [&amp;lt;ffffffffc0b2794c&amp;gt;] ? ll_destroy_inode+0x1c/0x20 [lustre]
[ 3600.231737]  [&amp;lt;ffffffff8121f5a8&amp;gt;] ? destroy_inode+0x38/0x60
[ 3600.232336]  [&amp;lt;ffffffff8121f6da&amp;gt;] ? evict+0x10a/0x180
[ 3600.232858]  [&amp;lt;ffffffff8121f78e&amp;gt;] ? dispose_list+0x3e/0x50
[ 3600.233474]  [&amp;lt;ffffffff81220446&amp;gt;] ? evict_inodes+0xd6/0x140
[ 3600.234032]  [&amp;lt;ffffffff812054d2&amp;gt;] generic_shutdown_super+0x72/0x100
[ 3600.234729]  [&amp;lt;ffffffff812058a2&amp;gt;] kill_anon_super+0x12/0x20
[ 3600.235330]  [&amp;lt;ffffffffc0704115&amp;gt;] lustre_kill_super+0x45/0x50 [obdclass]
[ 3600.236001]  [&amp;lt;ffffffff81205c59&amp;gt;] deactivate_locked_super+0x49/0x60
[ 3600.236716]  [&amp;lt;ffffffff812063c6&amp;gt;] deactivate_super+0x46/0x60
[ 3600.237317]  [&amp;lt;ffffffff8122375f&amp;gt;] cleanup_mnt+0x3f/0x80
[ 3600.237858]  [&amp;lt;ffffffff812237f2&amp;gt;] __cleanup_mnt+0x12/0x20
[ 3600.238460]  [&amp;lt;ffffffff810aee05&amp;gt;] task_work_run+0xc5/0xf0
[ 3600.239006]  [&amp;lt;ffffffff8102ab52&amp;gt;] do_notify_resume+0x92/0xb0
[ 3600.239647]  [&amp;lt;ffffffff816b8d37&amp;gt;] int_signal+0x12/0x17
[ 3628.070881] Lustre: 3090:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has failed due to network error: [sent 1515716613/real 1515716613]  req@ffff88006a659e00 x1589341203865152/t0(0) o250-&amp;gt;MGC10.2.8.151@tcp@10.2.8.151@tcp:26/25 lens 520/544 e 0 to 1 dl 1515716638 ref 1 fl Rpc:eXN/0/ffffffff rc 0/-1
[ 3628.074097] Lustre: 3090:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 276 previous similar messages
[ 3665.080686] LustreError: 3103:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1515716350, 300s ago), entering recovery for MGS@10.2.8.152@tcp ns: MGC10.2.8.151@tcp lock: ffff88007bb0b800/0xa3346a06e12d0db8 lrc: 4/1,0 mode: --/CR res: [0x65727473756c:0x2:0x0].0x0 rrc: 2 type: PLN flags: 0x1000000000000 nid: local remote: 0x62460a664e0e6acb expref: -99 pid: 3103 timeout: 0 lvb_type: 0
[ 3720.239678] INFO: task umount:17558 blocked for more than 120 seconds.
[ 3720.240435] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 3720.241277] umount          D ffff88007a1a5ee0     0 17558  17551 0x00000080
[ 3720.242105] Call Trace:
[ 3720.242379]  [&amp;lt;ffffffff816ac5f9&amp;gt;] schedule_preempt_disabled+0x29/0x70
[ 3720.243024]  [&amp;lt;ffffffff816aa427&amp;gt;] __mutex_lock_slowpath+0xc7/0x1d0
[ 3720.243723]  [&amp;lt;ffffffff816a983f&amp;gt;] mutex_lock+0x1f/0x2f
[ 3720.244271]  [&amp;lt;ffffffffc061e147&amp;gt;] mgc_process_config+0x207/0x13f0 [mgc]
[ 3720.244954]  [&amp;lt;ffffffffc0704a75&amp;gt;] obd_process_config.constprop.13+0x85/0x2d0 [obdclass]
[ 3720.245805]  [&amp;lt;ffffffffc05babc7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[ 3720.246498]  [&amp;lt;ffffffffc06f1de9&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[ 3720.247224]  [&amp;lt;ffffffffc0705d9f&amp;gt;] lustre_end_log+0x1ff/0x550 [obdclass]
[ 3720.247956]  [&amp;lt;ffffffffc0afcc0d&amp;gt;] ll_put_super+0x8d/0xaa0 [lustre]
[ 3720.248601]  [&amp;lt;ffffffff81248c61&amp;gt;] ? fsnotify_destroy_marks+0x81/0x90
[ 3720.249241]  [&amp;lt;ffffffff8113ae4d&amp;gt;] ? call_rcu_sched+0x1d/0x20
[ 3720.249876]  [&amp;lt;ffffffffc0b2794c&amp;gt;] ? ll_destroy_inode+0x1c/0x20 [lustre]
[ 3720.250558]  [&amp;lt;ffffffff8121f5a8&amp;gt;] ? destroy_inode+0x38/0x60
[ 3720.251118]  [&amp;lt;ffffffff8121f6da&amp;gt;] ? evict+0x10a/0x180
[ 3720.251684]  [&amp;lt;ffffffff8121f78e&amp;gt;] ? dispose_list+0x3e/0x50
[ 3720.252278]  [&amp;lt;ffffffff81220446&amp;gt;] ? evict_inodes+0xd6/0x140
[ 3720.252837]  [&amp;lt;ffffffff812054d2&amp;gt;] generic_shutdown_super+0x72/0x100
[ 3720.253521]  [&amp;lt;ffffffff812058a2&amp;gt;] kill_anon_super+0x12/0x20
[ 3720.254095]  [&amp;lt;ffffffffc0704115&amp;gt;] lustre_kill_super+0x45/0x50 [obdclass]
[ 3720.254820]  [&amp;lt;ffffffff81205c59&amp;gt;] deactivate_locked_super+0x49/0x60
[ 3720.255464]  [&amp;lt;ffffffff812063c6&amp;gt;] deactivate_super+0x46/0x60
[ 3720.256028]  [&amp;lt;ffffffff8122375f&amp;gt;] cleanup_mnt+0x3f/0x80
[ 3720.256624]  [&amp;lt;ffffffff812237f2&amp;gt;] __cleanup_mnt+0x12/0x20
[ 3720.257206]  [&amp;lt;ffffffff810aee05&amp;gt;] task_work_run+0xc5/0xf0
[ 3720.257816]  [&amp;lt;ffffffff8102ab52&amp;gt;] do_notify_resume+0x92/0xb0
[ 3720.258412]  [&amp;lt;ffffffff816b8d37&amp;gt;] int_signal+0x12/0x17
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;From client 1 (vm1) console log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 3077.173859] Lustre: DEBUG MARKER: /usr/sbin/lctl mark == replay-dual test complete, duration 2281 sec ====================================================== 00:16:26 \(1515716186\)
[ 3077.357365] Lustre: DEBUG MARKER: == replay-dual test complete, duration 2281 sec ====================================================== 00:16:26 (1515716186)
[ 3077.572239] Lustre: DEBUG MARKER: running=$(grep -c /mnt/lustre2&apos; &apos; /proc/mounts);
[ 3077.572239] if [ $running -ne 0 ] ; then
[ 3077.572239] echo Stopping client $(hostname) /mnt/lustre2 opts:;
[ 3077.572239] lsof /mnt/lustre2 || need_kill=no;
[ 3077.572239] if [ x != x -a x$need_kill != xno ]; then
[ 3077.572239]     pids=$(lsof -t /mnt/lustre2 | sort -u);
[ 3077.572239]    
[ 3240.203972] INFO: task umount:20148 blocked for more than 120 seconds.
[ 3240.204906] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 3240.205860] umount          D ffff88007b438000     0 20148  20141 0x00000080
[ 3240.206815] Call Trace:
[ 3240.207101]  [&amp;lt;ffffffff816ac5f9&amp;gt;] schedule_preempt_disabled+0x29/0x70
[ 3240.207823]  [&amp;lt;ffffffff816aa427&amp;gt;] __mutex_lock_slowpath+0xc7/0x1d0
[ 3240.208472]  [&amp;lt;ffffffff816a983f&amp;gt;] mutex_lock+0x1f/0x2f
[ 3240.209093]  [&amp;lt;ffffffffc0a7b147&amp;gt;] mgc_process_config+0x207/0x13f0 [mgc]
[ 3240.209783]  [&amp;lt;ffffffffc0750a75&amp;gt;] obd_process_config.constprop.13+0x85/0x2d0 [obdclass]
[ 3240.210610]  [&amp;lt;ffffffffc05d9bc7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[ 3240.211394]  [&amp;lt;ffffffffc073dde9&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[ 3240.212154]  [&amp;lt;ffffffffc0751d9f&amp;gt;] lustre_end_log+0x1ff/0x550 [obdclass]
[ 3240.212914]  [&amp;lt;ffffffffc0bd9c0d&amp;gt;] ll_put_super+0x8d/0xaa0 [lustre]
[ 3240.213548]  [&amp;lt;ffffffff81248c61&amp;gt;] ? fsnotify_destroy_marks+0x81/0x90
[ 3240.214268]  [&amp;lt;ffffffff8113ae4d&amp;gt;] ? call_rcu_sched+0x1d/0x20
[ 3240.214871]  [&amp;lt;ffffffffc0c0494c&amp;gt;] ? ll_destroy_inode+0x1c/0x20 [lustre]
[ 3240.215545]  [&amp;lt;ffffffff8121f5a8&amp;gt;] ? destroy_inode+0x38/0x60
[ 3240.216205]  [&amp;lt;ffffffff8121f6da&amp;gt;] ? evict+0x10a/0x180
[ 3240.216723]  [&amp;lt;ffffffff8121f78e&amp;gt;] ? dispose_list+0x3e/0x50
[ 3240.217303]  [&amp;lt;ffffffff81220446&amp;gt;] ? evict_inodes+0xd6/0x140
[ 3240.217941]  [&amp;lt;ffffffff812054d2&amp;gt;] generic_shutdown_super+0x72/0x100
[ 3240.218626]  [&amp;lt;ffffffff812058a2&amp;gt;] kill_anon_super+0x12/0x20
[ 3240.218951] LustreError: 166-1: MGC10.2.8.151@tcp: Connection to MGS (at 10.2.8.152@tcp) was lost; in progress operations using this service will fail
[ 3240.218952] LustreError: Skipped 2 previous similar messages
[ 3240.218977] LustreError: 23661:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1515716050, 300s ago), entering recovery for MGS@10.2.8.152@tcp ns: MGC10.2.8.151@tcp lock: ffff880062da5600/0x9eda9d6180716331 lrc: 4/1,0 mode: --/CR res: [0x65727473756c:0x2:0x0].0x0 rrc: 2 type: PLN flags: 0x1000000000000 nid: local remote: 0x62460a664e0e661e expref: -99 pid: 23661 timeout: 0 lvb_type: 0
[ 3240.221609] LustreError: 2809:0:(ldlm_resource.c:1100:ldlm_resource_complain()) MGC10.2.8.151@tcp: namespace resource [0x65727473756c:0x2:0x0].0x0 (ffff880060c1d480) refcount nonzero (1) after lock cleanup; forcing cleanup.
[ 3240.221611] LustreError: 2809:0:(ldlm_resource.c:1682:ldlm_resource_dump()) --- Resource: [0x65727473756c:0x2:0x0].0x0 (ffff880060c1d480) refcount = 2
[ 3240.221612] LustreError: 2809:0:(ldlm_resource.c:1703:ldlm_resource_dump()) Waiting locks:
[ 3240.221617] LustreError: 2809:0:(ldlm_resource.c:1705:ldlm_resource_dump()) ### ### ns: MGC10.2.8.151@tcp lock: ffff880062da5600/0x9eda9d6180716331 lrc: 4/1,0 mode: --/CR res: [0x65727473756c:0x2:0x0].0x0 rrc: 3 type: PLN flags: 0x1106400000000 nid: local remote: 0x62460a664e0e661e expref: -99 pid: 23661 timeout: 0 lvb_type: 0
[ 3240.232538]  [&amp;lt;ffffffffc0750115&amp;gt;] lustre_kill_super+0x45/0x50 [obdclass]
[ 3240.233322]  [&amp;lt;ffffffff81205c59&amp;gt;] deactivate_locked_super+0x49/0x60
[ 3240.234090]  [&amp;lt;ffffffff812063c6&amp;gt;] deactivate_super+0x46/0x60
[ 3240.234676]  [&amp;lt;ffffffff8122375f&amp;gt;] cleanup_mnt+0x3f/0x80
[ 3240.235214]  [&amp;lt;ffffffff812237f2&amp;gt;] __cleanup_mnt+0x12/0x20
[ 3240.235766]  [&amp;lt;ffffffff810aee05&amp;gt;] task_work_run+0xc5/0xf0
[ 3240.236339]  [&amp;lt;ffffffff8102ab52&amp;gt;] do_notify_resume+0x92/0xb0
[ 3240.236916]  [&amp;lt;ffffffff816b8d37&amp;gt;] int_signal+0x12/0x17
[ 3240.246310] Lustre: Unmounted lustre-client
[ 3252.218981] Lustre: lustre-MDT0000-mdc-ffff880060f37000: Connection to lustre-MDT0000 (at 10.2.8.152@tcp) was lost; in progress operations using this service will wait for recovery to complete
[ 3252.220948] Lustre: Skipped 30 previous similar messages
[ 3256.312576] Lustre: DEBUG MARKER: /usr/sbin/lctl mark  auster : @@@@@@ FAIL: Restart of mds1 failed! 
[ 3256.492539] Lustre: DEBUG MARKER: auster : @@@@@@ FAIL: Restart of mds1 failed!
[ 3256.704074] Lustre: DEBUG MARKER: /usr/sbin/lctl dk &amp;gt; /tmp/test_logs/1515713768/auster..debug_log.$(hostname -s).1515716366.log;
[ 3256.704074]          dmesg &amp;gt; /tmp/test_logs/1515713768/auster..dmesg.$(hostname -s).1515716366.log
[ 3257.096964] Lustre: DEBUG MARKER: lctl set_param -n fail_loc=0 	    fail_val=0 2&amp;gt;/dev/null
[ 3257.506302] Lustre: DEBUG MARKER: /usr/sbin/lctl mark -----============= acceptance-small: replay-vbr ============----- Fri Jan 12 00:19:27 UTC 2018
[ 3257.698451] Lustre: DEBUG MARKER: -----============= acceptance-small: replay-vbr ============----- Fri Jan 12 00:19:27 UTC 2018
[ 3257.983880] Lustre: DEBUG MARKER: /usr/sbin/lctl mark == replay-vbr test complete, duration -o sec ========================================================= 00:19:27 \(1515716367\)
[ 3258.157503] Lustre: DEBUG MARKER: == replay-vbr test complete, duration -o sec ========================================================= 00:19:27 (1515716367)
[ 3258.484817] Lustre: DEBUG MARKER: /usr/sbin/lctl mark excepting tests: 12a
[ 3258.667646] Lustre: DEBUG MARKER: excepting tests: 12a
[ 3258.852092] Lustre: DEBUG MARKER: 
[ 3258.852092] running=$(mount | grep -c /mnt/lustre2&apos; &apos;);
[ 3258.852092] rc=0;
[ 3258.852092] if [ $running -eq 0 ] ; then
[ 3258.852092] 	mkdir -p /mnt/lustre2;
[ 3258.852092] 	mount -t lustre  -o user_xattr,flock onyx-42vm7:onyx-42vm8:/lustre /mnt/lustre2;
[ 3258.852092] 	rc=$?;
[ 3258.852092] fi;
[ 3258.852092] exit $rc
[ 3279.009972] LustreError: 15c-8: MGC10.2.8.151@tcp: The configuration from log &apos;lustre-client&apos; failed (-5). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.
[ 3279.012386] Lustre: Unmounted lustre-client
[ 3279.012960] LustreError: 4457:0:(obd_mount.c:1506:lustre_fill_super()) Unable to mount  (-5)
[ 3279.662897] Lustre: DEBUG MARKER: running=$(grep -c /mnt/lustre&apos; &apos; /proc/mounts);
[ 3279.662897] if [ $running -ne 0 ] ; then
[ 3279.662897] echo Stopping client $(hostname) /mnt/lustre opts:;
[ 3279.662897] lsof /mnt/lustre || need_kill=no;
[ 3279.662897] if [ x != x -a x$need_kill != xno ]; then
[ 3279.662897]     pids=$(lsof -t /mnt/lustre | sort -u);
[ 3279.662897]     if 
[ 3304.220947] Lustre: 12857:0:(client.c:2114:ptlrpc_expire_one_request()) @@@ Request sent has timed out for slow reply: [sent 1515716398/real 1515716398]  req@ffff880060594600 x1589341091652896/t0(0) o38-&amp;gt;lustre-MDT0000-mdc-ffff880060f37000@10.2.8.152@tcp:12/10 lens 520/544 e 0 to 1 dl 1515716414 ref 1 fl Rpc:XN/0/ffffffff rc 0/-1
[ 3304.223822] Lustre: 12857:0:(client.c:2114:ptlrpc_expire_one_request()) Skipped 198 previous similar messages
[ 3480.236958] INFO: task umount:4583 blocked for more than 120 seconds.
[ 3480.237631] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 3480.238417] umount          D ffff8800368adee0     0  4583   4576 0x00000080
[ 3480.239168] Call Trace:
[ 3480.239436]  [&amp;lt;ffffffff816ac5f9&amp;gt;] schedule_preempt_disabled+0x29/0x70
[ 3480.240083]  [&amp;lt;ffffffff816aa427&amp;gt;] __mutex_lock_slowpath+0xc7/0x1d0
[ 3480.240822]  [&amp;lt;ffffffff816a983f&amp;gt;] mutex_lock+0x1f/0x2f
[ 3480.241356]  [&amp;lt;ffffffffc0a7b147&amp;gt;] mgc_process_config+0x207/0x13f0 [mgc]
[ 3480.242104]  [&amp;lt;ffffffffc0750a75&amp;gt;] obd_process_config.constprop.13+0x85/0x2d0 [obdclass]
[ 3480.242891]  [&amp;lt;ffffffffc05d9bc7&amp;gt;] ? libcfs_debug_msg+0x57/0x80 [libcfs]
[ 3480.243557]  [&amp;lt;ffffffffc073dde9&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[ 3480.244358]  [&amp;lt;ffffffffc0751d9f&amp;gt;] lustre_end_log+0x1ff/0x550 [obdclass]
[ 3480.245040]  [&amp;lt;ffffffffc0bd9c0d&amp;gt;] ll_put_super+0x8d/0xaa0 [lustre]
[ 3480.245705]  [&amp;lt;ffffffff81248c61&amp;gt;] ? fsnotify_destroy_marks+0x81/0x90
[ 3480.246346]  [&amp;lt;ffffffff8113ae4d&amp;gt;] ? call_rcu_sched+0x1d/0x20
[ 3480.246993]  [&amp;lt;ffffffffc0c0494c&amp;gt;] ? ll_destroy_inode+0x1c/0x20 [lustre]
[ 3480.247637]  [&amp;lt;ffffffff8121f5a8&amp;gt;] ? destroy_inode+0x38/0x60
[ 3480.248208]  [&amp;lt;ffffffff8121f6da&amp;gt;] ? evict+0x10a/0x180
[ 3480.248769]  [&amp;lt;ffffffff8121f78e&amp;gt;] ? dispose_list+0x3e/0x50
[ 3480.249331]  [&amp;lt;ffffffff81220446&amp;gt;] ? evict_inodes+0xd6/0x140
[ 3480.249954]  [&amp;lt;ffffffff812054d2&amp;gt;] generic_shutdown_super+0x72/0x100
[ 3480.250612]  [&amp;lt;ffffffff812058a2&amp;gt;] kill_anon_super+0x12/0x20
[ 3480.251193]  [&amp;lt;ffffffffc0750115&amp;gt;] lustre_kill_super+0x45/0x50 [obdclass]
[ 3480.251914]  [&amp;lt;ffffffff81205c59&amp;gt;] deactivate_locked_super+0x49/0x60
[ 3480.252539]  [&amp;lt;ffffffff812063c6&amp;gt;] deactivate_super+0x46/0x60
[ 3480.253170]  [&amp;lt;ffffffff8122375f&amp;gt;] cleanup_mnt+0x3f/0x80
[ 3480.253681]  [&amp;lt;ffffffff812237f2&amp;gt;] __cleanup_mnt+0x12/0x20
[ 3480.254231]  [&amp;lt;ffffffff810aee05&amp;gt;] task_work_run+0xc5/0xf0
[ 3480.254824]  [&amp;lt;ffffffff8102ab52&amp;gt;] do_notify_resume+0x92/0xb0
[ 3480.255398]  [&amp;lt;ffffffff816b8d37&amp;gt;] int_signal+0x12/0x17
[ 3540.237952] LustreError: 23661:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1515716350, 300s ago), entering recovery for MGS@10.2.8.151@tcp ns: MGC10.2.8.151@tcp lock: ffff880061b52800/0x9eda9d61807163e7 lrc: 4/1,0 mode: --/CR res: [0x65727473756c:0x2:0x0].0x0 rrc: 2 type: PLN flags: 0x1000000000000 nid: local remote: 0x62460a664e0e6abd expref: -99 pid: 23661 timeout: 0 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;From client 3 (vm3) console log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; [ 3200.888758] Lustre: DEBUG MARKER: == replay-dual test complete, duration 2281 sec ====================================================== 00:16:26 (1515716186)
[ 3201.086884] Lustre: DEBUG MARKER: running=$(grep -c /mnt/lustre2&apos; &apos; /proc/mounts);
[ 3201.086884] if [ $running -ne 0 ] ; then
[ 3201.086884] echo Stopping client $(hostname) /mnt/lustre2 opts:;
[ 3201.086884] lsof /mnt/lustre2 || need_kill=no;
[ 3201.086884] if [ x != x -a x$need_kill != xno ]; then
[ 3201.086884]     pids=$(lsof -t /mnt/lustre2 | sort -u);
[ 3201.086884]    
[ 3201.417836] Lustre: Unmounted lustre-client
[ 3362.842127] LustreError: 166-1: MGC10.2.8.151@tcp: Connection to MGS (at 10.2.8.152@tcp) was lost; in progress operations using this service will fail
[ 3362.843597] LustreError: Skipped 2 previous similar messages
[ 3362.844193] LustreError: 3109:0:(ldlm_request.c:148:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1515716048, 300s ago), entering recovery for MGS@10.2.8.152@tcp ns: MGC10.2.8.151@tcp lock: ffff88007b650000/0x66776ab671d1d0f5 lrc: 4/1,0 mode: --/CR res: [0x65727473756c:0x2:0x0].0x0 rrc: 2 type: PLN flags: 0x1000000000000 nid: local remote: 0x62460a664e0e6617 expref: -99 pid: 3109 timeout: 0 lvb_type: 0
[ 3362.852523] LustreError: 28374:0:(ldlm_resource.c:1100:ldlm_resource_complain()) MGC10.2.8.151@tcp: namespace resource [0x65727473756c:0x2:0x0].0x0 (ffff88000010ac00) refcount nonzero (1) after lock cleanup; forcing cleanup.
[ 3362.854550] LustreError: 28374:0:(ldlm_resource.c:1682:ldlm_resource_dump()) --- Resource: [0x65727473756c:0x2:0x0].0x0 (ffff88000010ac00) refcount = 2
[ 3362.855911] LustreError: 28374:0:(ldlm_resource.c:1703:ldlm_resource_dump()) Waiting locks:
[ 3362.856734] LustreError: 28374:0:(ldlm_resource.c:1705:ldlm_resource_dump()) ### ### ns: ?? lock: ffff88007b650000/0x66776ab671d1d0f5 lrc: 4/1,0 mode: --/CR res: ?? rrc=?? type: ??? flags: 0x1106400000000 nid: local remote: 0x62460a664e0e6617 expref: -99 pid: 3109 timeout: 0 lvb_type: 0
[ 3379.845411] Lustre: DEBUG MARKER: /usr/sbin/lctl mark  auster : @@@@@@ FAIL: Restart of mds1 failed! 
[ 3380.015373] Lustre: DEBUG MARKER: auster : @@@@@@ FAIL: Restart of mds1 failed!
[ 3380.229705] Lustre: DEBUG MARKER: /usr/sbin/lctl dk &amp;gt; /tmp/test_logs/1515713768/auster..debug_log.$(hostname -s).1515716366.log;
[ 3380.229705]          dmesg &amp;gt; /tmp/test_logs/1515713768/auster..dmesg.$(hostname -s).1515716366.log
[ 3380.621370] Lustre: DEBUG MARKER: lctl set_param -n fail_loc=0 	    fail_val=0 2&amp;gt;/dev/null
[ 3381.043662] Lustre: DEBUG MARKER: /usr/sbin/lctl mark -----============= acceptance-small: replay-vbr ============----- Fri Jan 12 00:19:27 UTC 2018
[ 3381.222383] Lustre: DEBUG MARKER: -----============= acceptance-small: replay-vbr ============----- Fri Jan 12 00:19:27 UTC 2018
[ 3382.008755] Lustre: DEBUG MARKER: /usr/sbin/lctl mark excepting tests: 12a
[ 3382.191345] Lustre: DEBUG MARKER: excepting tests: 12a
[ 3382.375529] Lustre: DEBUG MARKER: 
[ 3382.375529] running=$(mount | grep -c /mnt/lustre2&apos; &apos;);
[ 3382.375529] rc=0;
[ 3382.375529] if [ $running -eq 0 ] ; then
[ 3382.375529] 	mkdir -p /mnt/lustre2;
[ 3382.375529] 	mount -t lustre  -o user_xattr,flock onyx-42vm7:onyx-42vm8:/lustre /mnt/lustre2;
[ 3382.375529] 	rc=$?;
[ 3382.375529] fi;
[ 3382.375529] exit $rc
[ 3402.532825] LustreError: 15c-8: MGC10.2.8.151@tcp: The configuration from log &apos;lustre-client&apos; failed (-5). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.
[ 3402.535372] Lustre: Unmounted lustre-client
[ 3402.536342] LustreError: 32090:0:(obd_mount.c:1506:lustre_fill_super()) Unable to mount  (-5)
[ 3403.185319] Lustre: DEBUG MARKER: running=$(grep -c /mnt/lustre&apos; &apos; /proc/mounts);
[ 3403.185319] if [ $running -ne 0 ] ; then
[ 3403.185319] echo Stopping client $(hostname) /mnt/lustre opts:;
[ 3403.185319] lsof /mnt/lustre || need_kill=no;
[ 3403.185319] if [ x != x -a x$need_kill != xno ]; then
[ 3403.185319]     pids=$(lsof -t /mnt/lustre | sort -u);
[ 3403.185319]     if 
[ 3410.455336] Lustre: Unmounted lustre-client
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;There are several failures like this one, but I can&apos;t find any that match the issues seen here on the clients.&lt;/p&gt;</description>
                <environment></environment>
        <key id="50257">LU-10519</key>
            <summary>replay-vbr fails to start running tests due to MDS not restarting</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="emoly.liu">Emoly Liu</assignee>
                                    <reporter username="jamesanunez">James Nunez</reporter>
                        <labels>
                    </labels>
                <created>Tue, 16 Jan 2018 16:54:00 +0000</created>
                <updated>Thu, 16 Aug 2018 17:34:01 +0000</updated>
                                            <version>Lustre 2.11.0</version>
                    <version>Lustre 2.10.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="218553" author="pjones" created="Thu, 18 Jan 2018 18:44:26 +0000"  >&lt;p&gt;Emoly&lt;/p&gt;

&lt;p&gt;Could you please advise on this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="218555" author="green" created="Thu, 18 Jan 2018 18:45:30 +0000"  >&lt;p&gt;So the important problem here is mds/mgs start failure. client unmount hanging is just the sideeffect of that.&lt;/p&gt;</comment>
                            <comment id="218629" author="yong.fan" created="Fri, 19 Jan 2018 08:35:14 +0000"  >&lt;p&gt;Where to find the kernel debug logs: /tmp/test_logs/1515713768/auster..debug_log.$(hostname -s).1515716366.log? It may contained some useful information about the failure of restart MDS?&lt;/p&gt;</comment>
                            <comment id="218645" author="emoly.liu" created="Fri, 19 Jan 2018 15:14:05 +0000"  >&lt;p&gt;Here is the time line that the logs showed,&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00:16:26  kernel: Lustre: DEBUG MARKER: == replay-dual test complete, duration 2281 sec 
00:19:06 onyx-42vm4 kernel: Call Trace: (10.2.8.148)
00:19:08 onyx-42vm8 kernel: Lustre: MGS: Received new LWP connection from 10.2.8.147@tcp, removing former export from same NID
00:19:10 onyx-42vm1 kernel: Call Trace: (10.2.8.145)
00:19:11 onyx-42vm8 kernel: Lustre: MGS: Received new LWP connection from 10.2.8.148@tcp, removing former export from same NID
00:19:26 onyx-42vm1 mrshd[3200]: root@onyx-42vm1 as root: cmd=&apos;/usr/sbin/lctl mark &quot;/usr/sbin/lctl mark  auster : @@@@@@ FAIL: Restart of mds1 failed! &quot;
00:19:46 onyx-42vm8: restarted
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So as Fanyong said, that *.1515716366.log will tell use why mds1 restart failed.&lt;/p&gt;</comment>
                            <comment id="218670" author="jamesanunez" created="Fri, 19 Jan 2018 18:28:56 +0000"  >&lt;p&gt;Minh pointed out that, att the end of the replay-dual test suite log, replay-dual.suite_log.onyx-42vm1.log, we see a failure&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;CMD: onyx-42vm7 hostname
mount facets: mds1
CMD: onyx-42vm7 lsmod | grep zfs &amp;gt;&amp;amp;/dev/null || modprobe zfs;
			zpool list -H lustre-mdt1 &amp;gt;/dev/null 2&amp;gt;&amp;amp;1 ||
			zpool import -f -o cachefile=none -d /dev/lvm-Role_MDS lustre-mdt1
onyx-42vm7: cannot import &apos;lustre-mdt1&apos;: no such pool available
 auster : @@@@@@ FAIL: Restart of mds1 failed! 
  Trace dump:
  = /usr/lib64/lustre/tests/test-framework.sh:5314:error()
  = /usr/lib64/lustre/tests/test-framework.sh:1469:mount_facets()
  = /usr/lib64/lustre/tests/test-framework.sh:2900:facet_failover()
  = /usr/lib64/lustre/tests/test-framework.sh:2994:fail()
  = /usr/lib64/lustre/tests/test-framework.sh:3705:stopall()
  = auster:113:reset_lustre()
  = auster:217:run_suite()
  = auster:234:run_suite_logged()
  = auster:298:run_suites()
  = auster:334:main()&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="218798" author="jamesanunez" created="Mon, 22 Jan 2018 14:31:41 +0000"  >&lt;p&gt;We reran the failed test session to see if this is reproducible and to see if we could get more information. replay-dual test 16 fails due to &apos;Restart of mds1 failed!&apos; and test 17 hangs. More information at &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/db4cfddc-ff78-11e7-a6ad-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/db4cfddc-ff78-11e7-a6ad-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="218955" author="emoly.liu" created="Wed, 24 Jan 2018 02:33:06 +0000"  >&lt;p&gt;Same to the last time, the logs from mds1(onyx-47vm7) didn&apos;t show anything special during zfs import in the period of &lt;span class=&quot;error&quot;&gt;&amp;#91;07:41:19 - 07:41:22&amp;#93;&lt;/span&gt;.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Jan 22 07:41:20 onyx-47vm7 mrshd[1469]: root@onyx-47vm1 as root: cmd=&apos;(PATH=$PATH:/usr/lib64/lustre/utils:/usr/lib64/lustre/tests:/sbin:/usr/sbin; cd /usr/lib64/lustre/tests; LUSTRE=&quot;/usr/lib64/lustre&quot; sh -c &quot;lsmod | grep zfs &amp;gt;&amp;amp;/dev/null || modprobe zfs;#012#011#011#011zpool list -H lustre-mdt1 &amp;gt;/dev/null 2&amp;gt;&amp;amp;1 ||#012#011#011#011zpool import -f -o cachefile=none -d /dev/lvm-Role_MDS lustre-mdt1&quot;);echo XXRETCODE:$?&apos;
Jan 22 07:41:20 onyx-47vm7 kernel: spl: loading out-of-tree module taints kernel.
Jan 22 07:41:20 onyx-47vm7 kernel: spl: module verification failed: signature and/or required key missing - tainting kernel
Jan 22 07:41:20 onyx-47vm7 kernel: SPL: Loaded module v0.7.5-1
Jan 22 07:41:20 onyx-47vm7 kernel: znvpair: module license &apos;CDDL&apos; taints kernel.
Jan 22 07:41:20 onyx-47vm7 kernel: Disabling lock debugging due to kernel taint
Jan 22 07:41:21 onyx-47vm7 kernel: ZFS: Loaded module v0.7.5-1, ZFS pool version 5000, ZFS filesystem version 5
Jan 22 07:41:21 onyx-47vm7 xinetd[941]: EXIT: mshell status=0 pid=1467 duration=1(sec)
Jan 22 07:41:21 onyx-47vm7 systemd-logind: Removed session c8.
Jan 22 07:41:21 onyx-47vm7 systemd: Removed slice User Slice of root.
Jan 22 07:41:21 onyx-47vm7 systemd: Stopping User Slice of root.
Jan 22 07:41:22 onyx-47vm7 xinetd[941]: START: mshell pid=1531 from=::ffff:10.2.8.210
Jan 22 07:41:22 onyx-47vm7 systemd: Created slice User Slice of root.
Jan 22 07:41:22 onyx-47vm7 systemd: Starting User Slice of root.
Jan 22 07:41:22 onyx-47vm7 systemd: Started Session c9 of user root.
Jan 22 07:41:22 onyx-47vm7 systemd-logind: New session c9 of user root.
Jan 22 07:41:22 onyx-47vm7 systemd: Starting Session c9 of user root.
Jan 22 07:41:22 onyx-47vm7 mrshd[1532]: root@onyx-47vm1 as root: cmd=&apos;/usr/sbin/lctl mark &quot;/usr/sbin/lctl mark  replay-dual test_16: @@@@@@ FAIL: Restart of mds1 failed! &quot;;echo XXRETCODE:$?&apos;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="219136" author="jamesanunez" created="Thu, 25 Jan 2018 14:22:31 +0000"  >&lt;p&gt;We ran failover ZFS testing again with Lustre 2.10.3 RC1 and the previous version of ZFS, 0.7.3. There are still some issues with replay-vbr; test 4d fails and 4e hangs. Logs at &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/0a61b734-01b3-11e8-bd00-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/0a61b734-01b3-11e8-bd00-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;We do see replay-vbr test 4d fails with the following in the test_log, &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mount facets: mds1
CMD: onyx-42vm7 lsmod | grep zfs &amp;gt;&amp;amp;/dev/null || modprobe zfs;
			zpool list -H lustre-mdt1 &amp;gt;/dev/null 2&amp;gt;&amp;amp;1 ||
			zpool import -f -o cachefile=none -d /dev/lvm-Role_MDS lustre-mdt1
onyx-42vm7: cannot import &apos;lustre-mdt1&apos;: no such pool available
 replay-vbr test_4d: @@@@@@ FAIL: Restart of mds1 failed! 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

</comment>
                            <comment id="219215" author="adilger" created="Fri, 26 Jan 2018 01:11:25 +0000"  >&lt;p&gt;I was trying to see if there was anything interesting on the MDS console.  I was looking at the test run from 2018-01-25 05:02:19 UTC:&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/0a61b734-01b3-11e8-bd00-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/0a61b734-01b3-11e8-bd00-52540065bddc&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;In the &lt;a href=&quot;https://testing.hpdd.intel.com/test_logs/0ab418ee-01b3-11e8-bd00-52540065bddc/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;onyx-42vm7 (MDS1) console log&lt;/a&gt; it has test markers in the log from replay-vbr test_0a (05:02:35) until test_1b (05:02:43), then it has some conman reconnects (01-25 05:02 to 05:10), then the node reboots and it is 01-25 06:14 and chef proceeds to reinstall the MDS and start running replay-single...&lt;/p&gt;

&lt;p&gt;In the &lt;a href=&quot;https://testing.hpdd.intel.com/test_logs/0ab41a38-01b3-11e8-bd00-52540065bddc/show_text&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;onyx-42vm8 (MDS2) console log&lt;/a&gt; I see test markers up to the end of replay-dual (04:56:33), then a few conman disconnect/connect messages (01-25 04:59 to 05:12, which is after replay-vbr has already run), and then it reboots and it is 01-25 06:14 and chef reinstalls the node for replay-single.&lt;/p&gt;

&lt;p&gt;This appears to possibly be some kind of problem with the console server, since virtually none of the other node console logs have anything in this period either, which makes this problem difficult to debug.  The MDS dmesg logs show only a reboot, and nothing from the actual tests.&lt;/p&gt;
</comment>
                            <comment id="224560" author="standan" created="Mon, 26 Mar 2018 18:27:21 +0000"  >&lt;p&gt;Also on master build 3730 2.11RC1 for Failover&#160;RHEL 7.4 Server/ZFS RHEL 7.4 Client&lt;br/&gt;
&lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/6ad7901a-2e0f-11e8-b6a0-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/6ad7901a-2e0f-11e8-b6a0-52540065bddc&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="52985">LU-11256</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzr6f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>