<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:31:23 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10027] Unable to finish mount  on MDS while Rolling downgrade </title>
                <link>https://jira.whamcloud.com/browse/LU-10027</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;MDS hung when trying to mount MDS trying to downgrade it from 2.10.1 RC1 to b_ieel3_0 build 222.&lt;br/&gt;
This happens for both ldiskfs as well as zfs&lt;/p&gt;

&lt;p&gt;ldiskfs&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@onyx-63 ~]# mount -t lustre -o acl,user_xattr /dev/sdb1 /mnt/mds0
mount.lustre: increased /sys/block/sdb/queue/max_sectors_kb from 512 to 32767
mount.lustre: change scheduler of /sys/block/sdb[ 2004.864821] libcfs: loading out-of-tree module taints kernel.
/queue/scheduler[ 2004.871947] libcfs: module verification failed: signature and/or required key missing - tainting kernel
 from cfq to deadline
[ 2004.888945] LNet: HW CPU cores: 32, npartitions: 4
[ 2004.898521] alg: No test for adler32 (adler32-zlib)
[ 2004.904129] alg: No test for crc32 (crc32-table)
[ 2009.942002] sha512_ssse3: Using AVX optimized SHA-512 implementation
[ 2012.937978] Lustre: Lustre: Build Version: 2.7.19.10--PRISTINE-3.10.0-514.10.2.el7_lustre.x86_64
[ 2012.975643] LNet: Added LNI 10.2.5.135@tcp [8/256/0/180]
[ 2012.981679] LNet: Accept secure, port 988
[ 2013.034692] LDISKFS-fs (sdb1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,user_xattr,errors=remount-ro,no_mbcache

[ 2065.369959] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.51@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2067.236139] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.4.18@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2067.255509] LustreError: Skipped 1 previous similar message
[ 2076.647185] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.4.19@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.

[ 2090.368245] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.51@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2101.644288] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.4.19@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2101.663651] LustreError: Skipped 2 previous similar messages
[ 2115.367485] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.51@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2140.366559] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.51@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2140.385928] LustreError: Skipped 4 previous similar messages

[ 2160.784144] INFO: task ll_mgs_0002:21386 blocked for more than 120 seconds.
[ 2160.791943] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 2160.800724] ll_mgs_0002     D ffffffffa0f78660     0 21386      2 0x00000080
[ 2160.808691]  ffff88040aa43a10 0000000000000046 ffff880429e04e70 ffff88040aa43fd8
[ 2160.818118]  ffff88040aa43fd8 ffff88040aa43fd8 ffff880429e04e70 ffff88081df59d48
[ 2160.827408]  ffff88081df59d50 7fffffffffffffff ffff880429e04e70 ffffffffa0f78660
[ 2160.836732] Call Trace:
[ 2160.840486]  [&amp;lt;ffffffffa0f78660&amp;gt;] ? mgs_steal_client_llog_handler+0x1110/0x1110 [mgs]
[ 2160.850251]  [&amp;lt;ffffffff8168bd09&amp;gt;] schedule+0x29/0x70
[ 2160.856792]  [&amp;lt;ffffffff81689759&amp;gt;] schedule_timeout+0x239/0x2d0
[ 2160.864288]  [&amp;lt;ffffffff810c7f05&amp;gt;] ? sched_clock_cpu+0x85/0xc0
[ 2160.871684]  [&amp;lt;ffffffff810c1b15&amp;gt;] ? check_preempt_curr+0x75/0xa0
[ 2160.879360]  [&amp;lt;ffffffff810c1b59&amp;gt;] ? ttwu_do_wakeup+0x19/0xd0
[ 2160.886649]  [&amp;lt;ffffffffa0f78660&amp;gt;] ? mgs_steal_client_llog_handler+0x1110/0x1110 [mgs]
[ 2160.896367]  [&amp;lt;ffffffff8168c0e6&amp;gt;] wait_for_completion+0x116/0x170
[ 2160.904146]  [&amp;lt;ffffffff810c5080&amp;gt;] ? wake_up_state+0x20/0x20
[ 2160.911392]  [&amp;lt;ffffffffa095a047&amp;gt;] llog_process_or_fork+0x1d7/0x590 [obdclass]
[ 2160.920354]  [&amp;lt;ffffffffa095a414&amp;gt;] llog_process+0x14/0x20 [obdclass]
[ 2160.928335]  [&amp;lt;ffffffffa0f80c9f&amp;gt;] mgs_find_or_make_fsdb+0x72f/0x7e0 [mgs]
[ 2160.936899]  [&amp;lt;ffffffffa0f80db2&amp;gt;] mgs_check_index+0x62/0x2f0 [mgs]
[ 2160.944786]  [&amp;lt;ffffffffa0f689fe&amp;gt;] mgs_target_reg+0x38e/0x1320 [mgs]
[ 2160.952844]  [&amp;lt;ffffffffa0c74adb&amp;gt;] tgt_request_handle+0x8fb/0x11f0 [ptlrpc]
[ 2160.961512]  [&amp;lt;ffffffffa0c18b8b&amp;gt;] ptlrpc_server_handle_request+0x21b/0xa90 [ptlrpc]
[ 2160.971041]  [&amp;lt;ffffffffa084fce8&amp;gt;] ? lc_watchdog_touch+0x68/0x180 [libcfs]
[ 2160.979619]  [&amp;lt;ffffffffa0c15c58&amp;gt;] ? ptlrpc_wait_event+0x98/0x330 [ptlrpc]
[ 2160.988168]  [&amp;lt;ffffffff810ba2e8&amp;gt;] ? __wake_up_common+0x58/0x90
[ 2160.995664]  [&amp;lt;ffffffffa0c1c4b0&amp;gt;] ptlrpc_main+0xc00/0x1f60 [ptlrpc]
[ 2161.003618]  [&amp;lt;ffffffff81029569&amp;gt;] ? __switch_to+0xd9/0x4c0
[ 2161.010710]  [&amp;lt;ffffffffa0c1b8b0&amp;gt;] ? ptlrpc_register_service+0x1070/0x1070 [ptlrpc]
[ 2161.020100]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 2161.026468]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 2161.034663]  [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
[ 2161.041573]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 2176.641784] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.4.19@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2176.662876] LustreError: Skipped 5 previous similar messages
[ 2214.124574] LNet: Service thread pid 21386 was inactive for 200.36s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[ 2214.145286] Pid: 21386, comm: ll_mgs_0002
[ 2214.150532] 
[ 2214.150532] Call Trace:
[ 2214.156430]  [&amp;lt;ffffffffa0f78660&amp;gt;] ? mgs_fsdb_handler+0x0/0x10a0 [mgs]
[ 2214.164352]  [&amp;lt;ffffffff8168bd09&amp;gt;] schedule+0x29/0x70
[ 2214.170643]  [&amp;lt;ffffffff81689759&amp;gt;] schedule_timeout+0x239/0x2d0
[ 2214.177911]  [&amp;lt;ffffffff810c7f05&amp;gt;] ? sched_clock_cpu+0x85/0xc0
[ 2214.185069]  [&amp;lt;ffffffff810c1b15&amp;gt;] ? check_preempt_curr+0x75/0xa0
[ 2214.192491]  [&amp;lt;ffffffff810c1b59&amp;gt;] ? ttwu_do_wakeup+0x19/0xd0
[ 2214.199520]  [&amp;lt;ffffffffa0f78660&amp;gt;] ? mgs_fsdb_handler+0x0/0x10a0 [mgs]
[ 2214.207414]  [&amp;lt;ffffffff8168c0e6&amp;gt;] wait_for_completion+0x116/0x170
[ 2214.214917]  [&amp;lt;ffffffff810c5080&amp;gt;] ? default_wake_function+0x0/0x20
[ 2214.222514]  [&amp;lt;ffffffffa095a047&amp;gt;] llog_process_or_fork+0x1d7/0x590 [obdclass]
[ 2214.231181]  [&amp;lt;ffffffffa095a414&amp;gt;] llog_process+0x14/0x20 [obdclass]
[ 2214.238870]  [&amp;lt;ffffffffa0f80c9f&amp;gt;] mgs_find_or_make_fsdb+0x72f/0x7e0 [mgs]
[ 2214.247145]  [&amp;lt;ffffffffa0f80db2&amp;gt;] mgs_check_index+0x62/0x2f0 [mgs]
[ 2214.254738]  [&amp;lt;ffffffffa0f689fe&amp;gt;] mgs_target_reg+0x38e/0x1320 [mgs]
[ 2214.262432]  [&amp;lt;ffffffffa0c74adb&amp;gt;] tgt_request_handle+0x8fb/0x11f0 [ptlrpc]
[ 2214.270794]  [&amp;lt;ffffffffa0c18b8b&amp;gt;] ptlrpc_server_handle_request+0x21b/0xa90 [ptlrpc]
[ 2214.280035]  [&amp;lt;ffffffffa084fce8&amp;gt;] ? lc_watchdog_touch+0x68/0x180 [libcfs]
[ 2214.288315]  [&amp;lt;ffffffffa0c15c58&amp;gt;] ? ptlrpc_wait_event+0x98/0x330 [ptlrpc]
[ 2214.296584]  [&amp;lt;ffffffff810ba2e8&amp;gt;] ? __wake_up_common+0x58/0x90
[ 2214.303799]  [&amp;lt;ffffffffa0c1c4b0&amp;gt;] ptlrpc_main+0xc00/0x1f60 [ptlrpc]
[ 2214.311475]  [&amp;lt;ffffffff81029569&amp;gt;] ? __switch_to+0xd9/0x4c0
[ 2214.318290]  [&amp;lt;ffffffffa0c1b8b0&amp;gt;] ? ptlrpc_main+0x0/0x1f60 [ptlrpc]
[ 2214.325964]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 2214.332082]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[ 2214.338288]  [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
[ 2214.344981]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[ 2214.351182] 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;ZFS:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@onyx-78 ~]# mount -t lustre -o acl,user_xattr lustre-mdt1/mdt1 /mnt/mds0
[ 2214.245461] LNet: HW CPU cores: 72, npartitions: 8
[ 2214.253142] alg: No test for adler32 (adler32-zlib)
[ 2214.258723] alg: No test for crc32 (crc32-table)
[ 2219.277933] sha512_ssse3: Using AVX2 optimized SHA-512 implementation
[ 2222.279787] Lustre: Lustre: Build Version: 2.7.19.10--PRISTINE-3.10.0-514.10.2.el7_lustre.x86_64
[ 2222.309748] LNet: Added LNI 10.2.2.50@tcp [8/512/0/180]
[ 2222.315678] LNet: Accept secure, port 988
[ 2273.432429] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.52@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2275.484816] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.4.16@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2275.504279] LustreError: Skipped 1 previous similar message
[ 2298.432064] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.52@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2298.451527] LustreError: Skipped 1 previous similar message
[ 2300.482820] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.4.16@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2300.502281] LustreError: Skipped 1 previous similar message
[ 2323.431994] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.52@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2323.451546] LustreError: Skipped 1 previous similar message
[ 2348.432062] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.52@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2348.455763] LustreError: Skipped 3 previous similar messages
[ 2373.432088] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.52@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 2373.455409] LustreError: Skipped 3 previous similar messages
[ 2401.707691] INFO: task ll_mgs_0001:32818 blocked for more than 120 seconds.
[ 2401.717283] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 2401.727712] ll_mgs_0001     D ffff8810583fae30     0 32818      2 0x00000080
[ 2401.737119]  ffff88103479bba0 0000000000000046 ffff881058a91f60 ffff88103479bfd8
[ 2401.746898]  ffff88103479bfd8 ffff88103479bfd8 ffff881058a91f60 ffff8810583fae28
[ 2401.756643]  ffff8810583fae2c ffff881058a91f60 00000000ffffffff ffff8810583fae30
[ 2401.766335] Call Trace:
[ 2401.770395]  [&amp;lt;ffffffff8168cdf9&amp;gt;] schedule_preempt_disabled+0x29/0x70
[ 2401.778859]  [&amp;lt;ffffffff8168aa55&amp;gt;] __mutex_lock_slowpath+0xc5/0x1c0
[ 2401.786989]  [&amp;lt;ffffffff81689ebf&amp;gt;] mutex_lock+0x1f/0x2f
[ 2401.793954]  [&amp;lt;ffffffffa11d829f&amp;gt;] mgs_fsc_attach+0x22f/0x600 [mgs]
[ 2401.802032]  [&amp;lt;ffffffffa11b247a&amp;gt;] mgs_llog_open+0x1fa/0x430 [mgs]
[ 2401.810052]  [&amp;lt;ffffffffa0fd6adb&amp;gt;] tgt_request_handle+0x8fb/0x11f0 [ptlrpc]
[ 2401.818939]  [&amp;lt;ffffffffa0f7ab8b&amp;gt;] ptlrpc_server_handle_request+0x21b/0xa90 [ptlrpc]
[ 2401.828681]  [&amp;lt;ffffffffa0be7ce8&amp;gt;] ? lc_watchdog_touch+0x68/0x180 [libcfs]
[ 2401.837460]  [&amp;lt;ffffffffa0f77c58&amp;gt;] ? ptlrpc_wait_event+0x98/0x330 [ptlrpc]
[ 2401.846192]  [&amp;lt;ffffffff810ba2e8&amp;gt;] ? __wake_up_common+0x58/0x90
[ 2401.853882]  [&amp;lt;ffffffffa0f7e4b0&amp;gt;] ptlrpc_main+0xc00/0x1f60 [ptlrpc]
[ 2401.862057]  [&amp;lt;ffffffffa0f7d8b0&amp;gt;] ? ptlrpc_register_service+0x1070/0x1070 [ptlrpc]
[ 2401.871654]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 2401.878227]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 2401.886661]  [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
[ 2401.893816]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 2401.902243] INFO: task ll_mgs_0002:32819 blocked for more than 120 seconds.
[ 2401.911156] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 2401.921038] ll_mgs_0002     D ffffffffa11c4660     0 32819      2 0x00000080
[ 2401.930084]  ffff88103523ba10 0000000000000046 ffff881058a94e70 ffff88103523bfd8
[ 2401.939561]  ffff88103523bfd8 ffff88103523bfd8 ffff881058a94e70 ffff88085a687748
[ 2401.949014]  ffff88085a687750 7fffffffffffffff ffff881058a94e70 ffffffffa11c4660
[ 2401.958448] Call Trace:
[ 2401.962252]  [&amp;lt;ffffffffa11c4660&amp;gt;] ? mgs_steal_client_llog_handler+0x1110/0x1110 [mgs]
[ 2401.972072]  [&amp;lt;ffffffff8168bd09&amp;gt;] schedule+0x29/0x70
[ 2401.978673]  [&amp;lt;ffffffff81689759&amp;gt;] schedule_timeout+0x239/0x2d0
[ 2401.986230]  [&amp;lt;ffffffff810c7f05&amp;gt;] ? sched_clock_cpu+0x85/0xc0
[ 2401.993663]  [&amp;lt;ffffffff810c1b15&amp;gt;] ? check_preempt_curr+0x75/0xa0
[ 2402.001376]  [&amp;lt;ffffffff810c1b59&amp;gt;] ? ttwu_do_wakeup+0x19/0xd0
[ 2402.008685]  [&amp;lt;ffffffffa11c4660&amp;gt;] ? mgs_steal_client_llog_handler+0x1110/0x1110 [mgs]
[ 2402.018404]  [&amp;lt;ffffffff8168c0e6&amp;gt;] wait_for_completion+0x116/0x170
[ 2402.026180]  [&amp;lt;ffffffff810c5080&amp;gt;] ? wake_up_state+0x20/0x20
[ 2402.033374]  [&amp;lt;ffffffffa0cbc047&amp;gt;] llog_process_or_fork+0x1d7/0x590 [obdclass]
[ 2402.042291]  [&amp;lt;ffffffffa0cbc414&amp;gt;] llog_process+0x14/0x20 [obdclass]
[ 2402.050215]  [&amp;lt;ffffffffa11ccc9f&amp;gt;] mgs_find_or_make_fsdb+0x72f/0x7e0 [mgs]
[ 2402.058710]  [&amp;lt;ffffffffa11ccdb2&amp;gt;] mgs_check_index+0x62/0x2f0 [mgs]
[ 2402.066494]  [&amp;lt;ffffffffa11b49fe&amp;gt;] mgs_target_reg+0x38e/0x1320 [mgs]
[ 2402.074414]  [&amp;lt;ffffffffa0fd6adb&amp;gt;] tgt_request_handle+0x8fb/0x11f0 [ptlrpc]
[ 2402.082981]  [&amp;lt;ffffffffa0f7ab8b&amp;gt;] ptlrpc_server_handle_request+0x21b/0xa90 [ptlrpc]
[ 2402.092385]  [&amp;lt;ffffffffa0be7ce8&amp;gt;] ? lc_watchdog_touch+0x68/0x180 [libcfs]
[ 2402.100831]  [&amp;lt;ffffffffa0f77c58&amp;gt;] ? ptlrpc_wait_event+0x98/0x330 [ptlrpc]
[ 2402.109235]  [&amp;lt;ffffffff810ba2e8&amp;gt;] ? __wake_up_common+0x58/0x90
[ 2402.116588]  [&amp;lt;ffffffffa0f7e4b0&amp;gt;] ptlrpc_main+0xc00/0x1f60 [ptlrpc]
[ 2402.124421]  [&amp;lt;ffffffffa0f7d8b0&amp;gt;] ? ptlrpc_register_service+0x1070/0x1070 [ptlrpc]
[ 2402.133680]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 2402.139934]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 2402.148035]  [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
[ 2402.154864]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 2402.162962] INFO: task ll_mgs_0003:32907 blocked for more than 120 seconds.
[ 2402.171544] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 2402.181108] ll_mgs_0003     D ffff8810583fae30     0 32907      2 0x00000080
[ 2402.189851]  ffff88102ae2bba0 0000000000000046 ffff881047b58000 ffff88102ae2bfd8
[ 2402.199037]  ffff88102ae2bfd8 ffff88102ae2bfd8 ffff881047b58000 ffff8810583fae28
[ 2402.208210]  ffff8810583fae2c ffff881047b58000 00000000ffffffff ffff8810583fae30
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Steps followed for Rolling Upgrade Downgrade:&lt;br/&gt;
1. Started with all clients and servers with b_ieel3_0 build 222 and build the Lustre File system&lt;br/&gt;
2. Upgraded OSS to 2.10.1RC1 re-mounted and ran sanity.sh&lt;br/&gt;
3. Upgraded MDS to 2.10.1 RC1 , remounted and ran sanity.sh&lt;br/&gt;
4. Upgraded Clients to 2.10.1RC1, remounted and ran sanity.sh&lt;br/&gt;
5. Downgraded Clients to b_ieel3_0 build 222 remounted and ran sanity.sh&lt;br/&gt;
6. Downgraded MDS to b_ieel3_0 build 222.&lt;br/&gt;
For both ldiskfs and zfs when trying to mount , the system hangs&lt;/p&gt;</description>
                <environment>Older Version: b_ieel3_0, build 222&lt;br/&gt;
Newer Version: b2_10 build 26</environment>
        <key id="48447">LU-10027</key>
            <summary>Unable to finish mount  on MDS while Rolling downgrade </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="standan">Saurabh Tandan</reporter>
                        <labels>
                    </labels>
                <created>Sat, 23 Sep 2017 21:16:16 +0000</created>
                <updated>Tue, 19 Mar 2019 05:43:21 +0000</updated>
                                            <version>Lustre 2.10.1</version>
                    <version>Lustre 2.10.2</version>
                    <version>Lustre 2.10.3</version>
                    <version>Lustre 2.10.5</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="209662" author="standan" created="Tue, 26 Sep 2017 20:58:09 +0000"  >&lt;p&gt;Took following steps to reproduce the issue and the issue was not reproduced. &lt;br/&gt;
Steps for both zfs and ldiskfs:&lt;br/&gt;
1. Clients and Servers with b_ieel3_0 build 222. Created Lustre file system. &lt;br/&gt;
2. Unmounted and upgraded all clients and Servers to 2.10.1. Mounted lustre again.&lt;br/&gt;
3. Unmounted Clients and Downgraded them to b_ieel3_0 build 222. Mounted Lustre again when nodes were up.&lt;br/&gt;
4. Unmounted MDS and downgraded it to b_ieel3_0 build 222. Mounted lustre and it worked this time.&lt;br/&gt;
5. Ran sanity now and the test ended abruptly after the completion of test_117 for both zfs and ldiskfs.&lt;/p&gt;

&lt;p&gt;When I checked the states of nodes, MDT and one client were unmounted, OSTs and 2nd client were mounted.&lt;/p&gt;

&lt;p&gt;Results:&lt;br/&gt;
ldiskfs - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/df43fbf4-a2fb-11e7-b786-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/df43fbf4-a2fb-11e7-b786-5254006e85c2&lt;/a&gt;&lt;br/&gt;
zfs - &lt;a href=&quot;https://testing.hpdd.intel.com/test_sessions/335a5516-a2fd-11e7-bb19-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sessions/335a5516-a2fd-11e7-bb19-5254006e85c2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="209764" author="adilger" created="Wed, 27 Sep 2017 19:03:06 +0000"  >&lt;p&gt;Filed the sanity test failure as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10039&quot; title=&quot;ioctl error after downgrade&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10039&quot;&gt;LU-10039&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="209767" author="adilger" created="Wed, 27 Sep 2017 19:33:24 +0000"  >&lt;p&gt;In regards to the original &quot;can&apos;t mount MDS&quot; error, I see in the sanity test runs the following message:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 5983.313562] LustreError: 25318:0:(obd_config.c:1618:class_config_llog_handler()) Unknown llog record type 0x68 encountered
[ 5983.318235] LustreError: 25318:0:(obd_config.c:1624:class_config_llog_handler()) MGC10.2.5.248@tcp: cfg command failed: rc = -22
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It isn&apos;t clear where &quot;0x68&quot; comes from, since llog record types should be something like &lt;tt&gt;OBD_CFG_REC&lt;/tt&gt; = &lt;tt&gt;0x10620000&lt;/tt&gt;.  The MDS stacks above appear to be stuck in llog processing, so it is possible there is some kind of problem with the llog records, but there is no way to know.&lt;/p&gt;</comment>
                            <comment id="215284" author="sarah" created="Tue, 5 Dec 2017 05:46:33 +0000"  >&lt;p&gt;Hit the same issue when rolling downgrade MDS from 2.10.2 to EE3.x build #222  ZFS&lt;br/&gt;
similar steps as Saurabh listed in the first description.&lt;br/&gt;
1. setup and format system as EE3.x ZFS&lt;br/&gt;
2. rolling upgrade OSS, MDS, clients to 2.10.2 ZFS, run sanity&lt;br/&gt;
3. downgrade clients and MDS back to EE3.x again, mount MDS hang&lt;/p&gt;


&lt;p&gt;MDS console&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 1524.602445] LustreError: 137-5: lustre-MDT0000_UUID: not available for connect from 10.2.2.44@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
[ 1524.625171] LustreError: Skipped 3 previous similar messages
[ 1532.711986] LNet: Service thread pid 32831 was inactive for 200.75s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[ 1532.734540] Pid: 32831, comm: ll_mgs_0001
[ 1532.740666] 
[ 1532.740666] Call Trace:
[ 1532.748287]  [&amp;lt;ffffffff8168cdf9&amp;gt;] schedule_preempt_disabled+0x29/0x70
[ 1532.756975]  [&amp;lt;ffffffff8168aa55&amp;gt;] __mutex_lock_slowpath+0xc5/0x1c0
[ 1532.765337]  [&amp;lt;ffffffff81689ebf&amp;gt;] mutex_lock+0x1f/0x2f
[ 1532.772421]  [&amp;lt;ffffffffa11e729f&amp;gt;] mgs_fsc_attach+0x22f/0x600 [mgs]
[ 1532.780638]  [&amp;lt;ffffffffa11c147a&amp;gt;] mgs_llog_open+0x1fa/0x430 [mgs]
[ 1532.788684]  [&amp;lt;ffffffffa0fe5adb&amp;gt;] tgt_request_handle+0x8fb/0x11f0 [ptlrpc]
[ 1532.797608]  [&amp;lt;ffffffffa0f89b8b&amp;gt;] ptlrpc_server_handle_request+0x21b/0xa90 [ptlrpc]
[ 1532.807318]  [&amp;lt;ffffffffa0bd6ce8&amp;gt;] ? lc_watchdog_touch+0x68/0x180 [libcfs]
[ 1532.816023]  [&amp;lt;ffffffffa0f86c58&amp;gt;] ? ptlrpc_wait_event+0x98/0x330 [ptlrpc]
[ 1532.824681]  [&amp;lt;ffffffff810ba2e8&amp;gt;] ? __wake_up_common+0x58/0x90
[ 1532.832249]  [&amp;lt;ffffffffa0f8d4b0&amp;gt;] ptlrpc_main+0xc00/0x1f60 [ptlrpc]
[ 1532.840318]  [&amp;lt;ffffffffa0f8c8b0&amp;gt;] ? ptlrpc_main+0x0/0x1f60 [ptlrpc]
[ 1532.848288]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 1532.854713]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[ 1532.861175]  [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
[ 1532.868119]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread+0x0/0xe0
[ 1532.874546] 
[ 1532.877074] LustreError: dumping log to /tmp/lustre-log.1512449773.32831
[ 1561.996681] INFO: task ll_mgs_0001:32831 blocked for more than 120 seconds.
[ 1562.005988] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 1562.016152] ll_mgs_0001     D ffff88085bcc5c30     0 32831      2 0x00000080
[ 1562.025392]  ffff881037aabba0 0000000000000046 ffff8810392f5e20 ffff881037aabfd8
[ 1562.035011]  ffff881037aabfd8 ffff881037aabfd8 ffff8810392f5e20 ffff88085bcc5c28
[ 1562.044550]  ffff88085bcc5c2c ffff8810392f5e20 00000000ffffffff ffff88085bcc5c30
[ 1562.054014] Call Trace:
[ 1562.057842]  [&amp;lt;ffffffff8168cdf9&amp;gt;] schedule_preempt_disabled+0x29/0x70
[ 1562.066165]  [&amp;lt;ffffffff8168aa55&amp;gt;] __mutex_lock_slowpath+0xc5/0x1c0
[ 1562.074135]  [&amp;lt;ffffffff81689ebf&amp;gt;] mutex_lock+0x1f/0x2f
[ 1562.080945]  [&amp;lt;ffffffffa11e729f&amp;gt;] mgs_fsc_attach+0x22f/0x600 [mgs]
[ 1562.088859]  [&amp;lt;ffffffffa11c147a&amp;gt;] mgs_llog_open+0x1fa/0x430 [mgs]
[ 1562.096707]  [&amp;lt;ffffffffa0fe5adb&amp;gt;] tgt_request_handle+0x8fb/0x11f0 [ptlrpc]
[ 1562.105383]  [&amp;lt;ffffffffa0f89b8b&amp;gt;] ptlrpc_server_handle_request+0x21b/0xa90 [ptlrpc]
[ 1562.114923]  [&amp;lt;ffffffffa0bd6ce8&amp;gt;] ? lc_watchdog_touch+0x68/0x180 [libcfs]
[ 1562.123478]  [&amp;lt;ffffffffa0f86c58&amp;gt;] ? ptlrpc_wait_event+0x98/0x330 [ptlrpc]
[ 1562.132003]  [&amp;lt;ffffffff810ba2e8&amp;gt;] ? __wake_up_common+0x58/0x90
[ 1562.139448]  [&amp;lt;ffffffffa0f8d4b0&amp;gt;] ptlrpc_main+0xc00/0x1f60 [ptlrpc]
[ 1562.147374]  [&amp;lt;ffffffffa0f8c8b0&amp;gt;] ? ptlrpc_register_service+0x1070/0x1070 [ptlrpc]
[ 1562.156714]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 1562.163012]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 1562.171156]  [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
[ 1562.178025]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 1562.186117] INFO: task ll_mgs_0002:32832 blocked for more than 120 seconds.
[ 1562.194697] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[ 1562.204242] ll_mgs_0002     D ffffffffa11d3660     0 32832      2 0x00000080
[ 1562.212947]  ffff881037aafa10 0000000000000046 ffff8810392f6dd0 ffff881037aaffd8
[ 1562.222074]  ffff881037aaffd8 ffff881037aaffd8 ffff8810392f6dd0 ffff8800354d67a8
[ 1562.231201]  ffff8800354d67b0 7fffffffffffffff ffff8810392f6dd0 ffffffffa11d3660
[ 1562.240335] Call Trace:
[ 1562.243864]  [&amp;lt;ffffffffa11d3660&amp;gt;] ? mgs_steal_client_llog_handler+0x1110/0x1110 [mgs]
[ 1562.253425]  [&amp;lt;ffffffff8168bd09&amp;gt;] schedule+0x29/0x70
[ 1562.259785]  [&amp;lt;ffffffff81689759&amp;gt;] schedule_timeout+0x239/0x2d0
[ 1562.267114]  [&amp;lt;ffffffff810c7f05&amp;gt;] ? sched_clock_cpu+0x85/0xc0
[ 1562.274349]  [&amp;lt;ffffffff810c1b15&amp;gt;] ? check_preempt_curr+0x75/0xa0
[ 1562.281873]  [&amp;lt;ffffffff810c1b59&amp;gt;] ? ttwu_do_wakeup+0x19/0xd0
[ 1562.288998]  [&amp;lt;ffffffffa11d3660&amp;gt;] ? mgs_steal_client_llog_handler+0x1110/0x1110 [mgs]
[ 1562.298548]  [&amp;lt;ffffffff8168c0e6&amp;gt;] wait_for_completion+0x116/0x170
[ 1562.306162]  [&amp;lt;ffffffff810c5080&amp;gt;] ? wake_up_state+0x20/0x20
[ 1562.313210]  [&amp;lt;ffffffffa0ccb047&amp;gt;] llog_process_or_fork+0x1d7/0x590 [obdclass]
[ 1562.322013]  [&amp;lt;ffffffffa0ccb414&amp;gt;] llog_process+0x14/0x20 [obdclass]
[ 1562.329837]  [&amp;lt;ffffffffa11dbc9f&amp;gt;] mgs_find_or_make_fsdb+0x72f/0x7e0 [mgs]
[ 1562.338244]  [&amp;lt;ffffffffa11dbdb2&amp;gt;] mgs_check_index+0x62/0x2f0 [mgs]
[ 1562.345968]  [&amp;lt;ffffffffa11c39fe&amp;gt;] mgs_target_reg+0x38e/0x1320 [mgs]
[ 1562.353828]  [&amp;lt;ffffffffa0fe5adb&amp;gt;] tgt_request_handle+0x8fb/0x11f0 [ptlrpc]
[ 1562.362382]  [&amp;lt;ffffffffa0f89b8b&amp;gt;] ptlrpc_server_handle_request+0x21b/0xa90 [ptlrpc]
[ 1562.371783]  [&amp;lt;ffffffffa0bd6ce8&amp;gt;] ? lc_watchdog_touch+0x68/0x180 [libcfs]
[ 1562.380230]  [&amp;lt;ffffffffa0f86c58&amp;gt;] ? ptlrpc_wait_event+0x98/0x330 [ptlrpc]
[ 1562.388827]  [&amp;lt;ffffffff810ba2e8&amp;gt;] ? __wake_up_common+0x58/0x90
[ 1562.396390]  [&amp;lt;ffffffffa0f8d4b0&amp;gt;] ptlrpc_main+0xc00/0x1f60 [ptlrpc]
[ 1562.404377]  [&amp;lt;ffffffff81029569&amp;gt;] ? __switch_to+0xd9/0x4c0
[ 1562.411516]  [&amp;lt;ffffffffa0f8c8b0&amp;gt;] ? ptlrpc_register_service+0x1070/0x1070 [ptlrpc]
[ 1562.420970]  [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
[ 1562.427449]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
[ 1562.435723]  [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
[ 1562.442728]  [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="215311" author="pjones" created="Tue, 5 Dec 2017 15:09:24 +0000"  >&lt;p&gt;Bruno&lt;/p&gt;

&lt;p&gt;Can you please advise?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="218438" author="standan" created="Wed, 17 Jan 2018 18:07:32 +0000"  >&lt;p&gt;Same issue hit while doing rolling upgrade/downgrade for 2.10.3_RC1. Followed same steps as mentioned before.&lt;/p&gt;</comment>
                            <comment id="218444" author="sarah" created="Wed, 17 Jan 2018 19:52:15 +0000"  >&lt;p&gt;It seems the different the steps that cannot reproduce the problem is we didn&apos;t run sanity.sh between each upgrade&lt;/p&gt;</comment>
                            <comment id="218491" author="bfaccini" created="Thu, 18 Jan 2018 09:23:08 +0000"  >&lt;p&gt;Well I guess that to have a better understanding of what is going when this situation/problem occurs, I will need to get hands-on or at least work on a forced crashed-dump at the time the stacks for the inactive threads start to be printed on the Console.&lt;/p&gt;

&lt;p&gt;Sarah, Saurabh, do you still have a test platform setup or should I need to reproduce by myself?&lt;/p&gt;</comment>
                            <comment id="218565" author="bfaccini" created="Thu, 18 Jan 2018 19:49:55 +0000"  >&lt;p&gt;Ok, running crash on MDS node with hung MDT mount, I have found that a llog_process_thread is busy running on a CPU and it appears that this thread, that has been spawned to parse the MGS LLOG is busy looping reading it with the following stacks (obtained by dumping active CPUs stacks) :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Jan 18 19:39:04 onyx-78 kernel: NMI backtrace for cpu 6
Jan 18 19:39:04 onyx-78 kernel: CPU: 6 PID: 32808 Comm: llog_process_th Tainted: P          IOE  ------------   3.10.0-514.10.2.el7_lustre.x86_64 #1
Jan 18 19:39:04 onyx-78 kernel: Hardware name: Intel Corporation S2600WT2/S2600WT2, BIOS SE5C610.86B.01.01.0008.021120151325 02/11/2015
Jan 18 19:39:04 onyx-78 kernel: task: ffff8808283f4e70 ti: ffff880825b48000 task.ti: ffff880825b48000
Jan 18 19:39:04 onyx-78 kernel: RIP: 0010:[&amp;lt;ffffffff81689e54&amp;gt;]  [&amp;lt;ffffffff81689e54&amp;gt;] mutex_unlock+0x14/0x20
Jan 18 19:39:04 onyx-78 kernel: RSP: 0018:ffff880825b4bad8  EFLAGS: 00000202
Jan 18 19:39:04 onyx-78 kernel: RAX: 000000001b361b36 RBX: ffff88101bff1d00 RCX: ffff880825b4bfd8
Jan 18 19:39:04 onyx-78 kernel: RDX: 0000000000001b36 RSI: ffff88101bff1d48 RDI: ffff88101bff1d00
Jan 18 19:39:04 onyx-78 kernel: RBP: ffff880825b4bad8 R08: ffff88101bff1d48 R09: 9c930b31ddab80d5
Jan 18 19:39:04 onyx-78 kernel: R10: ffffe8f7ff583bc0 R11: ffffea0001d91a00 R12: ffff88101bff1d28
Jan 18 19:39:04 onyx-78 kernel: R13: ffff881056216008 R14: 0000000000000000 R15: ffff881056216008
Jan 18 19:39:04 onyx-78 kernel: FS:  0000000000000000(0000) GS:ffff88085ed80000(0000) knlGS:0000000000000000
Jan 18 19:39:04 onyx-78 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jan 18 19:39:04 onyx-78 kernel: CR2: 00007f7be5ea3160 CR3: 00000000019ba000 CR4: 00000000001407e0
Jan 18 19:39:04 onyx-78 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jan 18 19:39:04 onyx-78 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Jan 18 19:39:04 onyx-78 kernel: Stack:
Jan 18 19:39:04 onyx-78 kernel: ffff880825b4bb00 ffffffffa09180f4 ffff881056216008 0000000000000000
Jan 18 19:39:04 onyx-78 kernel: ffffffffa0a45480 ffff880825b4bb28 ffffffffa08cadc9 ffff88085bea96c0
Jan 18 19:39:04 onyx-78 kernel: 0000000000052690 ffffffffa0a177f0 ffff880825b4bb60 ffffffffa08cf283
Jan 18 19:39:04 onyx-78 kernel: Call Trace:
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa09180f4&amp;gt;] multilist_insert+0x94/0xb0 [zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa08cadc9&amp;gt;] remove_reference.isra.15+0x59/0x80 [zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa08cf283&amp;gt;] arc_buf_remove_ref+0xc3/0x150 [zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa08d563e&amp;gt;] dbuf_rele_and_unlock+0xae/0x400 [zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa0648e8d&amp;gt;] ? spl_kmem_cache_free+0x14d/0x1d0 [spl]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa097d1f7&amp;gt;] ? zio_destroy+0x77/0x80 [zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa0980afa&amp;gt;] ? zio_wait+0x16a/0x210 [zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa08d5ae6&amp;gt;] dbuf_rele+0x36/0x40 [zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa08df800&amp;gt;] dmu_buf_rele_array.part.4+0x40/0x70 [zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa08e080c&amp;gt;] dmu_read+0x15c/0x1a0 [zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa118ac69&amp;gt;] osd_read+0x109/0x240 [osd_zfs]
Jan 18 19:39:04 onyx-78 kernel: [&amp;lt;ffffffffa0d1ed5b&amp;gt;] dt_read+0x1b/0x50 [obdclass]
Jan 18 19:39:05 onyx-78 kernel: [&amp;lt;ffffffffa0cdaf28&amp;gt;] llog_osd_next_block+0x388/0xb50 [obdclass]
Jan 18 19:39:05 onyx-78 kernel: [&amp;lt;ffffffffa11d6ef3&amp;gt;] ? mgs_fsdb_handler+0x893/0x10a0 [mgs]
Jan 18 19:39:05 onyx-78 kernel: [&amp;lt;ffffffffa0ccd045&amp;gt;] llog_process_thread+0x305/0x1130 [obdclass]
Jan 18 19:39:05 onyx-78 kernel: [&amp;lt;ffffffffa0cce920&amp;gt;] ? llog_backup+0x500/0x500 [obdclass]
Jan 18 19:39:05 onyx-78 kernel: [&amp;lt;ffffffffa0cce96c&amp;gt;] llog_process_thread_daemonize+0x4c/0x80 [obdclass]
Jan 18 19:39:05 onyx-78 kernel: [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
Jan 18 19:39:05 onyx-78 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
Jan 18 19:39:05 onyx-78 kernel: [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
Jan 18 19:39:05 onyx-78 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
Jan 18 19:39:05 onyx-78 kernel: Code: 63 ff ff ba fc ff ff ff 85 c0 5b 0f 45 c2 5d eb e8 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 c7 47 18 00 00 00 00 48 89 e5 f0 ff 07 &amp;lt;7f&amp;gt; 05 e8 e5 0a 00 00 5d c3 0f 1f 00 0f 1f 44 00 00 55 48 8b 47

...................................

Jan 18 19:44:20 onyx-78 kernel: NMI backtrace for cpu 6
Jan 18 19:44:20 onyx-78 kernel: CPU: 6 PID: 32808 Comm: llog_process_th Tainted: P          IOE  ------------   3.10.0-514.10.2.el7_lustre.x86_64 #1
Jan 18 19:44:20 onyx-78 kernel: Hardware name: Intel Corporation S2600WT2/S2600WT2, BIOS SE5C610.86B.01.01.0008.021120151325 02/11/2015
Jan 18 19:44:20 onyx-78 kernel: task: ffff8808283f4e70 ti: ffff880825b48000 task.ti: ffff880825b48000
Jan 18 19:44:20 onyx-78 kernel: RIP: 0010:[&amp;lt;ffffffffa08d46b5&amp;gt;]  [&amp;lt;ffffffffa08d46b5&amp;gt;] dbuf_hash+0xb5/0xc0 [zfs]
Jan 18 19:44:20 onyx-78 kernel: RSP: 0018:ffff880825b4baf0  EFLAGS: 00000202
Jan 18 19:44:20 onyx-78 kernel: RAX: 65549fe7bd13c18c RBX: ffff88085c3a4000 RCX: 000000000000001e
Jan 18 19:44:20 onyx-78 kernel: RDX: 00000000000000c5 RSI: 0000000000000090 RDI: 0003fffe20216fb4
Jan 18 19:44:20 onyx-78 kernel: RBP: ffff880825b4bb50 R08: 0073937c5d5e2787 R09: e0525e8c9b98c0a6
Jan 18 19:44:20 onyx-78 kernel: R10: ffff88017fc07400 R11: ffffffffa0647b50 R12: 0000008000000000
Jan 18 19:44:20 onyx-78 kernel: R13: ffff88085bed3000 R14: 0000000000000000 R15: ffff8810537486d0
Jan 18 19:44:20 onyx-78 kernel: FS:  0000000000000000(0000) GS:ffff88085ed80000(0000) knlGS:0000000000000000
Jan 18 19:44:20 onyx-78 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jan 18 19:44:20 onyx-78 kernel: CR2: 00007f7be5ea3160 CR3: 00000000019ba000 CR4: 00000000001407e0
Jan 18 19:44:20 onyx-78 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jan 18 19:44:21 onyx-78 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Jan 18 19:44:21 onyx-78 kernel: Stack:
Jan 18 19:44:21 onyx-78 kernel: ffffffffa08d475f 0000000000000000 0000000000000000 0000000000000000
Jan 18 19:44:21 onyx-78 kernel: 000000000000001e 0000000000000000 0000000000000090 ffff88085c3a4000
Jan 18 19:44:21 onyx-78 kernel: 0000008000000000 ffff8808283f4e70 0000000000000000 ffff8810537486d0
Jan 18 19:44:21 onyx-78 kernel: Call Trace:
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa08d475f&amp;gt;] ? dbuf_find+0x2f/0x1e0 [zfs]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa08d7b71&amp;gt;] __dbuf_hold_impl+0xc1/0x500 [zfs]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa08d8023&amp;gt;] dbuf_hold_impl+0x73/0xa0 [zfs]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa08d835f&amp;gt;] dbuf_hold+0x2f/0x60 [zfs]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa08dfd0f&amp;gt;] dmu_buf_hold_array_by_dnode+0x10f/0x4f0 [zfs]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa08e0757&amp;gt;] dmu_read+0xa7/0x1a0 [zfs]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa0ceeb92&amp;gt;] ? lprocfs_oh_tally+0x32/0x40 [obdclass]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa118ac69&amp;gt;] osd_read+0x109/0x240 [osd_zfs]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa0d1ed5b&amp;gt;] dt_read+0x1b/0x50 [obdclass]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa0cdaf28&amp;gt;] llog_osd_next_block+0x388/0xb50 [obdclass]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa11d6ef3&amp;gt;] ? mgs_fsdb_handler+0x893/0x10a0 [mgs]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa0ccd045&amp;gt;] llog_process_thread+0x305/0x1130 [obdclass]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa0cce920&amp;gt;] ? llog_backup+0x500/0x500 [obdclass]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffffa0cce96c&amp;gt;] llog_process_thread_daemonize+0x4c/0x80 [obdclass]
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
Jan 18 19:44:21 onyx-78 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
Jan 18 19:44:21 onyx-78 kernel: Code: 14 c5 60 47 9c a0 48 31 d0 49 89 d0 0f b6 d0 48 89 c8 49 c1 e8 08 48 31 f0 48 c1 e8 10 48 31 f8 4c 31 c0 48 33 04 d5 60 47 9c a0 &amp;lt;c3&amp;gt; 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 0f b6 4f 6c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;thus causing MDT mount to hang.&lt;/p&gt;</comment>
                            <comment id="218568" author="bfaccini" created="Thu, 18 Jan 2018 20:09:55 +0000"  >&lt;p&gt;And on the other hand, I have just been able to reproduce the hang on a single-node+ldiskfs setup by using your indications!!&lt;br/&gt;
And again, the spawned thread to parse LLOG is busy/looping reading it :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Jan 18 20:10:20 onyx-62 kernel: NMI backtrace for cpu 25
Jan 18 20:10:20 onyx-62 kernel: CPU: 25 PID: 35361 Comm: llog_process_th Tainted: G           OE  ------------   3.10.0-514.10.2.el7_lustre.x86_64 #1
Jan 18 20:10:20 onyx-62 kernel: Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
Jan 18 20:10:20 onyx-62 kernel: task: ffff8803f4803ec0 ti: ffff8803f4868000 task.ti: ffff8803f4868000
Jan 18 20:10:20 onyx-62 kernel: RIP: 0010:[&amp;lt;ffffffff81326aed&amp;gt;]  [&amp;lt;ffffffff81326aed&amp;gt;] memcpy+0xd/0x110
Jan 18 20:10:20 onyx-62 kernel: RSP: 0018:ffff8803f486bc90  EFLAGS: 00010246
Jan 18 20:10:20 onyx-62 kernel: RAX: ffff8808016ed000 RBX: ffff8808016ed000 RCX: 00000000000000cc
Jan 18 20:10:20 onyx-62 kernel: RDX: 0000000000000000 RSI: ffff88081249c9a0 RDI: ffff8808016ed9a0
Jan 18 20:10:20 onyx-62 kernel: RBP: ffff8803f486bcf8 R08: 0000000000000000 R09: 0000000000000000
Jan 18 20:10:20 onyx-62 kernel: R10: 0000000000002000 R11: 0000000000000000 R12: 0000000000001000
Jan 18 20:10:20 onyx-62 kernel: R13: 0000000000001000 R14: ffff8803f486be28 R15: ffff88081c796ea0
Jan 18 20:10:20 onyx-62 kernel: FS:  0000000000000000(0000) GS:ffff88082da40000(0000) knlGS:0000000000000000
Jan 18 20:10:20 onyx-62 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jan 18 20:10:20 onyx-62 kernel: CR2: 00007f84dc914550 CR3: 00000000019ba000 CR4: 00000000000407e0
Jan 18 20:10:20 onyx-62 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jan 18 20:10:20 onyx-62 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Jan 18 20:10:20 onyx-62 kernel: Stack:
Jan 18 20:10:20 onyx-62 kernel: ffffffffa0f583e9 0000000000000000 0000200000002000 0000100000000fff
Jan 18 20:10:20 onyx-62 kernel: 0000000000001000 ffff8800aed6d078 00000000f486be28 00000000f7a30a1f
Jan 18 20:10:20 onyx-62 kernel: ffff880822ab4240 ffff880826e65970 ffff8803f486be28 ffff8800aed6d078
Jan 18 20:10:20 onyx-62 kernel: Call Trace:
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffffa0f583e9&amp;gt;] ? osd_ldiskfs_read+0x129/0x2d0 [osd_ldiskfs]
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffffa0f5867f&amp;gt;] osd_read+0xef/0x130 [osd_ldiskfs]
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffffa09fad5b&amp;gt;] dt_read+0x1b/0x50 [obdclass]
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffffa09b6f28&amp;gt;] llog_osd_next_block+0x388/0xb50 [obdclass]
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffffa0fc8ef3&amp;gt;] ? mgs_fsdb_handler+0x893/0x10a0 [mgs]
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffffa09a9045&amp;gt;] llog_process_thread+0x305/0x1130 [obdclass]
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffffa09aa920&amp;gt;] ? llog_backup+0x500/0x500 [obdclass]
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffffa09aa96c&amp;gt;] llog_process_thread_daemonize+0x4c/0x80 [obdclass]
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffff810b06ff&amp;gt;] kthread+0xcf/0xe0
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffff81696b98&amp;gt;] ret_from_fork+0x58/0x90
Jan 18 20:10:20 onyx-62 kernel: [&amp;lt;ffffffff810b0630&amp;gt;] ? kthread_create_on_node+0x140/0x140
Jan 18 20:10:20 onyx-62 kernel: Code: 43 4e 5b 5d c3 66 0f 1f 84 00 00 00 00 00 e8 7b fc ff ff eb e2 90 90 90 90 90 90 90 90 90 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 &amp;lt;f3&amp;gt; 48 a5 89 d1 f3 a4 c3 20 4c 8b 06 4c 8b 4e 08 4c 8b 56 10 4c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;More to come.&lt;/p&gt;</comment>
                            <comment id="218913" author="bfaccini" created="Tue, 23 Jan 2018 16:17:31 +0000"  >&lt;p&gt;After more live analysis of this forked+looping llog_process_thread and thus the hung MDT mount, my guess is that this is caused by master/2.10 patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7050&quot; title=&quot;llog_skip_over skip the record by too little minimum record size.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7050&quot;&gt;&lt;del&gt;LU-7050&lt;/del&gt;&lt;/a&gt; (&quot;llog: record the minimum record size&quot;) which puzzles (wrong computation of next block offset causing the same block to be read for ever by llog_osd_next_block()) the b_ieel3_0/2.7 version of llog_skip_over() routine because it wrongly assumes that records are of a fixed size due to the minimum record size now recorded in LLOG header&apos;s llh_size field.&lt;/p&gt;

&lt;p&gt;I doubt, as it could be quite tricky due to multiple dependancies with other patches..., if we can simply fix this by back-porting &lt;a href=&quot;http://review.whamcloud.com/16103&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/16103&lt;/a&gt; (&quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-7050&quot; title=&quot;llog_skip_over skip the record by too little minimum record size.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-7050&quot;&gt;&lt;del&gt;LU-7050&lt;/del&gt;&lt;/a&gt; llog: record the minimum record size&quot;) to b_ieel3_0 to allow for new LLOG behavior/format compliance, or may be we can think to have a new tool to return the LLOGs with variable size records to the old header format?&lt;/p&gt;

&lt;p&gt;Also, since the involved MGS/config LLOG is the &quot;&amp;lt;fsname&amp;gt;-client&quot; one, but there may be other plain LLOG types/instances affected, as a work-around I have tried a &quot;tunefs.lustre --writeconf&quot; on MGS+MDT device prior to mount it, but without success and have failed into the same hung situation. Then I mounted the MGS+MDT device and manually removed the CONFIGS/lustre-client LLOG, and next mount has worked successfully. After that I had to also writeconf the OSTs to be able to mount/start them.&lt;/p&gt;
</comment>
                            <comment id="219177" author="adilger" created="Thu, 25 Jan 2018 18:48:25 +0000"  >&lt;p&gt;Bruno, could you port the 16103 patch to b_ieel3_0?&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="31745">LU-7050</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="47200">LU-9764</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="48499">LU-10039</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzkpr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>