<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:28:30 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2823] large-scale test_3a hung: LBUG: ASSERTION(ergo(!obd-&gt;obd_recovering, diff &gt;= 0)) failed: lustre-OST0001: 1013476 - 1024134 = -10658</title>
                <link>https://jira.whamcloud.com/browse/LU-2823</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;The large-scale test_3a hung as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;1 : Starting failover on mds
Failing mds on node client-20-ib
CMD: client-20-ib grep -c /mnt/mds&apos; &apos; /proc/mounts
Stopping /mnt/mds (opts:)
CMD: client-20-ib umount -d /mnt/mds
CMD: client-20-ib lsmod | grep lnet &amp;gt; /dev/null &amp;amp;&amp;amp; lctl dl | grep &apos; ST &apos;
affected facets: mds
df pid is 29886
Failover mds to client-20-ib
05:15:08 (1360934108) waiting for client-20-ib network 900 secs ...
05:15:08 (1360934108) network interface is UP
CMD: client-20-ib hostname
Starting mds: -o user_xattr,acl  /dev/lvm-MDS/P1 /mnt/mds
CMD: client-20-ib mkdir -p /mnt/mds; mount -t lustre -o user_xattr,acl  /dev/lvm-MDS/P1 /mnt/mds
CMD: client-20-ib PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/tests:/usr/lib64/openmpi/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/usr/lib64/lustre/utils:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey::/usr/lib64/lustre/tests/mpi NAME=autotest_config sh rpc.sh set_default_debug \&quot;-1\&quot; \&quot; 0xffb7e3ff\&quot; 32 
client-20-ib: lnet.debug=-1
client-20-ib: lnet.subsystem_debug=0xffb7e3ff
client-20-ib: lnet.debug_mb=32
CMD: client-20-ib e2label /dev/lvm-MDS/P1
Started lustre-MDT0000
affected facets: mds
CMD: client-20-ib PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/tests:/usr/lib64/openmpi/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/usr/lib64/lustre/utils:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey::/usr/lib64/lustre/tests/mpi NAME=autotest_config sh rpc.sh _wait_recovery_complete *.lustre-MDT0000.recovery_status 200 
client-20-ib: *.lustre-MDT0000.recovery_status status: RECOVERING
client-20-ib: Waiting 195 secs for *.lustre-MDT0000.recovery_status recovery done. status: RECOVERING
client-20-ib: *.lustre-MDT0000.recovery_status status: COMPLETE
CMD: client-20-ib lctl get_param -n *.lustre-MDT0000.recovery_status
RECOVERY TIME: NFILES=50000 number of clients: 1  recovery_duration: 2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on Client (client-22-ib) showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;05:15:13:Lustre: MGC192.168.4.20@o2ib: Connection restored to service MGS using nid 192.168.4.20@o2ib.
05:15:13:Lustre: Skipped 5 previous similar messages
05:18:15:INFO: task mdsrate:9347 blocked for more than 120 seconds.
05:18:15:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
05:18:15:mdsrate       D 0000000000000002     0  9347   9346 0x00000080
05:18:15: ffff8802f9f71d38 0000000000000082 ffff8802181a4200 0000000000000001
05:18:15: ffff88029d9e2018 0000000000640c40 ffffffff8100bb8e ffff8802f9f71d38
05:18:15: ffff880313dcfab8 ffff8802f9f71fd8 000000000000fb88 ffff880313dcfab8
05:18:15:Call Trace:
05:18:15: [&amp;lt;ffffffff8100bb8e&amp;gt;] ? apic_timer_interrupt+0xe/0x20
05:18:15: [&amp;lt;ffffffff8104dfbf&amp;gt;] ? mutex_spin_on_owner+0x9f/0xc0
05:18:15: [&amp;lt;ffffffff814eb2ae&amp;gt;] __mutex_lock_slowpath+0x13e/0x180
05:18:15: [&amp;lt;ffffffff8115eb22&amp;gt;] ? kmem_cache_alloc+0x182/0x190
05:18:15: [&amp;lt;ffffffff814eb14b&amp;gt;] mutex_lock+0x2b/0x50
05:18:15: [&amp;lt;ffffffff81186e4e&amp;gt;] do_filp_open+0x2be/0xd60
05:18:15: [&amp;lt;ffffffff8104338c&amp;gt;] ? __do_page_fault+0x1ec/0x480
05:18:15: [&amp;lt;ffffffff811b9cae&amp;gt;] ? ep_poll+0x12e/0x330
05:18:15: [&amp;lt;ffffffff81193272&amp;gt;] ? alloc_fd+0x92/0x160
05:18:15: [&amp;lt;ffffffff81173a39&amp;gt;] do_sys_open+0x69/0x140
05:18:15: [&amp;lt;ffffffff81173b50&amp;gt;] sys_open+0x20/0x30
05:18:15: [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on MDS (client-20-ib) showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;05:15:03:Lustre: DEBUG MARKER: 1 : Starting failover on mds
05:15:03:Lustre: DEBUG MARKER: grep -c /mnt/mds&apos; &apos; /proc/mounts
05:15:03:Lustre: DEBUG MARKER: umount -d /mnt/mds
05:15:03:Lustre: Failing over lustre-MDT0000
05:15:03:Lustre: Skipped 17 previous similar messages
05:15:03:LustreError: 3780:0:(osc_create.c:595:osc_create()) lustre-OST0000-osc: oscc recovery failed: -4
05:15:03:LustreError: 3782:0:(osc_create.c:595:osc_create()) lustre-OST0001-osc: oscc recovery failed: -4
05:15:03:LustreError: 3782:0:(lov_obd.c:1153:lov_clear_orphans()) error in orphan recovery on OST idx 1/7: rc = -4
05:15:03:LustreError: 3781:0:(osc_create.c:595:osc_create()) lustre-OST0002-osc: oscc recovery failed: -4
05:15:03:LustreError: 3782:0:(mds_lov.c:1057:__mds_lov_synchronize()) lustre-OST0001_UUID failed at mds_lov_clear_orphans: -4
05:15:03:LustreError: 3782:0:(mds_lov.c:1066:__mds_lov_synchronize()) lustre-OST0001_UUID sync failed -4, deactivating
05:15:03:LustreError: 3785:0:(osc_create.c:595:osc_create()) lustre-OST0005-osc: oscc recovery failed: -4
05:15:03:LustreError: 3786:0:(osc_create.c:595:osc_create()) lustre-OST0006-osc: oscc recovery failed: -4
05:15:03:LustreError: 6245:0:(ldlm_request.c:1039:ldlm_cli_cancel_req()) Got rc -108 from cancel RPC: canceling anyway
05:15:03:LustreError: 6245:0:(ldlm_request.c:1039:ldlm_cli_cancel_req()) Skipped 1 previous similar message
05:15:03:LustreError: 6245:0:(ldlm_request.c:1597:ldlm_cli_cancel_list()) ldlm_cli_cancel_list: -108
05:15:03:LustreError: 6245:0:(ldlm_request.c:1597:ldlm_cli_cancel_list()) Skipped 1 previous similar message
05:15:03:LustreError: 3780:0:(lov_obd.c:1153:lov_clear_orphans()) error in orphan recovery on OST idx 0/7: rc = -4
05:15:03:LustreError: 3784:0:(osc_create.c:595:osc_create()) lustre-OST0004-osc: oscc recovery failed: -4
05:15:03:LustreError: 3784:0:(mds_lov.c:1057:__mds_lov_synchronize()) lustre-OST0004_UUID failed at mds_lov_clear_orphans: -4
05:15:03:LustreError: 3784:0:(mds_lov.c:1057:__mds_lov_synchronize()) Skipped 3 previous similar messages
05:15:03:LustreError: 3784:0:(mds_lov.c:1066:__mds_lov_synchronize()) lustre-OST0004_UUID sync failed -4, deactivating
05:15:03:LustreError: 3784:0:(mds_lov.c:1066:__mds_lov_synchronize()) Skipped 3 previous similar messages
05:15:03:LustreError: 3783:0:(osc_create.c:595:osc_create()) lustre-OST0003-osc: oscc recovery failed: -4
05:15:03:LustreError: 3478:0:(mds_open.c:442:mds_create_objects()) error creating objects for inode 524309: rc = -5
05:15:03:LustreError: 3478:0:(mds_open.c:827:mds_finish_open()) mds_create_objects: rc = -5
05:15:03:Lustre: 3478:0:(mds_reint.c:257:mds_finish_transno()) commit transaction for disconnected client 3e5acf3c-82ca-17ee-2609-2dd7736ba638: rc -5
&amp;lt;~snip~&amp;gt;
05:31:03:Lustre: lustre-OST0001-osc: Connection to service lustre-OST0001 via nid 192.168.4.21@o2ib was lost; in progress operations using this service will wait for recovery to complete.
05:31:03:LustreError: 11-0: an error occurred while communicating with 192.168.4.21@o2ib. The ost_connect operation failed with -16
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on OSS (client-21-ib) showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;05:14:54:Lustre: DEBUG MARKER: 1 : Starting failover on mds
05:15:16:Lustre: Service thread pid 14788 completed after 122.91s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
05:15:16:Lustre: Skipped 1 previous similar message
05:15:16:Lustre: 14830:0:(ldlm_lib.c:820:target_handle_connect()) lustre-OST0000: received new MDS connection from NID 192.168.4.20@o2ib, removing former export from same NID
05:15:16:Lustre: 14830:0:(ldlm_lib.c:820:target_handle_connect()) Skipped 16 previous similar messages
05:15:16:Lustre: lustre-OST0000: received MDS connection from 192.168.4.20@o2ib
05:15:16:Lustre: 14857:0:(filter.c:3129:filter_destroy_precreated()) lustre-OST0002: deleting orphan objects from 1013476 to 1017190, orphan objids won&apos;t be reused any more.
05:15:16:Lustre: Skipped 11 previous similar messages
05:15:16:LustreError: 11-0: an error occurred while communicating with 192.168.4.20@o2ib. The obd_ping operation failed with -107
05:15:16:LustreError: 166-1: MGC192.168.4.20@o2ib: Connection to service MGS via nid 192.168.4.20@o2ib was lost; in progress operations using this service will fail.
05:15:16:Lustre: 9791:0:(import.c:855:ptlrpc_connect_interpret()) MGS@MGC192.168.4.20@o2ib_0 changed server handle from 0x3e8c8b7064aac07f to 0x3e8c8b7064b57220
05:15:16:Lustre: MGC192.168.4.20@o2ib: Reactivating import
05:15:27:LustreError: 14877:0:(filter.c:3120:filter_destroy_precreated()) lustre-OST0003: destroy_in_progress already cleared
05:15:27:Lustre: Service thread pid 14809 completed after 137.70s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
05:15:27:LustreError: 14872:0:(filter.c:3120:filter_destroy_precreated()) lustre-OST0000: destroy_in_progress already cleared
05:15:27:Lustre: Service thread pid 14787 completed after 140.41s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
05:15:38:LustreError: 14884:0:(filter.c:3120:filter_destroy_precreated()) lustre-OST0004: destroy_in_progress already cleared
05:15:38:Lustre: Service thread pid 14810 completed after 148.92s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
05:15:38:LustreError: 14865:0:(filter.c:3120:filter_destroy_precreated()) lustre-OST0005: destroy_in_progress already cleared
05:15:38:LustreError: 17526:0:(filter.c:3234:filter_handle_precreate()) ASSERTION(ergo(!obd-&amp;gt;obd_recovering, diff &amp;gt;= 0)) failed: lustre-OST0001: 1013476 - 1024134 = -10658
05:15:38:LustreError: 17526:0:(filter.c:3234:filter_handle_precreate()) LBUG
05:15:38:Pid: 17526, comm: ll_ost_creat_04
05:15:38:
05:15:38:Call Trace:
05:15:38: [&amp;lt;ffffffff8afc26a1&amp;gt;] libcfs_debug_dumpstack+0x51/0x60 [libcfs]
05:15:38: [&amp;lt;ffffffff8afc2bda&amp;gt;] lbug_with_loc+0x7a/0xd0 [libcfs]
05:15:38: [&amp;lt;ffffffff8b4782c6&amp;gt;] filter_create+0x1186/0x17d0 [obdfilter]
05:15:38: [&amp;lt;ffffffff8afcacfd&amp;gt;] libcfs_debug_vmsg2+0x70d/0x970 [libcfs]
05:15:38: [&amp;lt;ffffffff8b15cec9&amp;gt;] lustre_pack_reply+0x29/0xb0 [ptlrpc]
05:15:38: [&amp;lt;ffffffff8b44d801&amp;gt;] ost_handle+0x1281/0x55c0 [ost]
05:15:38: [&amp;lt;ffffffff8afc7868&amp;gt;] libcfs_ip_addr2str+0x38/0x40 [libcfs]
05:15:38: [&amp;lt;ffffffff8b168874&amp;gt;] ptlrpc_server_handle_request+0x984/0xe00 [ptlrpc]
05:15:38: [&amp;lt;ffffffff8b169f16&amp;gt;] ptlrpc_main+0xf16/0x10e0 [ptlrpc]
05:15:38: [&amp;lt;ffffffff8005dfc1&amp;gt;] child_rip+0xa/0x11
05:15:38: [&amp;lt;ffffffff8b169000&amp;gt;] ptlrpc_main+0x0/0x10e0 [ptlrpc]
05:15:38: [&amp;lt;ffffffff8005dfb7&amp;gt;] child_rip+0x0/0x11
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/bbb3f1bc-779a-11e2-987d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/bbb3f1bc-779a-11e2-987d-52540035b04c&lt;/a&gt;&lt;/p&gt;</description>
                <environment>&lt;br/&gt;
Lustre Tag: v1_8_9_WC1_RC1&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/256&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/256&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.9/x86_64(server), RHEL6.3/x86_64(client)&lt;br/&gt;
Network: IB (in-kernel OFED)&lt;br/&gt;
ENABLE_QUOTA=yes&lt;br/&gt;
</environment>
        <key id="17596">LU-2823</key>
            <summary>large-scale test_3a hung: LBUG: ASSERTION(ergo(!obd-&gt;obd_recovering, diff &gt;= 0)) failed: lustre-OST0001: 1013476 - 1024134 = -10658</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                    </labels>
                <created>Sat, 16 Feb 2013 00:08:24 +0000</created>
                <updated>Wed, 17 Apr 2013 06:20:40 +0000</updated>
                            <resolved>Wed, 17 Apr 2013 06:20:40 +0000</resolved>
                                    <version>Lustre 1.8.9</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="52524" author="pjones" created="Sat, 16 Feb 2013 00:26:42 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Is this a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2822&quot; title=&quot;softlockups, evictions during recovery-scale&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2822&quot;&gt;&lt;del&gt;LU-2822&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="52525" author="yujian" created="Sat, 16 Feb 2013 00:38:54 +0000"  >&lt;p&gt;The same test run by autotest passed over TCP network: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/ad2ba008-7683-11e2-bc2f-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/ad2ba008-7683-11e2-bc2f-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="52529" author="niu" created="Sat, 16 Feb 2013 04:45:31 +0000"  >&lt;p&gt;Looks it&apos;s not a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2822&quot; title=&quot;softlockups, evictions during recovery-scale&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2822&quot;&gt;&lt;del&gt;LU-2822&lt;/del&gt;&lt;/a&gt;. The mds_lov_synchronize failure could probably triggered the assertion at the end, but I don&apos;t see how so far.&lt;/p&gt;</comment>
                            <comment id="52546" author="niu" created="Sat, 16 Feb 2013 21:59:13 +0000"  >&lt;p&gt;Looks the story like:&lt;/p&gt;

&lt;p&gt;The obdfilter is very slow for some reason, which makes the ost thread stuck in orphan cleanup for a very long time:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Lustre: Service thread pid 14810 was inactive &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 40.00s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; debugging purposes:
Pid: 14810, comm: ll_ost_32

Call Trace:
 [&amp;lt;ffffffff8006ed48&amp;gt;] do_gettimeofday+0x40/0x90
 [&amp;lt;ffffffff800155b4&amp;gt;] sync_buffer+0x0/0x3f
 [&amp;lt;ffffffff800637de&amp;gt;] io_schedule+0x3f/0x67
 [&amp;lt;ffffffff800155ef&amp;gt;] sync_buffer+0x3b/0x3f
 [&amp;lt;ffffffff80063a0a&amp;gt;] __wait_on_bit+0x40/0x6e
 [&amp;lt;ffffffff800155b4&amp;gt;] sync_buffer+0x0/0x3f
 [&amp;lt;ffffffff80063aa4&amp;gt;] out_of_line_wait_on_bit+0x6c/0x78
 [&amp;lt;ffffffff800a3c15&amp;gt;] wake_bit_function+0x0/0x23
 [&amp;lt;ffffffff800175ee&amp;gt;] ll_rw_block+0x8c/0xab
 [&amp;lt;ffffffff8b3dde61&amp;gt;] ldiskfs_bread+0x51/0x80 [ldiskfs]
 [&amp;lt;ffffffff8b3eee05&amp;gt;] ldiskfs_find_entry+0x1c5/0x670 [ldiskfs]
 [&amp;lt;ffffffff8afcacfd&amp;gt;] libcfs_debug_vmsg2+0x70d/0x970 [libcfs]
 [&amp;lt;ffffffff8b3f0097&amp;gt;] ldiskfs_lookup+0x57/0x200 [ldiskfs]
 [&amp;lt;ffffffff8003722b&amp;gt;] __lookup_hash+0x10b/0x130
 [&amp;lt;ffffffff800ed617&amp;gt;] lookup_one_len+0x53/0x61
 [&amp;lt;ffffffff8b46d3ed&amp;gt;] filter_fid2dentry+0x42d/0x740 [obdfilter]
 [&amp;lt;ffffffff8b21d18f&amp;gt;] filter_quota_adjust+0x27f/0x2b0 [lquota]
 [&amp;lt;ffffffff8000d585&amp;gt;] dput+0x2c/0x114
 [&amp;lt;ffffffff8b475304&amp;gt;] filter_destroy+0x154/0x1f90 [obdfilter]
 [&amp;lt;ffffffff8006474d&amp;gt;] __down_failed_trylock+0x35/0x3a
 [&amp;lt;ffffffff8b477df9&amp;gt;] filter_create+0xcb9/0x17d0 [obdfilter]
 [&amp;lt;ffffffff8b15cec9&amp;gt;] lustre_pack_reply+0x29/0xb0 [ptlrpc]
 [&amp;lt;ffffffff8b44d801&amp;gt;] ost_handle+0x1281/0x55c0 [ost]
 [&amp;lt;ffffffff8afc7868&amp;gt;] libcfs_ip_addr2str+0x38/0x40 [libcfs]
 [&amp;lt;ffffffff8b168874&amp;gt;] ptlrpc_server_handle_request+0x984/0xe00 [ptlrpc]
 [&amp;lt;ffffffff8b169f16&amp;gt;] ptlrpc_main+0xf16/0x10e0 [ptlrpc]
 [&amp;lt;ffffffff8005dfc1&amp;gt;] child_rip+0xa/0x11
 [&amp;lt;ffffffff8b169000&amp;gt;] ptlrpc_main+0x0/0x10e0 [ptlrpc]
 [&amp;lt;ffffffff8005dfb7&amp;gt;] child_rip+0x0/0x11
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And the orhpan recovery RPC from MDS was interrupted:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 3782:0:(osc_create.c:595:osc_create()) lustre-OST0001-osc: oscc recovery failed: -4
LustreError: 3782:0:(lov_obd.c:1153:lov_clear_orphans()) error in orphan recovery on OST idx 1/7: rc = -4
LustreError: 3781:0:(osc_create.c:595:osc_create()) lustre-OST0002-osc: oscc recovery failed: -4
LustreError: 3782:0:(mds_lov.c:1057:__mds_lov_synchronize()) lustre-OST0001_UUID failed at mds_lov_clear_orphans: -4
LustreError: 3782:0:(mds_lov.c:1066:__mds_lov_synchronize()) lustre-OST0001_UUID sync failed -4, deactivating
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;MDS failover, a new orphan recovery was sent, but this one was skipped because the obdfilter is still working on previous orhpan cleanup:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 14884:0:(filter.c:3120:filter_destroy_precreated()) lustre-OST0004: destroy_in_progress already cleared
Lustre: Service thread pid 14810 completed after 148.92s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So the oscc_last_id on MDS wasn&apos;t synced with the last_id on obdfilter at the end, and the assertion (id to created &amp;lt; last_id on obdfilter) was triggered when creating objects.&lt;/p&gt;</comment>
                            <comment id="52547" author="niu" created="Sat, 16 Feb 2013 22:42:47 +0000"  >&lt;p&gt;I don&apos;t see any reason why we should skip orhpan recovery in the filter_destroy_precreated():&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!filter-&amp;gt;fo_destroy_in_progress) {
                CERROR(&lt;span class=&quot;code-quote&quot;&gt;&quot;%s: destroy_in_progress already cleared\n&quot;&lt;/span&gt;,
                        exp-&amp;gt;exp_obd-&amp;gt;obd_name);
                RETURN(0);
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;That could result in MDS not synced with obdfilter about last_id, actually, redo the orphan destroy won&apos;t hurt anybody.&lt;/p&gt;

&lt;p&gt;patch for b1_8: &lt;a href=&quot;http://review.whamcloud.com/5448&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5448&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="52584" author="yujian" created="Sun, 17 Feb 2013 22:33:37 +0000"  >&lt;p&gt;Lustre Tag: v1_8_9_WC1_RC1&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/256&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/256&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.9/x86_64(server), RHEL6.3/x86_64(client)&lt;br/&gt;
Network: IB (in-kernel OFED)&lt;br/&gt;
ENABLE_QUOTA=yes &lt;/p&gt;

&lt;p&gt;MGS/MDS node: fat-amd-2-ib&lt;br/&gt;
OSS node: fat-amd-4-ib&lt;br/&gt;
Client nodes: client-1-ib,client-3-ib&lt;/p&gt;

&lt;p&gt;The large-scale test passed over IB network on the same distro/arch and Lustre build in manual run:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sessions/7ec8cc4c-7979-11e2-9a74-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sessions/7ec8cc4c-7979-11e2-9a74-52540035b04c&lt;/a&gt; &lt;/p&gt;</comment>
                            <comment id="56442" author="niu" created="Wed, 17 Apr 2013 06:20:40 +0000"  >&lt;p&gt;patch landed for 1.8.9&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvj9b:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6834</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>