<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:42:04 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4365] recovery-small test_51: LBUG: (qsd_lib.c:294:qsd_qtype_fini()) ASSERTION( atomic_read(&amp;qqi-&gt;qqi_ref) == 1 ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-4365</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;recovery-small test_51 hung as follows:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Failing mds1 on client-30vm3
CMD: client-30vm3 grep -c /mnt/mds1&apos; &apos; /proc/mounts
Stopping /mnt/mds1 (opts:) on client-30vm3
CMD: client-30vm3 umount -d /mnt/mds1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;LBUG occurred on MDS:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;23:04:50:Lustre: DEBUG MARKER: umount -d /mnt/mds1
23:04:50:LustreError: 7276:0:(ldlm_resource.c:804:ldlm_resource_complain()) lustre-MDT0000-lwp-MDT0000: namespace resource [0x200000006:0x1010000:0x0].0 (ffff8800709d3980) refcount nonzero (1) after lock cleanup; forcing cleanup.
23:04:50:LustreError: 7276:0:(ldlm_resource.c:804:ldlm_resource_complain()) Skipped 1 previous similar message
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1415:ldlm_resource_dump()) --- Resource: [0x200000006:0x1010000:0x0].0 (ffff8800709d3980) refcount = 2
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1418:ldlm_resource_dump()) Granted locks (in reverse order):
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1421:ldlm_resource_dump()) ### ### ns: lustre-MDT0000-lwp-MDT0000 lock: ffff880063768b40/0x3b627f62d662915 lrc: 2/1,0 mode: CR/CR res: [0x200000006:0x1010000:0x0].0 rrc: 2 type: PLN flags: 0x1106400000000 nid: local remote: 0x3b627f62d66293f expref: -99 pid: 6945 timeout: 0 lvb_type: 2
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1421:ldlm_resource_dump()) Skipped 1 previous similar message
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1415:ldlm_resource_dump()) --- Resource: [0x200000006:0x10000:0x0].0 (ffff8800709d3ac0) refcount = 2
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1418:ldlm_resource_dump()) Granted locks (in reverse order):
23:04:50:LustreError: 7276:0:(ldlm_lib.c:2137:target_stop_recovery_thread()) lustre-MDT0000: Aborting recovery
23:04:50:Lustre: 6947:0:(ldlm_lib.c:1801:target_recovery_overseer()) recovery is aborted, evict exports in recovery
23:04:50:Lustre: 6947:0:(ldlm_lib.c:1801:target_recovery_overseer()) Skipped 2 previous similar messages
23:04:50:LustreError: 6937:0:(osp_precreate.c:737:osp_precreate_cleanup_orphans()) lustre-OST0000-osc-MDT0000: cannot cleanup orphans: rc = -5
23:04:50:LustreError: 7276:0:(qsd_lib.c:294:qsd_qtype_fini()) ASSERTION( atomic_read(&amp;amp;qqi-&amp;gt;qqi_ref) == 1 ) failed: 
23:04:50:LustreError: 7276:0:(qsd_lib.c:294:qsd_qtype_fini()) LBUG
23:04:50:Pid: 7276, comm: umount
23:04:50:
23:04:50:Call Trace:
23:04:50: [&amp;lt;ffffffffa05c2895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
23:04:50: [&amp;lt;ffffffffa05c2e97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
23:04:51: [&amp;lt;ffffffffa0efcc23&amp;gt;] qsd_qtype_fini+0x363/0x3a0 [lquota]
23:04:51: [&amp;lt;ffffffffa0efcfc8&amp;gt;] qsd_fini+0xd8/0x460 [lquota]
23:04:51: [&amp;lt;ffffffffa0d751a8&amp;gt;] osd_shutdown+0x38/0xe0 [osd_zfs]
23:04:51: [&amp;lt;ffffffffa0d78571&amp;gt;] osd_process_config+0x141/0x190 [osd_zfs]
23:04:51: [&amp;lt;ffffffffa0843b41&amp;gt;] lod_process_config+0x4e1/0x17a0 [lod]
23:04:51: [&amp;lt;ffffffffa05d32d1&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
23:04:51: [&amp;lt;ffffffff8100bb8e&amp;gt;] ? apic_timer_interrupt+0xe/0x20
23:04:52: [&amp;lt;ffffffffa118ed00&amp;gt;] mdd_process_config+0x210/0x610 [mdd]
23:04:52: [&amp;lt;ffffffffa1722c56&amp;gt;] mdt_stack_fini+0x176/0xbe0 [mdt]
23:04:52: [&amp;lt;ffffffffa1189950&amp;gt;] ? mdd_init_capa_ctxt+0x120/0x130 [mdd]
23:04:52: [&amp;lt;ffffffffa1723dca&amp;gt;] mdt_device_fini+0x70a/0xd90 [mdt]
23:04:52: [&amp;lt;ffffffffa071fbc7&amp;gt;] class_cleanup+0x577/0xda0 [obdclass]
23:04:52: [&amp;lt;ffffffffa06f4b06&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
23:04:52: [&amp;lt;ffffffffa07214ac&amp;gt;] class_process_config+0x10bc/0x1c80 [obdclass]
23:04:52: [&amp;lt;ffffffffa05cdd98&amp;gt;] ? libcfs_log_return+0x28/0x40 [libcfs]
23:04:52: [&amp;lt;ffffffffa071ad11&amp;gt;] ? lustre_cfg_new+0x391/0x7e0 [obdclass]
23:04:52: [&amp;lt;ffffffffa07221e9&amp;gt;] class_manual_cleanup+0x179/0x6f0 [obdclass]
23:04:52: [&amp;lt;ffffffffa06f4b06&amp;gt;] ? class_name2dev+0x56/0xe0 [obdclass]
23:04:52: [&amp;lt;ffffffffa07571dc&amp;gt;] server_put_super+0x5bc/0xf00 [obdclass]
23:04:52: [&amp;lt;ffffffff8118366b&amp;gt;] generic_shutdown_super+0x5b/0xe0
23:04:52: [&amp;lt;ffffffff81183756&amp;gt;] kill_anon_super+0x16/0x60
23:04:52: [&amp;lt;ffffffffa07240a6&amp;gt;] lustre_kill_super+0x36/0x60 [obdclass]
23:04:52: [&amp;lt;ffffffff81183ef7&amp;gt;] deactivate_super+0x57/0x80
23:04:52: [&amp;lt;ffffffff811a21ef&amp;gt;] mntput_no_expire+0xbf/0x110
23:04:52: [&amp;lt;ffffffff811a2c5b&amp;gt;] sys_umount+0x7b/0x3a0
23:04:52: [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
23:04:52:
23:04:52:Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/beee4bb4-60cb-11e3-bd66-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/beee4bb4-60cb-11e3-bd66-52540035b04c&lt;/a&gt;&lt;/p&gt;</description>
                <environment>&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/63/&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/63/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64&lt;br/&gt;
FSTYPE=zfs&lt;br/&gt;
</environment>
        <key id="22387">LU-4365</key>
            <summary>recovery-small test_51: LBUG: (qsd_lib.c:294:qsd_qtype_fini()) ASSERTION( atomic_read(&amp;qqi-&gt;qqi_ref) == 1 ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="yujian">Jian Yu</reporter>
                        <labels>
                    </labels>
                <created>Mon, 9 Dec 2013 15:36:57 +0000</created>
                <updated>Tue, 31 Dec 2013 15:38:02 +0000</updated>
                            <resolved>Tue, 17 Dec 2013 16:43:17 +0000</resolved>
                                    <version>Lustre 2.4.1</version>
                    <version>Lustre 2.5.0</version>
                                    <fixVersion>Lustre 2.6.0</fixVersion>
                    <fixVersion>Lustre 2.4.2</fixVersion>
                    <fixVersion>Lustre 2.5.1</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="73078" author="yujian" created="Mon, 9 Dec 2013 15:44:39 +0000"  >&lt;p&gt;The same test &lt;b&gt;passed&lt;/b&gt; in another ZFS full group test session on Lustre b2_4 build #63:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/40487e62-5fb6-11e3-85c5-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/40487e62-5fb6-11e3-85c5-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;By searching on Maloo, I found all of the previous recovery-small test 51 failures were &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3460&quot; title=&quot;recovery-small test_51 timeout: lqe_iter_cb(): Inuse quota entry&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3460&quot;&gt;&lt;del&gt;LU-3460&lt;/del&gt;&lt;/a&gt;. Is this a potential regression introduced by the fix of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3460&quot; title=&quot;recovery-small test_51 timeout: lqe_iter_cb(): Inuse quota entry&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3460&quot;&gt;&lt;del&gt;LU-3460&lt;/del&gt;&lt;/a&gt;?&lt;/p&gt;</comment>
                            <comment id="73159" author="pjones" created="Tue, 10 Dec 2013 00:02:35 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please comment?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="73343" author="bobijam" created="Thu, 12 Dec 2013 07:31:10 +0000"  >&lt;p&gt;Hi Niu, &lt;/p&gt;

&lt;p&gt;I have a question about qsd_glb_blocking_ast(), in which&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        &lt;span class=&quot;code-keyword&quot;&gt;case&lt;/span&gt; LDLM_CB_CANCELING: {
                struct qsd_qtype_info *qqi;

                LDLM_DEBUG(lock, &lt;span class=&quot;code-quote&quot;&gt;&quot;canceling global quota lock&quot;&lt;/span&gt;);

                qqi = qsd_glb_ast_data_get(lock, &lt;span class=&quot;code-keyword&quot;&gt;true&lt;/span&gt;);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (qqi == NULL)
                        &lt;span class=&quot;code-keyword&quot;&gt;break&lt;/span&gt;;

...
                lu_ref_del(&amp;amp;qqi-&amp;gt;qqi_reference, &lt;span class=&quot;code-quote&quot;&gt;&quot;ast_data_get&quot;&lt;/span&gt;, lock);
                qqi_putref(qqi);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;since it calls qsd_glb_ast_data_get(lock, true) to reset the lock-&amp;gt;l_ast_data and in it the qqi&apos;s ref has been put, why here blocking ast put its ref again?&lt;/p&gt;

&lt;p&gt;While in qsd_glb_glimpse_ast(), it calls qsd_glb_ast_data_get(lock, false) and qqi&apos;s ref is kept.&lt;/p&gt;</comment>
                            <comment id="73458" author="yujian" created="Fri, 13 Dec 2013 08:42:42 +0000"  >&lt;p&gt;Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_4/67/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_4/67/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL6.4/x86_64 (server), SLES11SP2/x86_64 (client)&lt;/p&gt;

&lt;p&gt;sanity-scrub test 3 also hit this LBUG:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/4bdfe456-6347-11e3-8c76-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/4bdfe456-6347-11e3-8c76-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="73550" author="niu" created="Mon, 16 Dec 2013 05:00:42 +0000"  >&lt;p&gt;Hi, Bobi&lt;/p&gt;

&lt;p&gt;The blocking ast calls qsd_glb_ast_data_get(lock, true) to put the qqi ref which holding by the lock (since the lock is goning to be canceled), and then put the ref which just got in qsd_glb_ast_data_get() (for caller). In qsd_glb_glimpse_ast(), it only needs to put the ref holding for caller.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;23:04:50:LustreError: 7276:0:(ldlm_resource.c:804:ldlm_resource_complain()) lustre-MDT0000-lwp-MDT0000: namespace resource [0x200000006:0x1010000:0x0].0 (ffff8800709d3980) refcount nonzero (1) after lock cleanup; forcing cleanup.
23:04:50:LustreError: 7276:0:(ldlm_resource.c:804:ldlm_resource_complain()) Skipped 1 previous similar message
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1415:ldlm_resource_dump()) --- Resource: [0x200000006:0x1010000:0x0].0 (ffff8800709d3980) refcount = 2
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1418:ldlm_resource_dump()) Granted locks (in reverse order):
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1421:ldlm_resource_dump()) ### ### ns: lustre-MDT0000-lwp-MDT0000 lock: ffff880063768b40/0x3b627f62d662915 lrc: 2/1,0 mode: CR/CR res: [0x200000006:0x1010000:0x0].0 rrc: 2 type: PLN flags: 0x1106400000000 nid: local remote: 0x3b627f62d66293f expref: -99 pid: 6945 timeout: 0 lvb_type: 2
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1421:ldlm_resource_dump()) Skipped 1 previous similar message
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1415:ldlm_resource_dump()) --- Resource: [0x200000006:0x10000:0x0].0 (ffff8800709d3ac0) refcount = 2
23:04:50:LustreError: 7276:0:(ldlm_resource.c:1418:ldlm_resource_dump()) Granted locks (in reverse order):
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The qqi reference is holding by the global quota locks, and the two quota global locks were not cleanup because the lock reader was not 0, but I don&apos;t see who is holding the lock.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;23:04:38:LustreError: 5385:0:(qsd_reint.c:58:qsd_reint_completion()) lustre-MDT0000: failed to enqueue global quota lock, glb fid:[0x200000006:0x1010000:0x0], rc:-5
23:04:38:LustreError: 5385:0:(qsd_reint.c:58:qsd_reint_completion()) Skipped 1 previous similar message
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I see there were some lock enqueue failures in the log, maybe there is some bug in the error path of lock enqueue which caused the lock not dropped properly?&lt;/p&gt;</comment>
                            <comment id="73554" author="niu" created="Mon, 16 Dec 2013 06:25:55 +0000"  >&lt;p&gt;Looks there is a race: On server umount, if quota reint process is just holding the global locks for reintegraion, the locks will not be cleared by server_put_super() &lt;del&gt;&amp;gt; lustre_disconnect_lwp(), and later on when qsd_fini() stops reint process, the global locks will be dropped, however, because lock cancel is done asynchronously, when the qsd_fini() reachs LASSERT(cfs_atomic_read(&amp;amp;qqi&lt;/del&gt;&amp;gt;qqi_ref) == 1), the global locks haven&apos;t been canceled yet and they still holding qqi refcount.&lt;/p&gt;

&lt;p&gt;seems we&apos;d wait a while for such case in qsd_fini(), I&apos;ll try to cooke a patch soon.&lt;/p&gt;</comment>
                            <comment id="73557" author="niu" created="Mon, 16 Dec 2013 07:13:54 +0000"  >&lt;p&gt;patch for master: &lt;a href=&quot;http://review.whamcloud.com/8583&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8583&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="73558" author="niu" created="Mon, 16 Dec 2013 08:10:05 +0000"  >&lt;p&gt;patch for b2_4: &lt;a href=&quot;http://review.whamcloud.com/8586&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8586&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="73652" author="yujian" created="Tue, 17 Dec 2013 06:05:50 +0000"  >&lt;p&gt;Patch landed on master branch for 2.6.0 and b2_4 branch for 2.4.2.&lt;/p&gt;</comment>
                            <comment id="73696" author="jlevi" created="Tue, 17 Dec 2013 16:40:17 +0000"  >&lt;p&gt;Patches have landed. Can this ticket be closed, or is more work expected/needed in this ticket?&lt;/p&gt;</comment>
                            <comment id="73697" author="pjones" created="Tue, 17 Dec 2013 16:43:17 +0000"  >&lt;p&gt;Landed for 2.4.2 and 2.5. Will land to 2.5.1 when work starts on that release&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwaxz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11948</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>