<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:26:51 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2630] Client hang in mdc_enqueue()</title>
                <link>https://jira.whamcloud.com/browse/LU-2630</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;On a number of Sequoia IONs (ppc64 lustre clients) we hit an mdc hang unde mdc_enqueue().  Many processes are stuck in do_path_loopup() like this:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-01-16 13:33:21.160679 {DefaultControlEventListener} [mmcs]{474}.0.0: sysiod        D 00000fffaea576bc     0 15111   3083 0x00000002
2013-01-16 13:33:21.160730 {DefaultControlEventListener} [mmcs]{474}.0.0: Call Trace:
2013-01-16 13:33:21.160782 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416aa30] [8000000004012b20] nscheds+0x21dc/0xfffffffffffec004 [ko2iblnd] (unreliable)
2013-01-16 13:33:21.160835 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416ac00] [c000000000009b2c] .__switch_to+0xc4/0x100
2013-01-16 13:33:21.160886 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416ac90] [c00000000042a418] .schedule+0x7d4/0x944
2013-01-16 13:33:21.160937 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416af40] [c00000000042b4a8] .__mutex_lock_slowpath+0x208/0x390
2013-01-16 13:33:21.160988 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416b040] [c00000000042bf44] .mutex_lock+0x38/0x58
2013-01-16 13:33:21.161040 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416b0c0] [8000000005ac1830] .mdc_enqueue+0x6e0/0x1c60 [mdc]
2013-01-16 13:33:21.161092 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416b280] [8000000005ac3060] .mdc_intent_lock+0x2b0/0x820 [mdc]
2013-01-16 13:33:21.161158 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416b400] [8000000007411f64] .lmv_intent_open+0x4e4/0x1940 [lmv]
2013-01-16 13:33:21.161232 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416b590] [80000000074136dc] .lmv_intent_lock+0x31c/0x4b0 [lmv]
2013-01-16 13:33:21.161314 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416b690] [80000000068d3a3c] .ll_revalidate_it+0x41c/0x2220 [lustre]
2013-01-16 13:33:21.161410 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416b7f0] [80000000068d59ec] .ll_revalidate_nd+0x1ac/0x560 [lustre]
2013-01-16 13:33:21.161539 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416b8b0] [c0000000000dc318] .do_lookup+0x180/0x218
2013-01-16 13:33:21.161626 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416b970] [c0000000000dee8c] .__link_path_walk+0xb90/0x11d4
2013-01-16 13:33:21.161694 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416ba90] [c0000000000df830] .path_walk+0x98/0x148
2013-01-16 13:33:21.161752 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416bb30] [c0000000000dfa34] .do_path_lookup+0x38/0x78
2013-01-16 13:33:21.161817 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416bbc0] [c0000000000e08a4] .do_filp_open+0x11c/0xadc
2013-01-16 13:33:21.161880 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416bd80] [c0000000000cf498] .do_sys_open+0x8c/0x18c
2013-01-16 13:33:21.161942 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e416be30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;because this processes is sleeping while holding the lock:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-01-16 13:33:21.078061 {DefaultControlEventListener} [mmcs]{474}.0.0: sysiod        S 00000fff965b7148     0 15097   3083 0x00000000
2013-01-16 13:33:21.078112 {DefaultControlEventListener} [mmcs]{474}.0.0: Call Trace:
2013-01-16 13:33:21.078163 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae2630] [c000000000027d1c] .resched_task+0x9c/0xb4 (unreliable)
2013-01-16 13:33:21.078214 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae2800] [c000000000009b2c] .__switch_to+0xc4/0x100
2013-01-16 13:33:21.078266 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae2890] [c00000000042a418] .schedule+0x7d4/0x944
2013-01-16 13:33:21.078317 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae2b40] [8000000000ae2090] .cfs_waitq_wait+0x10/0x30 [libcfs]
2013-01-16 13:33:21.078368 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae2bb0] [8000000003b56c3c] .ldlm_completion_ast+0x6dc/0xe30 [ptlrpc]
2013-01-16 13:33:21.078420 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae2cf0] [8000000003b542c4] .ldlm_cli_enqueue_fini+0xa34/0x16b0 [ptlrpc]
2013-01-16 13:33:21.078472 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae2e20] [8000000003b553b4] .ldlm_cli_enqueue+0x474/0x970 [ptlrpc]
2013-01-16 13:33:21.078524 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae2f60] [8000000005ac18ac] .mdc_enqueue+0x75c/0x1c60 [mdc]
2013-01-16 13:33:21.078576 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3120] [80000000073ffbf4] .lmv_enqueue+0x504/0x19f0 [lmv]
2013-01-16 13:33:21.078627 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3280] [80000000068de4a4] .ll_get_dir_page+0x354/0x1ee0 [lustre]
2013-01-16 13:33:21.078679 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3450] [8000000006956a28] .do_statahead_enter+0x6d8/0x21f0 [lustre]
2013-01-16 13:33:21.078730 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3610] [800000000693c84c] .ll_lookup_it+0xb8c/0xfc0 [lustre]
2013-01-16 13:33:21.078782 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3770] [800000000693cd50] .ll_lookup_nd+0xd0/0x580 [lustre]
2013-01-16 13:33:21.078834 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3830] [c0000000000dc2b8] .do_lookup+0x120/0x218
2013-01-16 13:33:21.078885 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae38f0] [c0000000000dee8c] .__link_path_walk+0xb90/0x11d4
2013-01-16 13:33:21.078936 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3a10] [c0000000000df830] .path_walk+0x98/0x148
2013-01-16 13:33:21.078988 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3ab0] [c0000000000dfa34] .do_path_lookup+0x38/0x78
2013-01-16 13:33:21.079039 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3b40] [c0000000000e05e0] .user_path_at+0x68/0xc0
2013-01-16 13:33:21.079090 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3c90] [c0000000000d5e30] .vfs_fstatat+0x5c/0xa8
2013-01-16 13:33:21.079141 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3d30] [c0000000000d5f00] .SyS_newfstatat+0x2c/0x58
2013-01-16 13:33:21.079193 {DefaultControlEventListener} [mmcs]{474}.0.0: [c0000003e0ae3e30] [c000000000000580] syscall_exit+0x0/0x2c
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It could be that this situation was a result of an eviction event.  But note also that the client claimed to be fully recovered with the mds.  See attached console log with sysrq-t output.&lt;/p&gt;

&lt;p&gt;Also note that sending a &quot;kill -9&quot; to process 15097 did break the system out of its hang.&lt;/p&gt;</description>
                <environment>Sequoia, lustre 2.3.58-5chaos (github.com/chaos/lustre)</environment>
        <key id="17199">LU-2630</key>
            <summary>Client hang in mdc_enqueue()</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="morrone">Christopher Morrone</reporter>
                        <labels>
                            <label>sequoia</label>
                    </labels>
                <created>Wed, 16 Jan 2013 18:03:32 +0000</created>
                <updated>Thu, 8 May 2014 14:01:43 +0000</updated>
                            <resolved>Fri, 12 Apr 2013 16:27:26 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="50664" author="pjones" created="Thu, 17 Jan 2013 10:13:25 +0000"  >&lt;p&gt;Lai&lt;/p&gt;

&lt;p&gt;Could you please help with this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="50702" author="adilger" created="Thu, 17 Jan 2013 13:34:03 +0000"  >&lt;p&gt;Chris, what kernel version is running on the clients?  I&apos;m wondering if this might relate to the client-side VFS dcache changes?&lt;/p&gt;</comment>
                            <comment id="50704" author="morrone" created="Thu, 17 Jan 2013 13:47:57 +0000"  >&lt;p&gt;It is 2.6.32-220.23.3.bgq.18llnl.V1R1M2.bgq62_16.ppc64, a RHEL6.2-ish kernel.&lt;/p&gt;</comment>
                            <comment id="50771" author="laisiyao" created="Fri, 18 Jan 2013 01:51:29 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-01-16 06:14:13.281231 {DefaultControlEventListener} [mmcs]{474}.8.1: LustreError: 15097:0:(ldlm_request.c:137:ldlm_expired_completion_wait()) ### lock timed out (enqueued at 1358344459, 300s ago), entering recovery for ls1-MDT0000_UUID@172.20.5.1@o2ib500 ns: ls1-MDT0000-mdc-c00000036832f400 lock: c0000002d4f2ed80/0x10212001ce8912e0 lrc: 4/1,0 mode: --/PR res: 8589991817/133 bits 0x2 rrc: 1 type: IBT flags: 0x0 nid: local remote: 0x481c21d383474cbf expref: -99 pid: 15097 timeout: 0 lvb_type: 0
2013-01-16 06:14:14.460938 {DefaultControlEventListener} [mmcs]{474}.3.0: Lustre: ls1-MDT0000-mdc-c00000036832f400: Connection restored to ls1-MDT0000 (at 172.20.5.1@o2ib500)
2013-01-16 06:21:59.693333 {DefaultControlEventListener} [mmcs]{474}.7.1: 2013-01-16 06:16:30.670 (INFO ) [0xfff77bc7680] ibm.cios.jobctld.HwJobController: Job 44223 added with 128 compute nodes
2013-01-16 08:17:03.466123 {DefaultControlEventListener} [mmcs]{474}.8.3: LustreError: 12612:0:(ldlm_lock.c:1570:ldlm_fill_lvb()) ### Unexpected LVB type ns: ls1-MDT0000-mdc-c00000036832f400 lock: c0000002d4f2ed80/0x10212001ce8912e0 lrc: 5/1,0 mode: --/PR res: 8589991817/133 bits 0x2 rrc: 1 type: IBT flags: 0x400400000000 nid: local remote: 0x481c21d383474cbf expref: -99 pid: 15097 timeout: 0 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The log shows process 15097 is in a loop waiting for lock to be granted or canceled, but I can&apos;t find function ldlm_fill_lvb() in lustre code, could you verify this? BTW could you attach MDT logs?&lt;/p&gt;</comment>
                            <comment id="50772" author="morrone" created="Fri, 18 Jan 2013 02:19:44 +0000"  >&lt;p&gt;Lai, make sure you are on 2.3.58 (or our 2.3.58-5chaos tag).  ldlm_fill_lvb() is here:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ git grep ldlm_fill_lvb
lustre/ldlm/ldlm_internal.h:int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
lustre/ldlm/ldlm_lock.c:int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
lustre/ldlm/ldlm_lockd.c:               rc = ldlm_fill_lvb(lock, &amp;amp;req-&amp;gt;rq_pill, RCL_CLIENT,
lustre/ldlm/ldlm_request.c:                     rc = ldlm_fill_lvb(lock, &amp;amp;req-&amp;gt;rq_pill, RCL_SERVER,
lustre/ldlm/ldlm_request.c:                     rc = ldlm_fill_lvb(lock, &amp;amp;req-&amp;gt;rq_pill, RCL_SERVER,
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;ll see what I can get from the MDS logs at work tomorrow.&lt;/p&gt;</comment>
                            <comment id="51322" author="laisiyao" created="Mon, 28 Jan 2013 09:53:32 +0000"  >&lt;p&gt;Chris, I was on vacation last week, how is the MDS logs? I&apos;m looking into the right branch now, last time I was checking 2.3.57 branch.&lt;/p&gt;</comment>
                            <comment id="51461" author="laisiyao" created="Wed, 30 Jan 2013 11:02:26 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-01-16 08:17:03.466123 {DefaultControlEventListener} [mmcs]{474}.8.3: LustreError: 12612:0:(ldlm_lock.c:1570:ldlm_fill_lvb()) ### Unexpected LVB type ns: ls1-MDT0000-mdc-c00000036832f400 lock: c0000002d4f2ed80/0x10212001ce8912e0 lrc: 5/1,0 mode: --/PR res: 8589991817/133 bits 0x2 rrc: 1 type: IBT flags: 0x400400000000 nid: local remote: 0x481c21d383474cbf expref: -99 pid: 15097 timeout: 0 lvb_type: 0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This line looks quite suspicious, because an IBIT lock shouldn&apos;t have lvb(currently only extent/quota/layout lock has lvb), but I can&apos;t find any place in the code this could happen. If this can be reproduce, and with more logs (debug logs with ldlm is best) of both client and MDS, it will help a lot.&lt;/p&gt;</comment>
                            <comment id="51880" author="adilger" created="Wed, 6 Feb 2013 13:31:47 +0000"  >&lt;p&gt;Fan Yong, any comment here about the LVB on the IBIT lock?&lt;/p&gt;</comment>
                            <comment id="51929" author="yong.fan" created="Wed, 6 Feb 2013 19:04:03 +0000"  >&lt;p&gt;It is more similar as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2645&quot; title=&quot;1.8&amp;lt;-&amp;gt;2.4 interop: enqueue objid 0x2 subobj 0x1 on OST idx 0: rc -5&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2645&quot;&gt;&lt;del&gt;LU-2645&lt;/del&gt;&lt;/a&gt;, the client evicted due to lock completion callback timeout, although &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2645&quot; title=&quot;1.8&amp;lt;-&amp;gt;2.4 interop: enqueue objid 0x2 subobj 0x1 on OST idx 0: rc -5&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2645&quot;&gt;&lt;del&gt;LU-2645&lt;/del&gt;&lt;/a&gt; is under interop mode. I am working on &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2645&quot; title=&quot;1.8&amp;lt;-&amp;gt;2.4 interop: enqueue objid 0x2 subobj 0x1 on OST idx 0: rc -5&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2645&quot;&gt;&lt;del&gt;LU-2645&lt;/del&gt;&lt;/a&gt;. So if possible, we can see what will happen after &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2645&quot; title=&quot;1.8&amp;lt;-&amp;gt;2.4 interop: enqueue objid 0x2 subobj 0x1 on OST idx 0: rc -5&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2645&quot;&gt;&lt;del&gt;LU-2645&lt;/del&gt;&lt;/a&gt; resolved, or just re-assign this one to me also.&lt;/p&gt;</comment>
                            <comment id="53506" author="yong.fan" created="Thu, 7 Mar 2013 01:41:12 +0000"  >&lt;p&gt;Chris, have you seen the failure recently? Would you please to show me the MDT side log? Thanks!&lt;/p&gt;</comment>
                            <comment id="53550" author="morrone" created="Thu, 7 Mar 2013 14:10:30 +0000"  >&lt;p&gt;No, haven&apos;t seen recently.&lt;/p&gt;</comment>
                            <comment id="53554" author="pjones" created="Thu, 7 Mar 2013 14:42:11 +0000"  >&lt;p&gt;ok then I will drop the priority for now&lt;/p&gt;</comment>
                            <comment id="56200" author="yong.fan" created="Fri, 12 Apr 2013 12:39:35 +0000"  >&lt;p&gt;Can we close it since we have not seen it for months since Lu-2645 resolved? We can re-open it when it is reproduced in the future.&lt;/p&gt;</comment>
                            <comment id="56205" author="morrone" created="Fri, 12 Apr 2013 16:20:15 +0000"  >&lt;p&gt;That is fine.&lt;/p&gt;</comment>
                            <comment id="56206" author="yong.fan" created="Fri, 12 Apr 2013 16:27:26 +0000"  >&lt;p&gt;We cannot reproduce it anymore since &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2645&quot; title=&quot;1.8&amp;lt;-&amp;gt;2.4 interop: enqueue objid 0x2 subobj 0x1 on OST idx 0: rc -5&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2645&quot;&gt;&lt;del&gt;LU-2645&lt;/del&gt;&lt;/a&gt; patch applied. Reopen it if reproduced it in the future.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="12166" name="R20-ID-J02.log" size="2559240" author="morrone" created="Wed, 16 Jan 2013 18:03:32 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvfhj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6154</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>