<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:55:36 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5913] client stuck in ptlrpc_invalidate_import</title>
                <link>https://jira.whamcloud.com/browse/LU-5913</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;?DUP of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10&quot; title=&quot;Client stuck in ptlrpc_invalidate_import() after eviction&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10&quot;&gt;&lt;del&gt;LU-10&lt;/del&gt;&lt;/a&gt;?&lt;br/&gt;
1. Client get a odb_ping failed&lt;br/&gt;
2. client is evicted&lt;br/&gt;
3. client can&apos;t reconnect and stuck in ptlrpc_invalidate_import&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Nov 12 04:32:30 pfe22 kernel: [69643.816473] LustreError: 11-0: nbp9-OST0075-osc-ffff880a28404400: Communicating with 10.151.26.11@o2ib, operation obd_ping failed with -107.
Nov 12 04:32:30 pfe22 kernel: [69643.854158] Lustre: nbp9-OST0075-osc-ffff880a28404400: Connection to nbp9-OST0075 (at 10.151.26.11@o2ib) was lost; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will wait &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; recovery to complete
Nov 12 04:32:30 pfe22 kernel: [69643.922758] LustreError: 167-0: nbp9-OST0075-osc-ffff880a28404400: This client was evicted by nbp9-OST0075; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.
Nov 12 04:34:11 pfe22 kernel: [69743.757653] LustreError: 92481:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:324:ptlrpc_invalidate_import()) nbp9-OST0075_UUID: rc = -110 waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; callback (1 != 0)
Nov 12 04:34:11 pfe22 kernel: [69743.793518] LustreError: 92481:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:350:ptlrpc_invalidate_import()) @@@ still on sending list  req@ffff8802204cec00 x1484496199485132/t0(0) o4-&amp;gt;nbp9-OST0075-osc-ffff880a28404400@10.151.26.11@o2ib:6/4 lens 488/448 e 0 to 0 dl 1415726912 ref 2 fl Rpc:RE/0/ffffffff rc -5/-1
Nov 12 04:34:11 pfe22 kernel: [69743.866993] LustreError: 92481:0:(&lt;span class=&quot;code-keyword&quot;&gt;import&lt;/span&gt;.c:366:ptlrpc_invalidate_import()) nbp9-OST0075_UUID: RPCs in &lt;span class=&quot;code-quote&quot;&gt;&quot;Unregistering&quot;&lt;/span&gt; phase found (0). Network is sluggish? Waiting them to error out.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We find ldlm_bl_ threads stuck in D state.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Stack traceback &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; pid 8080
0xffff8802569b2540     8080        2  0    1   D  0xffff8802569b2bb0  ldlm_bl_04
 [&amp;lt;ffffffff8146fb6b&amp;gt;] thread_return+0x0/0x295
 [&amp;lt;ffffffff81470c58&amp;gt;] __mutex_lock_slowpath+0xf8/0x150
 [&amp;lt;ffffffff814706ea&amp;gt;] mutex_lock+0x1a/0x40
 [&amp;lt;ffffffffa08bd28e&amp;gt;] cl_lock_mutex_get+0x6e/0xc0 [obdclass]
 [&amp;lt;ffffffffa0b8988e&amp;gt;] osc_dlm_blocking_ast0+0x5e/0x210 [osc]
 [&amp;lt;ffffffffa0b89a8c&amp;gt;] osc_ldlm_blocking_ast+0x4c/0x100 [osc]
 [&amp;lt;ffffffffa09e30a0&amp;gt;] ldlm_handle_bl_callback+0xc0/0x420 [ptlrpc]
 [&amp;lt;ffffffffa09e3609&amp;gt;] ldlm_bl_thread_main+0x209/0x430 [ptlrpc]
 [&amp;lt;ffffffff8147ade4&amp;gt;] kernel_thread_helper+0x4/0x10
[0]kdb&amp;gt; btp 8090
Stack traceback &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; pid 8090
0xffff88024c9a6640     8090        2  0    6   D  0xffff88024c9a6cb0  ldlm_bl_12
 [&amp;lt;ffffffff8146fb6b&amp;gt;] thread_return+0x0/0x295
 [&amp;lt;ffffffff81470c58&amp;gt;] __mutex_lock_slowpath+0xf8/0x150
 [&amp;lt;ffffffff814706ea&amp;gt;] mutex_lock+0x1a/0x40
 [&amp;lt;ffffffffa08bd28e&amp;gt;] cl_lock_mutex_get+0x6e/0xc0 [obdclass]
 [&amp;lt;ffffffffa0b8988e&amp;gt;] osc_dlm_blocking_ast0+0x5e/0x210 [osc]
 [&amp;lt;ffffffffa0b89a8c&amp;gt;] osc_ldlm_blocking_ast+0x4c/0x100 [osc]
 [&amp;lt;ffffffffa09e30a0&amp;gt;] ldlm_handle_bl_callback+0xc0/0x420 [ptlrpc]
 [&amp;lt;ffffffffa09e3609&amp;gt;] ldlm_bl_thread_main+0x209/0x430 [ptlrpc]
 [&amp;lt;ffffffff8147ade4&amp;gt;] kernel_thread_helper+0x4/0x10
[0]kdb&amp;gt; go
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>client and server version lustre2.4.3-7nas</environment>
        <key id="27570">LU-5913</key>
            <summary>client stuck in ptlrpc_invalidate_import</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 12 Nov 2014 19:30:00 +0000</created>
                <updated>Fri, 16 Oct 2015 04:27:42 +0000</updated>
                            <resolved>Fri, 16 Oct 2015 04:27:42 +0000</resolved>
                                    <version>Lustre 2.4.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="99027" author="bfaccini" created="Thu, 13 Nov 2014 09:41:58 +0000"  >&lt;p&gt;Hello Mahmoud,&lt;br/&gt;
Have you had the chance to at least collect the full stacks trace and the Lustre debug log ?? Or may be a Client&apos;s full crash-dump has been taken ??&lt;/p&gt;</comment>
                            <comment id="99030" author="pjones" created="Thu, 13 Nov 2014 13:00:59 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please advise?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="99097" author="mhanafi" created="Thu, 13 Nov 2014 20:27:52 +0000"  >&lt;p&gt;I have uploaded lustre debug logs and stacktrace for all processes to the ftp site.&lt;/p&gt;

&lt;p&gt;ftp.whamcloud.com/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5913&quot; title=&quot;client stuck in ptlrpc_invalidate_import&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5913&quot;&gt;&lt;del&gt;LU-5913&lt;/del&gt;&lt;/a&gt;/r447i2n10.stacktrace.gz&lt;br/&gt;
ftp.whamcloud.com/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5913&quot; title=&quot;client stuck in ptlrpc_invalidate_import&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5913&quot;&gt;&lt;del&gt;LU-5913&lt;/del&gt;&lt;/a&gt;/r447i2n10.lustredebug.gz&lt;/p&gt;</comment>
                            <comment id="99317" author="jay" created="Mon, 17 Nov 2014 06:29:44 +0000"  >&lt;p&gt;I saw the statehead thread below was trying to enqueue a lock, then the enqueue process was stuck at writing back dirty pages. This is bad as it&apos;s doing this by holding parent lock, which may cause deadlock.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[1415909369.169600] ll_sa_96389     S ffff880734597460     0 96438      2 0x00000000
[1415909369.169600]  ffff880734597390 0000000000000046 ffff880734596010 0000000000011800
[1415909369.169600]  0000000000011800 0000000000011800 0000000000011800 ffff880734597fd8
[1415909369.169600]  ffff880734597fd8 0000000000011800 ffff880688818500 ffffffff81a11020
[1415909369.169600] Call Trace:
[1415909369.169600]  [&amp;lt;ffffffffa09a4b7b&amp;gt;] osc_extent_wait+0x5eb/0x680 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa09a5146&amp;gt;] osc_cache_wait_range+0x536/0x820 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa09a5f55&amp;gt;] osc_cache_writeback_range+0xb25/0x1140 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa099025d&amp;gt;] osc_lock_flush+0x7d/0x260 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa0990539&amp;gt;] osc_lock_cancel+0xf9/0x1e0 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa06c3225&amp;gt;] cl_lock_cancel0+0x65/0x150 [obdclass]
[1415909369.169600]  [&amp;lt;ffffffffa06c3f7b&amp;gt;] cl_lock_cancel+0x14b/0x150 [obdclass]
[1415909369.169600]  [&amp;lt;ffffffffa099107d&amp;gt;] osc_lock_blocking+0x5d/0xf0 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa0991929&amp;gt;] osc_dlm_blocking_ast0+0xf9/0x210 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa0991a8c&amp;gt;] osc_ldlm_blocking_ast+0x4c/0x100 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa07d4e0f&amp;gt;] ldlm_cancel_callback+0x5f/0x180 [ptlrpc]
[1415909369.169600]  [&amp;lt;ffffffffa07e320f&amp;gt;] ldlm_cli_cancel_local+0x7f/0x480 [ptlrpc]
[1415909369.169600]  [&amp;lt;ffffffffa07e65a2&amp;gt;] ldlm_cli_cancel_list_local+0xf2/0x290 [ptlrpc]
[1415909369.169600]  [&amp;lt;ffffffffa07e839f&amp;gt;] ldlm_prep_elc_req+0x3df/0x490 [ptlrpc]
[1415909369.169600]  [&amp;lt;ffffffffa07e846f&amp;gt;] ldlm_prep_enqueue_req+0x1f/0x30 [ptlrpc]
[1415909369.169600]  [&amp;lt;ffffffffa0977280&amp;gt;] osc_enqueue_base+0x150/0x660 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa09914a0&amp;gt;] osc_lock_enqueue+0x310/0x4c0 [osc]
[1415909369.169600]  [&amp;lt;ffffffffa06c4722&amp;gt;] cl_enqueue_kick+0x62/0x150 [obdclass]
[1415909369.169600]  [&amp;lt;ffffffffa06c7f31&amp;gt;] cl_enqueue_try+0xc1/0x210 [obdclass]
[1415909369.169600]  [&amp;lt;ffffffffa0a27e2d&amp;gt;] lov_lock_enqueue_one+0x4d/0x170 [lov]
[1415909369.169600]  [&amp;lt;ffffffffa0a2b07b&amp;gt;] lov_lock_enqueue+0x16b/0x3e0 [lov]
[1415909369.169600]  [&amp;lt;ffffffffa06c4722&amp;gt;] cl_enqueue_kick+0x62/0x150 [obdclass]
[1415909369.169600]  [&amp;lt;ffffffffa06c7f31&amp;gt;] cl_enqueue_try+0xc1/0x210 [obdclass]
[1415909369.169600]  [&amp;lt;ffffffffa06c8817&amp;gt;] cl_enqueue_locked+0x77/0x1e0 [obdclass]
[1415909369.169600]  [&amp;lt;ffffffffa06c9509&amp;gt;] cl_lock_request+0x79/0x280 [obdclass]
[1415909369.169600]  [&amp;lt;ffffffffa0afcd2d&amp;gt;] cl_glimpse_lock+0x17d/0x490 [lustre]
[1415909369.169600]  [&amp;lt;ffffffffa0afd1ef&amp;gt;] cl_glimpse_size0+0x1af/0x1e0 [lustre]
[1415909369.169600]  [&amp;lt;ffffffffa0af5aae&amp;gt;] ll_agl_trigger+0x2ae/0x4d0 [lustre]
[1415909369.169600]  [&amp;lt;ffffffffa0afbeac&amp;gt;] ll_statahead_thread+0x40c/0xd10 [lustre]
[1415909369.169600]  [&amp;lt;ffffffff8147ae64&amp;gt;] kernel_thread_helper+0x4/0x10
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I guess this is a reproduction of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4300&quot; title=&quot;ptlrpcd threads deadlocked in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4300&quot;&gt;&lt;del&gt;LU-4300&lt;/del&gt;&lt;/a&gt;. If we can get the log from OST side, we can verify if this.&lt;/p&gt;

&lt;p&gt;Anyway, let&apos;s try patch at &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4300&quot; title=&quot;ptlrpcd threads deadlocked in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4300&quot;&gt;&lt;del&gt;LU-4300&lt;/del&gt;&lt;/a&gt;, master commit: 44cdf59 and see if it can fix the problem. &lt;/p&gt;</comment>
                            <comment id="99504" author="mhanafi" created="Tue, 18 Nov 2014 19:22:20 +0000"  >&lt;p&gt;Which logs would you like from the OST side?&lt;/p&gt;</comment>
                            <comment id="99662" author="bobijam" created="Thu, 20 Nov 2014 10:18:28 +0000"  >&lt;p&gt;Lustre debug log.&lt;/p&gt;</comment>
                            <comment id="100732" author="mhanafi" created="Thu, 4 Dec 2014 20:20:35 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4300&quot; title=&quot;ptlrpcd threads deadlocked in cl_lock_mutex_get&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4300&quot;&gt;&lt;del&gt;LU-4300&lt;/del&gt;&lt;/a&gt; looks has fixed the issue. Since applying the patch we have not hit the bug.&lt;/p&gt;
</comment>
                            <comment id="130571" author="mhanafi" created="Fri, 16 Oct 2015 00:03:10 +0000"  >&lt;p&gt;Please close this&lt;/p&gt;</comment>
                            <comment id="130592" author="pjones" created="Fri, 16 Oct 2015 04:27:42 +0000"  >&lt;p&gt;Thanks Mahmoud&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzx0tr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>16510</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>