<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:29:28 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2931] OST umount hangs for over 1 hour</title>
                <link>https://jira.whamcloud.com/browse/LU-2931</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After scheduled maintenance, Yale was attempting to failback their OSTs from the failover server to the primary server, but the umounts hung on the failover server for over an hour until the machine was reboot. Here is an example of the messages we have seen:&lt;/p&gt;

&lt;p&gt;Feb 28 09:31:12 oss9 kernel: Lustre: Service thread pid 2708 was inactive for 200.00s. The thread might be hung, or it might only be slow and will resume later. Dumping&lt;br/&gt;
 the stack trace for debugging purposes:&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: Pid: 2708, comm: ll_ost_11&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: Call Trace:&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80063002&amp;gt;&amp;#93;&lt;/span&gt; thread_return+0x62/0xfe&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8002dee8&amp;gt;&amp;#93;&lt;/span&gt; __wake_up+0x38/0x4f&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88a12828&amp;gt;&amp;#93;&lt;/span&gt; jbd2_log_wait_commit+0xa3/0xf5 &lt;span class=&quot;error&quot;&gt;&amp;#91;jbd2&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff800a34a7&amp;gt;&amp;#93;&lt;/span&gt; autoremove_wake_function+0x0/0x2e&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88a0d5ae&amp;gt;&amp;#93;&lt;/span&gt; jbd2_journal_stop+0x1e6/0x215 &lt;span class=&quot;error&quot;&gt;&amp;#91;jbd2&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88af0d05&amp;gt;&amp;#93;&lt;/span&gt; filter_sync+0xc5/0x5c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdfilter&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887c30c1&amp;gt;&amp;#93;&lt;/span&gt; ldlm_pool_add+0x131/0x190 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887b39af&amp;gt;&amp;#93;&lt;/span&gt; ldlm_export_lock_put+0x6f/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887c40a5&amp;gt;&amp;#93;&lt;/span&gt; interval_next+0xf5/0x1d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88a9edac&amp;gt;&amp;#93;&lt;/span&gt; ost_blocking_ast+0x79c/0x9b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88728cf0&amp;gt;&amp;#93;&lt;/span&gt; class_handle2object+0xe0/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8879a270&amp;gt;&amp;#93;&lt;/span&gt; ldlm_resource_putref_internal+0x230/0x460 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff80064b09&amp;gt;&amp;#93;&lt;/span&gt; _spin_lock_bh+0x9/0x14&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887932fd&amp;gt;&amp;#93;&lt;/span&gt; ldlm_cancel_callback+0x6d/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88797580&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_cancel+0xc0/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887b58e5&amp;gt;&amp;#93;&lt;/span&gt; ldlm_request_cancel+0x265/0x330 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887d94a1&amp;gt;&amp;#93;&lt;/span&gt; lustre_swab_buf+0x81/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887b6d50&amp;gt;&amp;#93;&lt;/span&gt; ldlm_server_glimpse_ast+0x0/0x3b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887bc290&amp;gt;&amp;#93;&lt;/span&gt; ldlm_server_completion_ast+0x0/0x5e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88a9e610&amp;gt;&amp;#93;&lt;/span&gt; ost_blocking_ast+0x0/0x9b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887b9106&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue+0x1d6/0x1210 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887d7ff5&amp;gt;&amp;#93;&lt;/span&gt; lustre_msg_get_version+0x35/0xf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887d7f05&amp;gt;&amp;#93;&lt;/span&gt; lustre_msg_get_opc+0x35/0xf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887d80b8&amp;gt;&amp;#93;&lt;/span&gt; lustre_msg_check_version_v2+0x8/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88aa64e3&amp;gt;&amp;#93;&lt;/span&gt; ost_handle+0x4ff3/0x55c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ost&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887e76d9&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x989/0xe00 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887e7e35&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_wait_event+0x2e5/0x310 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8008d299&amp;gt;&amp;#93;&lt;/span&gt; __wake_up_common+0x3e/0x68&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887e8dc6&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xf66/0x1120 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8005dfb1&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x11&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff887e7e60&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0x0/0x1120 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8005dfa7&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0x0/0x11&lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: &lt;br/&gt;
Feb 28 09:31:12 oss9 kernel: LustreError: dumping log to /tmp/lustre-log.1362061872.2708&lt;br/&gt;
Feb 28 09:32:25 oss9 kernel: Lustre: Service thread pid 2708 completed after 272.45s. This indicates the system was overloaded (too many service threads, or there were not enough hardware resources).&lt;br/&gt;
Feb 28 09:35:11 oss9 kernel: Lustre: 3146:0:(quota_interface.c:475:quota_chk_acq_common()) still haven&apos;t managed to acquire quota space from the quota master after 10 retries (err=0, rc=0)&lt;/p&gt;

&lt;p&gt;We are planning a downtime to gather more information. Are there any debugging flags we should use? ldlm, quota, rpctrace? I was also thinking of seeing if 1.8.9 might help, though I don&apos;t see any commits that really deal with this issue.&lt;/p&gt;</description>
                <environment></environment>
        <key id="17805">LU-2931</key>
            <summary>OST umount hangs for over 1 hour</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="cliffw">Cliff White</assignee>
                                    <reporter username="orentas">Oz Rentas</reporter>
                        <labels>
                    </labels>
                <created>Fri, 8 Mar 2013 12:50:31 +0000</created>
                <updated>Fri, 11 Jul 2014 20:43:22 +0000</updated>
                            <resolved>Fri, 11 Jul 2014 20:43:22 +0000</resolved>
                                    <version>Lustre 1.8.8</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="53628" author="cliffw" created="Fri, 8 Mar 2013 16:46:10 +0000"  >&lt;p&gt;Are you certain this is the first timeout? Are there any errors prior to the timeout? 1.8.9 might be a good idea.&lt;/p&gt;</comment>
                            <comment id="54812" author="kitwestneat" created="Tue, 26 Mar 2013 01:20:56 +0000"  >&lt;p&gt;Hi Cliff, sorry for not getting back to you sooner, I missed your response. It was the first that day, there were some the previous day. It&apos;s actually kind of a strange log. The previous failover on the 21st makes more sense. I&apos;ll attach both kern.log files though so you can check it out.&lt;/p&gt;</comment>
                            <comment id="83660" author="cliffw" created="Fri, 9 May 2014 19:29:44 +0000"  >&lt;p&gt;Have you upgraded to 1.8.9? &lt;/p&gt;</comment>
                            <comment id="88864" author="cliffw" created="Fri, 11 Jul 2014 19:28:37 +0000"  >&lt;p&gt;Do you have any updates on this situation?&lt;/p&gt;</comment>
                            <comment id="88876" author="orentas" created="Fri, 11 Jul 2014 20:15:03 +0000"  >&lt;p&gt;This is way old.  Opened by Kit.&lt;br/&gt;
The system has since been upgraded and is now running Lustre 2.4.&lt;br/&gt;
THis is no longer a problem.  Please close this.&lt;/p&gt;</comment>
                            <comment id="88878" author="cliffw" created="Fri, 11 Jul 2014 20:43:18 +0000"  >&lt;p&gt;Great, will do&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="12426" name="oss9-2013-02-21" size="281198" author="kitwestneat" created="Tue, 26 Mar 2013 01:21:43 +0000"/>
                            <attachment id="12425" name="oss9-2013-02-28" size="233189" author="kitwestneat" created="Tue, 26 Mar 2013 01:21:43 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvkhj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7048</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>