<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:16:01 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1368] lctl abort_recovery deadlocked</title>
                <link>https://jira.whamcloud.com/browse/LU-1368</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Recovery was aborted for on an OST because it was taking too long (possibly due to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1352&quot; title=&quot;spurious recovery timer resets&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1352&quot;&gt;&lt;del&gt;LU-1352&lt;/del&gt;&lt;/a&gt;).  Recovery never completed for one OST.  The &apos;lctl abort_recovery --device=3&apos; process was hung with the following backtrace.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 25072  TASK: ffff880311ab2080  CPU: 9   COMMAND: &quot;lctl&quot;
 #0 [ffff880311b5fae8] schedule at ffffffff814eeee0
 #1 [ffff880311b5fbb0] schedule_timeout at ffffffff814efd95
 #2 [ffff880311b5fc60] wait_for_common at ffffffff814efa13
 #3 [ffff880311b5fcf0] wait_for_completion at ffffffff814efb2d
 #4 [ffff880311b5fd00] target_stop_recovery_thread at ffffffffa063b360 [ptlrpc]
 #5 [ffff880311b5fd20] filter_iocontrol at ffffffffa0b07ceb [obdfilter]
 #6 [ffff880311b5fd90] class_handle_ioctl at ffffffffa0509c37 [obdclass]
 #7 [ffff880311b5fe40] obd_class_ioctl at ffffffffa04fa21b [obdclass]
 #8 [ffff880311b5fe60] vfs_ioctl at ffffffff8118ab72
 #9 [ffff880311b5fea0] do_vfs_ioctl at ffffffff8118ad14
#10 [ffff880311b5ff30] sys_ioctl at ffffffff8118b291
#11 [ffff880311b5ff80] system_call_fastpath at ffffffff8100b0f2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Also, tgt_recov  backtrace:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;PID: 23416  TASK: ffff88033603b500  CPU: 15  COMMAND: &quot;tgt_recov&quot;
 #0 [ffff880310bd38f0] schedule at ffffffff814eeee0
 #1 [ffff880310bd39b8] schedule_timeout at ffffffff814efd12
 #2 [ffff880310bd3a68] cfs_waitq_timedwait at ffffffffa0422521 [libcfs]
 #3 [ffff880310bd3a78] target_bulk_io at ffffffffa0641ea0 [ptlrpc]
 #4 [ffff880310bd3b48] ost_brw_write at ffffffffa0abc21b [ost]
 #5 [ffff880310bd3cb8] ost_handle at ffffffffa0abf0e8 [ost]
 #6 [ffff880310bd3de8] handle_recovery_req at ffffffffa063bcac [ptlrpc]
 #7 [ffff880310bd3e28] target_recovery_thread at ffffffffa063c0b8 [ptlrpc]
 #8 [ffff880310bd3f48] kernel_thread at ffffffff8100c14a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Also attaching complete &apos;foreach bt&apos; output from crash.&lt;/p&gt;

&lt;p&gt;LLNL-bugzilla-ID: 1607&lt;/p&gt;</description>
                <environment>&lt;a href=&quot;https://github.com/chaos/lustre/commits/2.1.1-4chaos&quot;&gt;https://github.com/chaos/lustre/commits/2.1.1-4chaos&lt;/a&gt;</environment>
        <key id="14254">LU-1368</key>
            <summary>lctl abort_recovery deadlocked</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="nedbass">Ned Bass</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Thu, 3 May 2012 17:38:15 +0000</created>
                <updated>Tue, 12 Dec 2017 19:06:19 +0000</updated>
                            <resolved>Tue, 12 Dec 2017 19:06:19 +0000</resolved>
                                    <version>Lustre 2.1.1</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="38120" author="pjones" created="Thu, 3 May 2012 18:12:19 +0000"  >&lt;p&gt;Hi Hongchao&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="38121" author="nedbass" created="Thu, 3 May 2012 18:21:58 +0000"  >&lt;p&gt;From the affected server&apos;s logs, note the negative recovery time.  &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2012-05-03 13:24:54 Lustre: lsc-OST0132: Denying connection for new client 192.168.117.130@o2ib10 (at 9bbecaa5-b07c-320a-ef31-4620f1b26f3c), waiting for 7 clients in recovery for -12:-06
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This seems to support the comments in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1352&quot; title=&quot;spurious recovery timer resets&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1352&quot;&gt;&lt;del&gt;LU-1352&lt;/del&gt;&lt;/a&gt;.  Time here is calculated as&lt;/p&gt;

&lt;p&gt;t = cfs_timer_deadline(&amp;amp;target-&amp;gt;obd_recovery_timer);&lt;br/&gt;
t = cfs_time_sub(t, cfs_time_current());&lt;/p&gt;

&lt;p&gt;So we must have cfs_time_current &amp;gt; target-&amp;gt;obd_recovery_timer.&lt;/p&gt;</comment>
                            <comment id="38157" author="hongchao.zhang" created="Fri, 4 May 2012 11:29:34 +0000"  >&lt;p&gt;during recovery, the request will be kept forever if the server can&apos;t complete it,&lt;br/&gt;
in ptlrpc_at_send_early_reply,&lt;br/&gt;
...&lt;br/&gt;
  if (req-&amp;gt;rq_export &amp;amp;&amp;amp;&lt;br/&gt;
      lustre_msg_get_flags(req-&amp;gt;rq_reqmsg) &amp;amp;&lt;br/&gt;
      (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) &lt;/p&gt;
{
          /* During recovery, we don&apos;t want to send too many early
           * replies, but on the other hand we want to make sure the
           * client has enough time to resend if the rpc is lost. So
           * during the recovery period send at least 4 early replies,
           * spacing them every at_extra if we can. at_estimate should
           * always equal this fixed value during recovery. */
          at_measured(&amp;amp;svc-&amp;gt;srv_at_estimate, min(at_extra,
                      req-&amp;gt;rq_export-&amp;gt;exp_obd-&amp;gt;obd_recovery_timeout / 4));
  }
&lt;p&gt; else &lt;/p&gt;
{
        ...
  }
&lt;p&gt;  newdl = cfs_time_current_sec() + at_get(&amp;amp;svc-&amp;gt;srv_at_estimate);        &amp;lt;--- here, the new deadline is set.&lt;br/&gt;
  ...&lt;/p&gt;

&lt;p&gt;and in target_bulk_io, the request will be waited until it reaches the deadline of the request!&lt;/p&gt;

&lt;p&gt;_&lt;/p&gt;</comment>
                            <comment id="38317" author="green" created="Tue, 8 May 2012 13:13:02 +0000"  >&lt;p&gt;hm, negative recovery time.&lt;br/&gt;
I saw a couple bugs like this.&lt;br/&gt;
might be  dup of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="38507" author="hongchao.zhang" created="Thu, 10 May 2012 07:17:25 +0000"  >&lt;p&gt;the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1352&quot; title=&quot;spurious recovery timer resets&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1352&quot;&gt;&lt;del&gt;LU-1352&lt;/del&gt;&lt;/a&gt; should be caused by negative recovery time, and this one is related to it but it could contain a new issue,&lt;br/&gt;
will attach a possible patch for it.&lt;/p&gt;</comment>
                            <comment id="38627" author="hongchao.zhang" created="Fri, 11 May 2012 10:29:38 +0000"  >&lt;p&gt;the patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#change,2720&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,2720&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="39995" author="tappro" created="Tue, 5 Jun 2012 02:41:30 +0000"  >&lt;p&gt;This looks like just another &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt; case. About replay request issue, I don&apos;t see issue here, why request is waiting and what it is waiting? The ptlrpc_at_send_early_reply() is about sending early reply it doesn&apos;t affect request processing time on server. Can you explain a bit more what problem do you see here?&lt;/p&gt;</comment>
                            <comment id="40724" author="hongchao.zhang" created="Mon, 18 Jun 2012 02:37:54 +0000"  >&lt;p&gt;the main problem here is if the LNetPut or LNetGet cost much long time(longer than the srv_at_estimate) to transfer data,&lt;br/&gt;
the client still can see AT reply and continue to wait instead of failing the request.&lt;/p&gt;</comment>
                            <comment id="88196" author="hongchao.zhang" created="Fri, 4 Jul 2014 15:06:06 +0000"  >&lt;p&gt;Hi, Mikhail&lt;/p&gt;

&lt;p&gt;Is the patch still needed, and to be updated against master? Thanks! &lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="11286" name="foreach_bt.txt" size="530072" author="nedbass" created="Thu, 3 May 2012 17:38:15 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 4 Jul 2014 17:38:15 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                            <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvycn:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9749</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Thu, 3 May 2012 17:38:15 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>