<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:10:27 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-14518] allow slow request processing to be removed from health check</title>
                <link>https://jira.whamcloud.com/browse/LU-14518</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;If a request is not being processed in a timely manner, it will mark the service unhealthy, which can lead to STONITH.  However, in some cases when the server is very heavily loaded, requests may take longer than &lt;tt&gt;at_max&lt;/tt&gt; to be processed and this shouldn&apos;t cause the server to be killed, since that will slow down request processing even further and put extra load on the backup server(s), slowing down their processing and possibly causing them to fail in a similar manner.&lt;/p&gt;</description>
                <environment></environment>
        <key id="63331">LU-14518</key>
            <summary>allow slow request processing to be removed from health check</summary>
                <type id="4" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11310&amp;avatarType=issuetype">Improvement</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="adilger">Andreas Dilger</assignee>
                                    <reporter username="adilger">Andreas Dilger</reporter>
                        <labels>
                    </labels>
                <created>Fri, 12 Mar 2021 22:27:29 +0000</created>
                <updated>Tue, 23 Jan 2024 01:14:27 +0000</updated>
                                            <version>Lustre 2.14.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="361160" author="tappro" created="Wed, 1 Feb 2023 07:07:24 +0000"  >&lt;p&gt;I am duplicating here my comment in review: we could consider history of timeouts, e.g. &lt;tt&gt;ost.OSS.ost_io.timeouts&lt;/tt&gt; to decide about service health taking into account amount of timeouts per time unit&lt;/p&gt;</comment>
                            <comment id="361207" author="tappro" created="Wed, 1 Feb 2023 14:58:05 +0000"  >&lt;p&gt;Another note about that health checking based on services, it stops whole node if failed, so all severs will fail over which looks as overreaction in some cases, e.g. thread is stuck waiting for some event like another server recovery or so, or even worse if storage becomes overloaded for a moment in which case server movement to another node will not help, since storage remains the same, that would cause constant ping-pong failovers. The only case when that is helpful - deadlock situation or any other when node requires reboot. Detecting that by single thread timeout is too aggressive&lt;/p&gt;</comment>
                            <comment id="394116" author="gerrit" created="Fri, 24 Nov 2023 07:19:48 +0000"  >&lt;p&gt;&quot;Andreas Dilger &amp;lt;adilger@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/53225&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/53225&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14518&quot; title=&quot;allow slow request processing to be removed from health check&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14518&quot;&gt;LU-14518&lt;/a&gt; ptlrpc: WIP avoid server STONITH for slow requests&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 515100fb45dda4587c9d60b73a44258c7aba5bdc&lt;/p&gt;</comment>
                            <comment id="396757" author="gerrit" created="Thu, 14 Dec 2023 03:03:11 +0000"  >&lt;p&gt;&quot;Andreas Dilger &amp;lt;adilger@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/53451&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/53451&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14518&quot; title=&quot;allow slow request processing to be removed from health check&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14518&quot;&gt;LU-14518&lt;/a&gt; libcfs: print CFS_FAIL_CHECK() location&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 3f1231a9deb9cde5f2b81d8899d80359ff73dbe7&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i01pdz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>