<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:28:44 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16639] job_stat_exit() should not have any items</title>
                <link>https://jira.whamcloud.com/browse/LU-16639</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;It looks like a patch has somehow resulted in a leak of job_stats on the server that result in a message being printed at unmount time:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 1095.059674] Lustre: Failing over lustre-MDT0000
[ 1095.090215] LustreError: 31568:0:(lprocfs_jobstats.c:137:job_stat_exit()) should not have any items
[ 1095.093184] LustreError: 31568:0:(lprocfs_jobstats.c:137:job_stat_exit()) Skipped 20 previous similar messages
[ 1095.146954] Lustre: server umount lustre-MDT0000 complete
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This started around 2022-03-03  but I haven&apos;t isolated it to a specific patch yet. &lt;/p&gt;</description>
                <environment></environment>
        <key id="75038">LU-16639</key>
            <summary>job_stat_exit() should not have any items</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="adilger">Andreas Dilger</assignee>
                                    <reporter username="adilger">Andreas Dilger</reporter>
                        <labels>
                    </labels>
                <created>Mon, 13 Mar 2023 18:58:55 +0000</created>
                <updated>Fri, 10 Nov 2023 09:31:46 +0000</updated>
                            <resolved>Wed, 29 Mar 2023 03:31:31 +0000</resolved>
                                    <version>Lustre 2.16.0</version>
                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>3</watches>
                                                                            <comments>
                            <comment id="365810" author="adilger" created="Mon, 13 Mar 2023 22:00:17 +0000"  >&lt;p&gt;&lt;a href=&quot;https://50.222.100.39/kibana/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:&amp;#39;2022-02-01T15:00:00.000Z&amp;#39;,to:&amp;#39;2022-03-05T18:00:00.000Z&amp;#39;))&amp;amp;_a=(columns:!(test_set_name,sub_test_name,test_session_group,test_set_link),filters:!(),index:ffc3bad0-3e61-11ec-b0b9-8ba7a9c0d3d3,interval:auto,query:(language:kuery,query:&amp;#39;!!%20console_log%20~%20job_stat_exit%20and%20test_set_name:sanity%20and%20sub_test_name:test_232a&amp;#39;),sort:!(!(sub_test_date,desc)))&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;According to this Kibana search&lt;/a&gt;, it looks like the first reports of this &lt;tt&gt;job_stat_exit()&lt;/tt&gt; message are actually back on 2022-03-03.&lt;/p&gt;

&lt;p&gt;That was when 2.15.0 was released, and most patches have parent commit v2_15_0-RC2-2-g94f4e1f517 &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-15512&quot; title=&quot;Infinite loop in lnet_discover_peer_locked()&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-15512&quot;&gt;&lt;del&gt;LU-15512&lt;/del&gt;&lt;/a&gt; lnet: Stop discovery on deleted peer NI&lt;/tt&gt;&quot;, which was the only patch that had been landed in a couple of weeks.  However, there are a few patches in an LNet series &lt;a href=&quot;https://review.whamcloud.com/#/c/fs/lustre-release/+/46653/5&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/#/c/fs/lustre-release/+/46653/5&lt;/a&gt; that were run on 2022-03-03 that hit the problem, &lt;b&gt;BUT&lt;/b&gt; were based on an earlier parent commit, so this might indicate that the issue is at least partly related to the test environment and not the patches themselves.&lt;/p&gt;</comment>
                            <comment id="365812" author="gerrit" created="Mon, 13 Mar 2023 22:29:17 +0000"  >&lt;p&gt;&quot;Andreas Dilger &amp;lt;adilger@whamcloud.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50283&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50283&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16639&quot; title=&quot;job_stat_exit() should not have any items&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16639&quot;&gt;&lt;del&gt;LU-16639&lt;/del&gt;&lt;/a&gt; misc: improve console error messages&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: c50c6467958f4287fbcc3abec84e345b9cbf4c24&lt;/p&gt;</comment>
                            <comment id="365814" author="adilger" created="Mon, 13 Mar 2023 23:09:55 +0000"  >&lt;p&gt;I&apos;ve pushed the above patch which improves the &lt;tt&gt;job_stats_exit()&lt;/tt&gt; message to print the &lt;tt&gt;jn_name&lt;/tt&gt; field.&lt;/p&gt;

&lt;p&gt;In theory, the &lt;tt&gt;lprocfs_job_cleanup()&lt;/tt&gt; function should have freed all of the job stats before the &lt;tt&gt;cfs_hash_putref()&lt;del&gt;&amp;gt;cfs_hash_destroy()&lt;/del&gt;&amp;gt;job_stat_exit()&lt;/tt&gt; path cleans up the remaining stats.  It looks like the stats are freed by calling &lt;tt&gt;cfs_hash_put_locked()-&amp;gt;job_free()&lt;/tt&gt;.&lt;/p&gt;

&lt;p&gt;I&apos;m not sure if it is related, but there was an autotest patch landed just before the problem appeared that could potentially be related:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;commit dbf96c013a17af99b9968f7c79e49f949c33e881
Author:     Charlie Olmstead &amp;lt;charlie@whamcloud.com&amp;gt;
CommitDate: Tue Mar 1 17:58:17 2022 +0000

    ATM-2317 - Monitor test framework process
    
    Added logic to monitor the status of the test framework by the presence of
    &amp;lt;shared_log_dir/run_test.sh.pid. The presence of that file signals to AT
    that the test framework is running. This allows AT to monitor the status of
    the framework without logging into the test controller every X seconds. The
    pid file is deleted when TestStep.runtestscript exits. because of the newly
    created wrapper. The wrapper has 2 functions:
    - Execute TestStep.runtestscript in a process group so it and all
      sub-processes can be killed at once by AT
    - Remove &amp;lt;shared_log_dir&amp;gt;/run_test.sh.pid file when TestStep.runtestscript
      exits
    
    In addition to the monitoring I&apos;ve ported over the dynamic execute/rexec
    logic from loadjenkinsbuild. It allows callers to dynamically build
    execution commands by changing the method name called: e.g. rexec_no_retry,
    rexec_no_retry_data, rexec_status_no_retry!, etc.
    
    Also:
    - Removed the unnecessary reboot_without_install? check in the provisioner.
      This is a relic of old-school provisioning
    - Improved selinux/firewall enabling/disabling logic
    - Paritioner: removed logic that runs &apos;ls /dev&apos; if pvcreate fails. I looked
      through all of the logs and it has not happened nor have we had any
      sessions that have failed with &apos;pvcreate failed, unable to continue&apos;
      since I can remember. The rest of the changes to this file are shortening
      of long lines.
    
    Change-Id: I5f960bf5210e90a37aa66cdac7a3d260632a5279
    Reviewed-on: https://review.whamcloud.com/46528
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; </comment>
                            <comment id="365849" author="bzzz" created="Tue, 14 Mar 2023 08:33:16 +0000"  >&lt;p&gt;I did bisection:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
COMMIT          TESTED  PASSED  FAILED          COMMIT DESCRIPTION
9cf4dddd52      1       0       1       BAD     LU-14831 osd-ldiskfs: uninited osd_inode_id is used
3a9176d5bb      1       0       1       BAD     LU-14174 lfs: llapi_mirror_find() signed &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt;
3a83078628      1       0       1       BAD     LU-14409 ldiskfs: Add support &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; SUSE 5.3.18-24.46.1
c83304607a      8       0       8       BAD     LU-12391 tests: mdsrate tests improvements
ea2cd3af7b      8       0       8       BAD     LU-11407 obdclass: add start time to stats files
9a5bace55a      10      10      0       GOOD    LU-15115 ptlrpc: recalc timer on EINPROGRESS reply
3038917f12      10      10      0       GOOD    LU-2084 lnet: don&apos;t retry allocating router buffers
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="365892" author="adilger" created="Tue, 14 Mar 2023 15:56:13 +0000"  >&lt;p&gt;Thanks Alex. It looks like I messed up the final job stats cleanup in &lt;tt&gt;lprocfs_job_cleanup()&lt;/tt&gt; in that patch. &lt;/p&gt;</comment>
                            <comment id="367647" author="gerrit" created="Tue, 28 Mar 2023 22:20:30 +0000"  >&lt;p&gt;&quot;Oleg Drokin &amp;lt;green@whamcloud.com&amp;gt;&quot; merged in patch &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/50283/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/50283/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16639&quot; title=&quot;job_stat_exit() should not have any items&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16639&quot;&gt;&lt;del&gt;LU-16639&lt;/del&gt;&lt;/a&gt; misc: cleanup concole messages&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 8f40a3d7110da1af8e310a4b7f40b86f13080938&lt;/p&gt;</comment>
                            <comment id="367676" author="pjones" created="Wed, 29 Mar 2023 03:31:31 +0000"  >&lt;p&gt;Landed for 2.16&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="72646">LU-16205</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03g6f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>