<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:09:31 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-694] Job Stats</title>
                <link>https://jira.whamcloud.com/browse/LU-694</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This feature is to collect filesystem operation stats for the jobs running on Lustre.&lt;/p&gt;

&lt;p&gt;When some job sheculer (SLURM, for instance) is running on lustre client, the lustre client will pack the job id into each request (open, unlink, write...), and server will collect those information then expose them via procfs.&lt;/p&gt;</description>
                <environment></environment>
        <key id="11864">LU-694</key>
            <summary>Job Stats</summary>
                <type id="4" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11310&amp;avatarType=issuetype">Improvement</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="niu">Niu Yawei</reporter>
                        <labels>
                    </labels>
                <created>Wed, 21 Sep 2011 00:31:48 +0000</created>
                <updated>Sat, 22 Nov 2014 20:04:16 +0000</updated>
                            <resolved>Mon, 4 Jun 2012 06:07:32 +0000</resolved>
                                    <version>Lustre 2.3.0</version>
                                    <fixVersion>Lustre 2.3.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>13</watches>
                                                                            <comments>
                            <comment id="20369" author="niu" created="Wed, 21 Sep 2011 02:37:06 +0000"  >&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Enable/Disable Jobstats feature&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;  The Jobstats is disabled by default, that can be verified by checking the /proc/fs/lustre/jobid_var on client,&lt;br/&gt;
  the&apos;jobid_var&apos; should be &apos;disable&apos; by default.&lt;/p&gt;

  &lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  lctl get_param jobid_var
  jobid_var=disable
  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;  To enable the Jobstats, one can specify the &apos;jobid_var&apos; for a certain job scheduler. &lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Configure &apos;jobid_var&apos; for specified job scheduler&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;  To enable Jobstats for certain job scheduler, the &apos;jobid_var&apos; should be configured as proper value:&lt;br/&gt;
  SLURM:       jobid_var=SLURM_JOB_ID&lt;br/&gt;
  SGE:         jobid_var=JOB_ID&lt;br/&gt;
  LSF:         jobid_var=LSB_JOBID&lt;br/&gt;
  Loadleveler: jobid_var=LOADL_STEP_ID&lt;br/&gt;
  PBS:         jobid_var=PBS_JOBID&lt;br/&gt;
  Maui/MOAB:   jobid_var=PBS_JOBID&lt;/p&gt;

&lt;p&gt;  For example, to enable Jobstats for SLURM on a fs named &apos;lustre&apos;:&lt;/p&gt;
  &lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  lctl conf_param testfs.sys.jobid_var=SLURM_JOB_ID
  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;  Disable Jobstats on a fs named &apos;lustre&apos;:&lt;/p&gt;
  &lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  lctl conf_param testfs.sys.jobid_var=disable
  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;  If there isn&apos;t any job scheduler is running over the system, or user just want to collect the stats for process &amp;amp; uid:&lt;/p&gt;
  &lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  lctl conf_param testfs.sys.jobid_var=procname_uid
  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Check Job stats&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;  The metadata operation stats is collected on MDT, and one can access it via &lt;tt&gt;lctl get_param mdt.&amp;#42;.job_stats&lt;/tt&gt;:&lt;/p&gt;
  &lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  lctl get_param mdt.lustre-MDT0000.job_stats
job_stats:
- job_id:          bash.0
  snapshot_time:   1352084992
  open:            { samples:           2, unit:  reqs }
  close:           { samples:           2, unit:  reqs }
  mknod:           { samples:           0, unit:  reqs }
  link:            { samples:           0, unit:  reqs }
  unlink:          { samples:           0, unit:  reqs }
  mkdir:           { samples:           0, unit:  reqs }
  rmdir:           { samples:           0, unit:  reqs }
  rename:          { samples:           0, unit:  reqs }
  getattr:         { samples:           3, unit:  reqs }
  setattr:         { samples:           0, unit:  reqs }
  getxattr:        { samples:           0, unit:  reqs }
  setxattr:        { samples:           0, unit:  reqs }
  statfs:          { samples:           0, unit:  reqs }
  sync:            { samples:           0, unit:  reqs }
  samedir_rename:  { samples:           0, unit:  reqs }
  crossdir_rename: { samples:           0, unit:  reqs }
- job_id:          dd.0
  snapshot_time:   1352085037
  open:            { samples:           1, unit:  reqs }
  close:           { samples:           1, unit:  reqs }
  mknod:           { samples:           0, unit:  reqs }
  link:            { samples:           0, unit:  reqs }
  unlink:          { samples:           0, unit:  reqs }
  mkdir:           { samples:           0, unit:  reqs }
  rmdir:           { samples:           0, unit:  reqs }
  rename:          { samples:           0, unit:  reqs }
  getattr:         { samples:           0, unit:  reqs }
  setattr:         { samples:           0, unit:  reqs }
  getxattr:        { samples:           0, unit:  reqs }
  setxattr:        { samples:           0, unit:  reqs }
  statfs:          { samples:           0, unit:  reqs }
  sync:            { samples:           2, unit:  reqs }
  samedir_rename:  { samples:           0, unit:  reqs }
  crossdir_rename: { samples:           0, unit:  reqs }
  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;  The data operation stats is collected on OST, can one can check it via &lt;tt&gt;lctl get_param obdfilter.&amp;#42;.job_stats&lt;/tt&gt;:&lt;/p&gt;
  &lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  lctl get_param obdfilter.lustre-OST0000.job_stats
job_stats:
- job_id:          bash.0
  snapshot_time:   1352085025
  read:            { samples:           0, unit: bytes, min:       0, max:       0, sum:               0 }
  write:           { samples:           1, unit: bytes, min:       4, max:       4, sum:               4 }
  setattr:         { samples:           0, unit:  reqs }
  punch:           { samples:           0, unit:  reqs }
  sync:            { samples:           0, unit:  reqs }
  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Clear job stats for specified job (or all jobs)&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;  One can clear the job stats for a certain MDT or OST by writing the proc file &apos;job_stats&apos;.&lt;/p&gt;

&lt;p&gt;  Clear stats for all job on testfs-OST0001:&lt;/p&gt;
  &lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  lctl set_param obdfilter.testfs-OST0001.job_stats=clear
  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;  Clear stats for job &quot;dd.0&quot; on lustre-MDT0000:&lt;/p&gt;
  &lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  lctl set_param mdt.lustre-MDT0000.job_stats=dd.0
  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Configure cleanup interval&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;  By default, if some job doesn&apos;t have any activities for 600 seconds, it&apos;s stats will be cleared, this expiration value&lt;br/&gt;
  is tunable via mdt.&lt;b&gt;.job_cleanup_interval and obdfilter.&lt;/b&gt;.job_cleanup_interval.&lt;/p&gt;

&lt;p&gt;  for instance, change the cleanup interval to just over an hour (4000) seconds for MDT:&lt;/p&gt;
  &lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;  lctl conf_param lustre.mdt.job_cleanup_interval=4000
  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;  The &apos;job_cleanup_interval&apos; can be set as 0 to disable the auto-cleanup. &lt;/p&gt;</comment>
                            <comment id="20370" author="niu" created="Wed, 21 Sep 2011 02:54:23 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/1397&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/1397&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="22798" author="niu" created="Thu, 10 Nov 2011 02:41:30 +0000"  >&lt;p&gt;follow-up patch which moves &apos;jobid_var&apos; to global:  &lt;a href=&quot;http://review.whamcloud.com/1683&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/1683&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="27629" author="ihara" created="Mon, 30 Jan 2012 18:46:16 +0000"  >&lt;p&gt;Hello, Niu&lt;br/&gt;
new patches for this feature will be landed for 2.2?&lt;/p&gt;</comment>
                            <comment id="27634" author="niu" created="Mon, 30 Jan 2012 20:41:45 +0000"  >&lt;p&gt;Hi, Ihara&lt;/p&gt;

&lt;p&gt;The patch will not be landed for 2.2, which version it should be landed for is not decided yet.&lt;/p&gt;</comment>
                            <comment id="35618" author="rhenwood" created="Fri, 27 Apr 2012 21:18:34 +0000"  >&lt;p&gt;I have been advised that a filesystem name may not uniquely identify a lustre filesystem.&lt;/p&gt;

&lt;p&gt;I am not sure what a better choice for the command you have above is, but some thought as to an alternative to fs name would be valuable.&lt;/p&gt;</comment>
                            <comment id="35626" author="niu" created="Sun, 29 Apr 2012 01:37:44 +0000"  >&lt;blockquote&gt;
&lt;p&gt;I have been advised that a filesystem name may not uniquely identify a lustre filesystem.&lt;/p&gt;

&lt;p&gt;I am not sure what a better choice for the command you have above is, but some thought as to an alternative to fs name would be valuable.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Hi, Richard, fs name should be unique on a single MGS namespace, and most &apos;lctl conf_param&apos; uses fsname to identify a filesystem. Do ou suggest that we&apos;d set jobstats parameters per target server but not per fs? I&apos;m not sure if I followed your comment correctly?&lt;/p&gt;</comment>
                            <comment id="35898" author="nrutman" created="Mon, 30 Apr 2012 14:04:00 +0000"  >&lt;p&gt;The intent of the MGS was to provide config info for all the filesystems at a site, so the fs name is unique.  If multiple MGS&apos;s are being used, on different nodes, the filesystem name could overlap &amp;#8211; but you&apos;d have to be masochistic to use the same filesystem name for two different filesystems at a single site.  &lt;br/&gt;
Masochistic to the point where I would say this should be disallowed by any and all configuration management systems, and if you do it by hand anyhow, you reap the unpleasant rewards.&lt;/p&gt;
</comment>
                            <comment id="35903" author="morrone" created="Mon, 30 Apr 2012 16:43:44 +0000"  >&lt;p&gt;Nathan, it boggles my mind as well.  But I know for a fact that folks out there have done it, because they complained about LMT not being able to handle two filesystem having exactly the same name.  They seemed to think it was Livermore&apos;s responsibility to factor in additional information like IP addresses to uniquely identify filesystems with the same name.  I of course declined.&lt;/p&gt;

&lt;p&gt;But one has to sympathize with the users.  Configuring lustre is so horribly bad that something like the filesystem name is completely non-obvious to most people.  You set it once in some cryptic way, and it none too clear from that point forward how it is used at all.&lt;/p&gt;

&lt;p&gt;Which I suppose is a long winded way of agreeing that filesystem names really need to be unique, and bending over backwards to differentiate filesystems with the same name is a path to madness.  But we also need to promote the name to a first-class object that is used in a sane way in the command-line tools and throughout lustre.  We also need to clearly document filesystem name usage.&lt;/p&gt;</comment>
                            <comment id="35961" author="rhenwood" created="Wed, 2 May 2012 11:33:30 +0000"  >&lt;p&gt;Thanks for the input. I agree that fs names are useful, for example:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;In the cases where there is only one MGS at a site, you need fs_name to distinguish the fs.&lt;/li&gt;
	&lt;li&gt;In the cases where you have not mounted the fs, you can still identify the fs.&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;So, how about supporting: &lt;tt&gt;&amp;lt;mount point|fsname&amp;gt;&lt;/tt&gt;?&lt;/p&gt;

&lt;p&gt;If the mount point is not valid, then &lt;tt&gt;error&lt;/tt&gt;&lt;/p&gt;</comment>
                            <comment id="99856" author="adilger" created="Sat, 22 Nov 2014 20:04:16 +0000"  >&lt;p&gt;Just updated examples in this bug to be more clear, since it showed up in a Google search.  I prefer not to use &quot;lustre&quot; as the fsname in examples, since it is very non-obvious that this needs to be replaced with the actual fsname and is not an fixed part of the parameter being specified (like the &quot;sys.jobid_var&quot; part is).&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="16191">LU-2058</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv4rr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4306</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                                                                                </customfields>
    </item>
</channel>
</rss>