<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:03:26 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-71] metabench failures</title>
                <link>https://jira.whamcloud.com/browse/LU-71</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Originally the bugzilla bug was about two separate issues as I understand it, but now it mostly revolves around a hash collision issue.&lt;br/&gt;
FanYong has a patch in the bug that needs to be tested at Hyperion.&lt;/p&gt;</description>
                <environment></environment>
        <key id="10349">LU-71</key>
            <summary>metabench failures</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="green">Oleg Drokin</reporter>
                        <labels>
                    </labels>
                <created>Wed, 9 Feb 2011 12:39:05 +0000</created>
                <updated>Sat, 19 Mar 2011 07:39:40 +0000</updated>
                            <resolved>Sat, 19 Mar 2011 07:39:40 +0000</resolved>
                                    <version>Lustre 2.0.0</version>
                    <version>Lustre 2.1.0</version>
                                    <fixVersion>Lustre 2.1.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                            <comments>
                            <comment id="10804" author="yong.fan" created="Tue, 1 Mar 2011 00:15:48 +0000"  >&lt;p&gt;The new patch needs to be verified on Hyperion before landed to lustre-2.1:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/#change,281&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,281&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="10934" author="cliffw" created="Mon, 7 Mar 2011 16:17:47 +0000"  >&lt;p&gt;Patch is under test on Hyperion - hyperion-sanity results:&lt;br/&gt;
Full System MIB&lt;/p&gt;

&lt;p&gt;000: Table of 824 tasks with up to 4097 system calls&lt;br/&gt;
000:           date            tasks  xfer  call  time      write       read&lt;br/&gt;
000:                                       limit limit       MB/s       MB/s&lt;br/&gt;
000: ------------------------ ------ ----- ----- ----- ---------- ----------&lt;br/&gt;
000: Fri Mar  4 14:23:24 2011    824 1024k  4096   300    4904.61    4463.70&lt;br/&gt;
000: Fri Mar  4 23:04:08 2011    824 1024k  4096   300    4794.67    4370.48&lt;br/&gt;
000: Sat Mar  5 07:45:25 2011    824 1024k  4096   300    4774.72    4364.73&lt;br/&gt;
000: Sat Mar  5 16:26:57 2011    824 1024k  4096   300    4677.37    4291.81&lt;br/&gt;
AVERAGE                                                   4787.84    4372.68&lt;/p&gt;

&lt;p&gt;000: Table of 824 tasks with up to 65 system calls&lt;br/&gt;
000:           date            tasks  xfer  call  time      write       read&lt;br/&gt;
000:                                       limit limit       MB/s       MB/s&lt;br/&gt;
000: ------------------------ ------ ----- ----- ----- ---------- ----------&lt;br/&gt;
000: Fri Mar  4 14:24:01 2011    824 1024k    64   300    3221.95   13247.59&lt;br/&gt;
000: Fri Mar  4 23:04:49 2011    824 1024k    64   300    2837.28   10886.22&lt;br/&gt;
000: Sat Mar  5 07:46:01 2011    824 1024k    64   300    3392.38   13772.67&lt;br/&gt;
000: Sat Mar  5 16:27:33 2011    824 1024k    64   300    3175.49   13043.68&lt;br/&gt;
AVERAGE                                                   3156.78   12737.54&lt;/p&gt;

&lt;p&gt;IOR &lt;br/&gt;
file-per-process&lt;br/&gt;
Write AVERAGE         3198.88 Mib/sec&lt;br/&gt;
Read AVERAGE         2643.08&lt;/p&gt;

&lt;p&gt;file-per-process independent&lt;br/&gt;
Write AVERAGE         3242.98&lt;br/&gt;
Read AVERAGE         2647.37&lt;/p&gt;

&lt;p&gt;single-shared-file&lt;br/&gt;
Write AVERAGE         1979.96&lt;br/&gt;
Read AVERAGE         4094.98&lt;/p&gt;

&lt;p&gt;single-shared-file independent&lt;br/&gt;
Write AVERAGE         1982.10&lt;br/&gt;
Read Average           4213.00&lt;/p&gt;

&lt;p&gt;These results are comparable to previous 2.1 runs&lt;br/&gt;
One metabench test failed due to a single client eviction&lt;/p&gt;

&lt;p&gt;Mar  6 00:00:13 ehyperion571 mrshd&lt;span class=&quot;error&quot;&gt;&amp;#91;19987&amp;#93;&lt;/span&gt;: root@ehyperion0 as root: cmd=&apos;rdistd -S&apos;&lt;br/&gt;
Mar  6 00:03:09 ehyperion571 logger: fixperms: done running at 03/06/11 00:03:09 ...&lt;br/&gt;
Mar  6 00:16:55 ehyperion571 LustreError: 20162:0:(dir.c:316:ll_get_dir_page()) dir page locate: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2000007d6:0xaf0:0x0&amp;#93;&lt;/span&gt; at 13476436484223492: rc&lt;br/&gt;
-5&lt;br/&gt;
Mar  6 00:18:36 ehyperion571 LustreError: 11-0: an error occurred while communicating with 192.168.117.1@o2ib. The ldlm_enqueue operation failed w&lt;br/&gt;
ith -107&lt;br/&gt;
Mar  6 00:18:36 ehyperion571 Lustre: lustre-MDT0000-mdc-ffff810210198400: Connection to service lustre-MDT0000 via nid 192.168.117.1@o2ib was lost&lt;br/&gt;
; in progress operations using this service will wait for recovery to complete.&lt;br/&gt;
Mar  6 00:18:36 ehyperion571 LustreError: 167-0: This client was evicted by lustre-MDT0000; in progress operations using this service will fail.&lt;br/&gt;
Mar  6 00:18:36 ehyperion571 LustreError: 20131:0:(mdc_locks.c:711:mdc_enqueue()) ldlm_cli_enqueue: -4&lt;br/&gt;
Mar  6 00:18:36 ehyperion571 LustreError: 20164:0:(ldlm_resource.c:746:ldlm_resource_complain()) Namespace lustre-MDT0000-mdc-ffff810210198400 res&lt;br/&gt;
ource refcount nonzero (1) after lock cleanup; forcing cleanup.&lt;br/&gt;
Mar  6 00:18:36 ehyperion571 LustreError: 20164:0:(ldlm_resource.c:752:ldlm_resource_complain()) Resource: ffff8101f6c9f0c0 (8589936606/39266/0/0)&lt;br/&gt;
 (rc: 1)&lt;br/&gt;
Mar  6 00:18:36 ehyperion571 Lustre: lustre-MDT0000-mdc-ffff810210198400: Connection restored to service lustre-MDT0000 using nid 192.168.117.1@o2&lt;br/&gt;
ib.&lt;/p&gt;</comment>
                            <comment id="10942" author="cliffw" created="Tue, 8 Mar 2011 00:31:31 +0000"  >&lt;p&gt;Okay the second pass of the test has also failed metabench, with the same failure - this may be a cause for concern.&lt;/p&gt;


&lt;p&gt;metabench -w /p/l_wham/white215/hyperion.14374/metabench -k -c 16384 -C -z&lt;/p&gt;

&lt;p&gt; 2097152    785.6744       2669.24        152      0.0464       3272.76&lt;/p&gt;

&lt;p&gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;03/07/2011 22:28:48&amp;#93;&lt;/span&gt; Leaving time_file_creation with proc_id = 823&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;03/07/2011 22:28:48&amp;#93;&lt;/span&gt; Entering par_create_multidir to create 2549 files in 1 dirs&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;03/07/2011 22:30:29&amp;#93;&lt;/span&gt; FATAL error on process 823&lt;br/&gt;
 Proc 823: Unable to stat file &lt;span class=&quot;error&quot;&gt;&amp;#91;/p/l_wham/white215/hyperion.14374/metabench/TIME_CREATE_823.003/cspFCaRlAs4Dx&amp;#93;&lt;/span&gt; [144131814699691100/8881629&lt;br/&gt;
303163026]: Interrupted system call&lt;br/&gt;
srun: error: hyperion571: task 823: Exited with exit code 255&lt;br/&gt;
srun: First task exited 1800s ago&lt;/p&gt;

&lt;p&gt;Client errors: &lt;br/&gt;
Mar  7 22:28:49 ehyperion571 LustreError: 1303:0:(dir.c:316:ll_get_dir_page()) dir page locate: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2000efff0:0x1918:0x0&amp;#93;&lt;/span&gt; at 8879820204541995: rc -5&lt;br/&gt;
Mar  7 22:30:29 ehyperion571 LustreError: 11-0: an error occurred while communicating with 192.168.117.1@o2ib. The ldlm_enqueue operation failed with -107&lt;br/&gt;
Mar  7 22:30:29 ehyperion571 Lustre: lustre-MDT0000-mdc-ffff810210198400: Connection to service lustre-MDT0000 via nid 192.168.117.1@o2ib was lost; in progress opera&lt;br/&gt;
tions using this service will wait for recovery to complete.&lt;br/&gt;
Mar  7 22:30:29 ehyperion571 LustreError: 167-0: This client was evicted by lustre-MDT0000; in progress operations using this service will fail.&lt;br/&gt;
Mar  7 22:30:29 ehyperion571 LustreError: 1270:0:(mdc_locks.c:711:mdc_enqueue()) ldlm_cli_enqueue: -4&lt;br/&gt;
Mar  7 22:30:29 ehyperion571 LustreError: 1306:0:(ldlm_resource.c:746:ldlm_resource_complain()) Namespace lustre-MDT0000-mdc-ffff810210198400 resource refcount nonze&lt;br/&gt;
ro (1) after lock cleanup; forcing cleanup.&lt;br/&gt;
Mar  7 22:30:29 ehyperion571 LustreError: 1306:0:(ldlm_resource.c:746:ldlm_resource_complain()) Skipped 1 previous similar message&lt;br/&gt;
Mar  7 22:30:29 ehyperion571 LustreError: 1306:0:(ldlm_resource.c:752:ldlm_resource_complain()) Resource: ffff81014b484a80 (8590920616/109475/0/0) (rc: 1)&lt;br/&gt;
Mar  7 22:30:29 ehyperion571 LustreError: 1306:0:(ldlm_resource.c:752:ldlm_resource_complain()) Skipped 1 previous similar message&lt;br/&gt;
Mar  7 22:30:29 ehyperion571 Lustre: lustre-MDT0000-mdc-ffff810210198400: Connection restored to service lustre-MDT0000 using nid 192.168.117.1@o2ib.&lt;br/&gt;
Mar  7 23:00:13 ehyperion571 mrshd&lt;span class=&quot;error&quot;&gt;&amp;#91;1323&amp;#93;&lt;/span&gt;: root@ehyperion0 as root: cmd=&apos;rdistd -S&apos;&lt;br/&gt;
Mar  7 23:00:32 ehyperion571 logger: fixperms: done running at 03/07/11 23:00:32 ...&lt;br/&gt;
Mar  8 00:00:13 ehyperion571 mrshd&lt;span class=&quot;error&quot;&gt;&amp;#91;1487&amp;#93;&lt;/span&gt;: root@ehyperion0 as root: cmd=&apos;rdistd -S&apos;&lt;/p&gt;

&lt;p&gt;MDS hyperion720&lt;br/&gt;
Mar  7 22:30:29 ehyperion720 LustreError: 0:0:(ldlm_lockd.c:348:waiting_locks_callback()) ### lock callback timer expired after 100s: evicting client at 192.168.115.&lt;br/&gt;
142@o2ib  ns: mdt-ffff81012b222000 lock: ffff81017773e480/0x697433fb7bc82718 lrc: 3/0,0 mode: PR/PR res: 8590920616/109475 bits 0x3 rrc: 2 type: IBT flags: 0x4000020&lt;br/&gt;
 remote: 0xf00e0d9dee2b9e00 expref: 10 pid: 10361 timeout: 4586658929&lt;br/&gt;
Mar  7 22:30:29 ehyperion720 LustreError: 10349:0:(mdt_handler.c:2806:mdt_recovery()) operation 101 on unconnected MDS from 12345-192.168.115.142@o2ib&lt;br/&gt;
Mar  7 22:30:29 ehyperion720 LustreError: 10349:0:(ldlm_lib.c:2118:target_send_reply_msg()) @@@ processing error (-107)  req@ffff81005747ec00 x1362388536975602/t0(0)&lt;br/&gt;
 o-1-&amp;gt;&amp;lt;?&amp;gt;@&amp;lt;?&amp;gt;:0/0 lens 552/0 e 0 to 0 dl 1299565959 ref 1 fl Interpret:/ffffffff/ffffffff rc -107/-1&lt;br/&gt;
Mar  7 22:30:29 ehyperion720 Lustre: 10349:0:(ldlm_lib.c:871:target_handle_connect()) lustre-MDT0000: connection from 8a9522b4-1c87-2ee8-a571-ded5c0d66ba3@192.168.11&lt;br/&gt;
5.142@o2ib t4922514102 exp 0000000000000000 cur 1299565829 last 0&lt;br/&gt;
Mar  7 22:30:29 ehyperion720 Lustre: 10349:0:(sec.c:1474:sptlrpc_import_sec_adapt()) import lustre-MDT0000-&amp;gt;NET_0x50000c0a8738e_UUID netid 50000: select flavor null&lt;/p&gt;

&lt;p&gt;No errors on any OSTs.&lt;/p&gt;</comment>
                            <comment id="10943" author="yong.fan" created="Tue, 8 Mar 2011 01:56:31 +0000"  >&lt;p&gt;Thanks Cliff. From the test result we can say that the patch works as we expected. The failure case you attached is for test without my patch, right?&lt;/p&gt;

&lt;p&gt;&amp;gt; Client errors: &lt;br/&gt;
&amp;gt; Mar 7 22:28:49 ehyperion571 LustreError: 1303:0:(dir.c:316:ll_get_dir_page()) dir page locate: &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2000efff0:0x1918:0x0&amp;#93;&lt;/span&gt; at 8879820204541995: rc -5&lt;/p&gt;

&lt;p&gt;It is corresponding to original Lustre code without my patch.&lt;/p&gt;</comment>
                            <comment id="10953" author="cliffw" created="Tue, 8 Mar 2011 09:59:20 +0000"  >&lt;p&gt;no, the failure case I attached is with your patch&lt;br/&gt;
cliffw&lt;/p&gt;





&lt;p&gt;&amp;#8211; &lt;br/&gt;
cliffw&lt;br/&gt;
Support Guy&lt;br/&gt;
WhamCloud, Inc.&lt;br/&gt;
www.whamcloud.com&lt;/p&gt;</comment>
                            <comment id="10954" author="cliffw" created="Tue, 8 Mar 2011 10:01:20 +0000"  >&lt;p&gt;I took the RPMs from build 347&lt;br/&gt;
lustre-2.0.59-2.6.18_194.17.1.el5_lustre.g94d9119_gd5e659b.x86_64.rpm&lt;br/&gt;
cliffw&lt;/p&gt;





&lt;p&gt;&amp;#8211; &lt;br/&gt;
cliffw&lt;br/&gt;
Support Guy&lt;br/&gt;
WhamCloud, Inc.&lt;br/&gt;
www.whamcloud.com&lt;/p&gt;</comment>
                            <comment id="10970" author="yong.fan" created="Tue, 8 Mar 2011 19:46:41 +0000"  >&lt;p&gt;Very strange, according to the error message, the line corresponding to &quot;(dir.c:316:ll_get_dir_page()) dir page locate:&quot; is that:&lt;/p&gt;

&lt;p&gt;        page = ll_dir_page_locate(dir, &amp;amp;lhash, &amp;amp;start, &amp;amp;end);&lt;br/&gt;
        if (IS_ERR(page)) &lt;/p&gt;
{
                CERROR(&quot;dir page locate: &quot;DFID&quot; at &quot;LPU64&quot;: rc %ld\n&quot;,
===&amp;gt;                       PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
                GOTO(out_unlock, page);
        }

&lt;p&gt;Such code section is just line 316 for original master without patch. For patched master, it is line 340. I have checked the source code for build 347 you used.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;http://build.whamcloud.com/job/reviews-centos5/347/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/reviews-centos5/347/&lt;/a&gt;&lt;br/&gt;
lustre-source-2.0.59-2.6.18_194.17.1.el5_lustre.g94d9119_gd5e659b.x86_64.rpm&lt;/p&gt;

&lt;p&gt;So would you please to login ehyperion571 to check the kernel version for further confirm. Thanks!&lt;/p&gt;</comment>
                            <comment id="10998" author="cliffw" created="Thu, 10 Mar 2011 11:22:06 +0000"  >&lt;p&gt;There was a mistake in the kernel build. &lt;br/&gt;
Re-tested with the correct kernel, metabench passed 14 runs.&lt;/p&gt;


&lt;p&gt;  NERSC Time:    4735.13 &lt;br/&gt;
  NERSC Time:    4868.54 &lt;br/&gt;
  NERSC Time:    4920.77 &lt;br/&gt;
  NERSC Time:    4940.35 &lt;br/&gt;
  NERSC Time:    4934.38 &lt;br/&gt;
  NERSC Time:    4934.63 &lt;br/&gt;
  NERSC Time:    5030.45 &lt;br/&gt;
  NERSC Time:    5154.06 &lt;br/&gt;
  NERSC Time:    5052.45 &lt;br/&gt;
 NERSC Time:    5125.95 &lt;br/&gt;
  NERSC Time:    5095.29 &lt;br/&gt;
  NERSC Time:    5115.10 &lt;br/&gt;
  NERSC Time:    5138.36 &lt;br/&gt;
  NERSC Time:    5171.40 &lt;/p&gt;</comment>
                            <comment id="11012" author="yong.fan" created="Thu, 10 Mar 2011 20:00:24 +0000"  >&lt;p&gt;Thanks Cliff, this bug has blocked us for a long time. It is really helpful.&lt;/p&gt;</comment>
                            <comment id="11247" author="yong.fan" created="Sat, 19 Mar 2011 07:39:40 +0000"  >&lt;p&gt;patch has been merged into lustre-2.1 candidate.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                    <customfield id="customfield_10020" key="com.atlassian.jira.plugin.system.customfieldtypes:float">
                        <customfieldname>Bugzilla ID</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>20581.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv4en:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4247</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>