<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:04:19 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-157] metabench failed on parallel-scale test</title>
                <link>https://jira.whamcloud.com/browse/LU-157</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;metabench test failed on lustre client, can be reproduced.&lt;/p&gt;

&lt;p&gt;test log&lt;br/&gt;
-----------&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;03/23/2011 23:15:53&amp;#93;&lt;/span&gt; Leaving time_file_creation with proc_id = 11&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;03/23/2011 23:15:53&amp;#93;&lt;/span&gt; Entering par_create_multidir to create 910 files in 1 dirs&lt;br/&gt;
Removed 10000 files in      8.325 seconds&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;client-5.lab.whamcloud.com:6909&amp;#93;&lt;/span&gt; *** An error occurred in MPI_Gather&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;client-5.lab.whamcloud.com:6909&amp;#93;&lt;/span&gt; *** on communicator MPI COMMUNICATOR 14 CREATEE&lt;br/&gt;
 FROM 0&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;client-5.lab.whamcloud.com:6909&amp;#93;&lt;/span&gt; *** MPI_ERR_TRUNCATE: message truncated&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;client-5.lab.whamcloud.com:6909&amp;#93;&lt;/span&gt; *** MPI_ERRORS_ARE_FATAL (your MPI job will noo&lt;br/&gt;
w abort)&lt;br/&gt;
--------------------------------------------------------------------------&lt;br/&gt;
mpirun has exited due to process rank 0 with PID 6909 on&lt;br/&gt;
node client-5.lab.whamcloud.com exiting without calling &quot;finalize&quot;. This may&lt;br/&gt;
have caused other processes in the application to be&lt;br/&gt;
terminated by signals sent by mpirun (as reported here).&lt;br/&gt;
--------------------------------------------------------------------------&lt;/p&gt;</description>
                <environment>separated MDS and OSS, 3 clients</environment>
        <key id="10493">LU-157</key>
            <summary>metabench failed on parallel-scale test</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="mjmac">Michael MacDonald</assignee>
                                    <reporter username="sarah">Sarah Liu</reporter>
                        <labels>
                    </labels>
                <created>Thu, 24 Mar 2011 12:55:14 +0000</created>
                <updated>Fri, 20 May 2011 16:14:00 +0000</updated>
                            <resolved>Fri, 20 May 2011 16:14:00 +0000</resolved>
                                    <version>Lustre 2.1.0</version>
                    <version>Lustre 1.8.6</version>
                                    <fixVersion>Lustre 2.1.0</fixVersion>
                    <fixVersion>Lustre 1.8.6</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="11366" author="pjones" created="Fri, 25 Mar 2011 05:22:56 +0000"  >&lt;p&gt;Ah sorry Yu Jian. This was the ticket that I meant to assign to you when I assigned LU158 to you. Could you please see what you can uncover about the failure? Thanks.&lt;/p&gt;</comment>
                            <comment id="11374" author="pjones" created="Fri, 25 Mar 2011 07:15:49 +0000"  >&lt;p&gt;Apparently Fan Yong is working on this one&lt;/p&gt;</comment>
                            <comment id="11395" author="sarah" created="Fri, 25 Mar 2011 11:28:37 +0000"  >&lt;p&gt;it seems this is a duplicate issue with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-142&quot; title=&quot;system hang when running replay-single or replay-dual with three clients&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-142&quot;&gt;&lt;del&gt;LU-142&lt;/del&gt;&lt;/a&gt;, so close it.&lt;/p&gt;</comment>
                            <comment id="11396" author="sarah" created="Fri, 25 Mar 2011 13:06:09 +0000"  >&lt;p&gt;miss closing, reopen.this is not a duplicated with 161/142&lt;/p&gt;</comment>
                            <comment id="11405" author="pjones" created="Sat, 26 Mar 2011 04:52:59 +0000"  >&lt;p&gt;Fan Yong&lt;/p&gt;

&lt;p&gt;Are you able to look into this one or should I reassign it?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="11409" author="yong.fan" created="Sun, 27 Mar 2011 03:18:45 +0000"  >&lt;p&gt;I will investigate it.&lt;/p&gt;</comment>
                            <comment id="11537" author="yong.fan" created="Tue, 29 Mar 2011 09:04:23 +0000"  >&lt;p&gt;After some painful debug (because of no familiar with MPI), I found some useful clue eventually. It is the incompatibility between openmpi and metabench installed on Toro nodes caused the failure. I compiled metabench from source code (tiny fix metabench code because of unknown parameter from MPI lib when start, not sure why) with openmpi-devel installed, then metabench test run successfully. So it is nothing related with Lustre. We need new MPI lib or metabench when deploying test nodes on Toro, but it is out of my control.&lt;/p&gt;

&lt;p&gt;Thanks yujian for the help of building test environment.&lt;/p&gt;</comment>
                            <comment id="11541" author="pjones" created="Tue, 29 Mar 2011 09:22:40 +0000"  >&lt;p&gt;thanks Fan Yong. Let&apos;s reassign this ticket to mjmac to sort out the Toro config&lt;/p&gt;</comment>
                            <comment id="11543" author="mjmac" created="Tue, 29 Mar 2011 09:25:07 +0000"  >&lt;p&gt;fanyong, can you please provide more detail as to what the problem was, and how you fixed it?  I would like to update the toolkit build so that future test installs work correctly.&lt;/p&gt;</comment>
                            <comment id="11559" author="yong.fan" created="Tue, 29 Mar 2011 20:18:46 +0000"  >&lt;p&gt;Currently, the openmpi installed on Toro nodes is openmpi-1.4-4.el5, the metabench is metabench-1.0-1.wc1, there are some incompatibility between them. I do not know the detailed reason, but you can try as following:&lt;/p&gt;

&lt;p&gt;1) install openmpi-devel on your test node&lt;br/&gt;
2) compile metabench from source code.&lt;br/&gt;
3) run parallel-scale with new metabench, it will report &quot;Invalid Arg ?&quot;&lt;br/&gt;
4) fix the metabench.c to ignore such unknown parameter &quot;?&quot;, and recompile&lt;br/&gt;
5) then run parallel-scale again, it can pass.&lt;/p&gt;

&lt;p&gt;I have put the workable metabench under /tmp/metabench on Brent node, which can run on 2.6.18-194.17.1.el5. I am not sure how to fix it easily, maybe use &quot;MPICH&quot; or fix metabench.&lt;/p&gt;</comment>
                            <comment id="13303" author="yujian" created="Mon, 25 Apr 2011 18:44:55 +0000"  >&lt;p&gt;Branch: b1_8 (Revision: c5c2986be490b2fbceb4b38d6c983d279f4bbcf8)&lt;br/&gt;
Distro/Arch: RHEL6/x86_64 (patchless client), RHEL5/x86_64 (server)&lt;br/&gt;
Network: tcp&lt;/p&gt;

&lt;p&gt;&amp;#35; rpm -qf /usr/lib64/openmpi/bin/mpirun&lt;br/&gt;
openmpi-1.4.1-4.3.el6.x86_64&lt;/p&gt;

&lt;p&gt;&amp;#35; rpm -qf /usr/bin/metabench&lt;br/&gt;
metabench-1.0-1.wc1.x86_64&lt;/p&gt;

&lt;p&gt;The same failure occurred while running metabench test:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[client-13:9663] *** An error occurred in MPI_Gather
[client-13:9663] *** on communicator MPI COMMUNICATOR 10 CREATE FROM 0
[client-13:9663] *** MPI_ERR_TRUNCATE: message truncated
[client-13:9663] *** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)
--------------------------------------------------------------------------
mpirun has exited due to process rank 0 with PID 9663 on
node client-13 exiting without calling &quot;finalize&quot;. This may
have caused other processes in the application to be
terminated by signals sent by mpirun (as reported here).
--------------------------------------------------------------------------
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/4673d0c8-6cb3-11e0-b32b-52540025f9af&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/4673d0c8-6cb3-11e0-b32b-52540025f9af&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="14331" author="mjmac" created="Fri, 13 May 2011 09:53:58 +0000"  >&lt;p&gt;The real problem is that the receive buffer was defined as MPI_INT but the send buffer was defined as MPI_UNSIGNED_LONG.  When compiled with gcc on x86_64, longs (8 bytes) don&apos;t fit into ints (4 bytes), hence the MPI_ERR_TRUNCATE error.&lt;/p&gt;

&lt;p&gt;I&apos;ve committed a small patch which corrects this, and I&apos;m waiting for RPMs to build across all platforms.  I&apos;ve already verified this on EL6/x86_64; please resolve the ticket when other platforms are verified in the normal course of testing.  I&apos;m confident that this issue is fixed, though, as it was a simple problem with a simple solution, once I understood the problem!&lt;/p&gt;

&lt;p&gt;For reference, here is an excerpt from the patch:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;     MPI_SAFE(MPI_Gather(&amp;amp;count,1,MPI_UNSIGNED_LONG,
-           count_buf,1,MPI_INT,proc0,*my_comm));
+           count_buf,1,MPI_UNSIGNED_LONG,proc0,*my_comm));
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;ve applied this fix to all MPI_Gather instances with mismatched send/receive datatypes.&lt;/p&gt;</comment>
                            <comment id="14381" author="sarah" created="Sun, 15 May 2011 22:48:27 +0000"  >&lt;p&gt;Hi Mike, I ran cascading_rw on build lustre-master/rhel6-x86_64/#118, it continued running for almost two days and hasn&apos;t finished yet. Does this build contain the latest openmpi? &lt;/p&gt;</comment>
                            <comment id="14644" author="sarah" created="Wed, 18 May 2011 23:39:41 +0000"  >&lt;p&gt;I verified this bug on the latest last-master for RHEL5-x86_84, metabench passes on NFS3 but failed on NFS4. I think the failure is not related to mpi, so open a new ticket for tracking,&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-344&quot; title=&quot;Test failure on test suite parallel-scale&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-344&quot;&gt;&lt;del&gt;LU-344&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Here are both results:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/0df23b78-81de-11e0-b4df-52540025f9af&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/0df23b78-81de-11e0-b4df-52540025f9af&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/3c243272-81e2-11e0-b4df-52540025f9af&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/3c243272-81e2-11e0-b4df-52540025f9af&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="14858" author="mjmac" created="Fri, 20 May 2011 16:14:00 +0000"  >&lt;p&gt;I&apos;m going to resolve this, as the original issue with bad code in metabench has been fixed.  Please open new tickets for the other problems (e.g. cascading_rw).&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw00v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>10083</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>