<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:22:07 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15883] Lustre 2.15 GPUDirect Testing fullperf crash</title>
                <link>https://jira.whamcloud.com/browse/LU-15883</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Kernel crash happens after running fullperf tests.&lt;/p&gt;

&lt;p&gt;Crash happens after:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
root@a100-01:/usr/local/gds/docker# ./gds_docker.sh -p /lustre/ai400x2/client -v 1.2.0 -c 11.7.0 -m -t fullperf
SKIP DRIVER INSTALL 0
Available space in /lustre/ai400x2/client = 268740285
CONFIG_MOFED_VERSION
Found MOFED version 5.6-1.0.3.3
using nvidia driver version 515.43.04 on kernel 5.4.0-109-&lt;span class=&quot;code-keyword&quot;&gt;generic&lt;/span&gt;
f87d047a1632feeb1bd51a5544ac541ea91fd58910ce5d358540cc2b7da08fc5
Started container fullperf_135939
check output: docker container logs --follow fullperf_135939
root@a100-01:/usr/local/gds/docker# docker container logs --follow fullperf_135939
UserSpace RDMA Support Ok
logs file in /results/build_7-20220523_2228.log, /results/gds_7-20220523_2228.log
downloading dependencies &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; nvidia-fs
^M0% [Working]^M &#160; &#160; &#160; &#160; &#160; &#160;^MGet:1 https:&lt;span class=&quot;code-comment&quot;&gt;//developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 &#160;InRelease [1581 B]
&lt;/span&gt;^M0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers] [1 I^M0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers] [Con^M &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; ^MGet:2 http:&lt;span class=&quot;code-comment&quot;&gt;//security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
&lt;/span&gt;^M0% [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers] [2 InRelease 12.3 kB/114 kB 11%] [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers]^M0% [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers] [2 InRelease 14.2 kB/114 kB 12%] [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers]^M &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; ^MHit:3 http:&lt;span class=&quot;code-comment&quot;&gt;//archive.ubuntu.com/ubuntu focal InRelease
&lt;/span&gt;^M0% [2 InRelease 14.2 kB/114 kB 12%] [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers] [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers]^M &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; ^MGet:4 https:&lt;span class=&quot;code-comment&quot;&gt;//developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 &#160;Packages [557 kB]
&lt;/span&gt;^M0% [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers] [2 InRelease 14.2 kB/114 kB 12%] [4 Packages 4096 B/55^M &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; ^MHit:5 https:&lt;span class=&quot;code-comment&quot;&gt;//repo.download.nvidia.com/baseos/ubuntu/focal/x86_64 focal InRelease
&lt;/span&gt;^M0% [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers] [2 InRelease 14.2 kB/114 kB 12%] [4 Packages 557 kB/55^M0% [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers] [2 InRelease 14.2 kB/114 kB 12%] [Connecting to repo.d^M0% [4 Packages store 0 B] [Waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; headers] [2 InRelease 14.2 kB/114 kB 12%^M &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; ^MGet:6 http:&lt;span class=&quot;code-comment&quot;&gt;//archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
&lt;/span&gt;^M0% [4 Packages store 0 B] [6 InRelease 14.2 kB/114 kB 12%] [2 InRelease 43.1 kB^M0% [6 InRelease 15.6 kB/114 kB 14%] [2 InRelease 43.1 kB/114 kB 38%] [Connected^M0% [6 InRelease 15.6 kB/114 kB 14%] [2 InRelease 43.1 kB/114 kB 38%] [Waiting f^M
...skipping...
g &lt;span class=&quot;code-keyword&quot;&gt;package&lt;/span&gt; lists... 98%^MReading &lt;span class=&quot;code-keyword&quot;&gt;package&lt;/span&gt; lists... 98%^MReading &lt;span class=&quot;code-keyword&quot;&gt;package&lt;/span&gt; lists... 98%^MReading &lt;span class=&quot;code-keyword&quot;&gt;package&lt;/span&gt; lists... 99%^MReading &lt;span class=&quot;code-keyword&quot;&gt;package&lt;/span&gt; lists... 99%^MReading &lt;span class=&quot;code-keyword&quot;&gt;package&lt;/span&gt; lists... 99%^MReading &lt;span class=&quot;code-keyword&quot;&gt;package&lt;/span&gt; lists... 99%^MReading &lt;span class=&quot;code-keyword&quot;&gt;package&lt;/span&gt; lists... Done
W: Target Packages (Packages) is configured multiple times in /etc/apt/sources.list:50 and /etc/apt/sources.list.d/cuda-compute-repo.list:1
W: Target Packages (Packages) is configured multiple times in /etc/apt/sources.list:50 and /etc/apt/sources.list.d/cuda-compute-repo.list:1
nvidia-fs driver build success
cat: /sys/kernel/mm/memory_peers/nvidia-fs/version: No such file or directory
GDS verfication passed
GDS check passed
Max allowed GPUS: 8
path /data/0 is not a mount point
mount /data/0 not found, creating directory /data/GPU0
mount /data/0 not found, creating directory /data/GPU1
mount /data/0 not found, creating directory /data/GPU2
mount /data/0 not found, creating directory /data/GPU3
mount /data/0 not found, creating directory /data/GPU4
mount /data/0 not found, creating directory /data/GPU5
mount /data/0 not found, creating directory /data/GPU6
mount /data/0 not found, creating directory /data/GPU7
mount path: &#160;/data/GPU0 &#160;-&amp;gt; GPU device: 0
mount path: &#160;/data/GPU1 &#160;-&amp;gt; GPU device: 1
mount path: &#160;/data/GPU2 &#160;-&amp;gt; GPU device: 2
mount path: &#160;/data/GPU3 &#160;-&amp;gt; GPU device: 3
mount path: &#160;/data/GPU4 &#160;-&amp;gt; GPU device: 4
mount path: &#160;/data/GPU5 &#160;-&amp;gt; GPU device: 5
mount path: &#160;/data/GPU6 &#160;-&amp;gt; GPU device: 6
mount path: &#160;/data/GPU7 &#160;-&amp;gt; GPU device: 7
populating files:
/usr/local/gds/tools/gdsio -s 4096M -V -I 1 -x 0 -D /data/GPU0/gds -w 128 -d 0 -n 3 -D /data/GPU1/gds -w 128 -d 1 -n 3 -D /data/GPU2/gds -w 128 -d 2 -n 1 -D /data/GPU3/gds -w 128 -d 3 -n 1 -D /data/GPU4/gds -w 128 -d 4 -n 7 -D /data/GPU5/gds -w 128 -d 5 -n 7 -D /data/GPU6/gds -w 128 -d 6 -n 5 -D /data/GPU7/gds -w 128 -d 7 -n 5 -i 1M
Done populating
Running iter 1 &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; IOTYPE: 0 &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; XFERTYPE: -x 0 IOSIZE: 4 kb with threads: 128
/usr/local/gds/tools/gdsio -T 45 -s 512M -I 0 -x 0 -D /data/GPU0/gds -w 128 -d 0 -n 3 -D /data/GPU1/gds -w 128 -d 1 -n 3 -D /data/GPU2/gds -w 128 -d 2 -n 1 -D /data/GPU3/gds -w 128 -d 3 -n 1 -D /data/GPU4/gds -w 128 -d 4 -n 7 -D /data/GPU5/gds -w 128 -d 5 -n 7 -D /data/GPU6/gds -w 128 -d 6 -n 5 -D /data/GPU7/gds -w 128 -d 7 -n 5 -i 4k&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;ddn@a100-01:~$ lctl get_param version&lt;br/&gt;
version=2.15.50_13_gc524079_dirty&lt;/p&gt;

&lt;p&gt;NVIDIA-SMI 515.43.04 &#160; &#160;&lt;/p&gt;

&lt;p&gt;Driver Version: 515.43.04 &#160; &#160;&lt;/p&gt;

&lt;p&gt;CUDA Version: 11.7&lt;/p&gt;

&lt;p&gt;Kernel: 5.4.0-109-generic&lt;/p&gt;</description>
                <environment>NVIDIA DGX A100&lt;br/&gt;
</environment>
        <key id="70437">LU-15883</key>
            <summary>Lustre 2.15 GPUDirect Testing fullperf crash</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ssmirnov">Serguei Smirnov</assignee>
                                    <reporter username="okulachenko">Oleg Kulachenko</reporter>
                        <labels>
                    </labels>
                <created>Tue, 24 May 2022 05:54:08 +0000</created>
                <updated>Fri, 3 Jun 2022 22:53:40 +0000</updated>
                                            <version>Lustre 2.15.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="335894" author="JIRAUSER17312" created="Tue, 24 May 2022 14:50:57 +0000"  >&lt;p&gt;Hi &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=ssmirnov&quot; class=&quot;user-hover&quot; rel=&quot;ssmirnov&quot;&gt;ssmirnov&lt;/a&gt;&#160;&lt;/p&gt;

&lt;p&gt;I believe the GPU direct stuff was originally handled by &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=ashehata&quot; class=&quot;user-hover&quot; rel=&quot;ashehata&quot;&gt;ashehata&lt;/a&gt;, can you please take a look and assign out to someone else on your team?&lt;/p&gt;

&lt;p&gt;Thank you!&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="335941" author="ssmirnov" created="Tue, 24 May 2022 22:55:35 +0000"  >&lt;p&gt;From the provided traces, it didn&apos;t look to me like any of the lustre code is having an issue.&#160;&lt;/p&gt;

&lt;p&gt;After checking with Amir, I&apos;d like to request that a manual gdsio run is used:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;Ensure that gds driver is installed.
	&lt;ul&gt;
		&lt;li&gt;You can verify by: lsmod | grep nvidia_fs&lt;/li&gt;
	&lt;/ul&gt;
	&lt;/li&gt;
	&lt;li&gt;Verify that nvidia_fs is working properly by using gdscheck -p.&lt;/li&gt;
	&lt;li&gt;Mount Lustre&lt;/li&gt;
	&lt;li&gt;Tune lustre Clients&#160;&lt;/li&gt;
	&lt;li&gt;Run a quick gdsio test to ensure it works properly, for example:&#160;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
 ./gdsio -f /mnt/ai400/test -d 4 -n 0 -w 8 -s 1G -i 4M -x 0 -I 0IoType: READ XferType: GPUD Threads: 8&#160; DataSetSize: 809500672/1073741824 IOSize: 4096(KB),Throughput: 1.209208 GB/sec, Avg_Latency: 24341.183046 usecs ops: 193 total_time 623471.000000 usecs&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Do you see the same problem then?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="43767" name="Remote KVM [10.36.11.67] - [800 x 600 ] 2022-05-24 09-35-48.png" size="630474" author="okulachenko" created="Tue, 24 May 2022 05:54:25 +0000"/>
                            <attachment id="43766" name="Remote KVM [10.36.11.67] - [800 x 600 ] 2022-05-24 09-38-15.png" size="658847" author="okulachenko" created="Tue, 24 May 2022 05:54:25 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02qn3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>