<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:30:54 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3093] lstcon_dstnodes_prep()) ASSERTION( grp-&gt;grp_nnode &gt;= 1 )</title>
                <link>https://jira.whamcloud.com/browse/LU-3093</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;An lst session node crashed when I tried to run an lnet selftest.&lt;/p&gt;


&lt;p&gt;Backtrace from crash:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; bt
PID: 2961   TASK: ffff880bed544ae0  CPU: 8   COMMAND: &quot;lst&quot;
 #0 [ffff8808edeff980] machine_kexec at ffffffff81035b6b
 #1 [ffff8808edeff9e0] crash_kexec at ffffffff810c08d2
 #2 [ffff8808edeffab0] panic at ffffffff8150d3f3
 #3 [ffff8808edeffb30] lbug_with_loc at ffffffffa02c1e4b [libcfs]
 #4 [ffff8808edeffb50] lstcon_dstnodes_prep at ffffffffa059cc36 [lnet_selftest]
 #5 [ffff8808edeffbc0] lstcon_testrpc_prep at ffffffffa059eb9a [lnet_selftest]
 #6 [ffff8808edeffc20] lstcon_rpc_trans_ndlist at ffffffffa059f30f [lnet_selftest]
 #7 [ffff8808edeffc90] lstcon_test_add at ffffffffa059b2ce [lnet_selftest]
 #8 [ffff8808edeffd10] lst_test_add_ioctl at ffffffffa05a03e7 [lnet_selftest]
 #9 [ffff8808edeffda0] lstcon_ioctl_entry at ffffffffa05a3f95 [lnet_selftest]
#10 [ffff8808edeffdd0] libcfs_ioctl at ffffffffa02cac84 [libcfs]
#11 [ffff8808edeffe10] libcfs_ioctl at ffffffffa02c5ac4 [libcfs]
#12 [ffff8808edeffe60] vfs_ioctl at ffffffff81194acc
#13 [ffff8808edeffea0] do_vfs_ioctl at ffffffff81194c14
#14 [ffff8808edefff30] sys_ioctl at ffffffff81195191
#15 [ffff8808edefff80] system_call_fastpath at ffffffff8100b072
    RIP: 00002aaaaadada47  RSP: 00007fffffffe1b8  RFLAGS: 00010206
    RAX: 0000000000000010  RBX: ffffffff8100b072  RCX: 00002aaaaae1dac0
    RDX: 00007fffffffe0a0  RSI: 00000000c008653f  RDI: 0000000000000003
    RBP: 0000000000000000   R8: 0000000000000c26   R9: 0000000000000068
    R10: 00007fffffffdf40  R11: 0000000000000246  R12: 00000000c008653f
    R13: 00007fffffffe0a0  R14: 000000000061c2e0  R15: 0000000000000003
    ORIG_RAX: 0000000000000010  CS: 0033  SS: 002b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console panic message:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 2961:0:(conrpc.c:738:lstcon_dstnodes_prep()) ASSERTION( grp-&amp;gt;grp_nnode &amp;gt;= 1 ) failed: 
LustreError: 2961:0:(conrpc.c:738:lstcon_dstnodes_prep()) LBUG
Pid: 2961, comm: lst

Call Trace:
 [&amp;lt;ffffffffa02c17e5&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
 [&amp;lt;ffffffffa02c1df7&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
 [&amp;lt;ffffffffa059cc36&amp;gt;] lstcon_dstnodes_prep+0x236/0x280 [lnet_selftest]
 [&amp;lt;ffffffff8116041a&amp;gt;] ? alloc_pages_current+0xaa/0x110
 [&amp;lt;ffffffffa059eb9a&amp;gt;] lstcon_testrpc_prep+0xfa/0x310 [lnet_selftest]
 [&amp;lt;ffffffffa0596a40&amp;gt;] ? lstcon_testrpc_condition+0x0/0x1c0 [lnet_selftest]
 [&amp;lt;ffffffffa059f30f&amp;gt;] lstcon_rpc_trans_ndlist+0x24f/0x300 [lnet_selftest]
 [&amp;lt;ffffffffa059b2ce&amp;gt;] lstcon_test_add+0x4ce/0x930 [lnet_selftest]
 [&amp;lt;ffffffffa05a03e7&amp;gt;] lst_test_add_ioctl+0xaf7/0xc20 [lnet_selftest]
 [&amp;lt;ffffffffa05a3f95&amp;gt;] lstcon_ioctl_entry+0x505/0x5f0 [lnet_selftest]
 [&amp;lt;ffffffffa02cac84&amp;gt;] libcfs_ioctl+0x354/0x900 [libcfs]
 [&amp;lt;ffffffffa02c5ac4&amp;gt;] libcfs_ioctl+0x84/0x180 [libcfs]
 [&amp;lt;ffffffff81194acc&amp;gt;] vfs_ioctl+0x7c/0xa0
 [&amp;lt;ffffffff81194c14&amp;gt;] do_vfs_ioctl+0x84/0x580
 [&amp;lt;ffffffff81195191&amp;gt;] sys_ioctl+0x81/0xa0
 [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b

Kernel panic - not syncing: LBUG
Pid: 2961, comm: lst Tainted: G        W  ---------------    2.6.32-358.5chaos.ch5.1.x86_64 #1
Call Trace:
 [&amp;lt;ffffffff8150d3ec&amp;gt;] ? panic+0xa7/0x16f
 [&amp;lt;ffffffffa02c1e4b&amp;gt;] ? lbug_with_loc+0x9b/0xb0 [libcfs]
 [&amp;lt;ffffffffa059cc36&amp;gt;] ? lstcon_dstnodes_prep+0x236/0x280 [lnet_selftest]
 [&amp;lt;ffffffff8116041a&amp;gt;] ? alloc_pages_current+0xaa/0x110
 [&amp;lt;ffffffffa059eb9a&amp;gt;] ? lstcon_testrpc_prep+0xfa/0x310 [lnet_selftest]
 [&amp;lt;ffffffffa0596a40&amp;gt;] ? lstcon_testrpc_condition+0x0/0x1c0 [lnet_selftest]
 [&amp;lt;ffffffffa059f30f&amp;gt;] ? lstcon_rpc_trans_ndlist+0x24f/0x300 [lnet_selftest]
 [&amp;lt;ffffffffa059b2ce&amp;gt;] ? lstcon_test_add+0x4ce/0x930 [lnet_selftest]
 [&amp;lt;ffffffffa05a03e7&amp;gt;] ? lst_test_add_ioctl+0xaf7/0xc20 [lnet_selftest]
 [&amp;lt;ffffffffa05a3f95&amp;gt;] ? lstcon_ioctl_entry+0x505/0x5f0 [lnet_selftest]
 [&amp;lt;ffffffffa02cac84&amp;gt;] ? libcfs_ioctl+0x354/0x900 [libcfs]
 [&amp;lt;ffffffffa02c5ac4&amp;gt;] ? libcfs_ioctl+0x84/0x180 [libcfs]
 [&amp;lt;ffffffff81194acc&amp;gt;] ? vfs_ioctl+0x7c/0xa0
 [&amp;lt;ffffffff81194c14&amp;gt;] ? do_vfs_ioctl+0x84/0x580
 [&amp;lt;ffffffff81195191&amp;gt;] ? sys_ioctl+0x81/0xa0
 [&amp;lt;ffffffff8100b072&amp;gt;] ? system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;lst script (note the servers NID was mispelled)&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;#!/bin/bash
export LST_SESSION=1234
lst new_session read/write
lst add_group servers 172.19.1.101@owib100
lst add_group readers 172.16.66.53@tcp
lst add_batch bulk_rw
lst add_test --batch bulk_rw --from readers --to servers     brw read check=simple size=1M
lst run bulk_rw
# display server stats &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; 30 seconds
lst stat servers &amp;amp; sleep 30; kill $!
# tear down
lst end_session
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>&lt;a href=&quot;https://github.com/chaos/lustre/commits/2.1.4-3chaos&quot;&gt;https://github.com/chaos/lustre/commits/2.1.4-3chaos&lt;/a&gt;</environment>
        <key id="18208">LU-3093</key>
            <summary>lstcon_dstnodes_prep()) ASSERTION( grp-&gt;grp_nnode &gt;= 1 )</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="6" iconUrl="https://jira.whamcloud.com/images/icons/statuses/closed.png" description="The issue is considered finished, the resolution is correct. Issues which are closed can be reopened.">Closed</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="ashehata">Amir Shehata</assignee>
                                    <reporter username="nedbass">Ned Bass</reporter>
                        <labels>
                    </labels>
                <created>Wed, 3 Apr 2013 00:50:25 +0000</created>
                <updated>Wed, 4 Sep 2013 16:53:32 +0000</updated>
                            <resolved>Wed, 4 Sep 2013 16:52:52 +0000</resolved>
                                    <version>Lustre 2.1.4</version>
                                    <fixVersion>Lustre 2.4.1</fixVersion>
                    <fixVersion>Lustre 2.5.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="55403" author="pjones" created="Wed, 3 Apr 2013 17:21:32 +0000"  >&lt;p&gt;Amir&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="55656" author="ashehata" created="Fri, 5 Apr 2013 22:49:25 +0000"  >&lt;p&gt;In order to save time, I think the best approach is to agree first on the proposed solution with all interested parties, before going ahead with the implementation.&lt;/p&gt;

&lt;p&gt;As Ned pointed out in his comment, and I agree, I was aware that the the fix doesn&apos;t resolve the vulnerability in the kernel, but I was under the false assumption that I should keep the fix as simple as possible. &lt;/p&gt;

&lt;p&gt;The approach to properly fix this issue and thus remove the assert completely, since it&apos;s not appropriate to assert on user input, is outlined below.&lt;/p&gt;

&lt;p&gt;Currently the APIs under question are (and their explanation):&lt;br/&gt;
===============================================================&lt;br/&gt;
Add group&lt;br/&gt;
---------&lt;br/&gt;
. An empty group is added&lt;/p&gt;

&lt;p&gt;Add nodes to a group&lt;br/&gt;
---------------------&lt;br/&gt;
. A node is added to the group after it has been RPCed&lt;br/&gt;
 . if the rpc is successful the node is moved to ACTIVE state&lt;br/&gt;
 . if the rpc is successful but the the NID in the response is LNET_NID_ANY then state is set to UNKNOWN&lt;br/&gt;
 . if the rpc is successful but the reply session doesn&apos;t match console session then state is set to BUSY&lt;/p&gt;

&lt;p&gt;Further more:&lt;br/&gt;
. A node is created in UNKNOWN state and stays in UNKNOWN state until it&apos;s RPCed&lt;br/&gt;
. A node is set to DOWN when there is no response to an RPC message (this is currently buggy. It is set to UNKNOWN, but I&apos;ll fix it).&lt;br/&gt;
. A batch is only run if the nodes are in ACTIVE state&lt;/p&gt;

&lt;p&gt;update the group&lt;br/&gt;
----------------&lt;br/&gt;
 refresh&lt;br/&gt;
 clean &amp;lt;&lt;span class=&quot;error&quot;&gt;&amp;#91;active|busy|down|unknown|invalid&amp;#93;&lt;/span&gt;&amp;gt;&lt;br/&gt;
  . if all nodes which are in any of the above states are cleaned and the group now&lt;br/&gt;
    has 0 nodes, then the group itself is deleted implicitly (done in the kernel).&lt;br/&gt;
 remove &amp;lt;NID&amp;gt;&lt;br/&gt;
  . If the last nid is removed (IE no more nodes in the group) then the group is deleted (done in the kernel).&lt;/p&gt;

&lt;p&gt;The problematic cases are:&lt;br/&gt;
========================== &lt;br/&gt;
1. A group is added but no valid nids are added to the group (IE group is empty) &lt;br/&gt;
2. A group is added but no reachable nids are added to the group (IE group has no ACTIVE nodes)&lt;/p&gt;

&lt;p&gt;General Design Guidelines:&lt;br/&gt;
==========================&lt;br/&gt;
The selftest kernel module provides the following:&lt;br/&gt;
. A set of constructs which can be used to create an lnet test (EX: Group, node, batch)&lt;br/&gt;
. The selftest kernel modules ensures that the use of these constructs adhere to its internal rules:&lt;br/&gt;
  . EX: only send to reachable NIDs&lt;br/&gt;
. The application (user space) can instantiate these constructs and create test cases&lt;br/&gt;
. The kernel module shall guard against misuse of these constructs.&lt;/p&gt;

&lt;p&gt;The proposed solution is:&lt;br/&gt;
=========================&lt;br/&gt;
The selftest kernel module would check the source and destination groups when a test is added to a batch (with the add_test command) to ensure that the groups have at least one node in ACTIVE state. If not, then the test is not added to the batch.&lt;/p&gt;

&lt;p&gt;The groups remain (possibly empty) and can be updated at a later time. &lt;/p&gt;

&lt;p&gt;Kernel Selftest Behavior:&lt;br/&gt;
=========================&lt;br/&gt;
I propose the following modifications to the selftest kernel module behavior to make it consistent:&lt;br/&gt;
1. Groups are added independently (already done)&lt;br/&gt;
2. Nodes are created and added to the group (already done)&lt;br/&gt;
3. Nodes can be removed from the group and the group can remain empty&lt;br/&gt;
4. Groups referenced in a test which is to be added to a batch, must have at least one ACTIVE node; or the add_test command fails&lt;/p&gt;

&lt;p&gt;Justification:&lt;br/&gt;
==============&lt;br/&gt;
Groups are container entities that can exist independent of nodes; IE: they can be created and deleted explicitly and independently of other entities.  However, nodes can only exist within a group; IE you can not create a node independently and add it to a group later.  Thus to make the interface more consistent a group should not be deleted implicitly.&lt;/p&gt;

&lt;p&gt;This opens up the flexibility of a user space application creating groups that can be reused, by adding and removing nodes, without having to worry about the group implicitly being deleted.&lt;/p&gt;

&lt;p&gt;LST Utility behavior&lt;br/&gt;
====================&lt;br/&gt;
The lst utility can present its own behavioral model.  In this case the lst utility will not allow a group to exist without having at least one node in ACTIVE state.&lt;/p&gt;

&lt;p&gt;Other user space utilities can be implemented differently, but the kernel selftest module will ensure the sanity of the passed in parameters before carrying out a test case thus avoiding asserting and bringing down the entire OS.&lt;/p&gt;

&lt;p&gt;Is this solution acceptable?&lt;/p&gt;</comment>
                            <comment id="55657" author="nedbass" created="Fri, 5 Apr 2013 23:10:22 +0000"  >&lt;p&gt;Amir, thanks for the detailed analysis.  This solution looks acceptable to me.&lt;/p&gt;</comment>
                            <comment id="56335" author="ashehata" created="Mon, 15 Apr 2013 19:08:51 +0000"  >&lt;p&gt;We reached a consensus to make sure that the vulnerability in the Kernel module is addressed, as well as ensuring that lst utility catches this case in its checks as well.  &lt;/p&gt;

&lt;p&gt;The fix will be added to review soon.  My hard drive crashed, causing a delay in the resolution to this issue.&lt;/p&gt;</comment>
                            <comment id="56570" author="ashehata" created="Thu, 18 Apr 2013 19:51:13 +0000"  >&lt;p&gt;Fix is ready for review:&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#change,6092,patchset=2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,6092,patchset=2&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="56825" author="ashehata" created="Tue, 23 Apr 2013 16:22:53 +0000"  >&lt;p&gt;Just a reminder that the fix is ready for review: &lt;a href=&quot;http://review.whamcloud.com/#change,6092&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,6092&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="63920" author="bogl" created="Thu, 8 Aug 2013 22:55:44 +0000"  >&lt;p&gt;back port to b2_4: &lt;a href=&quot;http://review.whamcloud.com/7277&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7277&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvmxz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7513</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>