<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:16:18 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8295] conf-sanity test_41c: test failed to respond and timed out</title>
                <link>https://jira.whamcloud.com/browse/LU-8295</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for nasf &amp;lt;fan.yong@intel.com&amp;gt;&lt;/p&gt;

&lt;p&gt;Please provide additional information about the failure here.&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/dc7b4098-3372-11e6-bbf5-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/dc7b4098-3372-11e6-bbf5-5254006e85c2&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The logs on the OST show that two mount thread were blocked:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[12811.430006] mount           S ffff88007b4e72f0     0 25441  25436 0x00000080
[12811.430006]  ffff8800465b7e70 0000000000000086 ffff88007b4e7300 ffff8800465b7fd8
[12811.430006]  ffff8800465b7fd8 ffff8800465b7fd8 ffff88007b4e7300 ffff8800465b7ef8
[12811.430006]  ffff88007b4e77f0 ffff88007b4e7300 ffff88007b4e7300 ffff88007b4e72f0
[12811.430006] Call Trace:
[12811.430006]  [&amp;lt;ffffffff8163b349&amp;gt;] schedule+0x29/0x70
[12811.430006]  [&amp;lt;ffffffff81080f83&amp;gt;] do_wait+0x203/0x260
[12811.430006]  [&amp;lt;ffffffff810820a0&amp;gt;] SyS_wait4+0x80/0x110
[12811.430006]  [&amp;lt;ffffffff8107fb20&amp;gt;] ? task_stopped_code+0x60/0x60
[12811.430006]  [&amp;lt;ffffffff816463c9&amp;gt;] system_call_fastpath+0x16/0x1b
[12811.430006] sh              S ffff88007b4e2df0     0 25442  25440 0x00000080
[12811.430006]  ffff88007a95fe70 0000000000000086 ffff88007b4e2e00 ffff88007a95ffd8
[12811.430006]  ffff88007a95ffd8 ffff88007a95ffd8 ffff88007b4e2e00 ffff88007a95fef8
[12811.430006]  ffff88007b4e32f0 ffff88007b4e2e00 ffff88007b4e2e00 ffff88007b4e2df0
[12811.430006] Call Trace:
[12811.430006]  [&amp;lt;ffffffff8163b349&amp;gt;] schedule+0x29/0x70
[12811.430006]  [&amp;lt;ffffffff81080f83&amp;gt;] do_wait+0x203/0x260
[12811.430006]  [&amp;lt;ffffffff810820a0&amp;gt;] SyS_wait4+0x80/0x110
[12811.430006]  [&amp;lt;ffffffff8107fb20&amp;gt;] ? task_stopped_code+0x60/0x60
[12811.430006]  [&amp;lt;ffffffff816463c9&amp;gt;] system_call_fastpath+0x16/0x1b
[12811.430006] mount.lustre    S 0000000000000000     0 25443  25441 0x00000080
[12811.430006]  ffff8800438d7ce0 0000000000000082 ffff8800438c5c00 ffff8800438d7fd8
[12811.430006]  ffff8800438d7fd8 ffff8800438d7fd8 ffff8800438c5c00 ffff880047f4c800
[12811.430006]  ffff8800438c5c00 ffff8800438d7e08 ffff880043836800 0000000000000000
[12811.430006] Call Trace:
[12811.430006]  [&amp;lt;ffffffff8163b349&amp;gt;] schedule+0x29/0x70
[12811.430006]  [&amp;lt;ffffffffa0abaed6&amp;gt;] server_fill_super+0x1656/0x184c [obdclass]
[12811.430006]  [&amp;lt;ffffffff810a6ac0&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[12811.430006]  [&amp;lt;ffffffffa0a8f9d8&amp;gt;] lustre_fill_super+0x328/0x950 [obdclass]
[12811.430006]  [&amp;lt;ffffffffa0a8f6b0&amp;gt;] ? lustre_common_put_super+0x270/0x270 [obdclass]
[12811.430006]  [&amp;lt;ffffffff811e1f2d&amp;gt;] mount_nodev+0x4d/0xb0
[12811.430006]  [&amp;lt;ffffffffa0a87908&amp;gt;] lustre_mount+0x38/0x60 [obdclass]
[12811.430006]  [&amp;lt;ffffffff811e28d9&amp;gt;] mount_fs+0x39/0x1b0
[12811.430006]  [&amp;lt;ffffffff811fe1af&amp;gt;] vfs_kern_mount+0x5f/0xf0
[12811.430006]  [&amp;lt;ffffffff812006fe&amp;gt;] do_mount+0x24e/0xa40
[12811.430006]  [&amp;lt;ffffffff8116e15e&amp;gt;] ? __get_free_pages+0xe/0x50
[12811.430006]  [&amp;lt;ffffffff81200f86&amp;gt;] SyS_mount+0x96/0xf0
[12811.430006]  [&amp;lt;ffffffff816463c9&amp;gt;] system_call_fastpath+0x16/0x1b
[12811.430006] mount           S ffff88003dd65070     0 25446  25442 0x00000080
[12811.430006]  ffff8800434d3e70 0000000000000086 ffff88003dd65080 ffff8800434d3fd8
[12811.430006]  ffff8800434d3fd8 ffff8800434d3fd8 ffff88003dd65080 ffff8800434d3ef8
[12811.430006]  ffff88003dd65570 ffff88003dd65080 ffff88003dd65080 ffff88003dd65070
[12811.430006] Call Trace:
[12811.430006]  [&amp;lt;ffffffff8163b349&amp;gt;] schedule+0x29/0x70
[12811.430006]  [&amp;lt;ffffffff81080f83&amp;gt;] do_wait+0x203/0x260
[12811.430006]  [&amp;lt;ffffffff810820a0&amp;gt;] SyS_wait4+0x80/0x110
[12811.430006]  [&amp;lt;ffffffff8107fb20&amp;gt;] ? task_stopped_code+0x60/0x60
[12811.430006]  [&amp;lt;ffffffff816463c9&amp;gt;] system_call_fastpath+0x16/0x1b
[12811.430006] mount.lustre    S 0000000000000000     0 25449  25446 0x00000080
[12811.430006]  ffff88003db0bce0 0000000000000082 ffff88007c141700 ffff88003db0bfd8
[12811.430006]  ffff88003db0bfd8 ffff88003db0bfd8 ffff88007c141700 ffff88005146c000
[12811.430006]  ffff88007c141700 ffff88003db0be08 ffff88007a89c000 0000000000000000
[12811.430006] Call Trace:
[12811.430006]  [&amp;lt;ffffffff8163b349&amp;gt;] schedule+0x29/0x70
[12811.430006]  [&amp;lt;ffffffffa0abaed6&amp;gt;] server_fill_super+0x1656/0x184c [obdclass]
[12811.430006]  [&amp;lt;ffffffff810a6ac0&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[12811.430006]  [&amp;lt;ffffffffa0a8f9d8&amp;gt;] lustre_fill_super+0x328/0x950 [obdclass]
[12811.430006]  [&amp;lt;ffffffffa0a8f6b0&amp;gt;] ? lustre_common_put_super+0x270/0x270 [obdclass]
[12811.430006]  [&amp;lt;ffffffff811e1f2d&amp;gt;] mount_nodev+0x4d/0xb0
[12811.430006]  [&amp;lt;ffffffffa0a87908&amp;gt;] lustre_mount+0x38/0x60 [obdclass]
[12811.430006]  [&amp;lt;ffffffff811e28d9&amp;gt;] mount_fs+0x39/0x1b0
[12811.430006]  [&amp;lt;ffffffff811fe1af&amp;gt;] vfs_kern_mount+0x5f/0xf0
[12811.430006]  [&amp;lt;ffffffff812006fe&amp;gt;] do_mount+0x24e/0xa40
[12811.430006]  [&amp;lt;ffffffff8116e15e&amp;gt;] ? __get_free_pages+0xe/0x50
[12811.430006]  [&amp;lt;ffffffff81200f86&amp;gt;] SyS_mount+0x96/0xf0
[12811.430006]  [&amp;lt;ffffffff816463c9&amp;gt;] system_call_fastpath+0x16/0x1b
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="37638">LU-8295</key>
            <summary>conf-sanity test_41c: test failed to respond and timed out</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="jhammond">John Hammond</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Fri, 17 Jun 2016 06:45:59 +0000</created>
                <updated>Sat, 17 Dec 2016 14:50:47 +0000</updated>
                            <resolved>Sat, 17 Dec 2016 14:50:47 +0000</resolved>
                                                    <fixVersion>Lustre 2.10.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="156238" author="green" created="Mon, 20 Jun 2016 17:37:31 +0000"  >&lt;p&gt;so OBD_RACE() is supposed to stop for the first caller and then continue when somebody else reaches there.&lt;/p&gt;

&lt;p&gt;Yet we see both threads stuck in the same place in OBD_RACE? Something strange is going on here. Also there&apos;s only one print in the logs about the OBD_RACE actually being hit&lt;/p&gt;</comment>
                            <comment id="156258" author="jhammond" created="Mon, 20 Jun 2016 19:34:17 +0000"  >&lt;p&gt;There&apos;s  a bug in the test:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        #define OBD_FAIL_TGT_MOUNT_RACE 0x716                                             
        do_facet ost1 &lt;span class=&quot;code-quote&quot;&gt;&quot;$LCTL set_param fail_loc=0x716&quot;&lt;/span&gt;
        start ost1 $(ostdevname 1) $OST_MOUNT_OPTS &amp;amp;
        pid=$!
        start ost1 $(ostdevname 1) $OST_MOUNT_OPTS &amp;amp;
        do_facet ost1 &lt;span class=&quot;code-quote&quot;&gt;&quot;$LCTL set_param fail_loc=0x0&quot;&lt;/span&gt;
	pid2=$!
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Since both mount commands are asynchronously, we may clear &lt;tt&gt;fail_loc&lt;/tt&gt; between the two invocations of &lt;tt&gt;OBD_RACE()&lt;/tt&gt;. In this case both threads will hang.&lt;/p&gt;

&lt;p&gt;&amp;gt; Also there&apos;s only one print in the logs about the OBD_RACE actually being hit&lt;/p&gt;

&lt;p&gt;Shouldn&apos;t we expect that CDEBUG rate limitation will prevent the second message from being printed.&lt;/p&gt;</comment>
                            <comment id="156278" author="jhammond" created="Mon, 20 Jun 2016 21:22:15 +0000"  >&lt;p&gt;And we need &lt;tt&gt;CFS_FAIL_ONCE&lt;/tt&gt; along with &lt;tt&gt;OBD_FAIL_TGT_MOUNT_RACE&lt;/tt&gt; when setting &lt;tt&gt;fail_loc&lt;/tt&gt;.&lt;/p&gt;</comment>
                            <comment id="157408" author="bfaccini" created="Thu, 30 Jun 2016 15:41:54 +0000"  >&lt;p&gt;John, since I am the &quot;responsible&quot; of this test and associated code, I feel a bit interested by this problem and ticket &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/wink.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;My understanding was that OBD_RACE()/CFS_RACE()/cfs_race() only/safely work with 2 threads, the 1st stops waiting for the 2nd to awake it.&lt;/p&gt;

&lt;p&gt;And I think, like you, that for this to work as expected, fail_loc must not be cleared in between, when there is a potential window for this to happen due to the second mount/thread to be run asynchronously, with the clear of fail_loc being launched simultaneously.&lt;/p&gt;

&lt;p&gt;On the other hand, I thought before that CFS_FAIL_ONCE was useless for OBD_RACE() to work, but after better regarding the code in __cfs_fail_check_set() core routine this may also, as you point, be the reason for both threads being stuck if they are closely racing and have entered the routine at around the same time, and particularly if CFS_FAILED bit has still not been set.&lt;/p&gt;

&lt;p&gt;Do you want me to push a patch to fix these 2 holes (4 in fact since fail_loc is wrongly set/cleared for MDSs and OSSs cases) in test ?&lt;/p&gt;
</comment>
                            <comment id="157421" author="jhammond" created="Thu, 30 Jun 2016 16:21:52 +0000"  >&lt;p&gt;&amp;gt; Do you want me to push a patch to fix these 2 holes (4 in fact since fail_loc is wrongly set/cleared for MDSs and OSSs cases) in test ?&lt;/p&gt;

&lt;p&gt;Yes, please.&lt;/p&gt;

&lt;p&gt;I&apos;m still not confident that I understand how this is working but I can see several potential issues with this code: &lt;tt&gt;cfs_race_state&lt;/tt&gt; should be declared volatile. There is a race between the two threads setting &lt;tt&gt;cfs_race_state&lt;/tt&gt; and one sleeping the other waking. &lt;tt&gt;proc_fail_loc()&lt;/tt&gt; calls &lt;tt&gt;wake_up(&amp;amp;cfs_race_waitq)&lt;/tt&gt; but it does not set &lt;tt&gt;cfs_race_state&lt;/tt&gt; to 1 so it seems like the sleeping task would just go back to sleep.&lt;/p&gt;

&lt;p&gt;My best guess is that usually &quot;do_facet ost1 $LCTL set_param fail_loc=0x0&quot; is running before either mount command so the &lt;tt&gt;CFS_RACE()&lt;/tt&gt; is never triggered.&lt;/p&gt;</comment>
                            <comment id="157475" author="gerrit" created="Thu, 30 Jun 2016 23:18:24 +0000"  >&lt;p&gt;Faccini Bruno (bruno.faccini@intel.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/21117&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/21117&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8295&quot; title=&quot;conf-sanity test_41c: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8295&quot;&gt;&lt;del&gt;LU-8295&lt;/del&gt;&lt;/a&gt; tests: comply with OBD_RACE() behavior&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: e3050d4414667a941c501728de2bfb0b78e8b097&lt;/p&gt;</comment>
                            <comment id="178206" author="gerrit" created="Sat, 17 Dec 2016 05:41:02 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/21117/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/21117/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-8295&quot; title=&quot;conf-sanity test_41c: test failed to respond and timed out&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-8295&quot;&gt;&lt;del&gt;LU-8295&lt;/del&gt;&lt;/a&gt; tests: comply with OBD_RACE() behavior&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: a3becc1092524366e13b3c72d61c6b0ea82709f7&lt;/p&gt;</comment>
                            <comment id="178262" author="pjones" created="Sat, 17 Dec 2016 14:50:47 +0000"  >&lt;p&gt;Landed for 2.10&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyewf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>