<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:15:13 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1279] failure trying to mount two targets at the same time after boot</title>
                <link>https://jira.whamcloud.com/browse/LU-1279</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After booting an OSS, two OSTs are mounted simultaneously.  The mounts fail due to module loading failure:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Lustre: OBD &lt;span class=&quot;code-keyword&quot;&gt;class &lt;/span&gt;driver, http:&lt;span class=&quot;code-comment&quot;&gt;//wiki.whamcloud.com/
&lt;/span&gt;Lustre:         Lustre Version: 2.1.1
Lustre:         Build Version: jenkins-gae03fc8-PRISTINE-2.6.32-220.4.2.el6_lustre.gcbb4fad.x86_64
Lustre: Lustre LU module (ffffffffa0578c60).
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol RQF_FLD_QUERY
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol req_capsule_server_pack
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol req_capsule_client_get
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol ptlrpc_queue_wait
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol req_capsule_fini
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol req_capsule_init
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol req_capsule_set
INFO: task hydra-agent:1590 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.
&lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.
hydra-agent   D 0000000000000000     0  1590      1 0x00000080
 ffff88003db09d68 0000000000000082 ffff88003d740a88 ffff88003bad0250
 ffff88003db09d68 ffffffff8113fb78 800000002c760065 0000000000000086
 ffff880037c1c678 ffff88003db09fd8 000000000000f4e8 ffff880037c1c678
Call Trace:
 [&amp;lt;ffffffff8113fb78&amp;gt;] ? vma_adjust+0x128/0x590
 [&amp;lt;ffffffff814ee35e&amp;gt;] __mutex_lock_slowpath+0x13e/0x180
 [&amp;lt;ffffffff814ee1fb&amp;gt;] mutex_lock+0x2b/0x50
 [&amp;lt;ffffffff810aaafd&amp;gt;] m_start+0x1d/0x40
 [&amp;lt;ffffffff81198cc0&amp;gt;] seq_read+0x90/0x3f0
 [&amp;lt;ffffffff811dae0e&amp;gt;] proc_reg_read+0x7e/0xc0
 [&amp;lt;ffffffff81176cb5&amp;gt;] vfs_read+0xb5/0x1a0
 [&amp;lt;ffffffff810d4582&amp;gt;] ? audit_syscall_entry+0x272/0x2a0
 [&amp;lt;ffffffff81176df1&amp;gt;] sys_read+0x51/0x90
 [&amp;lt;ffffffff8100b0f2&amp;gt;] system_call_fastpath+0x16/0x1b
INFO: task modprobe:1679 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.
&lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.
modprobe      D 0000000000000000     0  1679   1651 0x00000080
 ffff88002ed35aa8 0000000000000082 ffff88002ed35a58 ffffffff810097cc
 ffff88003ef260f8 0000000000000000 0000000000d35a68 ffff880002213b00
 ffff880037415a78 ffff88002ed35fd8 000000000000f4e8 ffff880037415a78
Call Trace:
 [&amp;lt;ffffffff810097cc&amp;gt;] ? __switch_to+0x1ac/0x320
 [&amp;lt;ffffffff814ecd0e&amp;gt;] ? thread_return+0x4e/0x760
 [&amp;lt;ffffffff814edb75&amp;gt;] schedule_timeout+0x215/0x2e0
 [&amp;lt;ffffffff8104c9e9&amp;gt;] ? __wake_up_common+0x59/0x90
 [&amp;lt;ffffffff814ed7f3&amp;gt;] wait_for_common+0x123/0x180
 [&amp;lt;ffffffff8105e7f0&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffff8108b741&amp;gt;] ? __queue_work+0x41/0x50
 [&amp;lt;ffffffff814ed90d&amp;gt;] wait_for_completion+0x1d/0x20
 [&amp;lt;ffffffff81089c90&amp;gt;] call_usermodehelper_exec+0xe0/0xf0
 [&amp;lt;ffffffffa04966d2&amp;gt;] ? lnet_startup_lndnis+0x262/0x6f0 [lnet]
 [&amp;lt;ffffffff81089feb&amp;gt;] __request_module+0x18b/0x210
 [&amp;lt;ffffffffa0498e00&amp;gt;] ? lnet_parse_networks+0x90/0x7e0 [lnet]
 [&amp;lt;ffffffffa041aa13&amp;gt;] ? cfs_alloc+0x63/0x90 [libcfs]
 [&amp;lt;ffffffffa04966d2&amp;gt;] lnet_startup_lndnis+0x262/0x6f0 [lnet]
 [&amp;lt;ffffffffa041aa13&amp;gt;] ? cfs_alloc+0x63/0x90 [libcfs]
 [&amp;lt;ffffffffa0496c85&amp;gt;] LNetNIInit+0x125/0x1f0 [lnet]
 [&amp;lt;ffffffffa06aa13a&amp;gt;] ? init_module+0x0/0x597 [ptlrpc]
 [&amp;lt;ffffffffa05f1c89&amp;gt;] ptlrpc_ni_init+0x29/0x170 [ptlrpc]
 [&amp;lt;ffffffff8105e7f0&amp;gt;] ? default_wake_function+0x0/0x20
 [&amp;lt;ffffffffa05f2053&amp;gt;] ptlrpc_init_portals+0x13/0xd0 [ptlrpc]
 [&amp;lt;ffffffffa06aa13a&amp;gt;] ? init_module+0x0/0x597 [ptlrpc]
 [&amp;lt;ffffffffa06aa21a&amp;gt;] init_module+0xe0/0x597 [ptlrpc]
 [&amp;lt;ffffffff81096d15&amp;gt;] ? __blocking_notifier_call_chain+0x65/0x80
 [&amp;lt;ffffffff8100204c&amp;gt;] do_one_initcall+0x3c/0x1d0
 [&amp;lt;ffffffff810af4e1&amp;gt;] sys_init_module+0xe1/0x250
 [&amp;lt;ffffffff8100b0f2&amp;gt;] system_call_fastpath+0x16/0x1b
INFO: task modprobe:1688 blocked &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; more than 120 seconds.
&lt;span class=&quot;code-quote&quot;&gt;&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot;&lt;/span&gt; disables &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; message.
modprobe      D 0000000000000000     0  1688   1687 0x00000080
 ffff88003d6d3eb8 0000000000000086 ffff88003d6d3e18 0000000000000082
 ffff88003d6d1ab8 ffff88003d6d3fd8 000000000000f4e8 ffff88003d6d1ac0
 ffff88003d6d1ab8 ffff88003d6d3fd8 000000000000f4e8 ffff88003d6d1ab8
Call Trace:
 [&amp;lt;ffffffff814f39dd&amp;gt;] ? kprobes_module_callback+0xdd/0x170
 [&amp;lt;ffffffff814ee35e&amp;gt;] __mutex_lock_slowpath+0x13e/0x180
 [&amp;lt;ffffffff81096d15&amp;gt;] ? __blocking_notifier_call_chain+0x65/0x80
 [&amp;lt;ffffffff814ee1fb&amp;gt;] mutex_lock+0x2b/0x50
 [&amp;lt;ffffffff810af533&amp;gt;] sys_init_module+0x133/0x250
 [&amp;lt;ffffffff8100b0f2&amp;gt;] system_call_fastpath+0x16/0x1b
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol req_capsule_server_get
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol ptlrpc_at_set_req_timeout
type=1305 audit(1333395888.750:31878): auid=4294967295 ses=4294967295 op=&lt;span class=&quot;code-quote&quot;&gt;&quot;remove rule&quot;&lt;/span&gt; key=(&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;) list=4 res=1
type=1305 audit(1333395888.750:31879): audit_enabled=0 old=1 auid=4294967295 ses=4294967295 res=1
readahead-collector: starting delayed service auditd
readahead-collector: sorting
readahead-collector: finished
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol ptlrpc_request_alloc_pack
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol RMF_FLD_OPC
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol ptlrpc_request_set_replen
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol RMF_FLD_MDFLD
fld: gave up waiting &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; init of module ptlrpc.
fld: Unknown symbol ptlrpc_req_finished
LustreError: 1679:0:(socklnd.c:2420:ksocknal_base_startup()) Can&apos;t spawn socknal scheduler[0]: -513
LustreError: 105-4: Error -100 starting up LNI tcp
LustreError: 1679:0:(events.c:728:ptlrpc_init_portals()) network initialisation failed
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>lustre-modules-2.1.1-2.6.32_220.4.2.el6_lustre.gcbb4fad.x86_64_gae03fc8.x86_64</environment>
        <key id="13847">LU-1279</key>
            <summary>failure trying to mount two targets at the same time after boot</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="hongchao.zhang">Hongchao Zhang</assignee>
                                    <reporter username="brian">Brian Murrell</reporter>
                        <labels>
                            <label>llnl</label>
                    </labels>
                <created>Mon, 2 Apr 2012 17:39:10 +0000</created>
                <updated>Mon, 27 Apr 2015 20:17:22 +0000</updated>
                            <resolved>Tue, 21 Oct 2014 19:36:32 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                    <version>Lustre 2.1.2</version>
                                    <fixVersion>Lustre 2.7.0</fixVersion>
                    <fixVersion>Lustre 2.5.4</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>24</watches>
                                                                            <comments>
                            <comment id="38732" author="apittman" created="Mon, 14 May 2012 11:43:46 +0000"  >&lt;p&gt;We are also seeing this issue in our 2.1.2 release testing, currently using the 6a9def181397a20c48b8f55ef4f6c29f2fc18ed6 commit on the b2_1 branch.&lt;/p&gt;</comment>
                            <comment id="39170" author="pjones" created="Tue, 22 May 2012 01:49:26 +0000"  >&lt;p&gt;Bob&lt;/p&gt;

&lt;p&gt;Oleg thinks that this is related to the port of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt;. Could you please look at it?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="39226" author="bogl" created="Tue, 22 May 2012 15:10:09 +0000"  >&lt;p&gt;Seems unlikely this is related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1166&quot; title=&quot;recovery never finished&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1166&quot;&gt;&lt;del&gt;LU-1166&lt;/del&gt;&lt;/a&gt; since it started happening before that change landed in b2_1.  Think the underlying cause is still not known.&lt;/p&gt;</comment>
                            <comment id="39234" author="apittman" created="Tue, 22 May 2012 15:53:27 +0000"  >&lt;p&gt;This can be provoked purely by loading the module.  If I just run modprobe lustre I see the following in the message file and the command returns very quickly.  Lustre version is a DDN build of b2_1.&lt;/p&gt;

&lt;p&gt;May 22 12:41:37 12k-3 kernel: Lustre: Lustre: Build Version: EXAScaler-build-ddn1.1--PRISTINE-2.6.32-220.13.1.el6_lustre20120521052707.x86_64&lt;br/&gt;
May 22 12:41:38 12k-3 kernel: Lustre: Added LNI 10.10.11.183@o2ib &lt;span class=&quot;error&quot;&gt;&amp;#91;8/64/0/180&amp;#93;&lt;/span&gt;&lt;br/&gt;
May 22 12:41:38 12k-3 kernel: Lustre: Lustre OSC module (ffffffffa0b56040).&lt;br/&gt;
May 22 12:41:38 12k-3 kernel: Lustre: Lustre LOV module (ffffffffa0bb6000).&lt;br/&gt;
May 22 12:41:38 12k-3 kernel: Lustre: Lustre client module (ffffffffa0c44ca0).&lt;/p&gt;

&lt;p&gt;If I try the following, using pdsh to spawn several local modprobe processes simultaneously however I see the following:&lt;br/&gt;
pdsh -R exec -w &lt;span class=&quot;error&quot;&gt;&amp;#91;0-10&amp;#93;&lt;/span&gt; modprobe lustre&lt;/p&gt;

&lt;p&gt;May 22 12:43:05 12k-3 kernel: Lustre: Lustre: Build Version: EXAScaler-build-ddn1.1--PRISTINE-2.6.32-220.13.1.el6_lustre20120521052707.x86_64&lt;br/&gt;
May 22 12:43:35 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:43:35 12k-3 kernel: fld: Unknown symbol RQF_FLD_QUERY&lt;br/&gt;
May 22 12:44:05 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:44:05 12k-3 kernel: fld: Unknown symbol req_capsule_server_pack&lt;br/&gt;
May 22 12:44:35 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:44:35 12k-3 kernel: fld: Unknown symbol req_capsule_client_get&lt;br/&gt;
May 22 12:45:05 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:45:05 12k-3 kernel: fld: Unknown symbol ptlrpc_queue_wait&lt;br/&gt;
May 22 12:45:35 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:45:35 12k-3 kernel: fld: Unknown symbol req_capsule_fini&lt;br/&gt;
May 22 12:46:05 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:46:05 12k-3 kernel: fld: Unknown symbol req_capsule_init&lt;br/&gt;
May 22 12:46:35 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:46:35 12k-3 kernel: fld: Unknown symbol req_capsule_set&lt;br/&gt;
May 22 12:47:05 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:47:05 12k-3 kernel: fld: Unknown symbol req_capsule_server_get&lt;br/&gt;
May 22 12:47:35 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:47:35 12k-3 kernel: fld: Unknown symbol ptlrpc_at_set_req_timeout&lt;br/&gt;
May 22 12:48:05 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:48:05 12k-3 kernel: fld: Unknown symbol ptlrpc_request_alloc_pack&lt;br/&gt;
May 22 12:48:35 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:48:35 12k-3 kernel: fld: Unknown symbol RMF_FLD_OPC&lt;br/&gt;
May 22 12:49:05 12k-3 kernel: fld: gave up waiting for init of module ptlrpc.&lt;br/&gt;
May 22 12:49:05 12k-3 kernel: fld: Unknown symbol ptlrpc_request_set_replen&lt;/p&gt;

&lt;p&gt;At this point I normally reboot the node however I left a system in this state overnight and it had resolved itself by this morning, I&apos;ve just triggered this bug again and will leave it in the bad state to see what happens and when.&lt;/p&gt;</comment>
                            <comment id="40582" author="bogl" created="Thu, 14 Jun 2012 13:14:48 +0000"  >&lt;p&gt;When I attempt to reproduce this I see many modprobe procs, all in the call sequence&lt;/p&gt;

&lt;p&gt;ptlrpc_ni_init -&amp;gt; LNetNIInit -&amp;gt; ... -&amp;gt; cfs_request_module -&amp;gt; call_usermodehelper_exec&lt;/p&gt;

&lt;p&gt;It seems like before the_lnet.ln_refcount is set non-zero later on in LNetNIInit() there&apos;s nothing to prevent lnet init from calling into the linux module load infrastructure multiple times for the same module load.   Seems to collide down in linux.&lt;/p&gt;

&lt;p&gt;Here&apos;s an example stack trace captured in /var/log/messages:&lt;/p&gt;


&lt;p&gt;Jun 14 09:32:47 centos24 kernel: Call Trace:&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8104da7c&amp;gt;&amp;#93;&lt;/span&gt; ? check_preempt_curr+0x7c/0x90&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff814ee255&amp;gt;&amp;#93;&lt;/span&gt; schedule_timeout+0x215/0x2e0&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81090d46&amp;gt;&amp;#93;&lt;/span&gt; ? autoremove_wake_function+0x16/0x40&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8104cab9&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up_common+0x59/0x90&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff814eded3&amp;gt;&amp;#93;&lt;/span&gt; wait_for_common+0x123/0x180&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105ea30&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8108b9e1&amp;gt;&amp;#93;&lt;/span&gt; ? __queue_work+0x41/0x50&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff814edfed&amp;gt;&amp;#93;&lt;/span&gt; wait_for_completion+0x1d/0x20&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81089f30&amp;gt;&amp;#93;&lt;/span&gt; call_usermodehelper_exec+0xe0/0xf0&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04d16d2&amp;gt;&amp;#93;&lt;/span&gt; ? lnet_startup_lndnis+0x262/0x6f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8108a28b&amp;gt;&amp;#93;&lt;/span&gt; __request_module+0x18b/0x210&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04d3e00&amp;gt;&amp;#93;&lt;/span&gt; ? lnet_parse_networks+0x90/0x7e0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0992a13&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_alloc+0x63/0x90 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04d16d2&amp;gt;&amp;#93;&lt;/span&gt; lnet_startup_lndnis+0x262/0x6f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0992a13&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_alloc+0x63/0x90 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04d1c85&amp;gt;&amp;#93;&lt;/span&gt; LNetNIInit+0x125/0x1f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lnet&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa070713a&amp;gt;&amp;#93;&lt;/span&gt; ? init_module+0x0/0x597 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06501c9&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_ni_init+0x29/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8105ea30&amp;gt;&amp;#93;&lt;/span&gt; ? default_wake_function+0x0/0x20&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0650593&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_init_portals+0x13/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa070713a&amp;gt;&amp;#93;&lt;/span&gt; ? init_module+0x0/0x597 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa070721a&amp;gt;&amp;#93;&lt;/span&gt; init_module+0xe0/0x597 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81096fb5&amp;gt;&amp;#93;&lt;/span&gt; ? __blocking_notifier_call_chain+0x65/0x80&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100204c&amp;gt;&amp;#93;&lt;/span&gt; do_one_initcall+0x3c/0x1d0&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff810af891&amp;gt;&amp;#93;&lt;/span&gt; sys_init_module+0xe1/0x250&lt;br/&gt;
Jun 14 09:32:47 centos24 kernel: &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100b0f2&amp;gt;&amp;#93;&lt;/span&gt; system_call_fastpath+0x16/0x1b&lt;/p&gt;</comment>
                            <comment id="41803" author="bogl" created="Fri, 13 Jul 2012 11:56:15 +0000"  >&lt;p&gt;I&apos;m not sure there&apos;s much we can do about this problem.  The root cause is the linux kernel mutex used to serialize module loads, module_mutex.  The deadlock situation is this:&lt;/p&gt;

&lt;p&gt;a) modprobe lustre -&amp;gt; .... -&amp;gt; init_module(ptlrpc) -&amp;gt; LNetNIInit() -&amp;gt; modprobe ksocklnd&lt;/p&gt;

&lt;p&gt;The &apos;modprobe ksocklnd&apos; is blocked trying to get module_mutex in sys_init_module() -&amp;gt; mutex_lock_interruptible(module_mutex)&lt;/p&gt;

&lt;p&gt;ptlrpc init routine is blocked from finishing because ksocklnd load is blocked from starting by b) sequence trying to run concurrently.&lt;/p&gt;

&lt;p&gt;b) modprobe lustre -&amp;gt; load_module(?&lt;cite&gt;) (this thread owns module_mutex, obtained before entering load_module()) -&amp;gt; ... -&amp;gt; use_module(&lt;/cite&gt;?, ptlrpc) -&amp;gt; find_symbol_in_setup&lt;/p&gt;

&lt;p&gt;The b) sequence of modprobe loads holds the mutex, tries to lookup a ptlrpc symbol, can&apos;t due to ptlrpc never finishing init.  This will timeout after 30s, but the pattern will repeat multiple times apparently due to multiple symbol lookups.  If you wait long enough through all the multiple timeouts, eventually the a) sequence will finally finish.&lt;/p&gt;

&lt;p&gt;Without tampering inside linux and maybe causing more problems than we solve, I don&apos;t see any easy way around this.&lt;/p&gt;

&lt;p&gt;Perhaps there can be a workaround solution.  If lustre modules get preloaded in a guaranteed single stream fashion before trying to do mounts the problem could be avoided.  I&apos;m thinking of something like a .rc script executing &apos;modprobe lustre&apos; that would run before any mounts could happen, deployed only on lustre servers.&lt;/p&gt;</comment>
                            <comment id="41804" author="apittman" created="Fri, 13 Jul 2012 12:06:17 +0000"  >&lt;p&gt;Right now we have &quot;modprobe lustre&quot; in /etc/sysconfig/modules/exascaler.modules however the problems with this are two-fold, firstly we still hit the issue on install because we have to unload the module to set the lnet parameters, when we do this and then try and start the filesystem we hit the bug.  Secondly and more importantly IB networks tend not to be up that early in the boot process so the module loading fails anyway.  We could potentially add this to rc.local which stands a greater chance of working however I believe it would still be racy and could potentially fail, in which case we&apos;d hit this bug on filesystem startup.&lt;/p&gt;</comment>
                            <comment id="41815" author="bogl" created="Fri, 13 Jul 2012 13:30:23 +0000"  >&lt;p&gt;/etc/sysconfig/modules files execute out of /etc/rc.sysinit and are too early in the boot sequence, as you&apos;ve pointed out.  I&apos;m not sure I understand your objection to rc.local.  That does seem late enough in the boot sequence so that all the necessary devices &amp;amp; networks are started up.  Everything needed by lustre would be present by then.  Is that before you mount OSTs or start up a lustre fs?  If so that sounds to be a good place.&lt;/p&gt;

&lt;p&gt;On your other objection you say you need to manually stop &amp;amp; teardown lustre modules after an install in order to mod lnet setup.  If a manual command is used to stop it (lustre_rmmod?) can&apos;t you just do a manual cmdline &apos;modprobe lustre&apos; after changing the lnet params to bring it back up?  Then your node should be in a good state to start a filesystem.&lt;/p&gt;</comment>
                            <comment id="41816" author="bogl" created="Fri, 13 Jul 2012 13:39:56 +0000"  >&lt;p&gt;I almost hesitate to ask such a dumb question, but if this problem is triggered by mounting multiple OSTs at once why do that?  Just mount them one at a time and avoid the whole issue.&lt;/p&gt;</comment>
                            <comment id="41817" author="apittman" created="Fri, 13 Jul 2012 13:45:53 +0000"  >&lt;p&gt;rc.local might be OK but I have seen IB networks taking several minutes to startup properly after boot, especially if the switch is booting at the same time as the node.  I&apos;ll do a test.&lt;/p&gt;

&lt;p&gt;The filesystem is started by corosync normally which does it in parallel, we have little control over that.&lt;/p&gt;</comment>
                            <comment id="41819" author="bogl" created="Fri, 13 Jul 2012 13:59:03 +0000"  >&lt;p&gt;If your filesystems are started by corosync, then rc.local may be too late in the boot sequence.  S99local comes very late.  When does corosync service start?&lt;/p&gt;</comment>
                            <comment id="41820" author="apittman" created="Fri, 13 Jul 2012 14:04:55 +0000"  >&lt;p&gt;Corosync is configured to not attempt to start the filesystem until all network interfaces are up.&lt;/p&gt;</comment>
                            <comment id="41839" author="rread" created="Fri, 13 Jul 2012 20:09:41 +0000"  >&lt;p&gt;This is also an issue for Chroma (HYD-1263).  Is this a module dependency issue? Shouldn&apos;t the LNDs be loaded before ptlrpc and higher layers?&lt;/p&gt;</comment>
                            <comment id="41842" author="bogl" created="Fri, 13 Jul 2012 20:54:05 +0000"  >&lt;p&gt;To the best of my understanding this isn&apos;t strictly a module dependency issue.  For one thing as long as there is only a single set of modloads going they all complete and don&apos;t block each other.  Only when they are multiple and concurrent does this problem appear.  For another lnet doesn&apos;t load lnd modules based on dependencies.  It explicitly loads each needed lnd module with cfs_request_module() inside lnet_startup_lndnis().  That maps to kernel API request_module(), which forks off another userspace modprobe to load the lnd.&lt;/p&gt;

&lt;p&gt;That&apos;s exactly what&apos;s happening in the deadlock.  The module load of ptlrpc has completed the load_module() portion of its operation and has given up module_mutex.  In the course of running init_module() it calls LNetNIInit -&amp;gt; lnet_startup_lndnis.  lnet_startup_lndnis has forked a child to modprobe the lnd module (ksocklnd in my example, could be o2iblnd in the customer&apos;s config).  The lnd is blocked from loading due to concurrent modprobe holding module_mutex and waiting for ptlrpc init_module to complete.  Without a concurrent modprobe racing in &amp;amp; grabbing the mutex the lnd module load would complete, the ptlrpc init_module would complete, and later module loads that need globals in any of ptlrpc, lnet, and any lnd find them all present and available.&lt;/p&gt;</comment>
                            <comment id="41855" author="apittman" created="Sat, 14 Jul 2012 04:33:53 +0000"  >&lt;p&gt;One thing I could do is to make the filesystem agent attempt to modprobe ksocklnd and o2iblnd directly before loading the lustre module, it sounds like that might avoid the deadlock?&lt;/p&gt;</comment>
                            <comment id="41857" author="bogl" created="Sat, 14 Jul 2012 18:11:22 +0000"  >&lt;p&gt;yes, preloading ksocklnd and ko2iblnd might help.  As long as you only do one modprobe at a time you could even try having the filesystem agent do modprobe lustre and preload everything.  Only doing more than one or starting lustre mounts before modprobe completes is likely to trigger this problem as far as I can see.&lt;/p&gt;
</comment>
                            <comment id="42404" author="pjones" created="Fri, 27 Jul 2012 14:28:22 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="42427" author="bobijam" created="Mon, 30 Jul 2012 02:15:14 +0000"  >&lt;p&gt;would you please try &lt;a href=&quot;http://review.whamcloud.com/3471&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/3471&lt;/a&gt; to see if that could be any helpful?&lt;/p&gt;</comment>
                            <comment id="42516" author="bogl" created="Tue, 31 Jul 2012 16:57:16 +0000"  >&lt;p&gt;Tried the suggested patch from &lt;a href=&quot;http://review.whamcloud.com/#change,3471&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,3471&lt;/a&gt; in current b2_1.  Doesn&apos;t help at all.  Still fails exactly the same.&lt;/p&gt;</comment>
                            <comment id="42605" author="bobijam" created="Thu, 2 Aug 2012 12:44:37 +0000"  >&lt;p&gt;Bob, &lt;/p&gt;

&lt;p&gt;I have a question here, in the scenario you described&lt;/p&gt;

&lt;p&gt;a) modprobe lustre -&amp;gt; .... -&amp;gt; init_module(ptlrpc) -&amp;gt; LNetNIInit() -&amp;gt; modprobe ksocklnd&lt;/p&gt;

&lt;p&gt;init_module(ptlrpc) which will take module_mutex lock and calls load_module(ptlrpc) -&amp;gt; ... -&amp;gt; LNetNIInit() -&amp;gt; modprobe ksocklnd&lt;/p&gt;

&lt;p&gt;will itself hinder ksocklnd module&apos;s loading since init_module(ptlrpc) already hold the module_mutex?&lt;/p&gt;

&lt;p&gt;&amp;lt;edit&amp;gt; I got the answer here, when ptlrpc module get to LNetNIInit(), the module_mutex has already been unlocked, and it will not hinder ksocklnd module&apos;s loading. &lt;/p&gt;</comment>
                            <comment id="42669" author="pjones" created="Fri, 3 Aug 2012 12:04:34 +0000"  >&lt;p&gt;Andreas&lt;/p&gt;

&lt;p&gt;Any advice to offer here?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="42670" author="bogl" created="Fri, 3 Aug 2012 12:07:46 +0000"  >&lt;p&gt;Bobijam,&lt;br/&gt;
  Yes, that&apos;s what I saw in the linux code.  sys_init_module() holds module_mutex while doing load_module() but drops it before going to the init code in the loaded module.&lt;/p&gt;</comment>
                            <comment id="42681" author="adilger" created="Fri, 3 Aug 2012 14:13:55 +0000"  >&lt;p&gt;We used to do some modprobes from mount.lustre in the past, and this seems like a better place to do it than the system init scripts (which may not even have any Listre configs in them if the RPMs were just installed for the first time).&lt;/p&gt;

&lt;p&gt;I believe the reason that the LNET code is calling up to userspace to load the LND modules is because kernel-side module loading was removed at some point. I don&apos;t like that a a general solution, and would prefer if the LND module loading was also done. Y mount.lustre, but that would require also parsing /etc/modprobe.d/&lt;/p&gt;
{something}
&lt;p&gt; to find out the LNET routing config, but this file does not necessarily have a single name - we&apos;d have to try multiple names and may still get it incorrect. &lt;/p&gt;

&lt;p&gt;For now, until the LNET config project is done, let&apos;s ignore the LNET modules, and just do a single &quot;modprobe lustre&quot; from within mount.lustre. It would be useful to have some sort of exclusion in userspace (e.g. checking if the module is loaded yet) but it is still racy, and I don&apos;t want to make this more complex than it needs to be, since exclusion in userspace is much harder to do.&lt;/p&gt;

&lt;p&gt;Failing that, we could add serialization inside the lustre module init to ensure that there is no conflicts down at the lower level. That might even be less complex than trying to do it in userspace?&lt;/p&gt;</comment>
                            <comment id="42721" author="bobijam" created="Mon, 6 Aug 2012 06:08:13 +0000"  >&lt;p&gt;Bob, &lt;/p&gt;

&lt;p&gt;The original ptlrpc module initializer failed at loading ksocklnd module init, which should eventually get the module mutex, and the orignal log shows that it failed with another reason, the kernel thread start failed with -513 (-ERESTARTNOINTR), what could be the signal, is it a timeout signal?&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LustreError: 1679:0:(socklnd.c:2420:ksocknal_base_startup()) Can&apos;t spawn socknal scheduler[0]: -513
LustreError: 105-4: Error -100 starting up LNI tcp
LustreError: 1679:0:(events.c:728:ptlrpc_init_portals()) network initialisation failed
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="42741" author="bogl" created="Mon, 6 Aug 2012 12:46:31 +0000"  >&lt;p&gt;Bobijam,&lt;/p&gt;

&lt;p&gt;I don&apos;t know the exact mechanism of the failure, but I do think it is probably a timeout generated signal.  My guess is that it&apos;s a timeout in user level code around the sys_init_module(&quot;ksocklnd&quot;) syscall in the user level modprobe command.  If the syscall takes too long the timeout fires and interrupts the syscall.   That&apos;s just a guess as I don&apos;t have a view of the modprobe command code.  There may be some kernel level mechanism limiting the max time spent in a given syscall that I&apos;m just not aware of.&lt;/p&gt;

&lt;p&gt;In the other leg of the deadlock the thread holding module_mutex is failing lookups of global symbols from ptlrpc with a 30s timeout on each one.  This means it won&apos;t give up module_mutex for a very long time.&lt;/p&gt;</comment>
                            <comment id="42742" author="bogl" created="Mon, 6 Aug 2012 12:56:54 +0000"  >&lt;p&gt;Sorry, I see I misread your previous comment.  In all the deadlocks I took a close look at ksocklnd never got loaded and so never got as far as ksocknal_startup() -&amp;gt; ksocknal_base_startup().  Still think it&apos;s probably a timeout based signal, but have less idea of where it&apos;s coming from.&lt;/p&gt;</comment>
                            <comment id="46687" author="npearl" created="Wed, 17 Oct 2012 16:59:18 +0000"  >&lt;p&gt;This also has unpleasant results for HA clusters.  I have found that putting modprobe ptlrpc in rc.local&lt;br/&gt;
effectively works around the problem.  &lt;/p&gt;</comment>
                            <comment id="46698" author="adilger" created="Wed, 17 Oct 2012 21:54:22 +0000"  >&lt;p&gt;Bobijam, can you please add a call to mount_lustre.c like:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        system(&lt;span class=&quot;code-quote&quot;&gt;&quot;modprobe ptlrpc 2&amp;gt;&amp;amp;1 /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;&quot;&lt;/span&gt;);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;or something that will just run silently in the background and not print any error message.&lt;/p&gt;</comment>
                            <comment id="46718" author="brian" created="Thu, 18 Oct 2012 07:03:10 +0000"  >&lt;p&gt;Will adding this to mount_lustre.c actually resolve the problem though given that the original description of this bug says that it&apos;s multiple calls to mount that are causing the problem?  IOW, isn&apos;t this just moving the race/deadlock to somewhere else in the stack?&lt;/p&gt;</comment>
                            <comment id="46720" author="apittman" created="Thu, 18 Oct 2012 07:15:29 +0000"  >&lt;p&gt;&quot;modprobe lustre&quot; wouldn&apos;t solve the problem but &quot;modprobe ptlrpc&quot; should as it avoids the case where the module loading trys to pull in another module further down the chain.&lt;/p&gt;

&lt;p&gt;We added modprobe ksocklnd ; modprobe ko2iblnd to our HA agent and do not see this issue any more.&lt;/p&gt;</comment>
                            <comment id="47510" author="bobijam" created="Wed, 7 Nov 2012 01:59:33 +0000"  >&lt;p&gt;master patch tracking at  &lt;a href=&quot;http://review.whamcloud.com/4292&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4292&lt;/a&gt; + &lt;a href=&quot;http://review.whamcloud.com/4449&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4449&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="47516" author="bobijam" created="Wed, 7 Nov 2012 03:10:53 +0000"  >&lt;p&gt;b2_1 port at &lt;a href=&quot;http://review.whamcloud.com/4488&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4488&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="47544" author="green" created="Wed, 7 Nov 2012 17:28:06 +0000"  >&lt;p&gt;Please note that this patch introduces a problem now with lustre clients.&lt;br/&gt;
If you have no lustre modules loaded at all and do mount.lustre before the patch, lustre modules get loaded as needed from mount syscalll.&lt;/p&gt;

&lt;p&gt;Now with the patch in the mount loads ptlrpc.ko before the mount syscall that loads parts of lustre stack, but not all the way to the client, and then comes mount sycall time mounting actually fails. (real complaint from ORNL)&lt;/p&gt;

&lt;p&gt;So we need to address this too either by improving the way we detect client mounts and loading lustre from obd_mount.c somewhere or other means.&lt;/p&gt;</comment>
                            <comment id="47555" author="adilger" created="Wed, 7 Nov 2012 21:00:09 +0000"  >&lt;p&gt;One simple option might be to add another line:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        rc = system(&lt;span class=&quot;code-quote&quot;&gt;&quot;/sbin/modprobe lustre &amp;gt;/dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt; 2&amp;gt;&amp;amp;1&quot;&lt;/span&gt;);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;after the current ptlrpc line?  That should avoid the missing Lustre filesystem code on the client.  The root of the problem is that the &quot;shared&quot; mount handling is done in obdclass/mount.c or similar, which provides the &quot;lustre&quot; filesystem type, but this is not enough to mount the client.  If no modules are loaded, the missing &quot;lustre&quot; filesystem type would otherwise force the &quot;lustre&quot; module to be loaded and properly set up the stack.&lt;/p&gt;

&lt;p&gt;It may be with 2.4 that we no longer need this &quot;shared&quot; mount handling in obdclass, and can move it back to llite, but I&apos;m not sure.&lt;/p&gt;</comment>
                            <comment id="47576" author="simmonsja" created="Thu, 8 Nov 2012 08:28:49 +0000"  >&lt;p&gt;Oleg is describing the problems I was experiencing testing master on our cray systems. So our compute nodes were depending on the mount syscall loading all the proper modules for us. We attempts to manually do a modprobe lustre but only some of the modules were loaded. For the busted setup here are the modules loaded:&lt;/p&gt;

&lt;p&gt;mgc                    76893  0 &lt;br/&gt;
lustre                913092  0 &lt;br/&gt;
lov                   648104  1 lustre&lt;br/&gt;
osc                   466313  1 lov&lt;br/&gt;
mdc                   202643  1 lustre&lt;br/&gt;
fid                    70712  1 mdc&lt;br/&gt;
fld                    89516  1 fid&lt;br/&gt;
xpmem                  28383  0 &lt;br/&gt;
kgnilnd               522013  1 &lt;br/&gt;
ptlrpc               1307091  7 mgc,lustre,lov,osc,mdc,fid,fld&lt;br/&gt;
obdclass             1304282  83 mgc,lustre,lov,osc,mdc,fid,fld,ptlrpc&lt;br/&gt;
lnet                  384132  4 lustre,kgnilnd,ptlrpc,obdclass&lt;br/&gt;
lvfs                   37066  9 mgc,lustre,lov,osc,mdc,fid,fld,ptlrpc,obdclass&lt;br/&gt;
sha1_generic            2367  0 &lt;br/&gt;
md5                     2181  0 &lt;br/&gt;
libcfs                391846  12 mgc,lustre,lov,osc,mdc,fid,fld,kgnilnd,ptlrpc,obdclass,lnet,lvfs&lt;br/&gt;
ib_core                  759  0 &lt;span class=&quot;error&quot;&gt;&amp;#91;permanent&amp;#93;&lt;/span&gt;&lt;br/&gt;
kdreg                  17949  0 &lt;br/&gt;
gpcd_gem               10275  0 &lt;br/&gt;
ipogif_gem             13077  0 &lt;br/&gt;
kgni_gem              274252  2 kgnilnd,gpcd_gemhwerr                  60686  0 &lt;br/&gt;
rca                   362621  5 &lt;br/&gt;
hss_os                  4592  2 hwerr,rca&lt;br/&gt;
heartbeat               4230  1 rca&lt;br/&gt;
simplex                 2636  2 hwerr,rca&lt;br/&gt;
ghal_gem               84908  6 gpcd_gem,ipogif_gem,kgni_gem,hwerr,rca,hss_os&lt;br/&gt;
cgm                     8376  2 kgni_gem,ghal_gem&lt;br/&gt;
craytrace               6476  0 &lt;/p&gt;




</comment>
                            <comment id="47586" author="simmonsja" created="Thu, 8 Nov 2012 11:05:42 +0000"  >&lt;p&gt;With patch reverted I get this list of loaded modules:&lt;/p&gt;

&lt;p&gt;xpmem                  28383  0 &lt;br/&gt;
lmv                   314649  1 &lt;br/&gt;
mgc                    76893  1 &lt;br/&gt;
lustre                913092  29 &lt;br/&gt;
lov                   648104  16 lustre&lt;br/&gt;
osc                   466313  43 lov&lt;br/&gt;
mdc                   202643  2 lustre&lt;br/&gt;
fid                    70712  1 mdc&lt;br/&gt;
fld                    89516  2 lmv,fid&lt;br/&gt;
kgnilnd               522013  1 &lt;br/&gt;
ptlrpc               1307091  8 lmv,mgc,lustre,lov,osc,mdc,fid,fld&lt;br/&gt;
obdclass             1304353  106 lmv,mgc,lustre,lov,osc,mdc,fid,fld,ptlrpc&lt;br/&gt;
lnet                  384132  4 lustre,kgnilnd,ptlrpc,obdclass&lt;br/&gt;
lvfs                   37066  10 lmv,mgc,lustre,lov,osc,mdc,fid,fld,ptlrpc,obdclass&lt;br/&gt;
sha1_generic            2367  0 &lt;br/&gt;
md5                     2181  0 &lt;br/&gt;
libcfs                391846  13 lmv,mgc,lustre,lov,osc,mdc,fid,fld,kgnilnd,ptlrpc,obdclass,lnet,lvfs&lt;br/&gt;
ib_core                  759  0 &lt;span class=&quot;error&quot;&gt;&amp;#91;permanent&amp;#93;&lt;/span&gt;&lt;br/&gt;
kdreg                  17949  0 &lt;br/&gt;
gpcd_gem               10275  0 &lt;br/&gt;
ipogif_gem             13077  0 &lt;br/&gt;
kgni_gem              274252  2 kgnilnd,gpcd_gem&lt;br/&gt;
hwerr                  60686  0 &lt;br/&gt;
rca                   362621  5 &lt;br/&gt;
hss_os                  4592  2 hwerr,rca&lt;br/&gt;
heartbeat               4230  1 rca&lt;br/&gt;
simplex                 2636  2 hwerr,rca&lt;br/&gt;
ghal_gem               84908  6 gpcd_gem,ipogif_gem,kgni_gem,hwerr,rca,hss_os&lt;br/&gt;
cgm                     8376  2 kgni_gem,ghal_gem&lt;br/&gt;
craytrace               6476  0&lt;/p&gt;</comment>
                            <comment id="47594" author="green" created="Thu, 8 Nov 2012 12:18:52 +0000"  >&lt;p&gt;Andreas, actually modprobe lustre might be a wrong way to do it. After all this might be a server mount and there&apos;s no need for lustre clients modules to be present then.&lt;/p&gt;</comment>
                            <comment id="47595" author="simmonsja" created="Thu, 8 Nov 2012 12:30:10 +0000"  >&lt;p&gt;Agree. On cray computes the modules are loaded at boot time only. After the boot is complete the image then removes all the modules and turns off modprobe. You can&apos;t modprobe lustre during run time. In that case a compute node could only mount lustre at boot and not during run time.&lt;/p&gt;</comment>
                            <comment id="48829" author="adilger" created="Wed, 5 Dec 2012 17:04:45 +0000"  >&lt;p&gt;I&apos;m planning to revert the &quot;modprobe ptlrpc&quot; line to avoid the &quot;mount&quot; problem.&lt;/p&gt;

&lt;p&gt;It might be possible to fix this by adding explicit modprobe lines for the LNDs based on the MGS NID type as it is parsed by mount_lustre.c, like:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;convert_hostnames()
:
:
        (void)system(&lt;span class=&quot;code-quote&quot;&gt;&quot;modprobe %s &amp;gt; /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt; 2&amp;gt;&amp;amp;1&quot;&lt;/span&gt;,
                     libcfs_lnd2modname(libcfs_str2lnd(s1));
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This isn&apos;t a perfect solution if there is routing involved since only the MGS LND module would be loaded and that may not match the LND type used by the client, but that is somewhat a corner case.  It is likely to fix the problem for many environments, and will at least not break things like the current patch does.&lt;/p&gt;

&lt;p&gt;The issue blocking a proper fix is that the lnet modules are loaded based on modprobe options, but the module options are not in a specific file, so mount would have to hunt and peck to fix this.  It definitely makes sense to have the simplified LNET config fix this properly by loading the actual modules needed on the client.&lt;/p&gt;</comment>
                            <comment id="48831" author="adilger" created="Wed, 5 Dec 2012 17:14:52 +0000"  >&lt;p&gt;I&apos;ve pushed &lt;a href=&quot;http://review.whamcloud.com/4750&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4750&lt;/a&gt; to revert the patch &lt;a href=&quot;http://review.whamcloud.com/4292&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4292&lt;/a&gt; and &lt;a href=&quot;http://review.whamcloud.com/4449&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4449&lt;/a&gt; fixup.  Hopefully that will allow &lt;a href=&quot;http://review.whamcloud.com/4727&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4727&lt;/a&gt; to pass testing.&lt;/p&gt;</comment>
                            <comment id="48999" author="adilger" created="Mon, 10 Dec 2012 13:26:35 +0000"  >&lt;p&gt;The patch to revert ptlrpc loading is underway, but someone else needs to make the patch to load the individual LND modules at mount time. &lt;/p&gt;</comment>
                            <comment id="62303" author="hongchao.zhang" created="Mon, 15 Jul 2013 16:00:56 +0000"  >&lt;p&gt;the patch which checks and load the specified LNDs in mount.lustre is underway.&lt;/p&gt;</comment>
                            <comment id="62483" author="hongchao.zhang" created="Wed, 17 Jul 2013 14:39:29 +0000"  >&lt;p&gt;the patch to load individual LND modules at mount time is tracked at &lt;a href=&quot;http://review.whamcloud.com/#/c/7024/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7024/&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="62486" author="hongchao.zhang" created="Wed, 17 Jul 2013 14:54:45 +0000"  >&lt;p&gt;if modules can&apos;t be loaded after boot time(see above), it can be fixed by adding &quot;modprobe lustre; lctl net load&quot; to rc.local with the patch&lt;/p&gt;</comment>
                            <comment id="62491" author="brian" created="Wed, 17 Jul 2013 15:29:42 +0000"  >&lt;p&gt;If loading modules at boot time is the solution, it should be done it&apos;s own initscript (or integrating into the distro&apos;s native boot-time module loading infrastructure) but not rc.local.  rc.local should be left alone for the sysadmin to use, remove, etc. as he wishes.&lt;/p&gt;</comment>
                            <comment id="62520" author="hongchao.zhang" created="Thu, 18 Jul 2013 00:33:06 +0000"  >&lt;p&gt;yes, it can be put into other initscripts, rc.local is just one of the possible ones.&lt;/p&gt;</comment>
                            <comment id="62613" author="bryon" created="Fri, 19 Jul 2013 14:36:46 +0000"  >&lt;p&gt;This is needed for &lt;a href=&quot;https://jira.hpdd.intel.com/browse/HYD-2311&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://jira.hpdd.intel.com/browse/HYD-2311&lt;/a&gt;.  Could we try to get this landed for 2.5 please?&lt;/p&gt;</comment>
                            <comment id="64942" author="hongchao.zhang" created="Fri, 23 Aug 2013 08:28:50 +0000"  >&lt;p&gt;the patch is updated&lt;/p&gt;</comment>
                            <comment id="69266" author="paf" created="Fri, 18 Oct 2013 00:03:36 +0000"  >&lt;p&gt;I think you may find that this patch is not sufficient to allow parallel mounting of multiple targets.  We&apos;ve recently observed other issues with that same operation.&lt;/p&gt;

&lt;p&gt;Cray uses a script to mount all OSTs at the same time - The standard mount -t lustre commands are issued in parallel.&lt;/p&gt;

&lt;p&gt;Recently, we&apos;ve started occasionally noticing that the fsfilt_ldiskfs module sometimes fails to load.&lt;br/&gt;
The error comes from fsfilt_get_ops, which loads the module like this:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!(rc = cfs_request_module(&lt;span class=&quot;code-quote&quot;&gt;&quot;%s&quot;&lt;/span&gt;, name))) {
                        fs_ops = fsfilt_search_type(type);
                        CDEBUG(D_INFO, &lt;span class=&quot;code-quote&quot;&gt;&quot;Loaded module &lt;span class=&quot;code-quote&quot;&gt;&apos;%s&apos;&lt;/span&gt;\n&quot;&lt;/span&gt;, name);
                        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!fs_ops)
                                rc = -ENOENT;
                }


                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (rc) {
                        CERROR(&lt;span class=&quot;code-quote&quot;&gt;&quot;Can&apos;t find %s &lt;span class=&quot;code-keyword&quot;&gt;interface&lt;/span&gt;\n&quot;&lt;/span&gt;, name);
                        RETURN(ERR_PTR(rc &amp;lt; 0 ? rc : -rc));
                        /* u
                }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So when the module load returns a non-zero RC, it checks to see if the module was really loaded with fsfilt_search_type (it checks for a pointer to something from the module).&lt;br/&gt;
If that happens, that ENOENT is set.  It&apos;s converted to something else and ends up back at userspace as something like &quot;ENOTSUPPORTED&quot;, but this is the source of it.&lt;/p&gt;

&lt;p&gt;We get the &quot;Can&apos;t find fsfilt_ldiskfs interface&quot; message, and the mount fails.&lt;/p&gt;

&lt;p&gt;In investigating this, I switched to using an instrumented version of modprobe to see what step was failing.&lt;/p&gt;

&lt;p&gt;I&apos;ve logged a number of mount attempts with a four OST OSS, some of which failed with the error above but most of which succeeded.&lt;/p&gt;

&lt;p&gt;I&apos;ll attach the log from it, but here&apos;s an example of what I found, when logging a (successful, in this case) Lustre mount:&lt;br/&gt;
pid 28287 Modprobe started.&lt;br/&gt;
pid 28287: modulearg is: osd-ldiskfs&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/net/lustre/libcfs.ko result: 1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/fs/lustre/lvfs.ko result: 1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/net/lustre/lnet.ko result: 1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/fs/lustre/obdclass.ko result: 1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/fs/lustre/ptlrpc.ko result: 1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/fs/lustre/fld.ko result: 1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/fs/lustre/fid.ko result: 1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/fs/lustre/mdd.ko result: 1&lt;br/&gt;
pid 28287: init_module failed, error: -1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/kernel/fs/jbd2/jbd2.ko result: 1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/fs/lustre/lquota.ko result: 1&lt;br/&gt;
pid 28287: init_module failed, error: -1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/kernel/fs/mbcache.ko result: 1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/fs/lustre-ldiskfs/ldiskfs.ko result: 1&lt;br/&gt;
pid 28287: init_module failed, error: -1&lt;br/&gt;
pid 28287: File open of file /lib/modules/2.6.32-358.6.1.el6.x86_64/updates/kernel/fs/lustre/osd_ldiskfs.ko result: 1&lt;br/&gt;
pid 28287: init_module failed, error: -1&lt;/p&gt;

&lt;p&gt;Modulearg is the module name modprobe was called with.  It then attempts to load any dependencies, and finally the module given in the argument.&lt;br/&gt;
The pid is the pid of the particular instance of modprobe, then the file being opened is the module it is trying to load.&lt;br/&gt;
When you see &quot;init_module&quot; failed, that is a failure to init the module listed on the &quot;File open of file&quot; line above.&lt;br/&gt;
In the attached log file, mounts of Lustre are separated by &quot;Mount starting&quot; and &quot;Mount complete&quot;.&lt;/p&gt;

&lt;p&gt;There is a separate modprobe of osd-ldiskfs which succeeded as part of that mount.&lt;/p&gt;

&lt;p&gt;If you look through the attached file, you will see many modules failing to init on and off on various mounts, and various module inits being attempted more than once.&lt;br/&gt;
Often, fsfilt_ldiskfs will fail to init one or more times, but the mount will still succeed.&lt;br/&gt;
It&apos;s also worth noting that even when the mount fails, fsfilt_ldiskfs is init&apos;ed successfully at least once.&lt;br/&gt;
This suggests that possibly the check that the module is loaded is racing with the module load itself.&lt;/p&gt;

&lt;p&gt;modprobe fsfilt_ldiskfs before doing the mount avoids the mount failure.&lt;/p&gt;</comment>
                            <comment id="69286" author="hongchao.zhang" created="Fri, 18 Oct 2013 15:13:53 +0000"  >&lt;p&gt;is the error -1 (-EPERM) the actual error returned by &quot;init_module&quot;?&lt;/p&gt;

&lt;p&gt;the modules could be loaded more than once in &quot;class_get_type&quot;(obdclass/genops.c&quot; if the previous loads failed.&lt;/p&gt;

&lt;p&gt;btw, which mount is the failed one in the attached log?&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="69290" author="paf" created="Fri, 18 Oct 2013 15:36:49 +0000"  >&lt;p&gt;I&apos;m not sure it necessarily represents -EPERM, but -1 is the return value from init_module:&lt;br/&gt;
This is the code from modprobe (inside the function &apos;insmod&apos;, around line 690 in modprobe.c) , with the debug line I added:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        ret = init_module(module-&amp;gt;data, module-&amp;gt;len, optstring);
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (ret != 0) {
                fprintf(debug_fp,&lt;span class=&quot;code-quote&quot;&gt;&quot;pid %d: init_module failed, error: %d\n&quot;&lt;/span&gt;,(&lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt;) pid, ret);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The fourth mount in the file is the one which failed (line 596 is one of the fsfilt_ldiskfs failures from that mount, note there are three failures and one success for fsfilt_ldiskfs in that mount).&lt;/p&gt;</comment>
                            <comment id="69292" author="paf" created="Fri, 18 Oct 2013 15:46:04 +0000"  >&lt;p&gt;Full source of module-init-tools-3.9.tar.gz, with lightly instrumented modprobe.c.&lt;/p&gt;

&lt;p&gt;This version of modprobe (built just by doing configure, then make, and then found in the /build/ directory) creates and appends debug output to a file called /tmp/modprobe_output&lt;/p&gt;

&lt;p&gt;It&apos;s made from a version of module-init-tools very close to that used with CentOS 6.4 and can be dropped in in place of the existing modprobe in /sbin/modprobe&lt;/p&gt;</comment>
                            <comment id="69293" author="paf" created="Fri, 18 Oct 2013 15:46:42 +0000"  >&lt;p&gt;I&apos;ve also attached the module-init-tools source with my instrumented modprobe.  See the note on the attachment for more details.&lt;/p&gt;

&lt;p&gt;One further note - This ticket gives the affected version as 2.1.  We&apos;re seeing the related issue I described above in 2.4.1.&lt;/p&gt;</comment>
                            <comment id="69319" author="adilger" created="Fri, 18 Oct 2013 18:47:57 +0000"  >&lt;p&gt;It is important to note that the fsfilt code that is loading the fsfilt_ldiskfs module will be deleted in Lustre 2.6.  The code is almost obsolete in Lustre 2.5, but the patch to remove it (&lt;a href=&quot;http://review.whamcloud.com/5512&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5512&lt;/a&gt;) didn&apos;t get landed before the feature freeze.  Any fixes made in this area (e.g. having mount.lustre load fsfilt_ldiskfs for ldiskfs servers, or putting a mutex around the probe/load/check code) does not need to be landed for Lustre 2.5 or later.&lt;/p&gt;

&lt;p&gt;I&apos;d prefer to just land 5512 to b2_5 to delete the affected code to fix this problem instead.  The patch may be large, but is mostly just moving code around so that lvfs and fsfilt can be deleted.&lt;/p&gt;</comment>
                            <comment id="69322" author="simmonsja" created="Fri, 18 Oct 2013 19:11:11 +0000"  >&lt;p&gt;Patrick can you try &lt;a href=&quot;http://review.whamcloud.com/5512&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5512&lt;/a&gt; to see if the fsfilt issues go away for you?&lt;/p&gt;</comment>
                            <comment id="69323" author="paf" created="Fri, 18 Oct 2013 19:14:23 +0000"  >&lt;p&gt;James:&lt;/p&gt;

&lt;p&gt;Possibly.  I have yet to replicate those on a system where I can install arbitrary Lustre code &lt;span class=&quot;error&quot;&gt;&amp;#91;it&amp;#39;s only happening on some of our systems, seems very timing dependent&amp;#93;&lt;/span&gt;, but I&apos;ll give it a try.  (Replication first of all...)&lt;/p&gt;</comment>
                            <comment id="69327" author="spitzcor" created="Fri, 18 Oct 2013 19:58:32 +0000"  >&lt;p&gt;I&apos;m sure that #5512 will workaround this problem, but isn&apos;t there a larger issue to resolve?  Patrick noted that modules like mdd and lquota also fail in the same manner.  Should we investigate fixing racy module load for all modules?&lt;/p&gt;</comment>
                            <comment id="69339" author="paf" created="Fri, 18 Oct 2013 21:36:40 +0000"  >&lt;p&gt;James - I just tried briefly and I&apos;m not able to replicate the issue on a test system with &amp;gt;2 OSTs/OSS (we&apos;re seeing it on our &apos;production&apos; development systems, rather than our dedicated Lustre testing systems).&lt;/p&gt;

&lt;p&gt;But as Cory said, it seems obvious that #5512 would fix the problem.  If the module does not exist, loading it can hardly be a problem. &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/wink.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;Still, I&apos;m worried we might just be dodging this issue and leaving others waiting to be discovered, as this is not the only module load failure in parallel mounts.  It&apos;s just the only one causing a noticeable problem now.  Unless we can point to a clear difference in how the other modules are loaded that shows why they&apos;re not failing the mount when one of their inits fails, I think we may see problems there in the future.&lt;/p&gt;</comment>
                            <comment id="69385" author="hongchao.zhang" created="Mon, 21 Oct 2013 09:47:05 +0000"  >&lt;p&gt;the &quot;-1&quot; should be the error returned by the syscall &quot;sys_init_module&quot;, and is -EPERM&lt;br/&gt;
there are two conditions to return -EPERM, &lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;SYSCALL_DEFINE3(init_module, void __user *, umod,
                unsigned &lt;span class=&quot;code-object&quot;&gt;long&lt;/span&gt;, len, &lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;char&lt;/span&gt; __user *, uargs)
{
        struct module *mod;
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; ret = 0;

        &lt;span class=&quot;code-comment&quot;&gt;/* Must have permission */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!capable(CAP_SYS_MODULE) || modules_disabled)
                &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -EPERM;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the &quot;modules_disabled&quot; is controlled by &quot;/proc/sys/modules_disabled&quot;, I remembered that you mentioned Cray will disable module load after boot completes,&lt;br/&gt;
is it implemented by this way? and is there any chance &quot;&quot;/proc/sys/modules_disabled&quot; is set during issuing parallel mounting commands?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;</comment>
                            <comment id="69398" author="paf" created="Mon, 21 Oct 2013 14:34:07 +0000"  >&lt;p&gt;Hongchao,&lt;/p&gt;

&lt;p&gt;A very good guess.  In this case, no, module loading is definitely not disabled.  We disable module load on the compute nodes of our mainframes, but this is not those nodes (which are heavily stripped down because their OS runs exclusively in memory).  This problem is seen on an external CentOS 6.4 system running Lustre.  In essence, it&apos;s just vanilla CentOS 6.4 with Lustre installed on it - No significant changes to the OS or settings.&lt;/p&gt;

&lt;p&gt;I&apos;d encourage you to test target mounts in parallel with the modified modprobe with multiple OSTs on a test system of your own - You may not see the outright mount failure we observe, but I suspect you will see a number of -1s returned.&lt;/p&gt;</comment>
                            <comment id="70038" author="paf" created="Mon, 28 Oct 2013 17:34:43 +0000"  >&lt;p&gt;Further thoughts...  There are various other possible sources for a -1.  I don&apos;t think it&apos;s EPERM here.&lt;/p&gt;

&lt;p&gt;From the init_module syscall in the kernel:&lt;br/&gt;
&amp;#8212;&lt;br/&gt;
        /* Do all the hard work */&lt;br/&gt;
        mod = load_module(umod, len, uargs);&lt;br/&gt;
        if (IS_ERR(mod)) &lt;/p&gt;
{
                mutex_unlock(&amp;amp;module_mutex);
                return PTR_ERR(mod);
        }
&lt;p&gt;&amp;#8212;&lt;/p&gt;

&lt;p&gt;We also have:&lt;br/&gt;
ret = do_one_initcall(mod-&amp;gt;init);&lt;/p&gt;

&lt;p&gt;And I suspect it&apos;s coming from one of those.&lt;/p&gt;</comment>
                            <comment id="70593" author="hongchao.zhang" created="Mon, 4 Nov 2013 10:18:41 +0000"  >&lt;p&gt;there is an issue in kernel related to loading modules in parallel, and the following loads after the first one didn&apos;t wait the module to be initialized,&lt;br/&gt;
which causes problem such as the filesystem type can&apos;t be found for the corresponding module has not been initialized. (please see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3975&quot; title=&quot;Race loading ldiskfs with parallel mounts&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3975&quot;&gt;&lt;del&gt;LU-3975&lt;/del&gt;&lt;/a&gt; for details)&lt;/p&gt;

&lt;p&gt;this issue could be a duplicate of it, could you please try with it.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="72426" author="adilger" created="Wed, 27 Nov 2013 19:17:19 +0000"  >&lt;p&gt;If there is already a fix in the upstream kernel (&lt;a href=&quot;http://thread.gmane.org/gmane.linux.kernel/1358707/focus=1358709&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://thread.gmane.org/gmane.linux.kernel/1358707/focus=1358709&lt;/a&gt;), we should just apply this patch to the RHEL/SLES kernels that we support, until such a time that they backport the fix themselves.  It doesn&apos;t make sense to do anything at the Lustre level if this is not a Lustre bug.&lt;/p&gt;</comment>
                            <comment id="88584" author="hongchao.zhang" created="Wed, 9 Jul 2014 13:47:21 +0000"  >&lt;p&gt;the patch &lt;a href=&quot;http://review.whamcloud.com/#/c/7024/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/7024/&lt;/a&gt; has been updated, and it conflicts with the patch &lt;a href=&quot;http://review.whamcloud.com/#/c/9832/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9832/&lt;/a&gt;&lt;br/&gt;
in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2456&quot; title=&quot;Dynamic LNet Config Main Development Work&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2456&quot;&gt;&lt;del&gt;LU-2456&lt;/del&gt;&lt;/a&gt;, and will update it once the patch &lt;a href=&quot;http://review.whamcloud.com/#/c/9832/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9832/&lt;/a&gt; landed.&lt;/p&gt;</comment>
                            <comment id="90132" author="liwei" created="Mon, 28 Jul 2014 02:32:13 +0000"  >&lt;p&gt;Nowadays, libcfs suffers from the problem as well, since it has become yet another module that results in additional request_module() calls in its init callback.  (Ideally, we should avoid this kind of init callbacks in Lustre.)  RHEL 7 &lt;em&gt;should&lt;/em&gt; (to be tested) have enough fixes so that these deadlocks will no longer happen.  IMHO, it is better to just carry a kernel fix in the RHEL 6 patch series than to go down the hacky road of changing mount.lustre.  Here is a patch I&apos;m waiting to test on a larger scale: &lt;a href=&quot;http://review.whamcloud.com/11229&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11229&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="91160" author="liwei" created="Fri, 8 Aug 2014 01:38:57 +0000"  >&lt;p&gt;As discussed with Bob when reviewing the RHEL 6 patch, we might also want to check if a separate patch is needed for SLES kernels as well.&lt;/p&gt;</comment>
                            <comment id="96903" author="pjones" created="Tue, 21 Oct 2014 19:36:32 +0000"  >&lt;p&gt;This has never (to my knowledge) been reported on SLES, but reported from multiple sources on RHEL 6.x, so I think it is reasonable to mark this as resolved for 2.5.4 and 2.7 based on Li Wei&apos;s fix having landed. If this is ever seen on SLES then we can track that issue under a new ticket.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10120">
                    <name>Blocker</name>
                                            <outwardlinks description="is blocking">
                                                        </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="22237">LU-4311</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="21040">LU-3975</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="27740">LU-5961</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="21040">LU-3975</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="25063">LU-5159</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="15616">LU-2456</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="13653" name="modprobe_log.txt" size="94534" author="paf" created="Fri, 18 Oct 2013 00:05:38 +0000"/>
                            <attachment id="13654" name="module-init-tools-3.9.tar.gz" size="1119549" author="paf" created="Fri, 18 Oct 2013 15:46:04 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv38v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4025</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>