<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:17:02 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1484] Test failure on test suite recovery-small, subtest test_57</title>
                <link>https://jira.whamcloud.com/browse/LU-1484</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for yujian &amp;lt;yujian@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/743bea58-af48-11e1-a585-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/743bea58-af48-11e1-a585-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_57 failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;== recovery-small test 57: read procfs entries causes kernel crash =================================== 05:43:48 (1338900228)&lt;br/&gt;
fail_loc=0x80000B00&lt;br/&gt;
Stopping client client-28vm6.lab.whamcloud.com /mnt/lustre (opts&lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;

&lt;p&gt;test failed to respond and timed out&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: recovery-small 57&lt;/p&gt;</description>
                <environment>&lt;br/&gt;
Lustre Tag: v2_1_2_RC2&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_1/87/&quot;&gt;http://build.whamcloud.com/job/lustre-b2_1/87/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.8/x86_64&lt;br/&gt;
Network: TCP (1GigE)&lt;br/&gt;
</environment>
        <key id="14749">LU-1484</key>
            <summary>Test failure on test suite recovery-small, subtest test_57</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="utopiabound">Nathaniel Clark</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Tue, 5 Jun 2012 23:40:02 +0000</created>
                <updated>Thu, 18 Apr 2013 18:14:29 +0000</updated>
                            <resolved>Thu, 21 Feb 2013 15:04:05 +0000</resolved>
                                    <version>Lustre 2.3.0</version>
                    <version>Lustre 2.1.2</version>
                    <version>Lustre 2.1.3</version>
                    <version>Lustre 2.1.4</version>
                    <version>Lustre 1.8.8</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                    <fixVersion>Lustre 2.1.5</fixVersion>
                    <fixVersion>Lustre 1.8.9</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>14</watches>
                                                                            <comments>
                            <comment id="40069" author="yujian" created="Tue, 5 Jun 2012 23:47:57 +0000"  >&lt;p&gt;Console log on Client 4 (client-28vm6) showed that:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;05:43:48:Lustre: DEBUG MARKER: == recovery-small test 57: read procfs entries causes kernel crash =================================== 05:43:48 (1338900228)
05:43:53:LustreError: 28843:0:(fail.c:126:__cfs_fail_timeout_set()) cfs_fail_timeout id b00 sleeping for 10000ms
05:44:01:LustreError: 28843:0:(fail.c:130:__cfs_fail_timeout_set()) cfs_fail_timeout id b00 awake
05:46:17:INFO: task lctl:28843 blocked for more than 120 seconds.
05:46:17:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
05:46:18:lctl          D 0000000000001000     0 28843  28631                     (NOTLB)
05:46:18: ffff8100517dfe38 0000000000000086 ffffffff800cfa4c ffff810037d56d40
05:46:18: 0000000000000282 0000000000000007 ffff810067000080 ffff8100668c47e0
05:46:18: 000005e137f5c0e1 000000000001b643 ffff810067000268 0000000000000001
05:46:18:Call Trace:
05:46:18: [&amp;lt;ffffffff800cfa4c&amp;gt;] zone_statistics+0x3e/0x6d
05:46:18: [&amp;lt;ffffffff8000f40b&amp;gt;] __alloc_pages+0x78/0x308
05:46:19: [&amp;lt;ffffffff8006468c&amp;gt;] __down_read+0x7a/0x92
05:46:19: [&amp;lt;ffffffff888d90e2&amp;gt;] :obdclass:lprocfs_fops_read+0x82/0x1e0
05:46:19: [&amp;lt;ffffffff8010ab77&amp;gt;] proc_reg_read+0x7e/0x99
05:46:19: [&amp;lt;ffffffff8000b721&amp;gt;] vfs_read+0xcb/0x171
05:46:19: [&amp;lt;ffffffff80011d15&amp;gt;] sys_read+0x45/0x6e
05:46:19: [&amp;lt;ffffffff8005d28d&amp;gt;] tracesys+0xd5/0xe0
05:46:19:
05:46:19:INFO: task umount:28863 blocked for more than 120 seconds.
05:46:22:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
05:46:24:umount        D ffff810002536420     0 28863  28862                     (NOTLB)
05:46:24: ffff810057f81828 0000000000000082 0000000000000000 0000000100000002
05:46:24: 0000000000000000 0000000000000007 ffff8100668c47e0 ffffffff80319b60
05:46:24: 000005e137f5e08c 0000000000001fab ffff8100668c49c8 0000000000000000
05:46:24:Call Trace:
05:46:24: [&amp;lt;ffffffff80064cb5&amp;gt;] __reacquire_kernel_lock+0x2e/0x47
05:46:24: [&amp;lt;ffffffff80063171&amp;gt;] wait_for_completion+0x79/0xa2
05:46:24: [&amp;lt;ffffffff8008ee74&amp;gt;] default_wake_function+0x0/0xe
05:46:24: [&amp;lt;ffffffff8010dcdb&amp;gt;] remove_proc_entry+0xfb/0x1c7
05:46:25: [&amp;lt;ffffffff888d7603&amp;gt;] :obdclass:lprocfs_remove+0x103/0x130
05:46:25: [&amp;lt;ffffffff888d6a46&amp;gt;] :obdclass:lprocfs_free_stats+0x1e6/0x230
05:46:25: [&amp;lt;ffffffff888d7a1f&amp;gt;] :obdclass:lprocfs_obd_cleanup+0x6f/0x80
05:46:28: [&amp;lt;ffffffff88b7ca32&amp;gt;] :osc:osc_precleanup+0x292/0x370
05:46:28: [&amp;lt;ffffffff888ff13c&amp;gt;] :obdclass:lu_context_fini+0x1c/0x50
05:46:28: [&amp;lt;ffffffff888e303f&amp;gt;] :obdclass:class_cleanup+0xc6f/0xe30
05:46:28: [&amp;lt;ffffffff888e6d8c&amp;gt;] :obdclass:class_process_config+0x1e5c/0x3200
05:46:28: [&amp;lt;ffffffff888e97f7&amp;gt;] :obdclass:class_manual_cleanup+0xad7/0xe80
05:46:28: [&amp;lt;ffffffff8002ff6f&amp;gt;] __up_write+0x27/0xf2
05:46:30: [&amp;lt;ffffffff88bba26c&amp;gt;] :lov:lov_putref+0xb0c/0xb90
05:46:30: [&amp;lt;ffffffff88bc2b98&amp;gt;] :lov:lov_disconnect+0x308/0x3e0
05:46:30: [&amp;lt;ffffffff88c66d94&amp;gt;] :lustre:client_common_put_super+0x894/0xed0
05:46:30: [&amp;lt;ffffffff88c676e5&amp;gt;] :lustre:ll_put_super+0x195/0x310
05:46:30: [&amp;lt;ffffffff800f079e&amp;gt;] invalidate_inodes+0xce/0xe0
05:46:30: [&amp;lt;ffffffff800e77ab&amp;gt;] generic_shutdown_super+0x79/0xfb
05:46:30: [&amp;lt;ffffffff800e787b&amp;gt;] kill_anon_super+0x9/0x35
05:46:30: [&amp;lt;ffffffff800e792c&amp;gt;] deactivate_super+0x6a/0x82
05:46:31: [&amp;lt;ffffffff800f1e8b&amp;gt;] sys_umount+0x245/0x27b
05:46:31: [&amp;lt;ffffffff800ba767&amp;gt;] audit_syscall_entry+0x1a8/0x1d3
05:46:31: [&amp;lt;ffffffff8005d28d&amp;gt;] tracesys+0xd5/0xe0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;For Lustre 2.1.1, we also hit recovery-small test 57 hanging, but it&apos;s another failure on client: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1097&quot; title=&quot;Oops: EIP is at osc_rd_lockless_truncate+0xd/0x30 [osc]&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1097&quot;&gt;&lt;del&gt;LU-1097&lt;/del&gt;&lt;/a&gt;. &lt;/p&gt;</comment>
                            <comment id="40071" author="yujian" created="Wed, 6 Jun 2012 01:32:58 +0000"  >&lt;p&gt;Another instance:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/18995f38-aafb-11e1-b191-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/18995f38-aafb-11e1-b191-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;And here are the historical reports with status &quot;TIMEOUT&quot; on Maloo:&lt;br/&gt;
&lt;a href=&quot;http://tinyurl.com/cym3do3&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://tinyurl.com/cym3do3&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="42138" author="pjones" created="Mon, 23 Jul 2012 15:32:32 +0000"  >&lt;p&gt;Bobijam&lt;/p&gt;

&lt;p&gt;Could you please look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="42182" author="bobijam" created="Tue, 24 Jul 2012 03:24:38 +0000"  >&lt;p&gt;there&apos;s a deadlock here: (!HAVE_PROCFS_USER &amp;amp;&amp;amp; HAVE_PROCFS_DELETED)&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;proc reader                                    proc remover
proc_reg_read()                                LPROCFS_WRITE_ENTRY() // down_write _lprocfs_lock semaphore  -------&amp;gt; (1)
   pdeaux-&amp;gt;pde_users++
   lprocfs_fops_read()
      LPROCFS_ENTRY_AND_CHECK() // down_read  _lprocfs_lock semaphore, wait here        ---------------------------&amp;gt; (2)
                                               remove_proc_entry()
                                                  if (pdeaux-&amp;gt;pde_users &amp;gt; 0)
                                                      wait_for_completion()    ------------------------------------&amp;gt; (3)
...
  pde_users_dec() // pdeaux-&amp;gt;pde_users--, complete()      ---------------------------------------------------------- (4)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the issue is when remover get to (3), the proc reader will wait at (2), while proc remover cannot move on until proc reader reachs (4), a deadlock ensues.&lt;/p&gt;</comment>
                            <comment id="42187" author="bobijam" created="Tue, 24 Jul 2012 05:10:37 +0000"  >&lt;p&gt;patch tracking at &lt;a href=&quot;http://review.whamcloud.com/3455&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/3455&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;lprocfs: fix a deadlock&lt;/p&gt;

&lt;p&gt;There is a deadlock between proc reader and proc remover.&lt;/p&gt;

&lt;p&gt;  &lt;b&gt;proc reader&lt;/b&gt;                  &lt;b&gt;proc remover&lt;/b&gt;&lt;br/&gt;
                              LPROCFS_WRITE_ENTRY()  -----&amp;gt; (1)&lt;br/&gt;
proc_reg_read()&lt;br/&gt;
  pdeaux-&amp;gt;pde_users++&lt;br/&gt;
  lprocfs_fops_read()&lt;br/&gt;
    LPROCFS_ENTRY_AND_CHECK() // wait semaphore      &amp;lt;----- (2)&lt;br/&gt;
                              remove_proc_entry()&lt;br/&gt;
                                if (pdeaux-&amp;gt;pde_users &amp;gt; 0)&lt;br/&gt;
                                   wait_for_completion() -&amp;gt; (3)&lt;br/&gt;
...&lt;br/&gt;
  pde_users_dec() // pdeaux-&amp;gt;pde_users-&lt;del&gt;, complete() &amp;lt;&lt;/del&gt;---- (4)&lt;/p&gt;

&lt;p&gt;when remover gets to (3), the proc reader will wait at (2), while&lt;br/&gt;
proc remover cannot move on until proc reader reaches (4), a deadlock&lt;br/&gt;
ensues.&lt;/p&gt;</comment>
                            <comment id="42229" author="bobijam" created="Tue, 24 Jul 2012 23:01:32 +0000"  >&lt;p&gt;update patch&lt;/p&gt;

&lt;p&gt;lprocfs: refine LC_PROCFS_USERS check&lt;/p&gt;

&lt;p&gt;In some RHEL patched 2.6.18 kernels, pde_users member is added in&lt;br/&gt;
another struct proc_dir_entry_aux instead of in struct proc_dir_entry&lt;br/&gt;
in later kernel version of 2.6.23.&lt;/p&gt;</comment>
                            <comment id="42331" author="pjones" created="Thu, 26 Jul 2012 10:26:31 +0000"  >&lt;p&gt;Landed for 2.3&lt;/p&gt;</comment>
                            <comment id="43072" author="yujian" created="Mon, 13 Aug 2012 02:20:16 +0000"  >&lt;p&gt;Lustre Tag: v2_1_3_RC1&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_1/113/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_1/113/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.8/x86_64 (kernel version: 2.6.18-308.11.1.el5)&lt;br/&gt;
Network: TCP (1GigE)&lt;/p&gt;

&lt;p&gt;The same issue exists in Lustre 2.1.3: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/89731b4c-e415-11e1-b6d3-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/89731b4c-e415-11e1-b6d3-52540035b04c&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Will the patch for this ticket be cherry-picked/ported to b2_1 branch?&lt;/p&gt;</comment>
                            <comment id="43079" author="bobijam" created="Mon, 13 Aug 2012 03:10:23 +0000"  >&lt;p&gt;b2_1 patch tracking at &lt;a href=&quot;http://review.whamcloud.com/3471&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/3471&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="47774" author="bergwolf" created="Wed, 14 Nov 2012 03:22:25 +0000"  >&lt;p&gt;With commit 76bf16d1e12cd3c2d2f48a31e3e6c1ad66523638 (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-1484&quot; title=&quot;Test failure on test suite recovery-small, subtest test_57&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-1484&quot;&gt;&lt;del&gt;LU-1484&lt;/del&gt;&lt;/a&gt; lprocfs: refine LC_PROCFS_USERS check) for the ticket, I got following build errors with RHEL 2.6.18-274.12.1.el5.i686 kernel.&lt;/p&gt;

&lt;p&gt;  CC &lt;span class=&quot;error&quot;&gt;&amp;#91;M&amp;#93;&lt;/span&gt;  /home/bergwolf/src/lustre-testing/libcfs/libcfs/linux/linux-tracefile.o&lt;br/&gt;
In file included from /home/bergwolf/src/lustre-testing/libcfs/include/libcfs/libcfs.h:322,&lt;br/&gt;
                 from /home/bergwolf/src/lustre-testing/libcfs/libcfs/linux/linux-tracefile.c:38:&lt;br/&gt;
/home/bergwolf/src/lustre-testing/libcfs/include/libcfs/params_tree.h:107:2: error: #error proc_dir_entry-&amp;gt;deleted is conflicted with proc_dir_entry-&amp;gt;pde_users&lt;br/&gt;
make&lt;span class=&quot;error&quot;&gt;&amp;#91;6&amp;#93;&lt;/span&gt;: *** &lt;span class=&quot;error&quot;&gt;&amp;#91;/home/bergwolf/src/lustre-testing/libcfs/libcfs/linux/linux-tracefile.o&amp;#93;&lt;/span&gt; Error 1&lt;br/&gt;
make&lt;span class=&quot;error&quot;&gt;&amp;#91;5&amp;#93;&lt;/span&gt;: *** &lt;span class=&quot;error&quot;&gt;&amp;#91;/home/bergwolf/src/lustre-testing/libcfs/libcfs&amp;#93;&lt;/span&gt; Error 2&lt;br/&gt;
make&lt;span class=&quot;error&quot;&gt;&amp;#91;4&amp;#93;&lt;/span&gt;: *** &lt;span class=&quot;error&quot;&gt;&amp;#91;/home/bergwolf/src/lustre-testing/libcfs&amp;#93;&lt;/span&gt; Error 2&lt;/p&gt;</comment>
                            <comment id="47775" author="bergwolf" created="Wed, 14 Nov 2012 03:23:29 +0000"  >&lt;p&gt;config.h shows that both HAVE_PROCFS_DELETED and HAVE_PROCFS_USERS are defined.&lt;/p&gt;

&lt;p&gt;460 /* kernel has deleted member in procfs entry struct */&lt;br/&gt;
461 #define HAVE_PROCFS_DELETED 1&lt;br/&gt;
462&lt;br/&gt;
463 /* kernel has pde_users member in proc_dir_entry_aux */&lt;br/&gt;
464 #define HAVE_PROCFS_USERS 1&lt;/p&gt;</comment>
                            <comment id="47887" author="schamp" created="Thu, 15 Nov 2012 16:52:44 +0000"  >&lt;p&gt;Building b2_1, I have verified that the RHEL 5.8 updates 2.6.18-308.11.1.el5 and 2.6.18-308.16.1.el5 include both proc_dir_entry.deleted and proc_dir_entry_aux.pde_users.  As Peng reported, this leads to&lt;/p&gt;

&lt;p&gt;  libcfs/include/libcfs/params_tree.h:107:2: error: #error proc_dir_entry-&amp;gt;deleted is conflicted with proc_dir_entry-&amp;gt;pde_user&lt;/p&gt;

&lt;p&gt;anytime the source for one of these RHEL 5.8 updates is used for --with-linux=.&lt;/p&gt;</comment>
                            <comment id="47888" author="schamp" created="Thu, 15 Nov 2012 17:23:05 +0000"  >&lt;p&gt;Follow up in &lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2334&quot; title=&quot;build errors with 2.6.18 kernel&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2334&quot;&gt;&lt;del&gt;LU-2334&lt;/del&gt;&lt;/a&gt; build errors with 2.6.18 kernel&lt;/p&gt;</comment>
                            <comment id="48975" author="yujian" created="Mon, 10 Dec 2012 09:19:16 +0000"  >&lt;p&gt;Lustre Branch: b2_1&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_1/148&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_1/148&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.8/x86_64 (kernel version: 2.6.18-308.20.1.el5)&lt;/p&gt;

&lt;p&gt;The same issue occurred again on recovery-small test 57:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== recovery-small test 57: read procfs entries causes kernel crash =================================== 19:13:57 (1354936437)
fail_loc=0x80000B00
CMD: fat-intel-3vm6.lab.whamcloud.com grep -c /mnt/lustre&apos; &apos; /proc/mounts
Stopping client fat-intel-3vm6.lab.whamcloud.com /mnt/lustre (opts:)
CMD: fat-intel-3vm6.lab.whamcloud.com lsof -t /mnt/lustre
CMD: fat-intel-3vm6.lab.whamcloud.com umount  /mnt/lustre 2&amp;gt;&amp;amp;1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Console log on client (fat-intel-3vm6):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;19:14:02:Lustre: DEBUG MARKER: == recovery-small test 57: read procfs entries causes kernel crash =================================== 19:13:57 (1354936437)
19:14:02:Lustre: DEBUG MARKER: grep -c /mnt/lustre&apos; &apos; /proc/mounts
19:14:02:LustreError: 9173:0:(fail.c:133:__cfs_fail_timeout_set()) cfs_fail_timeout id b00 sleeping for 10000ms
19:14:02:Lustre: DEBUG MARKER: lsof -t /mnt/lustre
19:14:02:Lustre: DEBUG MARKER: umount /mnt/lustre 2&amp;gt;&amp;amp;1
19:14:13:LustreError: 9173:0:(fail.c:137:__cfs_fail_timeout_set()) cfs_fail_timeout id b00 awake
19:16:57:INFO: task lctl:9173 blocked for more than 120 seconds.
19:16:57:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
19:16:57:lctl          D 0000000000001000     0  9173   8913                     (NOTLB)
19:16:57: ffff810044841e38 0000000000000086 ffffffff800cfa80 ffff810037d56d40
19:16:57: 0000000000000282 0000000000000007 ffff81004de4f860 ffff81005f2120c0
19:16:57: 00002a16f848196c 0000000000020344 ffff81004de4fa48 0000000000000001
19:16:57:Call Trace:
19:16:57: [&amp;lt;ffffffff800cfa80&amp;gt;] zone_statistics+0x3e/0x6d
19:16:58: [&amp;lt;ffffffff8000f47b&amp;gt;] __alloc_pages+0x78/0x308
19:16:58: [&amp;lt;ffffffff8006468c&amp;gt;] __down_read+0x7a/0x92
19:16:58: [&amp;lt;ffffffff889170c2&amp;gt;] :obdclass:lprocfs_fops_read+0x82/0x200
19:16:58: [&amp;lt;ffffffff8010b73e&amp;gt;] proc_reg_read+0x7e/0x99
19:16:58: [&amp;lt;ffffffff8000b72f&amp;gt;] vfs_read+0xcb/0x171
19:16:58: [&amp;lt;ffffffff80011d85&amp;gt;] sys_read+0x45/0x6e
19:16:58: [&amp;lt;ffffffff8005d28d&amp;gt;] tracesys+0xd5/0xe0
19:16:58:
19:16:58:INFO: task umount:9195 blocked for more than 120 seconds.
19:16:58:&quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
19:16:58:umount        D ffff810002536420     0  9195   9194                     (NOTLB)
19:16:58: ffff810058613a08 0000000000000086 00000000ffffffff 0000000000000020
19:16:58: 00000000ffffffff 0000000000000007 ffff81005f2120c0 ffffffff8031ab60
19:16:58: 00002a16f8483cb6 000000000000234a ffff81005f2122a8 0000000000000000
19:16:58:Call Trace:
19:16:58: [&amp;lt;ffffffff80064cb5&amp;gt;] __reacquire_kernel_lock+0x2e/0x47
19:16:58: [&amp;lt;ffffffff80063171&amp;gt;] wait_for_completion+0x79/0xa2
19:16:58: [&amp;lt;ffffffff8008ee97&amp;gt;] default_wake_function+0x0/0xe
19:16:58: [&amp;lt;ffffffff8010e8a2&amp;gt;] remove_proc_entry+0xfb/0x1c7
19:16:58: [&amp;lt;ffffffff88915573&amp;gt;] :obdclass:lprocfs_remove+0x103/0x130
19:16:58: [&amp;lt;ffffffff889159d0&amp;gt;] :obdclass:lprocfs_obd_cleanup+0x90/0xa0
19:16:58: [&amp;lt;ffffffff88caf665&amp;gt;] :osc:osc_precleanup+0x2e5/0x3a0
19:16:58: [&amp;lt;ffffffff88920c35&amp;gt;] :obdclass:class_cleanup+0xc55/0xda0
19:16:58: [&amp;lt;ffffffff889241f6&amp;gt;] :obdclass:class_process_config+0x1b46/0x2cc0
19:16:58: [&amp;lt;ffffffff88926a0b&amp;gt;] :obdclass:class_manual_cleanup+0x9bb/0xd70
19:16:58: [&amp;lt;ffffffff88d03c5d&amp;gt;] :lov:lov_putref+0xa7d/0xaf0
19:16:58: [&amp;lt;ffffffff88cfe623&amp;gt;] :lov:lov_del_target+0x6d3/0x720
19:16:58: [&amp;lt;ffffffff88d0b78b&amp;gt;] :lov:lov_disconnect+0x39b/0x440
19:16:58: [&amp;lt;ffffffff88de73ea&amp;gt;] :lustre:client_common_put_super+0x83a/0xe10
19:16:58: [&amp;lt;ffffffff88de7d15&amp;gt;] :lustre:ll_put_super+0x1a5/0x330
19:16:58: [&amp;lt;ffffffff800f120a&amp;gt;] invalidate_inodes+0xce/0xe0
19:16:58: [&amp;lt;ffffffff800e78ae&amp;gt;] generic_shutdown_super+0x79/0xfb
19:16:58: [&amp;lt;ffffffff800e797e&amp;gt;] kill_anon_super+0x9/0x35
19:16:58: [&amp;lt;ffffffff800e7a2f&amp;gt;] deactivate_super+0x6a/0x82
19:16:58: [&amp;lt;ffffffff800f28f7&amp;gt;] sys_umount+0x245/0x27b
19:16:58: [&amp;lt;ffffffff800ba78a&amp;gt;] audit_syscall_entry+0x1a8/0x1d3
19:16:58: [&amp;lt;ffffffff8005d28d&amp;gt;] tracesys+0xd5/0xe0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Maloo report: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/a59d9126-41bc-11e2-a653-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/a59d9126-41bc-11e2-a653-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="49020" author="bobijam" created="Mon, 10 Dec 2012 21:27:51 +0000"  >&lt;p&gt;In Lustre Branch: b2_1&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_1/148&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_1/148&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.8/x86_64 (kernel version: 2.6.18-308.20.1.el5)&lt;/p&gt;

&lt;p&gt;In rhel5 kernel build log&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;checking if kernel has pde_users member in procfs entry struct... no&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;this test result does not match the supposed result from 2.6.18-308.20.1.el5 kernel source, my local test shows&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;checking if kernel has pde_users member in procfs entry struct... yes&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;and build failed as Stephen pointed out:&lt;/p&gt;

&lt;blockquote&gt;&lt;p&gt;/root/work/lustre.clone/libcfs/include/libcfs/params_tree.h:113:2: error: #error proc_dir_entry-&amp;gt;deleted is conflicted with proc_dir_entry-&amp;gt;pde_users&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;This means there is something wrong about the RHEL5 kernel build.&lt;/p&gt;

&lt;p&gt;Also the hung stack also reveals that proc_dir_entry_aux::pre_users is used.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;remove_proc_entry() in 2.6.18-308.20.1.el5 source&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                &lt;span class=&quot;code-comment&quot;&gt;/* Wait until all existing callers into module are done. */&lt;/span&gt;
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (pdeaux-&amp;gt;pde_users &amp;gt; 0) {
                        DECLARE_COMPLETION_ONSTACK(c);
                        pdeaux = to_pde_aux(de);
                        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (!pdeaux-&amp;gt;pde_unload_completion)
                                pdeaux-&amp;gt;pde_unload_completion = &amp;amp;c;

                        spin_unlock(&amp;amp;pdeaux-&amp;gt;pde_unload_lock);
                        spin_unlock(&amp;amp;proc_subdir_lock);

                        wait_for_completion(pdeaux-&amp;gt;pde_unload_completion);

                        spin_lock(&amp;amp;proc_subdir_lock);
                        &lt;span class=&quot;code-keyword&quot;&gt;goto&lt;/span&gt; continue_removing;
                }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I&apos;ll update a patch to make 2.6.18-308.20.1.el5 build pass.&lt;/p&gt;</comment>
                            <comment id="49021" author="bobijam" created="Mon, 10 Dec 2012 22:08:46 +0000"  >&lt;p&gt;b2_1 patch handling build error for 2.6.18-308 RHEL5 kernel is tracked at &lt;a href=&quot;http://review.whamcloud.com/4794&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4794&lt;/a&gt; &lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;commit message&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LU-1484 kernel: pass RHEL5 build for 2.6.18-308

For vanilla kernel, proc_dir_entry::deleted and ::pde_users co-exists
from 2.6.23 to 2.6.23.17.

For some RHEL5 kernels, it defines co-existings
proc_dir_entry::deleted and proc_dir_entry_aux::pde_users.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="49028" author="bobijam" created="Tue, 11 Dec 2012 01:42:20 +0000"  >&lt;p&gt;strangely the RHEL5 build in &lt;a href=&quot;http://build.whamcloud.com/job/lustre-reviews/11128/arch=x86_64,build_type=server,distro=el5,ib_stack=inkernel/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-reviews/11128/arch=x86_64,build_type=server,distro=el5,ib_stack=inkernel/&lt;/a&gt; still shows that pde_users test failed. &lt;/p&gt;

&lt;p&gt;Chris Gearing, could you help me check out why RHEL5 (b2_1) build does not detect pde_users members? Thanks.&lt;/p&gt;

&lt;p&gt;Check items:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;it is defined in fs/proc/internal.h, in proc_dir_entry_aux structure.&lt;/li&gt;
	&lt;li&gt;lustre/autoconf/lusre-core.m4 checked this member in LC_PROCFS_USERS&lt;/li&gt;
	&lt;li&gt;after configure-ed, the config.h contains &quot;#define HAVE_PROCFS_DELETED 1&quot;&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="49454" author="pjones" created="Wed, 19 Dec 2012 11:40:33 +0000"  >&lt;p&gt;Landed for 2.1.4. RHEL5 is not supported in 2.4 so this latest change is not needed to master&lt;/p&gt;</comment>
                            <comment id="49600" author="yujian" created="Sat, 22 Dec 2012 09:33:17 +0000"  >&lt;p&gt;Lustre Tag: v2_1_4_RC2&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b2_1/164&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b2_1/164&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.8/x86_64&lt;/p&gt;

&lt;p&gt;The issue occurred again: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/baaad7ac-4c1d-11e2-875d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/baaad7ac-4c1d-11e2-875d-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="49811" author="yujian" created="Mon, 31 Dec 2012 02:42:32 +0000"  >&lt;p&gt;Lustre Branch: b1_8&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/236/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/236/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.8/x86_64&lt;br/&gt;
Test Group: failover&lt;/p&gt;

&lt;p&gt;The same issue occurred: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/e6be996c-51b5-11e2-a904-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/e6be996c-51b5-11e2-a904-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="49872" author="chris" created="Thu, 3 Jan 2013 10:36:34 +0000"  >&lt;p&gt;Zhenyu Xu:&lt;/p&gt;

&lt;p&gt;I&apos;m not sure what you are asking of me, but perhaps you could tell me what this line indicates?&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;after configure-ed, the config.h contains &lt;span class=&quot;code-quote&quot;&gt;&quot;#define HAVE_PROCFS_DELETED 1&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;which config.h file is this?&lt;/p&gt;</comment>
                            <comment id="49916" author="bobijam" created="Thu, 3 Jan 2013 20:42:53 +0000"  >&lt;ul&gt;
	&lt;li&gt;pde_users should be defined in fs/proc/internal.h, in proc_dir_entry_aux structure.&lt;/li&gt;
	&lt;li&gt;lustre/autoconf/lusre-core.m4 should has checked this member in LC_PROCFS_USERS&lt;/li&gt;
	&lt;li&gt;after configure-ed, the config.h generated under lustre build root directory should contains &quot;#define HAVE_PROCFS_DELETED 1&quot; and &quot;#define HAVE_PROCFS_USERS 1&quot;&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="50039" author="yujian" created="Sun, 6 Jan 2013 07:26:12 +0000"  >&lt;p&gt;Lustre Branch: b1_8&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/236/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/236/&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.8/x86_64&lt;/p&gt;

&lt;p&gt;The same issue occurred again: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/6ed434e6-57ca-11e2-9cc9-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/6ed434e6-57ca-11e2-9cc9-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="50117" author="chris" created="Tue, 8 Jan 2013 08:53:49 +0000"  >&lt;p&gt;I presume this is a build issue,&lt;/p&gt;

&lt;p&gt;If I look at the latest b1_8 head build on server then &lt;/p&gt;

&lt;p&gt;BUILD/BUILD/lustre-1.8.8.60/config.h&lt;/p&gt;

&lt;p&gt;does have &lt;/p&gt;

&lt;p&gt;#define HAVE_PROCFS_DELETED 1&lt;/p&gt;

&lt;p&gt;but&lt;/p&gt;

&lt;p&gt;/* kernel has pde_users member in procfs entry struct */&lt;br/&gt;
/* #undef HAVE_PROCFS_USERS */&lt;/p&gt;

&lt;p&gt;I&apos;ve attached the config file and the config.log&lt;/p&gt;</comment>
                            <comment id="50177" author="bobijam" created="Tue, 8 Jan 2013 20:11:31 +0000"  >&lt;p&gt;I need port relevant patches to b1_8 branch then. it&apos;s tracked at &lt;a href=&quot;http://review.whamcloud.com/4976&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4976&lt;/a&gt;&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;commit message&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
LU-1484 lprocfs: refine LC_PROCFS_USERS check

In some RHEL patched 2.6.18 kernels, pde_users member is added in
another struct proc_dir_entry_aux instead of in struct proc_dir_entry
in later kernel version of 2.6.23.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="50412" author="pjones" created="Mon, 14 Jan 2013 10:28:08 +0000"  >&lt;p&gt;Landed to b1_8&lt;/p&gt;</comment>
                            <comment id="50854" author="yujian" created="Sat, 19 Jan 2013 10:13:16 +0000"  >&lt;p&gt;Lustre Branch: b1_8&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/248&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/248&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.8/x86_64 (kernel: 2.6.18-308.11.1.el5)&lt;/p&gt;

&lt;p&gt;The issue still occurred:&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/2b536b04-623a-11e2-b20c-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/2b536b04-623a-11e2-b20c-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="50867" author="yujian" created="Sun, 20 Jan 2013 22:05:59 +0000"  >&lt;p&gt;Another instance on Lustre build &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/249&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/249&lt;/a&gt; :&lt;br/&gt;
&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/3155439e-6355-11e2-ae8b-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/3155439e-6355-11e2-ae8b-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="50870" author="bobijam" created="Sun, 20 Jan 2013 22:33:28 +0000"  >&lt;p&gt;b1_8 still need land another patch &lt;a href=&quot;http://review.whamcloud.com/5129&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5129&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedHeader panelHeader&quot; style=&quot;border-bottom-width: 1px;&quot;&gt;&lt;b&gt;commit message&lt;/b&gt;&lt;/div&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;LU-1484 kernel: pass RHEL5 build for 2.6.18-308

For vanilla kernel, proc_dir_entry::deleted and ::pde_users co-exists
from 2.6.23 to 2.6.23.17.

For some RHEL5 kernels, it defines co-existings
proc_dir_entry::deleted and proc_dir_entry_aux::pde_users.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="51396" author="yujian" created="Tue, 29 Jan 2013 10:36:51 +0000"  >&lt;p&gt;Lustre Branch: b1_8&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/252&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/252&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.9 (kernel version: 2.6.18-348.1.1.el5)&lt;/p&gt;

&lt;p&gt;recovery-small test 57 failed again: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/68c48694-6a28-11e2-85d4-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/68c48694-6a28-11e2-85d4-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="51433" author="bobijam" created="Tue, 29 Jan 2013 21:15:10 +0000"  >&lt;p&gt;Chris,&lt;/p&gt;

&lt;p&gt;Would you mind checking the build system again for the following info of the latest b1_8 build (&lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/252)?&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/252)?&lt;/a&gt;&lt;/p&gt;

&lt;ul&gt;
	&lt;li&gt;pde_users should be defined in fs/proc/internal.h, in proc_dir_entry_aux structure.&lt;/li&gt;
	&lt;li&gt;lustre/autoconf/lusre-core.m4 should has checked this member in LC_PROCFS_USERS&lt;/li&gt;
	&lt;li&gt;after configure-ed, the config.h generated under lustre build root directory should contains &quot;#define HAVE_PROCFS_DELETED 1&quot; and &quot;#define HAVE_PROCFS_USERS 1&quot;&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="51447" author="chris" created="Wed, 30 Jan 2013 05:14:38 +0000"  >&lt;p&gt;Xu,&lt;/p&gt;

&lt;p&gt;Can you provide me a lot more info please, I really do not know what you are asking me to check.&lt;/p&gt;

&lt;p&gt;fs/proc/internal.h comes as part of the lustre source? If so how does the build system affect the presence of pde_users, and if not where does fs/proc/internal.h come from.&lt;/p&gt;

&lt;p&gt;I guess I&apos;m just not understanding how the build system affects these things.&lt;/p&gt;

&lt;p&gt;What do I need to provide to help you debug this?&lt;/p&gt;</comment>
                            <comment id="51448" author="bobijam" created="Wed, 30 Jan 2013 05:30:48 +0000"  >&lt;p&gt;sorry, fs/proc/internal.h is kernel&apos;s file, like linux-2.6.32-xxx/fs/proc/internal.h&lt;/p&gt;

&lt;p&gt;I&apos;ll also check my local 2.6.18-348.1.1.el5 kernel.&lt;/p&gt;</comment>
                            <comment id="51453" author="bobijam" created="Wed, 30 Jan 2013 07:42:31 +0000"  >&lt;p&gt;Chris,&lt;/p&gt;

&lt;p&gt;This is my local VM test environment test confirm procedure (CentOS 5.9, 2.6.18-348.1.1.el5 kernel), can you confirm them on the build node?&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;$ tail -n 15 linux-2.6.18-308.11.1.el5-b18/fs/proc/internal.h 
/*
 * RHEL internal wrapper to extend struct proc_dir_entry
 */
struct proc_dir_entry_aux {
	struct proc_dir_entry pde;
	int pde_users;  /* number of callers into module in progress */
	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
	struct completion *pde_unload_completion;
	char name[]; /* PDE name */
};

static inline struct proc_dir_entry_aux *to_pde_aux(struct proc_dir_entry *d)
{
	return container_of(d, struct proc_dir_entry_aux, pde);
}



$ ./configure --with-linux=/path-to/linux-2.6.18-308.11.1.el5-b18
...
...
checking if kernel has pde_users member in procfs entry struct... yes
...
checking if kernel has deleted member in procfs entry struct... yes
...



$ grep PROCFS ~/work/lustre-b18/config.h
440:#define HAVE_PROCFS_DELETED 1
443:#define HAVE_PROCFS_USERS 1



$ONLY=57 bash recovery-small.sh
...
...
== test 57: read procfs entries causes kernel crash == 20:41:25
fail_loc=0x80000B00
Stopping client test3 /mnt/lustre (opts:)
fail_loc=0x80000B00
Stopping /mnt/mds (opts:)
Failover mds to test3
20:41:49 (1359549709) waiting for test3 network 900 secs ...
20:41:49 (1359549709) network interface is UP
Starting mds: -o loop -o abort_recovery /tmp/lustre-mdt /mnt/mds
lnet.debug=0x33f1504
lnet.subsystem_debug=0xffb7e3ff
lnet.debug_mb=32
Started lustre-MDT0000
recovery-small.sh: line 992: kill: (1143) - No such process
fail_loc=0
Starting client: test3: -o user_xattr,acl,flock test3@tcp:/lustre /mnt/lustre
lnet.debug=0x33f1504
lnet.subsystem_debug=0xffb7e3ff
lnet.debug_mb=32
Filesystem           1K-blocks      Used Available Use% Mounted on
test3@tcp:/lustre       562408     53656    478752  11% /mnt/lustre
Resetting fail_loc on all nodes...done.
PASS 57 (26s)
...===== recovery-small.sh test complete, duration 28 sec ======================
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="51590" author="bobijam" created="Thu, 31 Jan 2013 22:40:43 +0000"  >&lt;p&gt;in client build log &quot;config.log&quot; I found this&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
 #include &amp;lt;linux/kernel.h&amp;gt;
|
|               #include &quot;/var/lib/jenkins/workspace/lustre-b1_8/arch/x86_64/build_type/client/distro/el5/ib_stack/inkernel/BUILD/reused/usr/src/kernels/2.6.18-348.1.1.el5-x86_64/fs/proc/internal.h&quot;
|
| int
| main (void)
| {
|
|               struct proc_dir_entry_aux pde_aux;
|
|               pde_aux.pde_users = 0;
|
|   ;
|   return 0;
| }
configure:14056: result: no
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And I checked the client build environment Chris copied for debugging&lt;/p&gt;

&lt;p&gt;bobijam@brent:/scratch/help-bob-jam/client/BUILD/reused/usr/src/kernels/2.6.18-3&lt;br/&gt;
48.1.1.el5-x86_64$ ll fs/proc/&lt;br/&gt;
total 12&lt;br/&gt;
drwxr-xr-x  2 533 503 4096 Jan 31 09:50 ./&lt;br/&gt;
drwxr-xr-x 66 533 503 4096 Jan 31 09:50 ../&lt;br/&gt;
&lt;del&gt;rw-r&lt;/del&gt;&lt;del&gt;r&lt;/del&gt;-  1 533 503  378 Jan 31 09:50 Makefile&lt;/p&gt;

&lt;p&gt;there&apos;s no files under fs/proc/, while this RHEL kernel (vanilla kernel + RHEL patches) should have C/H files under fs/proc/&lt;/p&gt;

&lt;p&gt;Brian, does the build process in this stage only use vanilla kernel (i.e. hasn&apos;t applied RHEL patches)?  Given that the RHEL kernel src rpm only provides vanilla kernel source plus RHEL&apos;s patches, and the patches only get applied when rpmbuild executes its &quot;%prep&quot; stage.&lt;/p&gt;</comment>
                            <comment id="51615" author="brian" created="Fri, 1 Feb 2013 07:04:57 +0000"  >&lt;p&gt;Does the client build even use full kernel source at all?  It shouldn&apos;t need it I don&apos;t think, just kernel-devel.  Yes, looking in lbuild in build_with_srpm() where $PATCHLESS == true:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ! kernelrpm=$(find_linux_rpm &lt;span class=&quot;code-quote&quot;&gt;&quot;-$DEVEL_KERNEL_TYPE&quot;&lt;/span&gt;); then
            fatal 1 &lt;span class=&quot;code-quote&quot;&gt;&quot;Could not find the kernel-$DEVEL_KERNEL_TYPE RPM in ${KERNELRPMSBASE}/${lnxmaj}/${DISTRO}&quot;&lt;/span&gt;
        fi
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; ! lnxrel=&lt;span class=&quot;code-quote&quot;&gt;&quot;$lnxrel&quot;&lt;/span&gt; unpack_linux_devel_rpm &lt;span class=&quot;code-quote&quot;&gt;&quot;$kernelrpm&quot;&lt;/span&gt; &lt;span class=&quot;code-quote&quot;&gt;&quot;-&quot;&lt;/span&gt;; then
            fatal 1 &lt;span class=&quot;code-quote&quot;&gt;&quot;Could not find the Linux tree in $kernelrpm&quot;&lt;/span&gt;
        fi
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;and we can see in the build log in:&lt;/p&gt;

&lt;p&gt;&lt;tt&gt;&lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/252/arch=x86_64,build_type=client,distro=el5,ib_stack=inkernel/consoleText&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/252/arch=x86_64,build_type=client,distro=el5,ib_stack=inkernel/consoleText&lt;/a&gt;&lt;/tt&gt;&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;+ kernelrpm=/&lt;span class=&quot;code-keyword&quot;&gt;var&lt;/span&gt;/lib/jenkins/lbuild-data/kernelrpm/2.6.18/rhel5/x86_64/kernel-devel-2.6.18-348.1.1.el5.x86_64.rpm
...
+ unpack_linux_devel_rpm /&lt;span class=&quot;code-keyword&quot;&gt;var&lt;/span&gt;/lib/jenkins/lbuild-data/kernelrpm/2.6.18/rhel5/x86_64/kernel-devel-2.6.18-348.1.1.el5.x86_64.rpm -
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So indeed, it&apos;s kernel-devel that lustre&apos;s configure is pointed at in build_with_srpm()-&amp;gt;build_lustre():&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;++ ./configure --build=x86_64-redhat-linux-gnu --host=x86_64-redhat-linux-gnu --target=x86_64-redhat-linux-gnu --program-prefix= --prefix=/usr --exec-prefix=/usr --bindir=/usr/bin --sbindir=/usr/sbin --sysconfdir=/etc --datadir=/usr/share --includedir=/usr/include --libdir=/usr/lib64 --libexecdir=/usr/libexec --localstatedir=/&lt;span class=&quot;code-keyword&quot;&gt;var&lt;/span&gt; --sharedstatedir=/usr/com --mandir=/usr/share/man --infodir=/usr/share/info --with-linux=/&lt;span class=&quot;code-keyword&quot;&gt;var&lt;/span&gt;/lib/jenkins/workspace/lustre-b1_8/arch/x86_64/build_type/client/distro/el5/ib_stack/inkernel/BUILD/reused/usr/src/kernels/2.6.18-348.1.1.el5-x86_64 --with-linux-obj=/&lt;span class=&quot;code-keyword&quot;&gt;var&lt;/span&gt;/lib/jenkins/workspace/lustre-b1_8/arch/x86_64/build_type/client/distro/el5/ib_stack/inkernel/BUILD/reused/usr/src/kernels/2.6.18-348.1.1.el5-x86_64 --disable-server --enable-liblustre --enable-liblustre-tests --with-release=wc1_2.6.18_348.1.1.el5_g3480bb0 --enable-tests --enable-liblustre-tests
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;If you look in kernel-devel you will find that fs/proc/ is empty because kernel-devel is &quot;kernel headers&quot; not full kernel source.&lt;/p&gt;

&lt;p&gt;Ultimate what this means is that you need to re-cook your test so as to not need kernel source but kernel headers only.  This is the standard for external kernel modules: they should be buildable with kernel headers only since kernel headers represent the API and reaching behind the API is &quot;cheating&quot;.&lt;/p&gt;</comment>
                            <comment id="51633" author="bobijam" created="Fri, 1 Feb 2013 13:18:09 +0000"  >&lt;p&gt;Andreas,&lt;/p&gt;

&lt;p&gt;RHEL 5.9 hasn&apos;t revealed pre_users in its devel package, I find no other way to detect proc_dir_entry_aux::pde_users, and since the pde_users are all used in later kernels, is it ok to change the lprocfs_status.&lt;span class=&quot;error&quot;&gt;&amp;#91;c|h&amp;#93;&lt;/span&gt; code to assume HAVE_PROCFS_USERS is always defined?&lt;/p&gt;</comment>
                            <comment id="51657" author="adilger" created="Fri, 1 Feb 2013 18:46:34 +0000"  >&lt;p&gt;I can&apos;t find any way to check for proc_dir_entry_aux, so we can&apos;t depend on checking it for patchless clients.&lt;/p&gt;

&lt;p&gt;I think what needs to change here is two things:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;the code in lprocfs_status.h (1.8) and param_tree.h (master) should be changed to check for HAVE_PROCFS_USERS first, then HAVE_PROCFS_DELETED secondly, so that if both are available it uses the HAVE_PROCFS_USERS method&lt;/li&gt;
	&lt;li&gt;always check for pde_fops == NULL, regardless of whether we detect HAVE_PROCFS_USERS&lt;/li&gt;
	&lt;li&gt;always check for deleted, if HAVE_PROCFS_DELETED is set, even if HAVE_PROCFS_USERS is also present&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;At worst this causes some small race where a /proc entry will not be shown when it is just loaded or unloaded, but should be safe against crashing.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; inline &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; LPROCFS_ENTRY_AND_CHECK(struct proc_dir_entry *dp)
{
        &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; deleted = 0;

#ifdef HAVE_PROCFS_USERS
        spin_lock(&amp;amp;dp-&amp;gt;pde_unload_lock);
#endif
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (unlikely(dp-&amp;gt;proc_fops == NULL)) 
                deleted = 1;
#ifdef HAVE_PROCFS_USERS
        spin_unlock(&amp;amp;dp-&amp;gt;pde_unload_lock);
#endif

        LPROCFS_ENTRY();
#&lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; defined(HAVE_PROCFS_DELETED)
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (unlikely(dp-&amp;gt;deleted)) {
                LPROCFS_EXIT();
                deleted = 1;
        }
#endif

        &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; deleted ? -ENODEV : 0;
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I haven&apos;t tested this at all, nor even compiled it yet.&lt;/p&gt;</comment>
                            <comment id="51658" author="adilger" created="Fri, 1 Feb 2013 19:32:06 +0000"  >&lt;p&gt;Patch at &lt;a href=&quot;http://review.whamcloud.com/5253&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5253&lt;/a&gt;, let&apos;s hope it builds and tests OK.&lt;/p&gt;</comment>
                            <comment id="51841" author="yujian" created="Tue, 5 Feb 2013 23:27:34 +0000"  >&lt;p&gt;Lustre Branch: b1_8&lt;br/&gt;
Lustre Build: &lt;a href=&quot;http://build.whamcloud.com/job/lustre-b1_8/253&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-b1_8/253&lt;/a&gt;&lt;br/&gt;
Distro/Arch: RHEL5.9/x86_64&lt;/p&gt;

&lt;p&gt;The issue still occurred: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/583b7710-7009-11e2-a955-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/583b7710-7009-11e2-a955-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="51845" author="bobijam" created="Wed, 6 Feb 2013 00:15:27 +0000"  >&lt;p&gt;Since recovery-small test_57 is intended to test proc removing while reading it, so the patch (review#5253) cannot avoid the hung of the test w/ patchless client build upon the hidden proc_dir_entry users kernels.&lt;/p&gt;

&lt;p&gt;Since later kernels all use proc_dir_entry users, I think we can presume it and define LPROCFS_&lt;/p&gt;
{ENTRY,END}
&lt;p&gt; empty ops.&lt;/p&gt;</comment>
                            <comment id="52403" author="utopiabound" created="Thu, 14 Feb 2013 16:28:38 +0000"  >&lt;p&gt;Patch to assume proc_dir_entry for rhel kernels: &lt;a href=&quot;http://review.whamcloud.com/5439&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5439&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="52621" author="pjones" created="Mon, 18 Feb 2013 08:46:09 +0000"  >&lt;p&gt;Nathaniel,&lt;/p&gt;

&lt;p&gt;Is this patch needed for b2_1 also?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="52626" author="yujian" created="Mon, 18 Feb 2013 09:43:58 +0000"  >&lt;p&gt;Per &lt;a href=&quot;http://wiki.whamcloud.com/display/ENG/Lustre+2.1.4+release+testing+tracker&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://wiki.whamcloud.com/display/ENG/Lustre+2.1.4+release+testing+tracker&lt;/a&gt;, the issue still exists in Lustre 2.1.4, so we need the patch on the current b2_1 branch for Lustre 2.1.5.&lt;/p&gt;</comment>
                            <comment id="52680" author="utopiabound" created="Tue, 19 Feb 2013 09:03:43 +0000"  >&lt;p&gt;Peter,&lt;/p&gt;

&lt;p&gt;Yes. This patch can cleanly apply to b2_1 (all the way through master).  It should be applied to anything we want to support rhel 5 on.  Should I submit additional patches?&lt;/p&gt;</comment>
                            <comment id="52681" author="utopiabound" created="Tue, 19 Feb 2013 09:13:31 +0000"  >&lt;p&gt;b2_1 patch:&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/5468&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5468&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="52832" author="pjones" created="Thu, 21 Feb 2013 15:04:05 +0000"  >&lt;p&gt;Landed for 1.8.9 and 2.1.5&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="12149" name="config.h" size="18111" author="chris" created="Tue, 8 Jan 2013 08:54:11 +0000"/>
                            <attachment id="12150" name="config.log" size="603278" author="chris" created="Tue, 8 Jan 2013 08:54:11 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv653:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4529</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>