<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:41:01 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-4249] exception RIP: lqe64_hash_keycmp+12</title>
                <link>https://jira.whamcloud.com/browse/LU-4249</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;OSS crashed with Projection Fault at lqe64_hash_keycmp&lt;/p&gt;

&lt;p&gt;See attached files. &lt;br/&gt;
Looks like we are crashing here&lt;/p&gt;

&lt;p&gt;libcfs/libcfs/hash.c&lt;br/&gt;
static cfs_hlist_node_t *&lt;br/&gt;
cfs_hash_bd_lookup_intent(cfs_hash_t *hs, cfs_hash_bd_t  *bd,                        &lt;br/&gt;
.&lt;br/&gt;
.&lt;br/&gt;
.&lt;br/&gt;
        cfs_hlist_for_each(ehnode, hhead) { &amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt; CRASH &lt;br/&gt;
                if (!cfs_hash_keycmp(hs, key, ehnode))&lt;br/&gt;
                        continue;&lt;/p&gt;

&lt;p&gt;cfs_hlist_for_each defined as hlist_for_each but hlist_for_each doesn&apos;t seem to be defined any where.&lt;/p&gt;

&lt;p&gt;See attached backtrace info.&lt;/p&gt;
</description>
                <environment>our source tree is at &lt;a href=&quot;https://github.com/jlan/lustre-nas&quot;&gt;https://github.com/jlan/lustre-nas&lt;/a&gt;&lt;br/&gt;
Server running 2.4.0-4nasS</environment>
        <key id="22005">LU-4249</key>
            <summary>exception RIP: lqe64_hash_keycmp+12</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Wed, 13 Nov 2013 22:19:53 +0000</created>
                <updated>Thu, 26 Feb 2015 22:08:48 +0000</updated>
                            <resolved>Fri, 8 Aug 2014 13:13:04 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.5.3</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>16</watches>
                                                                            <comments>
                            <comment id="71496" author="niu" created="Thu, 14 Nov 2013 02:10:35 +0000"  >&lt;p&gt;hlist_for_each() is a kernel function. Mahmoud, do you know if this happened when shutding down OST? Thanks.&lt;/p&gt;</comment>
                            <comment id="71497" author="mhanafi" created="Thu, 14 Nov 2013 02:24:37 +0000"  >&lt;p&gt;This was not during shutdown&lt;/p&gt;</comment>
                            <comment id="71671" author="mhanafi" created="Fri, 15 Nov 2013 20:17:46 +0000"  >&lt;p&gt;We hit this bug again today. Is there any progress coming up with a cause/fix?&lt;/p&gt;</comment>
                            <comment id="71700" author="adilger" created="Fri, 15 Nov 2013 23:48:51 +0000"  >&lt;p&gt;Mahmoud, could you please attach the console log from before the crash and the oops message itself.&lt;/p&gt;</comment>
                            <comment id="71703" author="mhanafi" created="Sat, 16 Nov 2013 00:00:27 +0000"  >&lt;p&gt;Here 1 Hour before the oops and the oops message itself.&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;-- MARK -- Fri Nov 15 11:00:00 2013&amp;#93;&lt;/span&gt;^M&lt;br/&gt;
LustreError: 13805:0:(qsd_handler.c:344:qsd_req_completion()) $$$ DQACQ failed with -37, flags:0x2 qsd:nbp7-OST004e qtype:usr id:12157 enforced:1 granted:3639632 pending:0 waiting:0 req:1 usage:3449480 qunit:1048576 qtune:262144 edquot:0^M&lt;br/&gt;
LustreError: 13805:0:(qsd_handler.c:344:qsd_req_completion()) Skipped 52 previous similar messages^M&lt;br/&gt;
LustreError: 13891:0:(qsd_handler.c:344:qsd_req_completion()) $$$ DQACQ failed with -37, flags:0x2 qsd:nbp7-OST002a qtype:usr id:12157 enforced:1 granted:4194304 pending:0 waiting:0 req:1 usage:4009436 qunit:1048576 qtune:262144 edquot:0^M&lt;br/&gt;
LustreError: 13891:0:(qsd_handler.c:344:qsd_req_completion()) Skipped 58 previous similar messages^M&lt;br/&gt;
LustreError: 14039:0:(qsd_handler.c:344:qsd_req_completion()) $$$ DQACQ failed with -37, flags:0x2 qsd:nbp7-OST002e qtype:usr id:12157 enforced:1 granted:4157724 pending:0 waiting:0 req:1 usage:3963432 qunit:1048576 qtune:262144 edquot:0^M&lt;br/&gt;
LustreError: 14039:0:(qsd_handler.c:344:qsd_req_completion()) Skipped 48 previous similar messages^M&lt;br/&gt;
Lustre: nbp7-OST004e: haven&apos;t heard from client fefdd2bf-1bc8-70b6-3aeb-de9d92f342d8 (at 10.151.1.109@o2ib) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff88155de95400, cur 1384543303 expire 1384543153 last 1384543076^M&lt;br/&gt;
Lustre: Skipped 10 previous similar messages^M&lt;br/&gt;
Lustre: nbp7-OST0042: haven&apos;t heard from client fefdd2bf-1bc8-70b6-3aeb-de9d92f342d8 (at 10.151.1.109@o2ib) in 227 seconds. I think it&apos;s dead, and I am evicting it. exp ffff881a0cb53000, cur 1384543303 expire 1384543153 last 1384543076^M&lt;br/&gt;
LNet: 2974:0:(o2iblnd_cb.c:2348:kiblnd_passive_connect()) Conn stale 10.151.17.83@o2ib &lt;span class=&quot;error&quot;&gt;&amp;#91;old ver: 12, new ver: 12&amp;#93;&lt;/span&gt;^M&lt;br/&gt;
LustreError: 14039:0:(qsd_handler.c:344:qsd_req_completion()) $$$ DQACQ failed with -37, flags:0x2 qsd:nbp7-OST002e qtype:usr id:12157 enforced:1 granted:4157724 pending:0 waiting:0 req:1 usage:3963432 qunit:1048576 qtune:262144 edquot:0^M&lt;br/&gt;
LustreError: 14039:0:(qsd_handler.c:344:qsd_req_completion()) Skipped 562 previous similar messages^M&lt;br/&gt;
general protection fault: 0000 &lt;a href=&quot;#1&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;1&lt;/a&gt; SMP ^M&lt;br/&gt;
last sysfs file: /sys/devices/pci0000:80/0000:80:03.0/0000:86:00.0/host15/rport-15:0-0/target15:0:0/15:0:0:39/state^M&lt;br/&gt;
^M&lt;br/&gt;
Entering kdb (current=0xffff8810259ed500, pid 306) on processor 19 Oops: (null)^M&lt;br/&gt;
due to oops @ 0xffffffffa0c72f2c^M&lt;br/&gt;
     r15 = 0x5a5a5a5a5a5a5a5a      r14 = 0x0000000000000000 ^M&lt;br/&gt;
     r13 = 0xffff880e9509ef28      r12 = 0xffff8810259ef9d0 ^M&lt;br/&gt;
      bp = 0xffff8810259ef940       bx = 0xffff881f55c5f900 ^M&lt;br/&gt;
     r11 = 0x6000000000000000      r10 = 0x0000000000001e70 ^M&lt;br/&gt;
      r9 = 0x000000000000e56f       r8 = 0x0000000000000003 ^M&lt;br/&gt;
      ax = 0x0000000000002f7d       cx = 0x0000000000000000 ^M&lt;br/&gt;
      dx = 0x0000000000000000       si = 0x5a5a5a5a5a5a5a5a ^M&lt;br/&gt;
      di = 0xffff880e9509ef28  orig_ax = 0xffffffffffffffff ^M&lt;br/&gt;
      ip = 0xffffffffa0c72f2c       cs = 0x0000000000000010 ^M&lt;br/&gt;
   flags = 0x0000000000010206       sp = 0xffff8810259ef930 ^M&lt;br/&gt;
      ss = 0x0000000000000018 &amp;amp;regs = 0xffff8810259ef898^M&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;19&amp;#93;&lt;/span&gt;kdb&amp;gt; &lt;span class=&quot;error&quot;&gt;&amp;#91;-- MARK -- Fri Nov 15 12:00:00 2013&amp;#93;&lt;/span&gt;^M&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;-- root@localhost attached -- Fri Nov 15 12:01:54 2013&amp;#93;&lt;/span&gt;^M&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;-- root@localhost detached -- Fri Nov 15 12:08:44 2013&amp;#93;&lt;/span&gt;^M&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;-- root@localhost attached -- Fri Nov 15 12:10:07 2013&amp;#93;&lt;/span&gt;^M&lt;/p&gt;</comment>
                            <comment id="71754" author="niu" created="Mon, 18 Nov 2013 07:06:43 +0000"  >&lt;p&gt;Looks like the lqe is freed unexpectedly, but I didn&apos;t see from the code how it will be freed in such case. Anyway, the message:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 13891:0:(qsd_handler.c:344:qsd_req_completion()) $$$ DQACQ failed with -37, flags:0x2 qsd:nbp7-OST002a qtype:usr id:12157 enforced:1 granted:4194304 pending:0 waiting:0 req:1 usage:4009436 qunit:1048576 qtune:262144 edquot:0^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;indicating something was wrong. If such error message can be reproduced, could you reproduce it and capture the logs on both OST and MDT with D_QUOTA &amp;amp; D_TRACE enabled? I&apos;d like to figure out why the acquire request always failed with -ENOLCK.&lt;/p&gt;</comment>
                            <comment id="71821" author="mhanafi" created="Mon, 18 Nov 2013 19:53:10 +0000"  >&lt;p&gt;Uploaded debug logs:&lt;/p&gt;

&lt;p&gt; &lt;a href=&quot;ftp://ftp.whamcloud.com/uploads/LU-4249.debuglogs.tgz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;ftp://ftp.whamcloud.com/uploads/LU-4249.debuglogs.tgz&lt;/a&gt;&lt;/p&gt;
</comment>
                            <comment id="71845" author="niu" created="Tue, 19 Nov 2013 03:15:37 +0000"  >&lt;p&gt;The numerous ENOLCK errors is possibly caused by a race:&lt;/p&gt;

&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;The per-ID quota lock is being canceled on quota slave, and the corresponding lqe_lockh is cleared, see qsd_id_blocking_ast():
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;                lqe_write_lock(lqe);
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lustre_handle_equal(&amp;amp;lockh, &amp;amp;lqe-&amp;gt;lqe_lockh)) {
                        &lt;span class=&quot;code-comment&quot;&gt;/* Clear lqe_lockh &amp;amp; reset qunit to 0 */&lt;/span&gt;
                        qsd_set_qunit(lqe, 0);
                        memset(&amp;amp;lqe-&amp;gt;lqe_lockh, 0, sizeof(lqe-&amp;gt;lqe_lockh));
                        lqe-&amp;gt;lqe_edquot = &lt;span class=&quot;code-keyword&quot;&gt;false&lt;/span&gt;;
                        rel = &lt;span class=&quot;code-keyword&quot;&gt;true&lt;/span&gt;;
                }
                lqe_write_unlock(lqe);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;While just after the lqe_lockh is cleared, reply of pending DQACQ is received, the the lqe_lockh is set back to old lockh again, see qsd_req_completion():
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        &lt;span class=&quot;code-comment&quot;&gt;/* Set the lqe_lockh */&lt;/span&gt;
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (lustre_handle_is_used(lockh) &amp;amp;&amp;amp;
            !lustre_handle_equal(lockh, &amp;amp;lqe-&amp;gt;lqe_lockh))
                lustre_handle_copy(&amp;amp;lqe-&amp;gt;lqe_lockh, lockh);

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;Then pre-acquire will always try to get lock with stale lockh, and fail with -ENLOCK.&lt;/p&gt;

&lt;p&gt;But I didn&apos;t see why it cause crash in lqe64_hash_keycmp(), anyway, I&apos;ll cooke a patch to fix the race first.&lt;/p&gt;</comment>
                            <comment id="71847" author="niu" created="Tue, 19 Nov 2013 04:21:10 +0000"  >&lt;p&gt;patch to fix the -ENOLCK error:  &lt;a href=&quot;http://review.whamcloud.com/8322&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/8322&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="72100" author="niu" created="Fri, 22 Nov 2013 07:11:47 +0000"  >&lt;p&gt;Looks my analysis about the ENOLCK isn&apos;t quite right: the DQACQ should have held lock reference, so lock can&apos;t be canceled when there is pending DQACQ. Mahmoud, do you know what kind of operations could cause the problem? Is your source tree same as our source tree?&lt;/p&gt;</comment>
                            <comment id="72220" author="pjones" created="Mon, 25 Nov 2013 14:16:34 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;NASA do include some patches. There is a link to their tree in the Environment field above&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="72253" author="mhanafi" created="Mon, 25 Nov 2013 21:00:35 +0000"  >&lt;p&gt;We had another crash. We do have a crash dump. But needs to be only view by US Citizen.&lt;/p&gt;

&lt;p&gt;I tried to get more info on the DQACQ error. Got a list of users that are generating the error. And then looked at there quota. Found some odd things. Some users are have used files count much less than what the are using. for example &lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;find  /nobackupp7/ppritche/ -type f -user ppritche| wc -l&lt;br/&gt;
22986&lt;/li&gt;
&lt;/ol&gt;



&lt;ol&gt;
	&lt;li&gt;lfs quota -u ppritche /nobackupp7&lt;br/&gt;
Disk quotas for user ppritche (uid 11563):&lt;br/&gt;
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace&lt;br/&gt;
    /nobackupp7 2040023252  2300000000 2600000000       -      30   75000  150000       -&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;lfs quota says he is using 30 files but find show &amp;gt;22000 files. there are other examples like this.&lt;/p&gt;

</comment>
                            <comment id="72272" author="keith" created="Mon, 25 Nov 2013 22:29:10 +0000"  >&lt;p&gt;I am able to look at this Crashdump as I have done before. I send an email. &lt;/p&gt;</comment>
                            <comment id="72276" author="niu" created="Tue, 26 Nov 2013 02:01:07 +0000"  >&lt;p&gt;Thank you, Keith, let&apos;s see if there is anything new in the new crash.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Some users are have used files count much less than what the are using&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Is this a new system or upgraded from old system? Will &apos;lfs quota&apos; show the newly created files? Can quotacheck fix the inconsistence problem? (&apos;tune2fs -O ^quota&apos; to disable quota then &apos;tune2fs -O quota&apos; to enable quota, with uptodate e2fsprogs, see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3861&quot; title=&quot;Quota issues after upgrade from 2.1.4 to 2.4&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3861&quot;&gt;&lt;del&gt;LU-3861&lt;/del&gt;&lt;/a&gt;)&lt;/p&gt;</comment>
                            <comment id="72332" author="mhanafi" created="Tue, 26 Nov 2013 18:57:59 +0000"  >&lt;p&gt;This is a filesystem that was upgraded from 2.1.5 to 2.4. We initially ran into this file undercount and did the tune2fs command. One thing to note about these users. Their files are initially copied as root to this filesystem. They their ownership is changed. Here is an example&lt;/p&gt;

&lt;p&gt;/nobackupp7                            &amp;lt;-Top level directory&lt;br/&gt;
/nobackupp7/ARRIVALS.anudelma          &amp;lt;-- Owned by user:anudelma&lt;br/&gt;
/nobackupp7/ARRIVALS.anudelma/jliu7    &amp;lt;-- owned by jlui7 and all files under this dir.&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;lfs quota -u jliu7 /nobackupp7&lt;br/&gt;
Disk quotas for user jliu7 (uid 11925):&lt;br/&gt;
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace&lt;br/&gt;
    /nobackupp7 4340116920       0       0       -       0       0       0       -&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;So when the owner ship is changed the file are not accounted for.&lt;/p&gt;


&lt;p&gt;Question Using tune2fs:&lt;br/&gt;
Once we unmount the ost and run &apos;tune2fs -O ^quota&apos;, do we need to mount and umount it before running &apos;tune2fs -O quota&apos;&lt;/p&gt;


&lt;p&gt;Keith,&lt;br/&gt;
I uploaded the core file see you email for info&lt;/p&gt;

&lt;p&gt;Mahmoud&lt;/p&gt;</comment>
                            <comment id="72334" author="keith" created="Tue, 26 Nov 2013 20:11:07 +0000"  >&lt;p&gt;I will take a good look this crash and report back. &lt;/p&gt;</comment>
                            <comment id="72368" author="niu" created="Wed, 27 Nov 2013 03:47:25 +0000"  >&lt;blockquote&gt;
&lt;p&gt;So when the owner ship is changed the file are not accounted for.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Is this reproduceable? Any error messages in the log?&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Question Using tune2fs:&lt;br/&gt;
Once we unmount the ost and run &apos;tune2fs -O ^quota&apos;, do we need to mount and umount it before running &apos;tune2fs -O quota&apos;&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;No, no need to do mount and umount.&lt;/p&gt;</comment>
                            <comment id="73976" author="mhanafi" created="Sat, 21 Dec 2013 00:35:01 +0000"  >&lt;p&gt;Keith,&lt;/p&gt;

&lt;p&gt;We need some update on this? Any luck extracting info from the core files?&lt;/p&gt;
</comment>
                            <comment id="74826" author="cliffw" created="Mon, 13 Jan 2014 16:13:20 +0000"  >&lt;p&gt;I have taken over this from Kieth, very sorry about the delay. I have the dump unpacked, and should be able to get you some information this week. &lt;/p&gt;</comment>
                            <comment id="74882" author="cliffw" created="Tue, 14 Jan 2014 01:21:19 +0000"  >&lt;p&gt;I see you are using a non-standard build - would it be possible for you to unload your vmlinux for this crash dump? &lt;/p&gt;</comment>
                            <comment id="74883" author="mhanafi" created="Tue, 14 Jan 2014 01:50:01 +0000"  >&lt;p&gt;I upload the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4249&quot; title=&quot;exception RIP: lqe64_hash_keycmp+12&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4249&quot;&gt;&lt;del&gt;LU-4249&lt;/del&gt;&lt;/a&gt;RPMS.tgz to ftp.whamcloud.com which will have the needed file you need.&lt;/p&gt;
</comment>
                            <comment id="74970" author="mhanafi" created="Tue, 14 Jan 2014 23:02:17 +0000"  >&lt;p&gt;Cliff,&lt;br/&gt;
Have you had a chance to look at the crashdump? We hit this bug 3 time this morning.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Mahmoud&lt;/p&gt;</comment>
                            <comment id="74985" author="niu" created="Wed, 15 Jan 2014 03:54:42 +0000"  >&lt;p&gt;Mahmoud, Cliff has already checked the crashdump, and the crashdump shows that it crashed in same place as the first crash you posted in this ticket.&lt;/p&gt;

&lt;p&gt;I&apos;m not sure if it&apos;s related to the cfs_hash bug, but you&apos;d apply e44670a0301d3edf13ebd1bc728ad4c797369de2 (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4362&quot; title=&quot;cfs_hash_rehash_key() passed wrong parameters to cfs_hash_keycpy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4362&quot;&gt;&lt;del&gt;LU-4362&lt;/del&gt;&lt;/a&gt;)&lt;/p&gt;</comment>
                            <comment id="75008" author="mhanafi" created="Wed, 15 Jan 2014 18:05:05 +0000"  >&lt;p&gt;We will apply &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4362&quot; title=&quot;cfs_hash_rehash_key() passed wrong parameters to cfs_hash_keycpy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4362&quot;&gt;&lt;del&gt;LU-4362&lt;/del&gt;&lt;/a&gt; and test.&lt;/p&gt;

&lt;p&gt;Thanks.&lt;/p&gt;</comment>
                            <comment id="75908" author="mhanafi" created="Thu, 30 Jan 2014 01:39:45 +0000"  >&lt;p&gt;Looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4362&quot; title=&quot;cfs_hash_rehash_key() passed wrong parameters to cfs_hash_keycpy&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4362&quot;&gt;&lt;del&gt;LU-4362&lt;/del&gt;&lt;/a&gt; DID not fix this crash. We just had 3 oss crash at the same time. Looks like this is very sensitive to mds recovery.&lt;/p&gt;

&lt;p&gt;ustreError: 13476:0:(qsd_handler.c:344:qsd_req_completion()) Skipped 9 previous similar messages^M&lt;br/&gt;
LustreError: 167-0: nbp7-MDT0000-lwp-OST0002: This client was evicted by nbp7-MDT0000; in progress operations using this service will fail.^M&lt;br/&gt;
LustreError: Skipped 1 previous similar message^M&lt;br/&gt;
Lustre: Evicted from MGS (at 10.151.27.38@o2ib) after server handle changed from 0xc262b1e69b1d10fc to 0xe69d9b7903f78200^M&lt;br/&gt;
Lustre: MGC10.151.27.38@o2ib: Connection restored to MGS (at 10.151.27.38@o2ib)^M&lt;br/&gt;
Lustre: Skipped 2 previous similar messages^M&lt;br/&gt;
Lustre: 82396:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST000e: failed to report quota. &lt;span class=&quot;error&quot;&gt;&amp;#91;0x200000006:0x20000:0x0&amp;#93;&lt;/span&gt;, -37^M&lt;br/&gt;
Lustre: 82396:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST000e: reconciliation failed. &lt;span class=&quot;error&quot;&gt;&amp;#91;0x0:0x0:0x0&amp;#93;&lt;/span&gt;, -37^M&lt;br/&gt;
Lustre: nbp7-OST0002: deleting orphan objects from 0x0:7609774 to 0x0:7610145^M&lt;br/&gt;
Lustre: nbp7-OST000e: deleting orphan objects from 0x0:7567992 to 0x0:7568097^M&lt;br/&gt;
Lustre: nbp7-OST001a: deleting orphan objects from 0x0:7599891 to 0x0:7600225^M&lt;br/&gt;
general protection fault: 0000 &lt;a href=&quot;#1&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;1&lt;/a&gt; SMP ^M&lt;br/&gt;
last sysfs file: /sys/devices/pci0000:40/0000:40:03.2/0000:44:00.1/host4/rport-4:0-0/target4:0:0/4:0:0:27/state^M&lt;br/&gt;
  15 out of 16 cpus in kdb, waiting for the rest, timeout in 10 second(s)^M&lt;br/&gt;
...1 cpu is not in kdb, its state is unknown^M&lt;br/&gt;
^M&lt;br/&gt;
Entering kdb (current=0xffff880bdf4a3540, pid 18116) on processor 0 Oops: (null)^M&lt;br/&gt;
due to oops @ 0xffffffffa0c59f2c^M&lt;br/&gt;
     r15 = 0x5a5a5a5a5a5a5a5a      r14 = 0x0000000000000000 ^M&lt;br/&gt;
     r13 = 0xffff880950324f60      r12 = 0xffff880be47436c0 ^M&lt;br/&gt;
      bp = 0xffff880be4743630       bx = 0xffff8806048505c0 ^M&lt;br/&gt;
     r11 = 0x0000000000000000      r10 = 0x00000000000007ca ^M&lt;br/&gt;
      r9 = 0x0000000000000001       r8 = 0x0000000000000003 ^M&lt;br/&gt;
      ax = 0x0000000000002ccb       cx = 0x0000000000000000 ^M&lt;br/&gt;
      dx = 0x0000000000000000       si = 0x5a5a5a5a5a5a5a5a ^M&lt;br/&gt;
      di = 0xffff880950324f60  orig_ax = 0xffffffffffffffff ^M&lt;br/&gt;
      ip = 0xffffffffa0c59f2c       cs = 0x0000000000000010 ^M&lt;br/&gt;
   flags = 0x0000000000010206       sp = 0xffff880be4743620 ^M&lt;br/&gt;
      ss = 0x0000000000000018 &amp;amp;regs = 0xffff880be4743588^M&lt;/p&gt;</comment>
                            <comment id="75922" author="hongchao.zhang" created="Thu, 30 Jan 2014 08:09:46 +0000"  >&lt;p&gt;from the assembly code of lqe64_hask_keycmp, the problem should be the lquota_entry managed by cfs_hash is destroyed,&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;0xffffffffa0bf8f2c &amp;lt;lqe64_hash_keycmp+12&amp;gt;:	cmp    %rax,0x10(%rsi)   &amp;lt;-- 0x10(%rsi) is the &lt;span class=&quot;code-quote&quot;&gt;&quot;lqe-&amp;gt;lqe_id.qid_uid&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;    [exception RIP: lqe64_hash_keycmp+12]
    RIP: ffffffffa0bf8f2c  RSP: ffff88058e5736d0  RFLAGS: 00010206
    RAX: 0000000000002f7d  RBX: ffff880fd6450e00  RCX: 0000000000000000
    RDX: 0000000000000000  RSI: 5a5a5a5a5a5a5a5a  RDI: ffff88058f1b2f60
    RBP: ffff88058e5736d0   R8: 0000000000000003   R9: 0000000000000000
    R10: ffff88058f1b2f28  R11: ffffffffa0cac2f0  R12: ffff88058e573760
    R13: ffff88058f1b2f60  R14: 0000000000000000  R15: 5a5a5a5a5a5a5a5a
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;the RSI is 0x5a5a5a5a5a5a5a5a, which means the corresponding lquota_entry is freed.&lt;/p&gt;
</comment>
                            <comment id="75940" author="mhanafi" created="Thu, 30 Jan 2014 18:56:27 +0000"  >&lt;p&gt;Can we expect a patch soon. This is our most critical issue.&lt;/p&gt;</comment>
                            <comment id="75977" author="johann" created="Fri, 31 Jan 2014 06:18:55 +0000"  >&lt;p&gt;Hi Mahmoud, &lt;/p&gt;

&lt;p&gt;One possible scenario is that the quota code screws up the refcount on the lquota_entry structure, causing a lqe to be freed while still referenced in the hash table. We are working on a diagnostic patch under &lt;a href=&quot;http://review.whamcloud.com/#/c/9070&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9070&lt;/a&gt;. We would like to make sure it passes our regression tests before you apply it. Stay tuned.&lt;/p&gt;</comment>
                            <comment id="79193" author="mhanafi" created="Wed, 12 Mar 2014 21:35:48 +0000"  >&lt;p&gt;We got a crash and here is the debug output.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;LustreError: 167-0: nbp7-MDT0000-lwp-OST004a: This client was evicted by nbp7-MDT0000; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.
Lustre: nbp7-MDT0000-lwp-OST003e: Connection restored to nbp7-MDT0000 (at 10.151.27.38@o2ib)
Lustre: Evicted from MGS (at 10.151.27.38@o2ib) after server handle changed from 0x77507a47a4c36f84 to 0x5127d6849d7f5d1c
Lustre: nbp7-MDT0000-lwp-OST004a: Connection restored to nbp7-MDT0000 (at 10.151.27.38@o2ib)
Lustre: 45157:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST0032: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 45157:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST0032: reconciliation failed. [0x0:0x0:0x0], -37
Lustre: 45187:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST0026: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 45187:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST0026: reconciliation failed. [0x0:0x0:0x0], -37
Lustre: 45188:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST0042: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 45188:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST0042: reconciliation failed. [0x0:0x0:0x0], -37
Lustre: 45189:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST002a: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 45189:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST002a: reconciliation failed. [0x0:0x0:0x0], -37
Lustre: nbp7-OST0026: deleting orphan objects from 0x0:16242960 to 0x0:16242977
Lustre: nbp7-OST002a: deleting orphan objects from 0x0:16088539 to 0x0:16088557
Lustre: nbp7-OST002e: deleting orphan objects from 0x0:15914638 to 0x0:15914671
Lustre: nbp7-OST003a: deleting orphan objects from 0x0:15238739 to 0x0:15238764
Lustre: nbp7-OST0042: deleting orphan objects from 0x0:15604504 to 0x0:15604524
Lustre: nbp7-OST003e: deleting orphan objects from 0x0:16407770 to 0x0:16407841
Lustre: nbp7-OST004a: deleting orphan objects from 0x0:15822277 to 0x0:15822318
Lustre: nbp7-OST004e: deleting orphan objects from 0x0:15730335 to 0x0:15730375
Lustre: nbp7-OST0036: deleting orphan objects from 0x0:14913994 to 0x0:14914030
Lustre: nbp7-OST0046: deleting orphan objects from 0x0:15378324 to 0x0:15378342
Lustre: nbp7-OST0032: deleting orphan objects from 0x0:15902313 to 0x0:15902368
Lustre: nbp7-OST0052: deleting orphan objects from 0x0:15714908 to 0x0:15714926
Lustre: 45191:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST0046: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 45191:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST0046: reconciliation failed. [0x0:0x0:0x0], -37
Lustre: 45209:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST003e: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 45209:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST003e: reconciliation failed. [0x0:0x0:0x0], -37
LustreError: 11901:0:(lquota_internal.h:272:lqe_putref()) $$$ Freeing quota entry &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; it is still referenced in the hash!!!
 qsd:nbp7-OST002a qtype:usr id:11920 enforced:1 granted:31459296 pending:0 waiting:0 req:0 usage:31396440 qunit:4194304 qtune:524288 edquot:0
LustreError: 11901:0:(lquota_internal.h:273:lqe_putref()) LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="79194" author="johann" created="Wed, 12 Mar 2014 21:40:40 +0000"  >&lt;p&gt;There should be a stack trace dumped after the LBUG. Could you please paste it here?&lt;/p&gt;</comment>
                            <comment id="79195" author="mhanafi" created="Wed, 12 Mar 2014 21:45:57 +0000"  >&lt;p&gt;This is from a second host that crashed at the same time. It has the call trace&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;ustre: nbp7-MDT0000-lwp-OST003b: Connection restored to nbp7-MDT0000 (at 10.151.27.38@o2ib)
Lustre: Evicted from MGS (at 10.151.27.38@o2ib) after server handle changed from 0x77507a47a4c430e4 to 0x5127d6849d7f8850
Lustre: nbp7-MDT0000-lwp-OST0033: Connection restored to nbp7-MDT0000 (at 10.151.27.38@o2ib)
Lustre: 47116:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST003f: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 47116:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST003f: reconciliation failed. [0x0:0x0:0x0], -37
Lustre: 47117:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST002f: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 47117:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST002f: reconciliation failed. [0x0:0x0:0x0], -37
Lustre: 47122:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST004f: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 47122:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST004f: reconciliation failed. [0x0:0x0:0x0], -37
Lustre: nbp7-OST002b: deleting orphan objects from 0x0:16291625 to 0x0:16291649
Lustre: nbp7-OST003b: deleting orphan objects from 0x0:16000564 to 0x0:16000656
Lustre: nbp7-OST0027: deleting orphan objects from 0x0:16520477 to 0x0:16520523
Lustre: nbp7-OST0033: deleting orphan objects from 0x0:16359404 to 0x0:16359437
Lustre: nbp7-OST0047: deleting orphan objects from 0x0:16321198 to 0x0:16321227
Lustre: nbp7-OST0043: deleting orphan objects from 0x0:16268525 to 0x0:16268561
Lustre: nbp7-OST002f: deleting orphan objects from 0x0:15751870 to 0x0:15751911
Lustre: nbp7-OST003f: deleting orphan objects from 0x0:16136079 to 0x0:16136110
Lustre: nbp7-OST004b: deleting orphan objects from 0x0:15683379 to 0x0:15683400
Lustre: nbp7-OST004f: deleting orphan objects from 0x0:16583505 to 0x0:16583534
Lustre: nbp7-OST0037: deleting orphan objects from 0x0:16228767 to 0x0:16228810
Lustre: nbp7-OST0053: deleting orphan objects from 0x0:16334906 to 0x0:16334966
Lustre: 47127:0:(qsd_reint.c:365:qsd_reconciliation()) nbp7-OST004b: failed to report quota. [0x200000006:0x20000:0x0], -37
Lustre: 47127:0:(qsd_reint.c:525:qsd_reint_main()) nbp7-OST004b: reconciliation failed. [0x0:0x0:0x0], -37
LustreError: 12423:0:(lquota_internal.h:272:lqe_putref()) $$$ Freeing quota entry &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; it is still referenced in the hash!!!
 qsd:nbp7-OST003f qtype:usr id:9637 enforced:1 granted:2111482660 pending:0 waiting:0 req:0 usage:2111481984 qunit:4096 qtune:1024 edquot:0
LustreError: 12423:0:(lquota_internal.h:273:lqe_putref()) LBUG
Pid: 12423, comm: lquota_wb_nbp7-

Call Trace:
 [&amp;lt;ffffffffa0486895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
 [&amp;lt;ffffffffa0486e97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
 [&amp;lt;ffffffffa0c761ef&amp;gt;] qsd_upd_thread+0xc8f/0xdd0 [lquota]
 [&amp;lt;ffffffff81063be0&amp;gt;] ? default_wake_function+0x0/0x20

Entering kdb (current=0xffff881023ab4040, pid 12423) on processor 24 Oops: (&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;)
due to oops @ 0x0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="79324" author="niu" created="Fri, 14 Mar 2014 09:16:46 +0000"  >&lt;p&gt;I didn&apos;t see how quota code screwed the refcount so far, will continue to investigate it further. Johann, do you have any idea? Thanks.&lt;/p&gt;</comment>
                            <comment id="79490" author="niu" created="Mon, 17 Mar 2014 03:05:00 +0000"  >&lt;p&gt;One thing confused me is that: qsd_reconciliation() reported &quot;failed to report quota for -ENOLCK&quot;, which means qsd_reconciliation() called qsd_adjust() and returned -ENOLCK, however I didn&apos;t see the corresponding error message from qsd_req_completion() in the log:&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;        /* despite -EDQUOT &amp;amp; -EINPROGRESS errors, the master might still
         * grant us back quota space to adjust quota overrun */
        &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (ret != 0 &amp;amp;&amp;amp; ret != -EDQUOT &amp;amp;&amp;amp; ret != -EINPROGRESS) {
                &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; (ret != -ETIMEDOUT &amp;amp;&amp;amp; ret != -ENOTCONN &amp;amp;&amp;amp;
                   ret != -ESHUTDOWN &amp;amp;&amp;amp; ret != -EAGAIN)
                        &lt;span class=&quot;code-comment&quot;&gt;/* print errors only &lt;span class=&quot;code-keyword&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; code is unexpected */&lt;/span&gt;
                        LQUOTA_ERROR(lqe, &lt;span class=&quot;code-quote&quot;&gt;&quot;DQACQ failed with %d, flags:0x%x&quot;&lt;/span&gt;,
                                     ret, reqbody-&amp;gt;qb_flags);
                GOTO(out, ret);
        }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="79816" author="niu" created="Thu, 20 Mar 2014 07:12:32 +0000"  >&lt;p&gt;Hi, Mahmoud&lt;/p&gt;

&lt;p&gt;I didn&apos;t see why &quot;DQACQ failed with -37&quot; wasn&apos;t printed in the log, is it possible that the running code isn&apos;t same as 2.4.0-4nasS? Do you have other 2.4 systems are also suffering this problem? Thanks.&lt;/p&gt;

&lt;p&gt;As Johann pointed, the error message could probably be flooded. Could you enable quota debug (D_QUOTA) on all OSTs and provide lustre logs as well when it&apos;s reproduced next time? Thank you.&lt;/p&gt;</comment>
                            <comment id="79927" author="mhanafi" created="Thu, 20 Mar 2014 19:53:44 +0000"  >&lt;p&gt;We have a second filesystem running the same code (2.4.1-5.2nasS) It doesn&apos;t have any of the ENOLCK errors. The filesystem with the errors was a 2.1.5 that was upgraded to 2.4. The other started out with 2.4. We do want  to track down the ENOLCK errors.&lt;/p&gt;

&lt;p&gt;We have D_QUOTA enabled. But when we see a crash the host panics and we are not able to get a debug dump.&lt;/p&gt;

&lt;p&gt;Additional info: These past few crashes have been after the MDS crashed and finished recovery. Then we would see 1 or more OSS crash. But we have see this crash without the MDS crash.&lt;/p&gt;

&lt;p&gt;I took a current debug dump of all OSS and MDS and uploaded to the ftp site. (nbp7.debug.tgz)&lt;/p&gt;
</comment>
                            <comment id="80198" author="ihara" created="Tue, 25 Mar 2014 06:19:57 +0000"  >&lt;p&gt;Hi, we hit same issue (filed as &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4807&quot; title=&quot;repeating DQACQ failed with -37&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4807&quot;&gt;&lt;del&gt;LU-4807&lt;/del&gt;&lt;/a&gt;) of this problem. The following error messages have been showing up every 10 minute, even now. turning on quota debug on OSS and collecting lustre debug might help?&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Mar 25 00:29:51 ddnoss4 kernel: LustreError: 4503:0:(qsd_handler.c:344:qsd_req_completion()) $$$ DQACQ failed with -37, flags:0x2 qsd:home2-OST0026 qtype:grp id:3303 enforced:1 granted:7364460 pending:0 waiting:0 req:1 usage:7165364 qunit:4194304 qtune:524288 edquot:0
Mar 25 00:39:51 ddnoss4 kernel: LustreError: 4503:0:(qsd_handler.c:344:qsd_req_completion()) $$$ DQACQ failed with -37, flags:0x2 qsd:home2-OST0026 qtype:grp id:3303 enforced:1 granted:7364460 pending:0 waiting:0 req:1 usage:7165364 qunit:4194304 qtune:524288 edquot:0
Mar 25 00:49:51 ddnoss4 kernel: LustreError: 4503:0:(qsd_handler.c:344:qsd_req_completion()) $$$ DQACQ failed with -37, flags:0x2 qsd:home2-OST0026 qtype:grp id:3303 enforced:1 granted:7364460 pending:0 waiting:0 req:1 usage:7165364 qunit:4194304 qtune:524288 edquot:0
Mar 25 00:59:51 ddnoss4 kernel: LustreError: 4503:0:(qsd_handler.c:344:qsd_req_completion()) $$$ DQACQ failed with -37, flags:0x2 qsd:home2-OST0026 qtype:gr
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="80201" author="niu" created="Tue, 25 Mar 2014 07:16:37 +0000"  >&lt;p&gt;Hi, Ihara&lt;/p&gt;

&lt;p&gt;If the OSS is still showing such error message every 10 minutes, collecting debug logs (it&apos;s better to have both MDT and OST log with D_QUOTA enabled) can help us to figure out why dqacq failed with -ENOLCK. Thank you.&lt;/p&gt;</comment>
                            <comment id="80280" author="niu" created="Wed, 26 Mar 2014 06:59:24 +0000"  >&lt;p&gt;Seems there is a small race window in qsd_intent_lock(), I updated the patch.&lt;/p&gt;</comment>
                            <comment id="80301" author="ihara" created="Wed, 26 Mar 2014 15:51:47 +0000"  >&lt;p&gt;Hi Niu, &lt;/p&gt;

&lt;p&gt;uploaded debug log on ftp.whamcloud.com/uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4249&quot; title=&quot;exception RIP: lqe64_hash_keycmp+12&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4249&quot;&gt;&lt;del&gt;LU-4249&lt;/del&gt;&lt;/a&gt;&lt;br/&gt;
Hope this help for analysis.&lt;/p&gt;</comment>
                            <comment id="80371" author="niu" created="Thu, 27 Mar 2014 13:29:26 +0000"  >&lt;p&gt;Hi, Ihara/Mahmoud&lt;br/&gt;
Could you apply the debug patch (&lt;a href=&quot;http://review.whamcloud.com/#/c/9808/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9808/&lt;/a&gt; ) to try to collect more information when the bug is hit again? (please make sure the autotest passed before apply the patch, and the previous debug patch needs be reverted before apply this one). Thank you.&lt;/p&gt;</comment>
                            <comment id="80412" author="jaylan" created="Thu, 27 Mar 2014 20:36:43 +0000"  >&lt;p&gt;Niu, do you want me to undo patch at &lt;a href=&quot;http://review.whamcloud.com/#/c/9070&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/9070&lt;/a&gt; or to keep it, in addition to #9808? Thanks!&lt;/p&gt;</comment>
                            <comment id="80422" author="jaylan" created="Thu, 27 Mar 2014 22:20:30 +0000"  >&lt;p&gt;The #9070 needs to be reversed before applying #9808.&lt;/p&gt;

&lt;p&gt;However, there are still conflicts in three files. I believe your patch was not generated against 2.4 branch. The tree files with conflicts were:&lt;/p&gt;

&lt;p&gt;lustre/quota/qmt_pool.c&lt;br/&gt;
lustre/quota/qsd_lock.c&lt;br/&gt;
lustre/quota/qsd_writeback.c&lt;/p&gt;


&lt;p&gt;One conflict in qmt_pool.c at qmt_pool_lqe_lookup():&lt;/p&gt;

&lt;p&gt;        if (qid-&amp;gt;qid_uid == 0) &lt;/p&gt;
{
                /* caller wants to access grace time, no need to look up the
                 * entry since we keep a reference on ID 0 all the time */
                lqe = pool-&amp;gt;qpi_grace_lqe[qtype];
&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;lt; HEAD
                lqe_getref(lqe);
                GOTO(out, 0);
=======
                lqe_getref(lqe, LQE_REF_IDX_MAX);
                GOTO(out, lqe);
&amp;gt;&amp;gt;&amp;gt;&amp;gt;&amp;gt;&amp;gt;&amp;gt; 6ec5b50... LU-4249 quota: lqe reference debug patch
        }

&lt;p&gt;        /* now that we have the pool, let&apos;s look-up the quota entry in the&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;right quota site */&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;The top of trunk of 2.4 branch shuold have GOTO line like this:&lt;br/&gt;
    GOTO(out, 0);&lt;br/&gt;
but yours is&lt;br/&gt;
    GOTO(out,lqe);&lt;/p&gt;

&lt;p&gt;I think one example is enough for you to see the difference.&lt;/p&gt;</comment>
                            <comment id="80428" author="niu" created="Fri, 28 Mar 2014 02:16:18 +0000"  >&lt;p&gt;Sorry, I made the patch against master mistakenly, here is the patch for b2_4: &lt;a href=&quot;http://review.whamcloud.com/9833&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/9833&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;This new patch doesn&apos;t address the conflict in lustre/quota/qsd_writeback.c, because there is a quota fix introduced in 2.4.2 (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3460&quot; title=&quot;recovery-small test_51 timeout: lqe_iter_cb(): Inuse quota entry&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3460&quot;&gt;&lt;del&gt;LU-3460&lt;/del&gt;&lt;/a&gt; &lt;a href=&quot;http://review.whamcloud.com/#/c/8169/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8169/&lt;/a&gt;), I suggest you apply this fix first.&lt;/p&gt;</comment>
                            <comment id="80506" author="jaylan" created="Fri, 28 Mar 2014 18:02:36 +0000"  >&lt;p&gt;Hi Niu, I cherry-picked &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3460&quot; title=&quot;recovery-small test_51 timeout: lqe_iter_cb(): Inuse quota entry&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3460&quot;&gt;&lt;del&gt;LU-3460&lt;/del&gt;&lt;/a&gt; and then #9833. It applied clean now. Thanks!&lt;/p&gt;</comment>
                            <comment id="80920" author="niu" created="Thu, 3 Apr 2014 06:08:48 +0000"  >&lt;p&gt;Hi Javed&lt;/p&gt;

&lt;p&gt;Looks your post isn&apos;t related to this bug, could you open another ticket to track it? Thanks.&lt;/p&gt;</comment>
                            <comment id="80921" author="javed" created="Thu, 3 Apr 2014 06:53:00 +0000"  >&lt;p&gt;thanks niu. deleted from here.&lt;/p&gt;</comment>
                            <comment id="80923" author="ihara" created="Thu, 3 Apr 2014 09:14:26 +0000"  >&lt;p&gt;Hi Niu, so, finally, what exactly patches should we apply againt b2_4 if we want enabled debug patches?&lt;/p&gt;</comment>
                            <comment id="80924" author="niu" created="Thu, 3 Apr 2014 09:21:36 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Hi Niu, so, finally, what exactly patches should we apply againt b2_4 if we want enabled debug patches?&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;fix of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3460&quot; title=&quot;recovery-small test_51 timeout: lqe_iter_cb(): Inuse quota entry&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3460&quot;&gt;&lt;del&gt;LU-3460&lt;/del&gt;&lt;/a&gt; &lt;a href=&quot;http://review.whamcloud.com/#/c/8169/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/8169/&lt;/a&gt; and debug patch &lt;a href=&quot;http://review.whamcloud.com/9833&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/9833&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="81917" author="ihara" created="Fri, 18 Apr 2014 04:53:46 +0000"  >&lt;p&gt;Niu, we applied patches, and then, same error messages have been showing up. Attahced are recent debug log and syslog messages after applies patches.&lt;/p&gt;</comment>
                            <comment id="81919" author="niu" created="Fri, 18 Apr 2014 05:23:53 +0000"  >&lt;p&gt;Hi, Ihara, the ENOLCK error message problem is addressed in another ticket, please see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4920&quot; title=&quot;lqe_lockh should always be cleared on id lock cancel&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4920&quot;&gt;&lt;del&gt;LU-4920&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;This debug patch is to collect information on lqe refcount when system crash on &quot;exception RIP: ...&quot;, please keep the debug patch applied until the crash problem reproduced. Thanks.&lt;/p&gt;</comment>
                            <comment id="88173" author="niu" created="Fri, 4 Jul 2014 03:58:32 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/10988&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/10988&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="88569" author="niu" created="Wed, 9 Jul 2014 05:35:44 +0000"  >&lt;p&gt;b2_4: &lt;a href=&quot;http://review.whamcloud.com/11019&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11019&lt;/a&gt;&lt;br/&gt;
b2_5: &lt;a href=&quot;http://review.whamcloud.com/11020&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/11020&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="91185" author="pjones" created="Fri, 8 Aug 2014 13:13:04 +0000"  >&lt;p&gt;Landed for 2.7&lt;/p&gt;</comment>
                            <comment id="91230" author="jaylan" created="Fri, 8 Aug 2014 22:06:33 +0000"  >&lt;p&gt;Do we need to undo #9833 (the debug patch) and apply #11019?&lt;br/&gt;
Please advise.&lt;/p&gt;

&lt;p&gt;Also please land #11019 and #11020.&lt;/p&gt;</comment>
                            <comment id="91231" author="pjones" created="Fri, 8 Aug 2014 22:20:13 +0000"  >&lt;p&gt;I will leave it to Niu to comment definitively about whether the debug patch needs to be removed but my understanding is that #11019 is intended to be the fix for b2_4. The fixes for maintenance branches will be landed when a release is scheduled for those branches.&lt;/p&gt;</comment>
                            <comment id="91233" author="jaylan" created="Fri, 8 Aug 2014 22:31:47 +0000"  >&lt;p&gt;But wouldn&apos;t that cause your engineers who work in future patches have different code base from ours with accumulated b2_4/b2_5 patches?&lt;/p&gt;</comment>
                            <comment id="91238" author="pjones" created="Fri, 8 Aug 2014 22:59:55 +0000"  >&lt;p&gt;Well, even without this happening it would still be quite possible that the release you baseline on lags behind the tip of the maintenance branch. If such a situation arises where there are multiple &quot;floating&quot; patches that affect the same area then we can always assist in resolving the conflict. I would think that introducing patch dependencies should make this easier though. It&apos;s probably simpler to talk about this on the next call than back and forth in a ticket &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/p&gt;</comment>
                            <comment id="91256" author="niu" created="Mon, 11 Aug 2014 01:36:20 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Do we need to undo #9833 (the debug patch) and apply #11019?&lt;br/&gt;
Please advise.&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Yes, that debug patch a little bit heavier than the former one (but it provides more detailed information), it wasn&apos;t supposed to be carried in production system all the time. Thanks.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                                                <inwardlinks description="is duplicated by">
                                        <issuelink>
            <issuekey id="23847">LU-4807</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="23169">LU-4633</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="14734" name="debug.tgz" size="351863" author="ihara" created="Fri, 18 Apr 2014 04:53:46 +0000"/>
                            <attachment id="13832" name="service188.nov13.2013.tgz" size="11651" author="mhanafi" created="Wed, 13 Nov 2013 22:19:53 +0000"/>
                            <attachment id="14735" name="syslog.tgz" size="54949" author="ihara" created="Fri, 18 Apr 2014 04:53:46 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzw8t3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>11583</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>