<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:51:45 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5469] Intermittent Clients hang during IO </title>
                <link>https://jira.whamcloud.com/browse/LU-5469</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Clients hang during IO this is a typical trace for the hung process&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
[287554.530174] ld              S ffff880989104a30     0 17536      1 0x00000000^M
[287554.551603]  ffff88096e0759a8 0000000000000082 ffff88096e074010 0000000000011800^M
[287554.574176]  0000000000011800 0000000000011800 0000000000011800 ffff88096e075fd8^M
[287554.596747]  ffff88096e075fd8 0000000000011800 ffff8809de16c640 ffff8802733aa040^M
[287554.619319] Call Trace:^M
[287554.626926]  [&amp;lt;ffffffffa08916a5&amp;gt;] cl_sync_io_wait+0x365/0x450 [obdclass]^M
[287554.647277]  [&amp;lt;ffffffffa0cc9169&amp;gt;] vvp_page_sync_io+0x59/0x120 [lustre]^M
[287554.667083]  [&amp;lt;ffffffffa0cc9731&amp;gt;] vvp_io_commit_write+0x501/0x640 [lustre]^M
[287554.687953]  [&amp;lt;ffffffffa0891ebc&amp;gt;] cl_io_commit_write+0x9c/0x1d0 [obdclass]^M
[287554.708816]  [&amp;lt;ffffffffa0c9df14&amp;gt;] ll_commit_write+0x104/0x1f0 [lustre]^M
[287554.728608]  [&amp;lt;ffffffffa0cb6fda&amp;gt;] ll_write_end+0x2a/0x60 [lustre]^M
[287554.747098]  [&amp;lt;ffffffff810f83e2&amp;gt;] generic_perform_write+0x122/0x1c0^M
[287554.766095]  [&amp;lt;ffffffff810f84e1&amp;gt;] generic_file_buffered_write+0x61/0xa0^M
[287554.786125]  [&amp;lt;ffffffff810fb476&amp;gt;] __generic_file_aio_write+0x296/0x490^M
[287554.805895]  [&amp;lt;ffffffff810fb6bc&amp;gt;] generic_file_aio_write+0x4c/0xb0^M
[287554.824646]  [&amp;lt;ffffffffa0ccc111&amp;gt;] vvp_io_write_start+0xc1/0x2e0 [lustre]^M
[287554.844973]  [&amp;lt;ffffffffa088ea09&amp;gt;] cl_io_start+0x69/0x140 [obdclass]^M
[287554.864028]  [&amp;lt;ffffffffa0892dc3&amp;gt;] cl_io_loop+0xa3/0x190 [obdclass]^M
[287554.882804]  [&amp;lt;ffffffffa0c71d91&amp;gt;] ll_file_io_generic+0x461/0x600 [lustre]^M
[287554.903365]  [&amp;lt;ffffffffa0c72166&amp;gt;] ll_file_aio_write+0x236/0x290 [lustre]^M
[287554.923680]  [&amp;lt;ffffffffa0c73373&amp;gt;] ll_file_write+0x203/0x290 [lustre]^M
[287554.942941]  [&amp;lt;ffffffff8115b03e&amp;gt;] vfs_write+0xce/0x140^M
[287554.958555]  [&amp;lt;ffffffff8115b1b3&amp;gt;] sys_write+0x53/0xa0^M
[287554.973915]  [&amp;lt;ffffffff81479c92&amp;gt;] system_call_fastpath+0x16/0x1b^M
[287554.992131]  [&amp;lt;00007ffe3f20e0b0&amp;gt;] 0x7ffe3f20e0af^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We don&apos;t see any errors. &lt;/p&gt;

&lt;p&gt;OSS trace is attached.&lt;/p&gt;</description>
                <environment>clients: sles11SP3 2.4.3&lt;br/&gt;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&lt;br/&gt;
Server: centos 2.4.3</environment>
        <key id="25938">LU-5469</key>
            <summary>Intermittent Clients hang during IO </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="niu">Niu Yawei</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Sat, 9 Aug 2014 07:21:36 +0000</created>
                <updated>Thu, 10 Mar 2016 22:00:42 +0000</updated>
                            <resolved>Thu, 10 Mar 2016 22:00:42 +0000</resolved>
                                    <version>Lustre 2.4.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="91249" author="pjones" created="Sat, 9 Aug 2014 14:19:32 +0000"  >&lt;p&gt;Lai&lt;/p&gt;

&lt;p&gt;Could you please assist with this issue?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="91312" author="mhanafi" created="Mon, 11 Aug 2014 17:55:06 +0000"  >&lt;p&gt;I think I have captured debug info during a client hang.&lt;br/&gt;
I was untaring file and the client hung on following file &lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;bridge4 ~ # lsof /nobackupp7
COMMAND   PID     USER   FD   TYPE      DEVICE   SIZE/OFF               NODE NAME
csh     11017 jahousma  cwd    DIR 1170,297908       4096 144124584776438323 /nobackupp7/jahousma/work/LAVA-FA/windTunnel_9x7/lava3/anaysis_data20140801/M1.6_x300
tar     11212 jahousma  cwd    DIR 1170,297908       4096 144124584776438323 /nobackupp7/jahousma/work/LAVA-FA/windTunnel_9x7/lava3/anaysis_data20140801/M1.6_x300
tar     11212 jahousma    3r   REG 1170,297908 8905963520 144124584776438324 /nobackupp7/jahousma/work/LAVA-FA/windTunnel_9x7/lava3/anaysis_data20140801/M1.6_x300/prep.tar
tar     11212 jahousma    4w   REG 1170,297908    2072576 144124785633263686 /nobackupp7/jahousma/work/LAVA-FA/windTunnel_9x7/lava3/anaysis_data20140801/M1.6_x300/prep/GRID_DIR/grid.0441 (deleted)

bridge4 ~ # lfs getstripe /nobackupp7/jahousma/work/LAVA-FA/windTunnel_9x7/lava3/anaysis_data20140801/M1.6_x300/prep/GRID_DIR/grid.0441
/nobackupp7/jahousma/work/LAVA-FA/windTunnel_9x7/lava3/anaysis_data20140801/M1.6_x300/prep/GRID_DIR/grid.0441
lmm_stripe_count:   1
lmm_stripe_size:    4194304
lmm_layout_gen:     0
lmm_stripe_offset:  72
	obdidx		 objid		 objid		 group
	    72	      25921684	    0x18b8894	             0

bridge4 ~ # lfs path2fid /nobackupp7/jahousma/work/LAVA-FA/windTunnel_9x7/lava3/anaysis_data20140801/M1.6_x300/prep/GRID_DIR/grid.0441
[0x20008ba9b:0x46:0x0]

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[0]kdb&amp;gt; btp 11212
Stack traceback &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; pid 11212
0xffff88276ff3a4c0    11212    11017  0    8   S  0xffff88276ff3ab30  tar
 [&amp;lt;ffffffff8146fb6b&amp;gt;] thread_return+0x0/0x295
 [&amp;lt;ffffffffa09046a5&amp;gt;] cl_sync_io_wait+0x365/0x450 [obdclass]
 [&amp;lt;ffffffffa0d3c169&amp;gt;] vvp_page_sync_io+0x59/0x120 [lustre]
 [&amp;lt;ffffffffa0d3c731&amp;gt;] vvp_io_commit_write+0x501/0x640 [lustre]
 [&amp;lt;ffffffffa0904ebc&amp;gt;] cl_io_commit_write+0x9c/0x1d0 [obdclass]
 [&amp;lt;ffffffffa0d10f14&amp;gt;] ll_commit_write+0x104/0x1f0 [lustre]
 [&amp;lt;ffffffffa0d29fda&amp;gt;] ll_write_end+0x2a/0x60 [lustre]
 [&amp;lt;ffffffff810f83e2&amp;gt;] generic_perform_write+0x122/0x1c0
 [&amp;lt;ffffffff810f84e1&amp;gt;] generic_file_buffered_write+0x61/0xa0
 [&amp;lt;ffffffff810fb476&amp;gt;] __generic_file_aio_write+0x296/0x490
 [&amp;lt;ffffffff810fb6bc&amp;gt;] generic_file_aio_write+0x4c/0xb0
 [&amp;lt;ffffffffa0d3f111&amp;gt;] vvp_io_write_start+0xc1/0x2e0 [lustre]
 [&amp;lt;ffffffffa0901a09&amp;gt;] cl_io_start+0x69/0x140 [obdclass]
 [&amp;lt;ffffffffa0905dc3&amp;gt;] cl_io_loop+0xa3/0x190 [obdclass]
 [&amp;lt;ffffffffa0ce4d91&amp;gt;] ll_file_io_generic+0x461/0x600 [lustre]
 [&amp;lt;ffffffffa0ce5166&amp;gt;] ll_file_aio_write+0x236/0x290 [lustre]
 [&amp;lt;ffffffffa0ce6373&amp;gt;] ll_file_write+0x203/0x290 [lustre]
 [&amp;lt;ffffffff8115b03e&amp;gt;] vfs_write+0xce/0x140
 [&amp;lt;ffffffff8115b1b3&amp;gt;] sys_write+0x53/0xa0
 [&amp;lt;ffffffff81479c92&amp;gt;] system_call_fastpath+0x16/0x1b
 [&amp;lt;00007fffcebea0b0&amp;gt;] 0x7fffcebea0b0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;


&lt;p&gt;obdidx 72 is nbp7-OST0048 on service194.&lt;/p&gt;

&lt;p&gt;Client is service64. See uploaded tar file &lt;a href=&quot;ftp://.../uploads/LU-5469/LU-5469.tgz&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;ftp://.../uploads/LU-5469/LU-5469.tgz&lt;/a&gt; &lt;/p&gt;

</comment>
                            <comment id="91313" author="green" created="Mon, 11 Aug 2014 17:59:33 +0000"  >&lt;p&gt;Hm, I wonder what led you to think it&apos;s the problem on this OSS, is there anything in the logs (I see you said no errors)? All threads appear to be pretty much idle.&lt;br/&gt;
The client trace on the other hand is waiting on a page which might be held by some other thread (having traces from other threads on this client would help).&lt;/p&gt;

&lt;p&gt;Does the client ever emerge from this state, or once it&apos;s hit it&apos;s always sit there and never completes?&lt;/p&gt;</comment>
                            <comment id="91315" author="mhanafi" created="Mon, 11 Aug 2014 18:09:07 +0000"  >&lt;p&gt;I am not Not sure if the problem is on the oss. But some time evicting the client from the OSS will free up the hung commands. &lt;/p&gt;


&lt;p&gt;See attached file (service64.trace.gz) for all threads from the client. This client was rebooted and with out any other lustre activity the untar was ran which caused it to hang. &lt;/p&gt;</comment>
                            <comment id="91316" author="mhanafi" created="Mon, 11 Aug 2014 18:24:00 +0000"  >&lt;p&gt;Some additional info that may help. I had service64 in kdb and then return to normal state it reconnected to lustre and was evicted due to time outs. Here is the errors on the console for fid &lt;span class=&quot;error&quot;&gt;&amp;#91;0x20008ba9b:0x46:0x0&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ 3554.774736] Lustre: Evicted from MGS (at 10.151.27.38@o2ib) after server handle changed from 0xd98dcc9c3a960a33 to 0xd98dcc9c3ab9e594^M
[ 3554.814302] LustreError: 167-0: nbp7-MDT0000-mdc-ffff881757f23000: This client was evicted by nbp7-MDT0000; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.^M
[ 3554.814397] Lustre: MGC10.151.27.38@o2ib: Connection restored to MGS (at 10.151.27.38@o2ib)^M
[ 3554.888956] LustreError: 167-0: nbp7-OST0000-osc-ffff881757f23000: This client was evicted by nbp7-OST0000; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.^M
[ 3554.890014] LustreError: 14444:0:(file.c:163:ll_close_inode_openhandle()) inode 144124584776433707 mdc close failed: rc = -108^M
[ 3554.892398] LustreError: 14444:0:(file.c:163:ll_close_inode_openhandle()) inode 144124584776438323 mdc close failed: rc = -108^M
[ 3554.898727] Lustre: nbp7-MDT0000-mdc-ffff881757f23000: Connection restored to nbp7-MDT0000 (at 10.151.27.38@o2ib)^M
[ 3578.892107] Lustre: Evicted from nbp7-OST0024_UUID (at 10.151.27.52@o2ib) after server handle changed from 0x7995b8509a5a26d9 to 0x7995b8509a5b49bb^M
[ 3578.935531] LustreError: 167-0: nbp7-OST0024-osc-ffff881757f23000: This client was evicted by nbp7-OST0024; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.^M
[ 3578.982721] LustreError: Skipped 35 previous similar messages^M
[ 3578.982962] LustreError: 11-0: nbp7-OST0034-osc-ffff881757f23000: Communicating with 10.151.27.52@o2ib, operation ost_connect failed with -114.^M
[ 3578.983268] Lustre: nbp7-OST0038-osc-ffff881757f23000: Connection restored to nbp7-OST0038 (at 10.151.27.52@o2ib)^M
[ 3578.983272] Lustre: Skipped 36 previous similar messages^M
[ 3578.983348] LustreError: 11212:0:(vvp_io.c:1088:vvp_io_commit_write()) Write page 506 of inode ffff88100f4946f8 failed -5^M
[ 3578.983578] Lustre: 8742:0:(llite_lib.c:2501:ll_dirty_page_discard_warn()) nbp7: dirty page discard: 10.151.27.38@o2ib:/nbp7/fid: [0x20008ba9b:0x46:0x0]&lt;span class=&quot;code-comment&quot;&gt;//jahousma/work/LAVA-FA/windTunnel_9x7/lava3/anaysis_data20140801/M1.6_x300/prep/GRID_DIR/grid.0441 may get corrupted (rc -108)^M
&lt;/span&gt;[ 3578.983702] LustreError: 14516:0:(osc_lock.c:817:osc_ldlm_completion_ast()) lock@ffff881048c364d8[2 3 0 1 1 00000000] W(2):[0, 18446744073709551615]@[0x100480000:0x18b8894:0x0] {^M
[ 3578.983712] LustreError: 14516:0:(osc_lock.c:817:osc_ldlm_completion_ast())     lovsub@ffff88376d8c3720: [0 ffff8810258aaa88 W(2):[0, 18446744073709551615]@[0x20008ba9b:0x46:0x0]] ^M
[ 3578.983722] LustreError: 14516:0:(osc_lock.c:817:osc_ldlm_completion_ast())     osc@ffff881048cb0b00: ffff881049baa6c0    0x20040040001 0x17ccc6540b4b0a6d 3 ffff88073bb50c70 size: 0 mtime: 0 atime: 0 ctime: 0 blocks: 0^M
[ 3578.983727] LustreError: 14516:0:(osc_lock.c:817:osc_ldlm_completion_ast()) } lock@ffff881048c364d8^M
[ 3578.983731] LustreError: 14516:0:(osc_lock.c:817:osc_ldlm_completion_ast()) dlmlock returned -5^M
[ 3578.983800] LustreError: 14516:0:(ldlm_resource.c:804:ldlm_resource_complain()) nbp7-OST0048-osc-ffff881757f23000: namespace resource [0x18b8894:0x0:0x0].0 (ffff8807195475c0) refcount nonzero (1) after lock cleanup; forcing cleanup.^M
[ 3578.983807] LustreError: 14516:0:(ldlm_resource.c:1415:ldlm_resource_dump()) --- Resource: [0x18b8894:0x0:0x0].0 (ffff8807195475c0) refcount = 2^M
[ 3578.983811] LustreError: 14516:0:(ldlm_resource.c:1418:ldlm_resource_dump()) Granted locks (in reverse order):^M
[ 3578.983821] LustreError: 14516:0:(ldlm_resource.c:1421:ldlm_resource_dump()) ### ### ns: nbp7-OST0048-osc-ffff881757f23000 lock: ffff881049baa6c0/0x17ccc6540b4b0a6d lrc: 3/0,0 mode: PW/PW res: [0x18b8894:0x0:0x0].0 rrc: 2 type: EXT [0-&amp;gt;18446744073709551615] (req 0-&amp;gt;8191) flags: 0x126400000000 nid: local remote: 0x7995b8509a5a93bb expref: -99 pid: 11212 timeout: 0 lvb_type: 1^M
[ 3578.991462] LustreError: 11212:0:(cl_lock.c:1420:cl_unuse_try()) result = -5, &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; is unlikely!^M
[ 3578.991477] LustreError: 11212:0:(cl_lock.c:1435:cl_unuse_locked()) lock@ffff881048c36858[1 0 0 1 0 00000000] W(2):[0, 18446744073709551615]@[0x20008ba9b:0x46:0x0] {^M
[ 3578.991485] LustreError: 11212:0:(cl_lock.c:1435:cl_unuse_locked())     vvp@ffff881048ccf5d8: ^M
[ 3578.991491] LustreError: 11212:0:(cl_lock.c:1435:cl_unuse_locked())     lov@ffff8810258aaa88: 1^M
[ 3578.991497] LustreError: 11212:0:(cl_lock.c:1435:cl_unuse_locked())     0 0: ---^M
[ 3578.991501] LustreError: 11212:0:(cl_lock.c:1435:cl_unuse_locked()) ^M
[ 3578.991506] LustreError: 11212:0:(cl_lock.c:1435:cl_unuse_locked()) } lock@ffff881048c36858^M
[ 3578.991512] LustreError: 11212:0:(cl_lock.c:1435:cl_unuse_locked()) unuse &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; -5^M
[ 3579.409090] Lustre: Evicted from nbp7-OST0026_UUID (at 10.151.27.54@o2ib) after server handle changed from 0x8b8ade63867f09b8 to 0x8b8ade63867fe14c^M
[ 3579.409094] Lustre: Skipped 10 previous similar messages^M
[ 3579.409273] LustreError: 11-0: nbp7-OST0036-osc-ffff881757f23000: Communicating with 10.151.27.54@o2ib, operation ost_connect failed with -114.^M
[ 3580.026860] LustreError: 167-0: nbp7-OST0025-osc-ffff881757f23000: This client was evicted by nbp7-OST0025; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.^M
[ 3580.073983] LustreError: Skipped 32 previous similar messages^M
[ 3580.093257] Lustre: nbp7-OST0035-osc-ffff881757f23000: Connection restored to nbp7-OST0035 (at 10.151.27.53@o2ib)^M
[ 3580.127000] Lustre: Skipped 43 previous similar messages^M
[ 3604.564439] Lustre: Evicted from nbp7-OST0034_UUID (at 10.151.27.52@o2ib) after server handle changed from 0x7995b8509a5a26fc to 0x7995b8509a5b49a6^M
[ 3604.607860] Lustre: Skipped 33 previous similar messages^M
[ 3604.625298] LustreError: 167-0: nbp7-OST0034-osc-ffff881757f23000: This client was evicted by nbp7-OST0034; in progress operations using &lt;span class=&quot;code-keyword&quot;&gt;this&lt;/span&gt; service will fail.^M
[ 3604.672415] LustreError: Skipped 11 previous similar messages^M
[ 3604.677874] Lustre: nbp7-OST0034-osc-ffff881757f23000: Connection restored to nbp7-OST0034 (at 10.151.27.52@o2ib)^M
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="91317" author="green" created="Mon, 11 Aug 2014 18:26:28 +0000"  >&lt;p&gt;So can you please elaborate on the &quot;intermittent hang&quot;? Does the client recover all by itself after some time?&lt;br/&gt;
Forceful eviction is understandably fixes things up because it discards the page from the cache with IO error.&lt;br/&gt;
But I see there&apos;s some stuff running on the OSS line read-inodes, and by the name of it, that sounds like some heavy IO generating program? Also I noticed that your oss had an oops trying to print the backtrace for this thread.&lt;/p&gt;</comment>
                            <comment id="91319" author="mhanafi" created="Mon, 11 Aug 2014 18:46:25 +0000"  >&lt;p&gt;The issue  is isolated to some users and some directories. The client doesn&apos;t recover unless it is rebooted. But after a reboot it can hang again with the same directory. on the effected client/directory a ls will hang, but it will work from a second host. &lt;/p&gt;
</comment>
                            <comment id="91321" author="mhanafi" created="Mon, 11 Aug 2014 18:58:26 +0000"  >&lt;p&gt;&apos;read-inodes&apos; is a tasks reads the inodes and  runs every hour. We have been running that for a very long time. I can turn that tasks off.&lt;/p&gt;</comment>
                            <comment id="91325" author="mhanafi" created="Mon, 11 Aug 2014 19:29:18 +0000"  >&lt;p&gt;list of recently applied patches&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5040&quot; title=&quot;kernel BUG at fs/jbd2/transaction.c:1033&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5040&quot;&gt;&lt;del&gt;LU-5040&lt;/del&gt;&lt;/a&gt; osd: fix osd declare credit for quota &lt;br/&gt;
Revert &quot;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5040&quot; title=&quot;kernel BUG at fs/jbd2/transaction.c:1033&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5040&quot;&gt;&lt;del&gt;LU-5040&lt;/del&gt;&lt;/a&gt; osd: fix osd declare credit for quota&quot; &lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3768&quot; title=&quot;&amp;quot;tunefs.lustre: &amp;#39;----index&amp;#39; only valid for MDT,OST&amp;quot; on a stand-alone MGS&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3768&quot;&gt;&lt;del&gt;LU-3768&lt;/del&gt;&lt;/a&gt; tunefs: make tunefs.lustre work correctly for MGS &lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5040&quot; title=&quot;kernel BUG at fs/jbd2/transaction.c:1033&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5040&quot;&gt;&lt;del&gt;LU-5040&lt;/del&gt;&lt;/a&gt; osd: fix osd declare credit for quota&lt;br/&gt;
&lt;b&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3671&quot; title=&quot;why are permission changes synchronous?&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3671&quot;&gt;&lt;del&gt;LU-3671&lt;/del&gt;&lt;/a&gt; mdd: sync perm for dir and perm reduction only&lt;/b&gt;&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5179&quot; title=&quot;Reading files from lustre results in stuck anonymous memory when JOBID is enabled&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5179&quot;&gt;&lt;del&gt;LU-5179&lt;/del&gt;&lt;/a&gt; libcfs: do not leak mm_struct &lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5188&quot; title=&quot;nbp6-OST002f-osc-MDT0000: invalid setattr record, lsr_valid:0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5188&quot;&gt;&lt;del&gt;LU-5188&lt;/del&gt;&lt;/a&gt; osp: Correctly check for invalid setattr record&lt;br/&gt;
&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4791&quot; title=&quot;lod_ah_init() ASSERTION( lc-&amp;gt;ldo_stripenr == 0 ) failed:&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4791&quot;&gt;&lt;del&gt;LU-4791&lt;/del&gt;&lt;/a&gt; lod: subtract xattr overhead in max EA size&lt;/p&gt;</comment>
                            <comment id="91357" author="mhanafi" created="Tue, 12 Aug 2014 01:12:38 +0000"  >&lt;p&gt;It looks this is a quota issue. Please see file uploaded &quot;uploads/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5469&quot; title=&quot;Intermittent Clients hang during IO &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5469&quot;&gt;&lt;del&gt;LU-5469&lt;/del&gt;&lt;/a&gt;/s194.dump.gz&quot; grep user &apos;9567&apos;&lt;/p&gt;</comment>
                            <comment id="91359" author="laisiyao" created="Tue, 12 Aug 2014 02:08:25 +0000"  >&lt;p&gt;Mahmoud, I encountered error when downloading s194.dump.gz:&lt;br/&gt;
The Finder can&#8217;t complete the operation because some data in &#8220;s194.dump.gz&#8221; can&#8217;t be read or written.&lt;br/&gt;
(Error code -36)&lt;/p&gt;

&lt;p&gt;Could you re-upload it or just paste related logs here?&lt;/p&gt;</comment>
                            <comment id="91395" author="green" created="Tue, 12 Aug 2014 15:11:13 +0000"  >&lt;p&gt;Lai - the log appears to be fine for me. Note it&apos;s not a tar file, just gzipped text&lt;/p&gt;</comment>
                            <comment id="91418" author="mhanafi" created="Tue, 12 Aug 2014 16:30:25 +0000"  >&lt;p&gt;There are 2 quota issues that was causing these hangs.&lt;/p&gt;

&lt;p&gt;1. The quota slaves had not connected to the master. We have seen this issue before and can only recover if the whole filesystem is unmounted and remounted. Is there a way to force the slave to retry connecting to the master without unmounting.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;target name:    nbp7-OST0048
pool ID:        0
type:           dt
quota enabled:  none
conn to master: not setup yet
space acct:     ug
user uptodate:  glb[0],slv[0],reint[1]
group uptodate: glb[0],slv[0],reint[1]
osd-ldiskfs.nbp7-OST004c.quota_slave.info=
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;2. If the salve is not connect to the master and enforcent is enable it will delay adjustment. There should be a max tries/timeout for this and return ENQUOTA.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;00040000:04000000:24.0:1407806066.559492:0:10442:0:(qsd_handler.c:111:qsd_ready()) $$$ connection to master not ready qsd:nbp7-OST0024 qtype:usr id:9567 enforced:1 granted:717643756 pending:0 waiting:0 req:0 usage:24808 qunit:0 qtune:0 edquot:0
00040000:04000000:24.0:1407806066.559494:0:10442:0:(qsd_handler.c:917:qsd_adjust()) $$$ delaying adjustment since qsd isn&apos;t ready qsd:nbp7-OST0024 qtype:usr id:9567 enforced:1 granted:717643756 pending:0 waiting:0 req:0 usage:24808 qunit:0 qtune:0 edquot:0
00040000:04000000:29.0:1407806067.111494:0:10463:0:(qsd_entry.c:219:qsd_refresh_usage()) $$$ disk usage: 11524 qsd:nbp7-OST003c qtype:usr id:9567 enforced:1 granted:671088640 pending:0 waiting:0 req:0 usage:11524 qunit:0 qtune:0 edquot:0
00040000:04000000:29.0:1407806067.111496:0:10463:0:(qsd_handler.c:111:qsd_ready()) $$$ connection to master not ready qsd:nbp7-OST003c qtype:usr id:9567 enforced:1 granted:671088640 pending:0 waiting:0 req:0 usage:11524 qunit:0 qtune:0 edquot:0
00040000:04000000:29.0:1407806067.111498:0:10463:0:(qsd_handler.c:917:qsd_adjust()) $$$ delaying adjustment since qsd isn&apos;t ready qsd:nbp7-OST003c qtype:usr id:9567 enforced:1 granted:671088640 pending:0 waiting:0 req:0 usage:11524 qunit:0 qtune:0 edquot:0
00040000:04000000:28.0:1407806067.879498:0:10663:0:(qsd_entry.c:219:qsd_refresh_usage()) $$$ disk usage: 30092 qsd:nbp7-OST0034 qtype:usr id:9567 enforced:1 granted:711605332 pending:0 waiting:0 req:0 usage:30092 qunit:0 qtune:0 edquot:0
00040000:04000000:28.0:1407806067.879500:0:10663:0:(qsd_handler.c:111:qsd_ready()) $$$ connection to master not ready qsd:nbp7-OST0034 qtype:usr id:9567 enforced:1 granted:711605332 pending:0 waiting:0 req:0 usage:30092 qunit:0 qtune:0 edquot:0
00040000:04000000:28.0:1407806067.879502:0:10663:0:(qsd_handler.c:917:qsd_adjust()) $$$ delaying adjustment since qsd isn&apos;t ready qsd:nbp7-OST0034 qtype:usr id:9567 enforced:1 granted:711605332 pending:0 waiting:0 req:0 usage:30092 qunit:0 qtune:0 edquot:0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Once I disabled enforcement all hung tasks freed up.&lt;/p&gt;
</comment>
                            <comment id="91440" author="pjones" created="Tue, 12 Aug 2014 18:26:46 +0000"  >&lt;p&gt;Niu&lt;/p&gt;

&lt;p&gt;Could you please comment on these quotas errors?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="91503" author="niu" created="Wed, 13 Aug 2014 01:59:51 +0000"  >&lt;blockquote&gt;
&lt;p&gt;Lai - the log appears to be fine for me. Note it&apos;s not a tar file, just gzipped text&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I can&apos;t download it neither, probably we can&apos;t download it as anonymous now? but I don&apos;t know which user should be used for the ftp.whamcloud.com.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;1. The quota slaves had not connected to the master. We have seen this issue before and can only recover if the whole filesystem is unmounted and remounted. Is there a way to force the slave to retry connecting to the master without unmounting.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;The pinger will always retry the connecting on all imports, so I don&apos;t think we need any way to force that. The problem is that we need to figure out why it can&apos;t connect to master without umount? I&apos;ll check the log to see if there is anything useful once I get the log.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;2. If the salve is not connect to the master and enforcent is enable it will delay adjustment. There should be a max tries/timeout for this and return ENQUOTA.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;I don&apos;t think it&apos;s correct to return EDQUOT in such case. The old quota design is to retry the dqacq on OST infinitely, however, that&apos;ll hold a OST service thread until the connection to master established. Our new quota design is to retry the write from client side infinitely,&lt;/p&gt;</comment>
                            <comment id="91527" author="mhanafi" created="Wed, 13 Aug 2014 15:11:21 +0000"  >&lt;p&gt;The wait  from client side has the side effect of all commands, ls for example, hanging from the effective directory. So if the user has the effected path in their path logins will hang. &lt;/p&gt;

&lt;p&gt;At the minimum an error should be logged. &lt;/p&gt;
</comment>
                            <comment id="91596" author="niu" created="Thu, 14 Aug 2014 05:48:19 +0000"  >&lt;p&gt;I can&apos;t see why slave can&apos;t connect to master from the log. Could you explain how did you resolve the problem in detail? (just umount then mount all MDT &amp;amp; OSTs, or writeconf is required?) I&apos;m wondering if it&apos;s an upgraded system? (because of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5298&quot; title=&quot;The lwp device cannot be started when we migrate from Lustre 2.1 to Lustre 2.4&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5298&quot;&gt;&lt;del&gt;LU-5298&lt;/del&gt;&lt;/a&gt;, writeconf is required for the upgraded system, otherwise, slave won&apos;t connect to master).&lt;/p&gt;</comment>
                            <comment id="91635" author="mhanafi" created="Thu, 14 Aug 2014 17:42:55 +0000"  >&lt;p&gt;Yes the system was upgraded. We didn&apos;t the writeconf. I checked the MDT0000-mdc in the client long and it does show up as 2.1.5&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;#15 (224)marker   5 (flags=0x02, v2.1.5.0) nbp7-MDT0000    &lt;span class=&quot;code-quote&quot;&gt;&apos;add mdc&apos;&lt;/span&gt; Sun Jun 16 20:10:35 2013-
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;To do a writeconf is not ideal, because it requires taking the filsystem offline.&lt;/p&gt;</comment>
                            <comment id="91699" author="niu" created="Fri, 15 Aug 2014 04:14:36 +0000"  >&lt;blockquote&gt;
&lt;p&gt;To do a writeconf is not ideal, because it requires taking the filsystem offline.&lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;Then what I can think of is:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;dump out the client log to a local file, then modify the version of &quot;add mdc&quot; to current lustre version; (by some hex editor?)&lt;/li&gt;
	&lt;li&gt;umount the mdt (or mgs, if it&apos;s not combined with mdt), mount it as ldiskfs and replace the client log with your modified local file;&lt;/li&gt;
	&lt;li&gt;mount mdt;&lt;/li&gt;
	&lt;li&gt;umount/mount all OSTs to reprocess client log, and that&apos;ll trigger slave connect to master;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;But I strongly suggest you use writeconf on next maintaining down time instead of using such dangerous method.&lt;/p&gt;</comment>
                            <comment id="145176" author="mhanafi" created="Thu, 10 Mar 2016 20:07:38 +0000"  >&lt;p&gt;Please close. no longer an issue.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="15502" name="service194.gz" size="104277" author="mhanafi" created="Sat, 9 Aug 2014 07:21:36 +0000"/>
                            <attachment id="15507" name="service64.trace.gz" size="88959" author="mhanafi" created="Mon, 11 Aug 2014 18:09:34 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwtb3:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>15236</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>