<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:19:46 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8696] &quot;ls&quot; hangs on a particular directory on production system </title>
                <link>https://jira.whamcloud.com/browse/LU-8696</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;On atlas2 file system, we have a particular directory, any operations such as &quot;ls&quot; or &quot;stat&quot; will completely hang the process. This incurs no OS error or Lustre error from the client side. On server side, we did observe OI scrub message a few times, which may suggest there is some MDS data inconsistency, and it is &quot;trying&quot; to do the fix but no avail. We can&apos;t correlate the two yet.&lt;/p&gt;

&lt;p&gt;Ops teams have collected traces on the client side by:&lt;/p&gt;

&lt;p&gt;mount -t lustre  10.36.226.77@o2ib:/atlas2 /lustre/atlas2 -o rw,flock,nosuid,nodev&lt;br/&gt;
lctl set_param osc/*/checksums 0&lt;br/&gt;
echo &#8220;all&#8221; &amp;gt; /proc/sys/lnet/debug&lt;br/&gt;
echo &#8220;1024&#8221; &amp;gt; /proc/sys/lnet/debug_mb&lt;/p&gt;

&lt;p&gt;Step2: cd /lustre/atlas2/path/to/offending_directory/&lt;br/&gt;
Step3: ls&lt;/p&gt;

&lt;p&gt;Step1: lctl dk &amp;gt; /dev/null&lt;br/&gt;
Step4: Wait 30 seconds&lt;br/&gt;
Step5: lctl dk &amp;gt; atlas2-mds3_ls_for_fprof.out&lt;/p&gt;

&lt;p&gt;the log is attached.&lt;/p&gt;</description>
                <environment>OLCF Atlas production system: clients running 2.8.0+ (with patches), server running 2.5.5+ (with patches)</environment>
        <key id="40527">LU-8696</key>
            <summary>&quot;ls&quot; hangs on a particular directory on production system </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="fwang2">Feiyi Wang</reporter>
                        <labels>
                    </labels>
                <created>Wed, 12 Oct 2016 15:58:37 +0000</created>
                <updated>Mon, 13 Nov 2017 23:08:18 +0000</updated>
                            <resolved>Thu, 9 Mar 2017 18:19:05 +0000</resolved>
                                    <version>Lustre 2.5.3</version>
                    <version>Lustre 2.8.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="169300" author="fwang2" created="Wed, 12 Oct 2016 15:59:39 +0000"  >&lt;p&gt;from client side&lt;/p&gt;</comment>
                            <comment id="169681" author="jgmitter" created="Fri, 14 Oct 2016 17:15:26 +0000"  >&lt;p&gt;Hi Fan Yong,&lt;/p&gt;

&lt;p&gt;Could you please have a look at this issue?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="170684" author="yong.fan" created="Sun, 23 Oct 2016 12:26:04 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;...
00000002:00100000:1.0:1476224347.810367:0:14314:0:(mdc_locks.c:612:mdc_finish_enqueue()) @@@ op: 8 disposition: b, status: -115  req@ffff883f9d89a0c0 x1547933307679040/t0(0) o101-&amp;gt;atlas2-MDT0000-mdc-ffff883fe0706c00@10.36.226.121@o2ib:12/10 lens 696/536 e 0 to 0 dl 1476224665 ref 1 fl Interpret:R/0/0 rc 301/301
00010000:00000001:1.0:1476224347.810373:0:14314:0:(ldlm_lock.c:598:__ldlm_handle2lock()) Process entered
00000020:00000001:1.0:1476224347.810373:0:14314:0:(lustre_handles.c:160:class_handle2object()) Process entered
00000020:00000001:1.0:1476224347.810374:0:14314:0:(lustre_handles.c:183:class_handle2object()) Process leaving (rc=0 : 0 : 0)
00010000:00000001:1.0:1476224347.810376:0:14314:0:(ldlm_lock.c:604:__ldlm_handle2lock()) Process leaving (rc=0 : 0 : 0)
00000002:00000001:1.0:1476224347.810377:0:14314:0:(mdc_locks.c:741:mdc_finish_enqueue()) Process leaving (rc=0 : 0 : 0)
00000002:00000001:1.0:1476224347.810378:0:14314:0:(mdc_locks.c:922:mdc_finish_intent_lock()) Process entered
00000002:00000001:1.0:1476224347.810379:0:14314:0:(mdc_locks.c:946:mdc_finish_intent_lock()) Process leaving (rc=18446744073709551501 : -115 : ffffffffffffff8d)
00000002:00000001:1.0:1476224347.810380:0:14314:0:(mdc_locks.c:1195:mdc_intent_getattr_async_interpret()) Process leaving
00000080:00000001:1.0:1476224347.810382:0:14314:0:(statahead.c:663:ll_statahead_interpret()) Process entered
00000080:00400000:1.0:1476224347.810382:0:14314:0:(statahead.c:675:ll_statahead_interpret()) sa_entry DESIGN_3D_MAR2013 rc -115
00000080:00000001:1.0:1476224347.810384:0:14314:0:(dcache.c:269:ll_intent_release()) Process entered
00000080:00000040:1.0:1476224347.810385:0:14314:0:(dcache.c:271:ll_intent_release()) intent ffff883f928a0d38 released
00000080:00000001:1.0:1476224347.810386:0:14314:0:(dcache.c:282:ll_intent_release()) Process leaving
00000080:00000010:1.0:1476224347.810387:0:14314:0:(statahead.c:680:ll_statahead_interpret()) kfreed &apos;minfo&apos;: 480 at ffff883f928a0c00.
00000080:00000001:1.0:1476224347.810391:0:14314:0:(statahead.c:709:ll_statahead_interpret()) Process leaving (rc=18446744073709551501 : -115 : ffffffffffffff8d)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;As shown in the client side log, when the client wants to stat the object DESIGN_3D_MAR2013, it hits -115 failure. That means the MDT is doing OI scrub, and the target OI mapping for DESIGN_3D_MAR2013 is not ready yet. So the client has to retry again and again until related OI mapping is rebuilt, looks like hung there. As for what caused the OI scrub, we need the MDS side kernel debug log (-1 level).&lt;/p&gt;

&lt;p&gt;So there should be some OI inconsistency, I would suggest you to run OI scrub on the MDT with kernel debug log (please open lfsck debug: &quot;lctl set_param debug=+lfsck&quot;) collected. Please show me the output of &quot;lctl get_param -n osd-ldiskfs.$fsname-MDT0000.oi_scrub&quot; before and after the OI scrub. Thanks!&lt;/p&gt;</comment>
                            <comment id="170926" author="dustb100" created="Tue, 25 Oct 2016 11:53:27 +0000"  >&lt;p&gt;Nasf,  &lt;br/&gt;
      Per Intel&apos;s recommendation, we ran an e2fsck during our last test shot to see if the problem gets fixed (despite the OI scrubber messages that we were seeing in the logs). We did find some non-critical issues, but we are still seeing the same hanging behavior with this directory. We have to take a downtime to temporarily upgrade to lustre-2.8 to use a functional LFSCK. I&apos;m not 100% sure when we will get this opportunity, but I will keep it on our radar. For your reference, this is the IO scrub lfs get_param info you were wanting:&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@atlas2-mds1 mdt&amp;#93;&lt;/span&gt;# lctl get_param -n osd-ldiskfs.atlas2-MDT0000.oi_scrub&lt;br/&gt;
name: OI_scrub&lt;br/&gt;
magic: 0x4c5fd252&lt;br/&gt;
oi_files: 64&lt;br/&gt;
status: completed&lt;br/&gt;
flags:&lt;br/&gt;
param:&lt;br/&gt;
time_since_last_completed: 559 seconds&lt;br/&gt;
time_since_latest_start: 5295 seconds&lt;br/&gt;
time_since_last_checkpoint: 559 seconds&lt;br/&gt;
latest_start_position: 12&lt;br/&gt;
last_checkpoint_position: 1073741825&lt;br/&gt;
first_failure_position: N/A&lt;br/&gt;
checked: 406401957&lt;br/&gt;
updated: 0&lt;br/&gt;
failed: 0&lt;br/&gt;
prior_updated: 0&lt;br/&gt;
noscrub: 192023&lt;br/&gt;
igif: 158502&lt;br/&gt;
success_count: 1140&lt;br/&gt;
run_time: 4736 seconds&lt;br/&gt;
average_speed: 85811 objects/sec&lt;br/&gt;
real-time_speed: N/A&lt;br/&gt;
current_position: N/A&lt;br/&gt;
lf_scanned: 0&lt;br/&gt;
lf_reparied: 0&lt;br/&gt;
lf_failed: 0&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Dustin &lt;/p&gt;</comment>
                            <comment id="170954" author="yong.fan" created="Tue, 25 Oct 2016 14:39:34 +0000"  >&lt;p&gt;It is strange that the OI scrub has not found inconsistency. It should be some OI scrub issue.&lt;br/&gt;
Do you have the MDS side -1 level Lustre kernel debug logs when the &quot;ls&quot; hung? On the other hand, would you please to use &quot;debugfs&quot; to dump the directory and its sub-items that caused the system hung when &quot;ls&quot;? Thanks!&lt;/p&gt;</comment>
                            <comment id="171295" author="yong.fan" created="Thu, 27 Oct 2016 02:09:13 +0000"  >&lt;p&gt;Another possible reason is that the FID-in-dirent is corrupted, that can explain why the OI scrub was triggered but no inconsistent OI mapping was found. It can be verified via &quot;lctl set_param fail_loc=0x1505&quot; on the MDS and try &quot;ls&quot; again after the setting. If it still hung there, then it is NOT the case; otherwise, we found the reason.&lt;/p&gt;</comment>
                            <comment id="175080" author="yong.fan" created="Sat, 26 Nov 2016 08:21:29 +0000"  >&lt;p&gt;Dustin, do you have more logs or any feedback about trying &quot;lctl set_param fail_loc=0x1505&quot; on the MDS? Thanks!&lt;/p&gt;</comment>
                            <comment id="181831" author="yong.fan" created="Tue, 24 Jan 2017 02:57:25 +0000"  >&lt;p&gt;Ping.&lt;/p&gt;</comment>
                            <comment id="181929" author="dustb100" created="Tue, 24 Jan 2017 15:42:49 +0000"  >&lt;p&gt;Sorry for the delays Nasf! We will be having an outage on Feb. 07 to test lustre-2.8 servers and will hopefully leave it in production. After this outage we can run an online lfsck to see if this probably gets resolved. &lt;/p&gt;</comment>
                            <comment id="186433" author="simmonsja" created="Tue, 28 Feb 2017 15:48:16 +0000"  >&lt;p&gt;The command &quot;lctl set_param fail_loc=0x1505&quot; was run on the MDS and it fixed the problem. Thanks nasf&lt;/p&gt;</comment>
                            <comment id="186438" author="yong.fan" created="Tue, 28 Feb 2017 16:13:22 +0000"  >&lt;p&gt;Sorry, some misguide, &quot;lctl set_param fail_loc=0x1505&quot; is used for bypass FID-in-dirent when it is broken. So if your system hang before, but works well with &quot;lctl set_param fail_loc=0x1505&quot;, that means it is quite possible that some FID-in-dirent is broken. Under such case, you need to run namespace LFSCK (with &quot;lctl set_param fail_loc=0&quot;) to repair the FID-in-dirent. Otherwise, bypass FID-in-dirent will slowdown the lookup() performance.&lt;/p&gt;</comment>
                            <comment id="186452" author="simmonsja" created="Tue, 28 Feb 2017 16:55:09 +0000"  >&lt;p&gt;Since this is the case we will leave this open until the lfsck run.&lt;/p&gt;</comment>
                            <comment id="187678" author="simmonsja" created="Thu, 9 Mar 2017 18:19:05 +0000"  >&lt;p&gt;lfsck run seems to have fixed this.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10324">
                    <name>Cloners</name>
                                                                <inwardlinks description="is cloned by">
                                        <issuelink>
            <issuekey id="49305">LU-10237</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="23403" name="atlas2-mds3_ls_for_fprof.out.gz" size="2525153" author="fwang2" created="Wed, 12 Oct 2016 15:59:39 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyrjj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>