<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:31:51 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3202] SWL Causes OST hang</title>
                <link>https://jira.whamcloud.com/browse/LU-3202</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Started SWL. Had immediate watchdog timeouts on OSTs, clients are stuck in &apos;comp&apos; state, are evicted by servers and unable to reconnect, as servers are returning EBUSY. Appears to be some sort of watchdog storm on OSTs, stack dump attached. &lt;/p&gt;</description>
                <environment>LLNL/Hyperion</environment>
        <key id="18480">LU-3202</key>
            <summary>SWL Causes OST hang</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="1" iconUrl="https://jira.whamcloud.com/images/icons/priorities/blocker.svg">Blocker</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="yong.fan">nasf</assignee>
                                    <reporter username="cliffw">Cliff White</reporter>
                        <labels>
                            <label>HB</label>
                    </labels>
                <created>Mon, 22 Apr 2013 18:33:18 +0000</created>
                <updated>Fri, 26 Apr 2013 13:00:14 +0000</updated>
                            <resolved>Fri, 26 Apr 2013 12:29:35 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="56716" author="cliffw" created="Mon, 22 Apr 2013 18:34:12 +0000"  >&lt;p&gt;sysrq -p from one OST&lt;/p&gt;</comment>
                            <comment id="56742" author="cliffw" created="Mon, 22 Apr 2013 21:16:55 +0000"  >&lt;p&gt;Reformatted the filesystem, restarted the test, had failure immediately. attached are consoles from all OSTs, and console from the first client to be evicted.&lt;/p&gt;</comment>
                            <comment id="56746" author="pjones" created="Mon, 22 Apr 2013 22:12:37 +0000"  >&lt;p&gt;Oleg is looking into this&lt;/p&gt;</comment>
                            <comment id="56752" author="green" created="Mon, 22 Apr 2013 23:13:24 +0000"  >&lt;p&gt;This seems to be a pretty clear deadlock:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-04-22 11:21:58 ll_ost01_001  D 000000000000000a     0  5490      2 0x000000
00
2013-04-22 11:21:58  ffff8803b0c4f9b0 0000000000000046 0000000000000000 ffff8803
2f4e8950
2013-04-22 11:21:58  ffff8803b0c4fa10 ffff8802ad507020 ffff8803b0c4f9a0 ffff8802
0a13c518
2013-04-22 11:21:58  ffff8803b0c49058 ffff8803b0c4ffd8 000000000000fb88 ffff8803
b0c49058
2013-04-22 11:21:58 Call Trace:
2013-04-22 11:21:58  [&amp;lt;ffffffff8150f3be&amp;gt;] __mutex_lock_slowpath+0x13e/0x180
2013-04-22 11:21:58  [&amp;lt;ffffffff8150f25b&amp;gt;] mutex_lock+0x2b/0x50
2013-04-22 11:21:58  [&amp;lt;ffffffffa100265e&amp;gt;] osd_object_destroy+0x1be/0x580 [osd_ldiskfs]
2013-04-22 11:21:58  [&amp;lt;ffffffffa089ebcf&amp;gt;] ? ldiskfs_dirty_inode+0x4f/0x60 [ldiskfs]
2013-04-22 11:21:58  [&amp;lt;ffffffffa10dcd20&amp;gt;] ofd_object_destroy+0x380/0x680 [ofd]
2013-04-22 11:21:58  [&amp;lt;ffffffffa10d0056&amp;gt;] ofd_destroy_by_fid+0x266/0x610 [ofd]
2013-04-22 11:21:58  [&amp;lt;ffffffffa0aca7a0&amp;gt;] ? ldlm_blocking_ast+0x0/0x180 [ptlrpc]
2013-04-22 11:21:58  [&amp;lt;ffffffffa0acbe00&amp;gt;] ? ldlm_completion_ast+0x0/0x960 [ptlrpc]
2013-04-22 11:21:58  [&amp;lt;ffffffffa0af4345&amp;gt;] ? lustre_msg_buf+0x55/0x60 [ptlrpc]
2013-04-22 11:21:58  [&amp;lt;ffffffffa10d3bb7&amp;gt;] ofd_destroy+0x1a7/0x8b0 [ofd]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;vs&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-04-22 11:21:56  [&amp;lt;ffffffffa0873ce5&amp;gt;] jbd2_log_wait_commit+0xc5/0x140 [jbd2]
2013-04-22 11:21:56  [&amp;lt;ffffffff81096ca0&amp;gt;] ? autoremove_wake_function+0x0/0x40
2013-04-22 11:21:56  [&amp;lt;ffffffffa089714e&amp;gt;] ldiskfs_sync_file+0x15e/0x260 [ldiskfs
]
2013-04-22 11:21:56  [&amp;lt;ffffffffa0fff854&amp;gt;] osd_object_sync+0x144/0x190 [osd_ldisk
fs]
2013-04-22 11:21:56  [&amp;lt;ffffffffa10cfadb&amp;gt;] ofd_sync+0x36b/0x680 [ofd]
2013-04-22 11:21:56  [&amp;lt;ffffffffa109eb30&amp;gt;] ost_blocking_ast+0x640/0x10f0 [ost]
2013-04-22 11:21:56  [&amp;lt;ffffffffa0964baf&amp;gt;] ? lu_context_fini+0x2f/0xc0 [obdclass]
2013-04-22 11:21:56  [&amp;lt;ffffffffa0aab16c&amp;gt;] ldlm_cancel_callback+0x6c/0x1a0 [ptlrp
c]
2013-04-22 11:21:56  [&amp;lt;ffffffffa0aab2fa&amp;gt;] ldlm_lock_cancel+0x5a/0x1e0 [ptlrpc]
2013-04-22 11:21:56  [&amp;lt;ffffffffa0aceee4&amp;gt;] ldlm_request_cancel+0x254/0x410 [ptlrp
c]
2013-04-22 11:21:56  [&amp;lt;ffffffffa0acf1dd&amp;gt;] ldlm_handle_cancel+0x13d/0x240 [ptlrpc
]
2013-04-22 11:21:56  [&amp;lt;ffffffffa0ad1218&amp;gt;] ldlm_cancel_handler+0x3f8/0x600 [ptlrp
c]
2013-04-22 11:21:56  [&amp;lt;ffffffffa0b0539c&amp;gt;] ptlrpc_server_handle_request+0x40c/0xd
90 [ptlrpc]
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In fact osd_object_destroy trying to take i_mutex while having a transaction open is a big no-no as it could lead to this exact lock inversions - usually i_mutex is taken bfore having a transaction open.&lt;br/&gt;
The ldiskfs_sync_file is called with i_mutex_held.&lt;/p&gt;

&lt;p&gt;What&apos;s unclear is how come we did not hit this any earlier, this code has been present in our tree since mid last year, it seems.&lt;/p&gt;</comment>
                            <comment id="56772" author="green" created="Tue, 23 Apr 2013 05:50:34 +0000"  >&lt;p&gt;Reassigned to FanYong as he&apos;s going to take it further with a fix.&lt;/p&gt;</comment>
                            <comment id="56776" author="yong.fan" created="Tue, 23 Apr 2013 06:43:23 +0000"  >&lt;p&gt;Originally, to control the race between the OI scrub inserting&lt;br/&gt;
OI mapping and the osd_object_destroy() removing OI mapping on&lt;br/&gt;
the same target, the inode::i_mutex was used.&lt;/p&gt;

&lt;p&gt;But the unlink thread which called osd_object_destroy() already&lt;br/&gt;
started transaction handle. Such order is different from others&lt;br/&gt;
as to may cause some deadlock between transaction start and the&lt;br/&gt;
obtain inode::i_mutex.&lt;/p&gt;

&lt;p&gt;So now, the osd_object_destroy() will not obtain inode::i_mutex,&lt;br/&gt;
instead, the OI scrub will check whether someone unlinked the&lt;br/&gt;
inode or not during the OI scrub rebuilding the OI mapping, and&lt;br/&gt;
remove the new-inserted OI mapping if the race happened.&lt;/p&gt;

&lt;p&gt;This is the patch:&lt;br/&gt;
&lt;a href=&quot;http://review.whamcloud.com/#change,6124&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,6124&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="57118" author="jlevi" created="Fri, 26 Apr 2013 12:29:35 +0000"  >&lt;p&gt;Patch landed. Let me know if more work is needed and I will reopen the ticket.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="12554" name="dit32.console.gz" size="105256" author="cliffw" created="Mon, 22 Apr 2013 18:34:12 +0000"/>
                            <attachment id="12555" name="srv.tar.gz" size="55596" author="cliffw" created="Mon, 22 Apr 2013 21:16:55 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvoo7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7822</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>