<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:08:57 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-634] LBUG in Kerberos sec.c::sptlrpc_req_ctx_switch()</title>
                <link>https://jira.whamcloud.com/browse/LU-634</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Aug 18 14:14:36 extenci kernel: LustreError: 3532:0:(sec.c:468:sptlrpc_req_ctx_switch()) ASSERTION(req-&amp;gt;rq_reqmsg) failed&lt;br/&gt;
Aug 18 14:14:36 extenci kernel: LustreError: 3532:0:(sec.c:468:sptlrpc_req_ctx_switch()) LBUG&lt;/p&gt;</description>
                <environment>Lustre 2.0.63 clients&lt;br/&gt;
Kernel 2.6.18-238.12.1.el5xen&lt;br/&gt;
&lt;br/&gt;
&lt;br/&gt;
Crashes client only  even with simple ls.  Apparent no pattern and not easily reproducible.&lt;br/&gt;
Patched/fixed in later versions?</environment>
        <key id="11565">LU-634</key>
            <summary>LBUG in Kerberos sec.c::sptlrpc_req_ctx_switch()</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="4">Incomplete</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="josephin">Josephine Palencia</reporter>
                        <labels>
                    </labels>
                <created>Thu, 25 Aug 2011 06:36:41 +0000</created>
                <updated>Wed, 17 Feb 2021 19:35:03 +0000</updated>
                            <resolved>Sun, 28 May 2017 07:04:59 +0000</resolved>
                                    <version>Lustre 2.1.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>11</watches>
                                                                            <comments>
                            <comment id="20359" author="green" created="Tue, 20 Sep 2011 16:06:06 +0000"  >&lt;p&gt;Can we at least get some logs, it&apos;s impossible to evaluate this otherwise.&lt;/p&gt;</comment>
                            <comment id="26884" author="dwd" created="Thu, 19 Jan 2012 11:41:04 +0000"  >&lt;p&gt;Here&apos;s a much more detailed console log of the crash that Josephine reported.  This is with the latest lustre tag from which I built lustre-client-2.1.54-2.6.18_274.12.1.el5.x86_64.  In case it matters, the configure options were --disable-server --enable-dependency-tracking --enable-posix-osd --enable-panic_dumplog --enable-health_write --enable-lru-resize --enable-gss --enable-quota --enable-ext4 --enable-mindf.&lt;/p&gt;</comment>
                            <comment id="27271" author="josephin" created="Tue, 24 Jan 2012 18:56:49 +0000"  >&lt;p&gt;Also for kernel-2.6.18-274.17.1.el5xen, &lt;br/&gt;
         lustre: 2.1.54&lt;/p&gt;


&lt;p&gt;Jan 24 10:13:09 extenci kernel: Lustre: 1775:0:(gss_keyring.c:970:gss_sec_gc_ctx_kr()) running gc&lt;br/&gt;
Jan 24 10:25:23 extenci kernel: Lustre: 3695:0:(sec_gss.c:405:gss_cli_ctx_uptodate()) client refreshed ctx ffff88030bfa9c80 idx 0xbf75804b3ef9f4af (77513-&amp;gt;extenci-MDT0000_UUID), expiry 1327471191(+52468s)&lt;br/&gt;
Jan 24 10:26:40 extenci kernel: Lustre: 3740:0:(sec_gss.c:405:gss_cli_ctx_uptodate()) client refreshed ctx ffff88030bfa9880 idx 0xbf75804b3ef9f4b0 (77513-&amp;gt;extenci-MDT0000_UUID), expiry 1327471191(+52391s)&lt;br/&gt;
Jan 24 10:26:43 extenci kernel: Lustre: 3784:0:(sec_gss.c:405:gss_cli_ctx_uptodate()) client refreshed ctx ffff8802640873c0 idx 0xbf75804b3ef9f4b1 (77513-&amp;gt;extenci-MDT0000_UUID), expiry 1327471191(+52388s)&lt;br/&gt;
Jan 24 11:13:09 extenci kernel: Lustre: 1775:0:(gss_keyring.c:970:gss_sec_gc_ctx_kr()) running gc&lt;br/&gt;
Jan 24 11:57:16 extenci kernel: Lustre: 31454:0:(sec_gss.c:345:cli_ctx_expire()) ctx ffff880236f42280(77602-&amp;gt;extenci-MDT0000_UUID) get expired: 1327422052(-2184s)&lt;br/&gt;
Jan 24 11:57:16 extenci kernel: LustreError: 31454:0:(sec.c:468:sptlrpc_req_ctx_switch()) ASSERTION(req-&amp;gt;rq_reqmsg) failed&lt;br/&gt;
Jan 24 11:57:16 extenci kernel: LustreError: 31454:0:(sec.c:468:sptlrpc_req_ctx_switch()) LBUG&lt;br/&gt;
Jan 24 11:57:16 extenci kernel: Pid: 31454, comm: bash&lt;br/&gt;
Jan 24 11:57:16 extenci kernel:&lt;br/&gt;
Jan 24 11:57:16 extenci kernel: Call Trace:&lt;br/&gt;
Jan 24 11:57:16 extenci kernel:  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88425641&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x51/0x60 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan 24 11:57:16 extenci kernel:  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88425b7a&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x7a/0xd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan 24 11:57:16 extenci kernel:  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff88430cf0&amp;gt;&amp;#93;&lt;/span&gt; cfs_tracefile_init+0x0/0x10a &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan 24 11:57:16 extenci kernel:  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8860f81c&amp;gt;&amp;#93;&lt;/span&gt; sptlrpc_req_replace_dead_ctx+0x24c/0xad0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan 24 11:57:16 extenci kernel:  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8031b3d1&amp;gt;&amp;#93;&lt;/span&gt; request_key_and_link+0x41/0x4e9&lt;br/&gt;
Jan 24 11:57:16 extenci kernel:  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8860c3bc&amp;gt;&amp;#93;&lt;/span&gt; sptlrpc_import_sec_ref+0x1c/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan 24 11:57:16 extenci kernel:  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8860f1f8&amp;gt;&amp;#93;&lt;/span&gt; import_sec_validate_get+0xd8/0x1f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
Jan 24 17:43:10 extenci syslogd 1.4.1: restart.&lt;/p&gt;</comment>
                            <comment id="27713" author="dwd" created="Wed, 1 Feb 2012 10:06:17 +0000"  >&lt;p&gt;On kernel-2.6.18-274.17.1.el5 (with lustre 2.1.54) it seems to take a lot longer to reproduce the problem, but it still happens.  Another crash log is attached.&lt;/p&gt;</comment>
                            <comment id="28122" author="green" created="Wed, 8 Feb 2012 01:34:42 +0000"  >&lt;p&gt;Ah, so it&apos;s some sort of kerberos deployment?&lt;br/&gt;
This is not really supported at the moment, sorry.&lt;/p&gt;</comment>
                            <comment id="28173" author="dwd" created="Wed, 8 Feb 2012 12:07:22 +0000"  >&lt;p&gt;That&apos;s disappointing news to me.  Can you please tell us who contributed the Kerberos code then?  Perhaps they would be interested in improving its quality, or could at least advise us on how to debug it.  We are part of an NSF-funded project (extenci.org) that is in part evaluating wide-area lustre for use by Large Hadron Collider experiments.  Working, non-crashing Kerberos functionality is a vital part of that.&lt;/p&gt;</comment>
                            <comment id="28277" author="pjones" created="Thu, 9 Feb 2012 09:12:22 +0000"  >&lt;p&gt;Dave&lt;/p&gt;

&lt;p&gt;I&apos;ll reach out to you directly to discuss this&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="46897" author="santoshkr" created="Thu, 25 Oct 2012 02:33:31 +0000"  >&lt;p&gt;Details regarding the issues and the patch.&lt;/p&gt;

&lt;p&gt;Fixes are specific to lustre-2.1.54 code for providing kerberos support.&lt;br/&gt;
LU634 was not just specific to ASSERT crashes but had other issues too.Attaching the patch and the below write-up describes the issues and the corresponding fix. &lt;/p&gt;

&lt;p&gt;1. ASSERTION Crashes&lt;/p&gt;

&lt;p&gt;After a TGT expires, the ticket is no longer refreshed automatically. The user must authenticate with Kerberos again to get a new TGT.&lt;/p&gt;

&lt;p&gt;For this assertion failure which is currently resulting in crash we are removing the ASSERT check which is being carried out in sptlrpc_req_ctx_switch() as the request buffers are already being  allocated before sptlrpc_req_ctx_switch() is called but in case of an import check context,it creates a fake request which does not have a request message buffer allocated.So the check is not required as the code further down the line is taking care of it.&lt;/p&gt;


&lt;p&gt;NOTE: root user will not encounter this problem, because root use a pre-installed keytable service credential, hence can refresh its tickets automatically.&lt;/p&gt;


&lt;p&gt;2. ldlm_cli_cancel_local LBUG&lt;/p&gt;

&lt;p&gt;During the case when a new lock gets created by ldlm_cli_enqueue,later when it fails to allocate memory for a new request issued by ptlrpc_request,control reaches ldlm_cli_cancel_local().&lt;/p&gt;

&lt;p&gt;Fixed as part of the latest lustre code.Since the relevant structure are not filled up (as lock-&amp;gt;l_conn_export is NULL).This results in control entering the  else-statement because the lock-&amp;gt;l_conn_export has not be set so far and as a result LBUG catches it as it a client side lock part of the fix is to fill in some lock fields before the memory allocation for a new request.&lt;/p&gt;


&lt;p&gt;3. mdc_lock NULL pointer dereference&lt;/p&gt;

&lt;p&gt;During reserving memory in mdc_enqueue when a call from ldlm_cli_enqueue() fails and control enters mdc_clear_replay_flag() which tries to clear off the flags, so as to not hold any error requests for replay and as  ptlrpc_request is still NULL and the code fails to handle a NULL pointer dereferencing.So code has been added to handle NULL pointers.&lt;/p&gt;</comment>
                            <comment id="46898" author="santoshkr" created="Thu, 25 Oct 2012 02:36:36 +0000"  >&lt;p&gt;&amp;gt; cd srcdir&lt;br/&gt;
&amp;gt; patch -p1 &amp;lt; LU634-lfs-2.1.54.patch&lt;/p&gt;</comment>
                            <comment id="52117" author="adilger" created="Mon, 11 Feb 2013 12:25:16 +0000"  >&lt;p&gt;Please submit this patch to Gerrit for inspection, testing, and landing. The patch submission process is described at &lt;a href=&quot;http://wiki.whamcloud.com/display/PUB/Patch+Landing+Process+Summary&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://wiki.whamcloud.com/display/PUB/Patch+Landing+Process+Summary&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="57862" author="adilger" created="Tue, 7 May 2013 21:36:21 +0000"  >&lt;p&gt;Alex, Josephine, Santosh, Dave,&lt;br/&gt;
is someone with an interest in functioning Kerberos able to update these patches to match the Lustre Coding Guidelines (&lt;a href=&quot;https://wiki.hpdd.intel.com/display/PUB/Coding+Guidelines&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/PUB/Coding+Guidelines&lt;/a&gt;), test them locally against 2.1 and/or 2.4, and submit them to Gerrit for review and regression testing?&lt;/p&gt;

&lt;p&gt;The patches themselves need a bit of work, and cannot be landed as-is, since they would spew messages onto the console under normal operation, but that should be apparent during your local testing.  Unfortunately, we do not have any facilities or expertise to test Kerberos-enabled Lustre ourselves, and no funding to hire someone to do this.&lt;/p&gt;

&lt;p&gt;That said, I&apos;m always interested to get bug fixes into the released versions of Lustre, so if these patches are important to you it is in your own best interest to move them through the process for landing.&lt;/p&gt;</comment>
                            <comment id="66535" author="pjones" created="Thu, 12 Sep 2013 18:06:09 +0000"  >&lt;p&gt;AS per Xyratex on the CDWG call, this patch was only a prototype and is not being upstreamed - &lt;a href=&quot;http://wiki.opensfs.org/CDWG_Minutes_2013-09-11&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://wiki.opensfs.org/CDWG_Minutes_2013-09-11&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="197270" author="adilger" created="Sun, 28 May 2017 07:04:59 +0000"  >&lt;p&gt;Close old issue.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="10748" name="LU-634-crash.log" size="6545" author="dwd" created="Thu, 19 Jan 2012 11:41:04 +0000"/>
                            <attachment id="10803" name="LU-634-crash2.log" size="3626" author="dwd" created="Wed, 1 Feb 2012 10:06:17 +0000"/>
                            <attachment id="11977" name="LU634-lfs-2.1.54.patch" size="3618" author="santoshkr" created="Thu, 25 Oct 2012 02:36:36 +0000"/>
                            <attachment id="11830" name="console-07.09.12" size="7136" author="josephin" created="Sat, 8 Sep 2012 09:18:06 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvqdr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>8152</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>