<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:36:09 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-3698] sec.c:1060:sptlrpc_cli_unwrap_reply() ASSERTION( req-&gt;rq_repdata == ((void *)0) ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-3698</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;As seen on a Hyperion client iwc1&lt;/p&gt;

&lt;p&gt;There was serious memory pressure lots of allocation were failing. &lt;/p&gt;

&lt;p&gt;Build:&lt;br/&gt;
&lt;a href=&quot;http://build.whamcloud.com/job/lustre-master/1594/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://build.whamcloud.com/job/lustre-master/1594/&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Lustre: Lustre: Build Version: jenkins-arch=x86_64,build_type=client,distro=el6,ib_stack=inkernel-1594-gbdf591f-PRISTINE-2.6.32-358.11.1.el6.x86_64
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; 

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2013-08-05 10:27:53 LustreError: 84566:0:(osc_request.c:2161:osc_build_rpc()) prep_req failed: -12
2013-08-05 10:27:53 LustreError: 84566:0:(osc_request.c:2161:osc_build_rpc()) Skipped 4 previous similar messages
2013-08-05 10:27:53 LustreError: 84566:0:(osc_cache.c:2091:osc_check_rpcs()) Write request failed with -12
2013-08-05 10:27:53 LustreError: 84566:0:(osc_cache.c:2091:osc_check_rpcs()) Skipped 5 previous similar messages
2013-08-05 10:27:53 LustreError: 84566:0:(sec.c:1060:sptlrpc_cli_unwrap_reply()) ASSERTION( req-&amp;gt;rq_repdata == ((void *)0) ) failed:
2013-08-05 10:27:53 LustreError: 84566:0:(sec.c:1060:sptlrpc_cli_unwrap_reply()) LBUG
2013-08-05 10:27:53 Pid: 84566, comm: ptlrpcd_7
2013-08-05 10:27:53
2013-08-05 10:27:53 Call Trace:
2013-08-05 10:27:53  [&amp;lt;ffffffffa056e895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
2013-08-05 10:27:53  [&amp;lt;ffffffffa056ee97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
2013-08-05 10:27:53  [&amp;lt;ffffffffa0c52256&amp;gt;] sptlrpc_cli_unwrap_reply+0x1d6/0x240 [ptlrpc]
2013-08-05 10:27:53  [&amp;lt;ffffffffa0c1735f&amp;gt;] after_reply+0x6f/0xd90 [ptlrpc]
2013-08-05 10:27:53  [&amp;lt;ffffffffa057efb1&amp;gt;] ? libcfs_debug_msg+0x41/0x50 [libcfs]
2013-08-05 10:27:53  [&amp;lt;ffffffffa0c1c941&amp;gt;] ptlrpc_check_set+0xfd1/0x1b20 [ptlrpc]
2013-08-05 10:27:53  [&amp;lt;ffffffffa0c4752b&amp;gt;] ptlrpcd_check+0x53b/0x560 [ptlrpc]
2013-08-05 10:27:53  [&amp;lt;ffffffffa0c47a43&amp;gt;] ptlrpcd+0x223/0x380 [ptlrpc]
2013-08-05 10:27:53  [&amp;lt;ffffffff81063310&amp;gt;] ? default_wake_function+0x0/0x20
2013-08-05 10:27:53  [&amp;lt;ffffffffa0c47820&amp;gt;] ? ptlrpcd+0x0/0x380 [ptlrpc]
2013-08-05 10:27:53  [&amp;lt;ffffffff81096936&amp;gt;] kthread+0x96/0xa0
2013-08-05 10:27:53  [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
2013-08-05 10:27:53  [&amp;lt;ffffffff810968a0&amp;gt;] ? kthread+0x0/0xa0
2013-08-05 10:27:53  [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;There should be crashdump for this that I will take a look at if needed. &lt;/p&gt;

&lt;p&gt;I will do a little more digging and further updating of this LU it looks like we are not handling the ENOMEM (-12) quite right in this code path.    &lt;/p&gt;</description>
                <environment></environment>
        <key id="20186">LU-3698</key>
            <summary>sec.c:1060:sptlrpc_cli_unwrap_reply() ASSERTION( req-&gt;rq_repdata == ((void *)0) ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="keith">Keith Mannthey</assignee>
                                    <reporter username="keith">Keith Mannthey</reporter>
                        <labels>
                    </labels>
                <created>Mon, 5 Aug 2013 18:53:17 +0000</created>
                <updated>Tue, 24 Sep 2013 04:58:36 +0000</updated>
                            <resolved>Tue, 24 Sep 2013 04:58:36 +0000</resolved>
                                    <version>Lustre 2.5.0</version>
                                    <fixVersion>Lustre 2.5.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="63690" author="keith" created="Tue, 6 Aug 2013 02:35:04 +0000"  >&lt;p&gt;The kernel log from the boot. &lt;/p&gt;</comment>
                            <comment id="63692" author="keith" created="Tue, 6 Aug 2013 02:42:21 +0000"  >&lt;p&gt;Well I took a quick look at the code and I think this may be a little more complicated than a usual out of memory issue&quot;&lt;/p&gt;

&lt;p&gt;osc_check_rpcs has the following code:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;                osc_object_lock(osc);
                if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
                        rc = osc_send_write_rpc(env, cli, osc, pol);
                        if (rc &amp;lt; 0) {
                                CERROR(&quot;Write request failed with %d\n&quot;, rc);

                                /* osc_send_write_rpc failed, mostly because of
                                 * memory pressure.
                                 *
                                 * It can&apos;t break here, because if:
                                 *  - a page was submitted by osc_io_submit, so
                                 *    page locked;
                                 *  - no request in flight
                                 *  - no subsequent request
                                 * The system will be in live-lock state,
                                 * because there is no chance to call
                                 * osc_io_unplug() and osc_check_rpcs() any
                                 * more. pdflush can&apos;t help in this case,
                                 * because it might be blocked at grabbing
                                 * the page lock as we mentioned.
                                 *
                                 * Anyway, continue to drain pages. */
                                /* break; */
                        }
                }
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;So we don&apos;t handle the enomem on write it seems?? &lt;/p&gt;

&lt;p&gt;It seems like this might be the outcome of this decision? &lt;/p&gt;
</comment>
                            <comment id="63898" author="doug" created="Thu, 8 Aug 2013 17:34:16 +0000"  >&lt;p&gt;Jinshan: can you comment on Keith&apos;s question.&lt;/p&gt;</comment>
                            <comment id="64159" author="bergwolf" created="Tue, 13 Aug 2013 08:17:05 +0000"  >&lt;p&gt;We just saw this on one of our test machines with b2_4 HEAD as well.&lt;/p&gt;

&lt;p&gt;I took a look at related code and it seems that the problem is that in ptl_send_rpc() req is not sent and req flags are not reset, so in ptlrpc_check_set() reply flag checking thinks req has got reply and goes on to process the reply and thus LBUG.&lt;/p&gt;

&lt;p&gt;I &lt;em&gt;guess&lt;/em&gt; something like following should fix the LBUG. But I&apos;m not familiar with ptlrpc code so not entirely sure.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index 8b96c5d..9008e62 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -1737,6 +1737,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                                        spin_lock(&amp;amp;req-&amp;gt;rq_lock);
                                        req-&amp;gt;rq_net_err = 1;
                                        spin_unlock(&amp;amp;req-&amp;gt;rq_lock);
+                                       continue;
                                }
                                /* need to reset the timeout */
                                force_timer_recalc = 1;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; </comment>
                            <comment id="64709" author="keith" created="Wed, 21 Aug 2013 04:39:04 +0000"  >&lt;p&gt;Peng Tao,  I agree with your idea and have created a patch for it. &lt;a href=&quot;http://review.whamcloud.com/7411&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/7411&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;A rpc expert will need to take a good look at this idea. &lt;/p&gt;</comment>
                            <comment id="67314" author="pjones" created="Tue, 24 Sep 2013 04:58:36 +0000"  >&lt;p&gt;Landed for 2.5.0&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="13329" name="iwc1-console" size="234740" author="keith" created="Tue, 6 Aug 2013 02:35:04 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvx5z:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9544</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>