<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:15:13 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-1278] Client Panic - Lustre 1.8.6 and RHEL 6.1</title>
                <link>https://jira.whamcloud.com/browse/LU-1278</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;Customer reported two system panics while trying to gain system acceptance.  They provided the attached tracebacks.&lt;/p&gt;

&lt;p&gt;In addition, they provided the following information:&lt;/p&gt;

&lt;p&gt;crash&amp;gt; bt&lt;br/&gt;
PID: 20417  TASK: ffff8803fe75a040  CPU: 0   COMMAND: &quot;flush-lustre-1&quot;&lt;br/&gt;
#0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1270&amp;#93;&lt;/span&gt; machine_kexec at ffffffff810310db&lt;br/&gt;
#1 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad12d0&amp;#93;&lt;/span&gt; crash_kexec at ffffffff810b63b2&lt;br/&gt;
#2 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad13a0&amp;#93;&lt;/span&gt; oops_end at ffffffff814dec50&lt;br/&gt;
#3 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad13d0&amp;#93;&lt;/span&gt; no_context at ffffffff81040cdb&lt;br/&gt;
#4 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1420&amp;#93;&lt;/span&gt; __bad_area_nosemaphore at ffffffff81040f65&lt;br/&gt;
#5 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1470&amp;#93;&lt;/span&gt; bad_area_nosemaphore at ffffffff81041033&lt;br/&gt;
#6 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1480&amp;#93;&lt;/span&gt; __do_page_fault at ffffffff8104170d&lt;br/&gt;
#7 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad15a0&amp;#93;&lt;/span&gt; do_page_fault at ffffffff814e0c3e&lt;br/&gt;
#8 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad15d0&amp;#93;&lt;/span&gt; page_fault at ffffffff814ddfe5&lt;br/&gt;
    &lt;span class=&quot;error&quot;&gt;&amp;#91;exception RIP: lov_ap_refresh_count+22&amp;#93;&lt;/span&gt;&lt;br/&gt;
    RIP: ffffffffa07c3cc6  RSP: ffff880400ad1680  RFLAGS: 00010282&lt;br/&gt;
    RAX: 0000000000000000  RBX: ffff8803ebe15230  RCX: ffff8803e9668360&lt;br/&gt;
    RDX: ffff8803e9668360  RSI: 0000000000000002  RDI: 0000000000000000&lt;br/&gt;
    RBP: ffff880400ad1690   R8: 0000000000000002   R9: 0000000000000000&lt;br/&gt;
    R10: ffffffffa0806ee0  R11: 0000000000000003  R12: ffff8803e9668348&lt;br/&gt;
    R13: 0000000000000000  R14: ffff8803e9668350  R15: ffff8803fa7c6ad0&lt;br/&gt;
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018&lt;br/&gt;
#9 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1698&amp;#93;&lt;/span&gt; osc_send_oap_rpc at ffffffffa074feb9 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#10 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad17e8&amp;#93;&lt;/span&gt; osc_check_rpcs at ffffffffa0751121 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#11 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1878&amp;#93;&lt;/span&gt; osc_set_async_flags at ffffffffa0751545 &lt;span class=&quot;error&quot;&gt;&amp;#91;osc&amp;#93;&lt;/span&gt;&lt;br/&gt;
#12 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1938&amp;#93;&lt;/span&gt; lov_set_async_flags at ffffffffa07c6273 &lt;span class=&quot;error&quot;&gt;&amp;#91;lov&amp;#93;&lt;/span&gt;&lt;br/&gt;
#13 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad19b8&amp;#93;&lt;/span&gt; ll_writepage at ffffffffa085b2b4 &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#14 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1a58&amp;#93;&lt;/span&gt; ll_writepage_26 at ffffffffa0876d6e &lt;span class=&quot;error&quot;&gt;&amp;#91;lustre&amp;#93;&lt;/span&gt;&lt;br/&gt;
#15 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1a68&amp;#93;&lt;/span&gt; __writepage at ffffffff81120e67&lt;br/&gt;
#16 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1a88&amp;#93;&lt;/span&gt; write_cache_pages at ffffffff811221f9&lt;br/&gt;
#17 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1bb8&amp;#93;&lt;/span&gt; generic_writepages at ffffffff811224f4&lt;br/&gt;
#18 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1bc8&amp;#93;&lt;/span&gt; do_writepages at ffffffff81122521&lt;br/&gt;
#19 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1bd8&amp;#93;&lt;/span&gt; writeback_single_inode at ffffffff8119bbbd&lt;br/&gt;
#20 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1c18&amp;#93;&lt;/span&gt; writeback_sb_inodes at ffffffff8119bfbe&lt;br/&gt;
#21 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1c78&amp;#93;&lt;/span&gt; writeback_inodes_wb at ffffffff8119c11b&lt;br/&gt;
#22 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1cd8&amp;#93;&lt;/span&gt; wb_writeback at ffffffff8119c4bb&lt;br/&gt;
#23 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1dd8&amp;#93;&lt;/span&gt; wb_do_writeback at ffffffff8119c7a9&lt;br/&gt;
#24 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1e68&amp;#93;&lt;/span&gt; bdi_writeback_task at ffffffff8119c8b3&lt;br/&gt;
#25 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1eb8&amp;#93;&lt;/span&gt; bdi_start_fn at ffffffff81130c16&lt;br/&gt;
#26 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1ee8&amp;#93;&lt;/span&gt; kthread at ffffffff8108ddf6&lt;br/&gt;
#27 &lt;span class=&quot;error&quot;&gt;&amp;#91;ffff880400ad1f48&amp;#93;&lt;/span&gt; kernel_thread at ffffffff8100c1ca&lt;/p&gt;

&lt;p&gt;We crash in lov_ap_refresh_count which was called by osc_send_oap_rpc.&lt;/p&gt;

&lt;p&gt;For whatever crazy reason osc/osc_request.c&quot; line 2564 :&lt;/p&gt;

&lt;p&gt;               /* ask the caller for the size of the io as the rpc&lt;br/&gt;
leaves. */&lt;br/&gt;
                if (!(oap-&amp;gt;oap_async_flags &amp;amp; ASYNC_COUNT_STABLE))&lt;br/&gt;
                        oap-&amp;gt;oap_count =&lt;/p&gt;

&lt;p&gt;ops-&amp;gt;ap_refresh_count(oap-&amp;gt;oap_caller_data,cmd); &amp;lt;--------------&lt;br/&gt;
                if (oap-&amp;gt;oap_count &amp;lt;= 0) {&lt;br/&gt;
                        CDEBUG(D_CACHE, &quot;oap %p count %d, completing\n&quot;,&lt;br/&gt;
oap,&lt;br/&gt;
                               oap-&amp;gt;oap_count);&lt;/p&gt;

&lt;p&gt;where the &amp;lt;-------------- is above we send ap_refresh_count&lt;br/&gt;
oap-&amp;gt;oap_caller_data&lt;/p&gt;

&lt;p&gt;from the dump:&lt;/p&gt;

&lt;p&gt;crash&amp;gt; print  *(struct osc_async_page *)0xffff8803ebe15230&lt;br/&gt;
$89 = {&lt;br/&gt;
  oap_magic = 8675309,&lt;br/&gt;
  oap_cmd = 2,&lt;br/&gt;
  oap_interrupted = 0,&lt;br/&gt;
  oap_pending_item = &lt;/p&gt;
{
    next = 0xffff8803ebe15238,
    prev = 0xffff8803ebe15238
  }
&lt;p&gt;,&lt;br/&gt;
  oap_urgent_item = &lt;/p&gt;
{
    next = 0xffff8803ebe15248,
    prev = 0xffff8803ebe15248
  }
&lt;p&gt;,&lt;br/&gt;
  oap_rpc_item = &lt;/p&gt;
{
    next = 0xffff8803ebe15258,
    prev = 0xffff8803ebe15258
  }
&lt;p&gt;,&lt;br/&gt;
  oap_obj_off = 0,&lt;br/&gt;
  oap_page_off = 0,&lt;br/&gt;
  oap_async_flags = 3,&lt;br/&gt;
  oap_brw_page = &lt;/p&gt;
{
    off = 0,
    pg = 0xffffea00297d5fb0,
    count = 0,
    flag = 1056
  }
&lt;p&gt;,&lt;br/&gt;
  oap_occ = {&lt;br/&gt;
    occ_oig_item = &lt;/p&gt;
{
      next = 0x0,
      prev = 0x0
    }
&lt;p&gt;,&lt;br/&gt;
    occ_interrupted = 0xffffffffa074b490 &amp;lt;osc_occ_interrupted&amp;gt;,&lt;br/&gt;
    interrupted = 0&lt;br/&gt;
  },&lt;br/&gt;
  oap_oig = 0x0,&lt;br/&gt;
  oap_request = 0x0,&lt;br/&gt;
  oap_cli = 0xffff8803fa7c6ad0,&lt;br/&gt;
  oap_loi = 0xffff8803e9668300,&lt;br/&gt;
  oap_caller_ops = 0xffffffffa0806ee0,&lt;br/&gt;
  oap_caller_data = 0x0,&lt;br/&gt;
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^&lt;br/&gt;
  oap_page_list = &lt;/p&gt;
{
    next = 0xffff8803eb850d70,
    prev = 0xffff8803eb850d70
  }
&lt;p&gt;,&lt;br/&gt;
  oap_ldlm_lock = 0xffff8803eb850c00,&lt;br/&gt;
  oap_lock = {&lt;br/&gt;
    raw_lock = &lt;/p&gt;
{
      slock = 131074
    }
&lt;p&gt;  }&lt;br/&gt;
}&lt;/p&gt;



&lt;p&gt;ops-&amp;gt;ap_refresh_count(oap-&amp;gt;oap_caller_data,cmd); is the below:&lt;/p&gt;

&lt;p&gt;static int lov_ap_refresh_count(void *data, int cmd)&lt;br/&gt;
{&lt;br/&gt;
        struct lov_async_page *lap = LAP_FROM_COOKIE(data);&lt;/p&gt;

&lt;p&gt;        return&lt;br/&gt;
lap-&amp;gt;lap_caller_ops-&amp;gt;ap_refresh_count(lap-&amp;gt;lap_caller_data,&lt;br/&gt;
                                                     cmd);&lt;br/&gt;
}&lt;/p&gt;





&lt;p&gt;#define LAP_FROM_COOKIE(c)&lt;br/&gt;
       \&lt;br/&gt;
        (LASSERT(((struct lov_async_page *)(c))-&amp;gt;lap_magic ==&lt;br/&gt;
LOV_AP_MAGIC),   \&lt;br/&gt;
         (struct lov_async_page *)(c))&lt;/p&gt;


&lt;p&gt;sending this a NULL is probably not a great idea......&lt;/p&gt;


&lt;p&gt;As to how  oap_caller_data = 0x0 came to be I have no idea.&lt;/p&gt;


&lt;p&gt;I was able to pull the lustre debug ring out of the dump but it doesn&apos;t&lt;br/&gt;
seem to have anything useful, the last entry appears to be a couple of&lt;br/&gt;
days before the dump....&lt;/p&gt;</description>
                <environment>Servers running Lustre 1.8.6 - jenkins-wc1--PRISTINE-2.6.18-238.12.1.el5_lustre.gce5e033 - RHEL 5.x&lt;br/&gt;
Clients running patchless client on RHEL 6.1 - 2.6.32-131.0.15.el6.x86_64&lt;br/&gt;
lustre: 1.8.6&lt;br/&gt;
kernel: patchless_client&lt;br/&gt;
build:  jenkins-wc1-ga73a0cf-PRISTINE-2.6.32-131.0.15.el6.x86_64 &lt;br/&gt;
</environment>
        <key id="13841">LU-1278</key>
            <summary>Client Panic - Lustre 1.8.6 and RHEL 6.1</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="green">Oleg Drokin</assignee>
                                    <reporter username="dnelson@ddn.com">Dennis Nelson</reporter>
                        <labels>
                    </labels>
                <created>Mon, 2 Apr 2012 08:30:22 +0000</created>
                <updated>Tue, 3 Apr 2012 12:40:04 +0000</updated>
                            <resolved>Tue, 3 Apr 2012 12:40:04 +0000</resolved>
                                    <version>Lustre 1.8.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                            <comments>
                            <comment id="33300" author="pjones" created="Mon, 2 Apr 2012 11:24:21 +0000"  >&lt;p&gt;Oleg will help with this one&lt;/p&gt;</comment>
                            <comment id="33303" author="green" created="Mon, 2 Apr 2012 12:01:52 +0000"  >&lt;p&gt;Hm, the oap_caller_data cannot be 0, it almost looks like there was a stray write to random memory from somewhere.&lt;/p&gt;

&lt;p&gt;Can you please tell me where the other panic happened? I see it&apos;s in a different place than the first.&lt;/p&gt;</comment>
                            <comment id="33364" author="dnelson@ddn.com" created="Tue, 3 Apr 2012 12:29:34 +0000"  >&lt;p&gt;The issues seem to be related to the fact that the customer enabled IB bonding which is not currently a supported feature.  They have disabled IB bonding and the problems have ceased.  We can close this ticket.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;</comment>
                            <comment id="33366" author="pjones" created="Tue, 3 Apr 2012 12:40:04 +0000"  >&lt;p&gt;ok thanks Dennis&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                            <attachment id="11044" name="panic1.txt" size="5550" author="dnelson@ddn.com" created="Mon, 2 Apr 2012 08:30:22 +0000"/>
                            <attachment id="11045" name="panic2.txt" size="4937" author="dnelson@ddn.com" created="Mon, 2 Apr 2012 08:30:22 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvh4f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6423</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10020"><![CDATA[1]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>