<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:25:18 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16246] NULL pointer at lod_lookup+0x24/0x38</title>
                <link>https://jira.whamcloud.com/browse/LU-16246</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[32261.214407] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
[32261.223858] Mem abort info:
[32261.227340] &#160; ESR = 0x96000004
[32261.231077] &#160; EC = 0x25: DABT (current EL), IL = 32 bits
[32261.237060] &#160; SET = 0, FnV = 0
[32261.240797] &#160; EA = 0, S1PTW = 0
[32261.244621] Data abort info:
[32261.248185] &#160; ISV = 0, ISS = 0x00000004
[32261.252702] &#160; CM = 0, WnR = 0
[32261.256354] user pgtable: 4k pages, 48-bit VAs, pgdp=0000202681405000
[32261.263462] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
[32261.270918] Internal error: Oops: 96000004 [#1] SMP
[32261.276466] Modules linked in: ofd(OE) ost(OE) osd_zfs(POE) osp(OE) mdd(OE) lod(OE) mdt(OE) lfsck(OE) mgs(OE) osd_ldiskfs(OE) lquota(OE) ldiskfs(OE) mbcache jbd2 lustre(OE) obdecho(OE) mgc(OE) mdc(OE) lov(OE) osc(OE) lmv(OE) fid(OE) fld(OE) ptlrpc_gss(OE) ptlrpc(OE) obdclass(OE) ko2iblnd(OE) lnet(OE) crc32_generic libcfs(OE) dm_flakey dm_mod vfio_pci vfio_virqfd vfio_iommu_type1 vfio cuse rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) rfkill sunrpc nls_cp437 vfat fat zfs(POE) zunicode(POE) zzstd(OE) zlua(OE) aes_ce_blk zcommon(POE) znvpair(POE) crypto_simd zavl(POE) ipmi_ssif cryptd icp(POE) aes_ce_cipher ghash_ce spl(OE) sha1_ce acpi_ipmi sbsa_gwdt ipmi_si ipmi_devintf ipmi_msghandler hisi_uncore_hha_pmu hisi_uncore_ddrc_pmu hisi_uncore_l3c_pmu hisi_uncore_pmu sch_fq_codel binfmt_misc knem(OE) xfs libcrc32c sd_mod sg hclge mlx5_ib(OE) ib_uverbs(OE) ib_core(OE) mlx5_core(OE) mlxfw(OE) hisi_sas_v3_hw tls hisi_sas_main psample sha2_ce libsas nvme ahci
[32261.276555] &#160;hibmc_drm mlxdevm(OE) sha256_arm64 nvme_core hns3 libahci scsi_transport_sas drm_vram_helper auxiliary(OE) t10_pi mlx_compat(OE) drm_ttm_helper libata hnae3 ttm megaraid_sas host_edma_drv i2c_designware_platform i2c_designware_core xpmem(OE) fuse
[32261.386429] CPU: 49 PID: 52372 Comm: mdt02_000 Kdump: loaded Tainted: P &#160; &#160; &#160; &#160; &#160; OE &#160; &#160; 5.10.0-60.18.0.50.aarch64 #1
[32261.397678] Hardware name: Huawei TaiShan 200 (Model 2280)/BC82AMDDA, BIOS 1.35 04/30/2020
[32261.406595] pstate: 60400009 (nZCv daif +PAN -UAO -TCO BTYPE=--)
[32261.413307] pc : lod_lookup+0x24/0x38 [lod]
[32261.418192] lr : __mdd_lookup.isra.3+0x314/0x5b8 [mdd]
[32261.423997] sp : ffff8000650ab4d0
[32261.427987] x29: ffff8000650ab4d0 x28: ffff2042c84c8820
[32261.433966] x27: ffff80000912d000 x26: 00000000000034e0
[32261.439945] x25: ffff8000650ab6e0 x24: ffff2023d2d16c50
[32261.445924] x23: ffff2023d1ae0080 x22: ffff0045aaf12e60
[32261.451904] x21: ffff2023d1ae0080 x20: ffff80000912d000
[32261.457882] x19: 0000000000000000 x18: 0000000000000001
[32261.463861] x17: 0000000000000000 x16: ffff80000a7df920
[32261.469841] x15: ffffffffffffffff x14: ffffffffffffffff
[32261.475819] x13: 0000000000000018 x12: ffffffffffffffff
[32261.481798] x11: 0000000000000040 x10: 7f7f7f7f7f7f7f7f
[32261.487777] x9 : ffff80000ac39fc4 x8 : 0000000000000001
[32261.493757] x7 : 0000000000000b20 x6 : 0000000000004000
[32261.499737] x5 : ffff80000912d000 x4 : 0000000000000000
[32261.505716] x3 : ffff2023d2d16c50 x2 : ffff8000650ab6e0
[32261.511695] x1 : ffff2042d17dff00 x0 : ffff2023d1ae0080
[32261.517675] Call trace:
[32261.520818] &#160;lod_lookup+0x24/0x38 [lod]
[32261.525337] &#160;__mdd_lookup.isra.3+0x314/0x5b8 [mdd]
[32261.530806] &#160;mdd_lookup+0x108/0x208 [mdd]
[32261.535524] &#160;mdt_reint_open+0xffc/0x3810 [mdt]
[32261.540656] &#160;mdt_reint_rec+0x170/0x390 [mdt]
[32261.545614] &#160;mdt_reint_internal+0x6fc/0xf98 [mdt]
[32261.551004] &#160;mdt_intent_open+0x17c/0x470 [mdt]
[32261.556134] &#160;mdt_intent_opc+0x194/0x1040 [mdt]
[32261.561265] &#160;mdt_intent_policy+0x23c/0x438 [mdt]
[32261.566662] &#160;ldlm_lock_enqueue+0x5f0/0xbc0 [ptlrpc]
[32261.572276] &#160;ldlm_handle_enqueue0+0x6ec/0x23e0 [ptlrpc]
[32261.578230] &#160;tgt_enqueue+0xd4/0x2f0 [ptlrpc]
[32261.583232] &#160;tgt_handle_request0+0xd4/0x9b0 [ptlrpc]
[32261.588922] &#160;tgt_request_handle+0x7cc/0x1a30 [ptlrpc]
[32261.594701] &#160;ptlrpc_server_handle_request+0x3bc/0x1218 [ptlrpc]
[32261.601342] &#160;ptlrpc_main+0xdfc/0x16c8 [ptlrpc]
[32261.606462] &#160;kthread+0x130/0x138
[32261.610369] &#160;ret_from_fork+0x10/0x18
[32261.614621] Code: f9400c24 d1006084 aa0403e1 f9401c84 (f9400084)
[32261.621429] SMP: stopping secondary CPUs
[32261.628375] Starting crashdump kernel...
[32261.632977] Bye!
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>lustre servers: &lt;br/&gt;
10 nodes ,each node has kunpeng920 96core *2, memory 512GB,nvme 3.2T*4&lt;br/&gt;
centos 8.4.2105&lt;br/&gt;
kernel 5.10.0-60.18.0.50.aarch64 &#65288;openeuler 22.03 kernel&#65289;&lt;br/&gt;
lustre 0c68b13a5eeb408862bad795aaf9a24a11a14b6a&lt;br/&gt;
&lt;br/&gt;
lustre clients:&lt;br/&gt;
10 nodes intel 6266C*2, memory 372GB&lt;br/&gt;
centos 8.4.2105&lt;br/&gt;
kernel 4.18.0-372.9.1.el8.x86_64&lt;br/&gt;
&lt;br/&gt;
IO500 tag:io500-sc21&lt;br/&gt;
</environment>
        <key id="72836">LU-16246</key>
            <summary>NULL pointer at lod_lookup+0x24/0x38</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="fengchunsong">Jason Feng</reporter>
                        <labels>
                    </labels>
                <created>Tue, 18 Oct 2022 04:35:56 +0000</created>
                <updated>Tue, 15 Nov 2022 21:50:10 +0000</updated>
                                                                                <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="350036" author="adilger" created="Tue, 18 Oct 2022 18:02:08 +0000"  >&lt;p&gt;I may not be able to help much here, since I suspect this issue relates somehow to ARM server (what is PAGE_SIZE and endianness?), but some things of note:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;Lustre version &quot;&lt;tt&gt;0c68b13a5eeb408862bad795aaf9a24a11a14b6a&lt;/tt&gt;&quot;  is &lt;tt&gt;v2_15_52&lt;/tt&gt;, which is a development branch that is landing new features and has not been tested extensively.  You are better off to run the b2_15 branch which is the Long Term Support (LTS) maintenance branch and is only getting bug fixes.&lt;/li&gt;
	&lt;li&gt;IO500 tag: &lt;tt&gt;io500-sc21&lt;/tt&gt; is old, you should be using &lt;tt&gt;io500-sc22&lt;/tt&gt; if you are planning to submit a result for the upcoming IO500 list at SC&apos;22.&lt;/li&gt;
	&lt;li&gt;&lt;tt&gt;lod_lookup+0x24/0x38&lt;/tt&gt; are you able to decode this address in GDB and/or add &lt;tt&gt;printk()&lt;/tt&gt; to this function to see which pointer is NULL?&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="350076" author="JIRAUSER18423" created="Wed, 19 Oct 2022 00:58:39 +0000"  >&lt;p&gt;Thanks for comment.&lt;/p&gt;

&lt;p&gt;I will try b2_15 and new IO500 sc22.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; lod_lookup(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, struct dt_object *dt,
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; struct dt_rec *rec, &lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct dt_key *key)
{
&#160; &#160; &#160; &#160; struct dt_object *next = dt_object_child(dt);
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160; &#160; &#160; &#160; It show this next = NULL.If next == null , - 1 is returned to avoid null pointer hanging,is this ok?&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&#160; &#160; &#160; &#160; &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; next-&amp;gt;do_index_ops-&amp;gt;dio_lookup(env, next, rec, key);
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="350084" author="adilger" created="Wed, 19 Oct 2022 02:22:41 +0000"  >&lt;blockquote&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;
&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;code-object&quot;&gt;int&lt;/span&gt; lod_lookup(&lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct lu_env *env, struct dt_object *dt,
                      struct dt_rec *rec, &lt;span class=&quot;code-keyword&quot;&gt;const&lt;/span&gt; struct dt_key *key)
{         struct dt_object *next = dt_object_child(dt); 
          &lt;span class=&quot;code-keyword&quot;&gt;return&lt;/span&gt; next-&amp;gt;do_index_ops-&amp;gt;dio_lookup(env, next, rec, key);
}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It show this next = NULL.  If next == null , - 1 is returned to avoid null pointer hanging, is this ok?  &lt;/p&gt;&lt;/blockquote&gt;
&lt;p&gt;That might be OK for debugging (I would suggest to return something like &lt;tt&gt;-ENOENT&lt;/tt&gt; or &lt;tt&gt;-EINVAL&lt;/tt&gt;), but I suspect it will still not work properly because there is likely a problem elsewhere in the code.&lt;/p&gt;

&lt;p&gt;The &quot;&lt;tt&gt;dt&lt;/tt&gt;&quot; object is a directory, and the &lt;tt&gt;mdd_lookup()&lt;/tt&gt; caller should have initialized the object correctly before calling &lt;tt&gt;lod_lookup()&lt;/tt&gt;.  I suspect some larger problem here, like the locking being broken or similar.&lt;/p&gt;</comment>
                            <comment id="350087" author="JIRAUSER18423" created="Wed, 19 Oct 2022 02:53:03 +0000"  >&lt;p&gt;The directory is not deleted during the test, which may be caused by the memory problem. I try to reproduce the problem and capture the complete vmcore file for further analysis.&lt;/p&gt;</comment>
                            <comment id="352279" author="degremoa" created="Wed, 9 Nov 2022 10:43:38 +0000"  >&lt;p&gt;For the record, AWS reproduced a very similar crash in `&lt;tt&gt;lod_lookup()&lt;/tt&gt;` on AWS specific Graviton ARM processors (4K pages) but this is &apos;&lt;tt&gt;do_index_ops&lt;/tt&gt;&apos; and not &apos;&lt;tt&gt;next&lt;/tt&gt;&apos; which was NULL. The crash dump shows that the &lt;tt&gt;dt_object&lt;/tt&gt; memory structure is correct, &lt;tt&gt;do_index_ops&lt;/tt&gt; has the correct value, but the register was NULL and the system crashed. This happens several times. This is running 2.12.9 + backports.&lt;/p&gt;</comment>
                            <comment id="352280" author="JIRAUSER18423" created="Wed, 9 Nov 2022 10:58:33 +0000"  >&lt;p&gt;Because the time sequence problem cannot be identified by kdump, can we add logs to further locate the problem?&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="72835">LU-16245</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03347:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>