<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:31:11 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16930] BUG: nid_keycmp+0x6</title>
                <link>https://jira.whamcloud.com/browse/LU-16930</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ &#160;117.007016] BUG: unable to handle kernel NULL pointer dereference at 0000000000000102
[ &#160;117.008158] IP: [&amp;lt;ffffffffc0ab0356&amp;gt;] nid_keycmp+0x6/0x30 [obdclass]
[ &#160;117.009044] PGD 0&#160;
[ &#160;117.009334] Oops: 0000 [#1] SMP&#160;
[ &#160;117.009796] Modules linked in: mdd(OE) lod(OE) mdt(OE) osp(OE) ofd(OE) lfsck(OE) ost(OE) mgs(OE) mgc(OE) osd_ldiskfs(OE) ldiskfs(OE) lquota(OE) lustre(OE) lmv(OE) mdc(OE) lov(OE) osc(OE) fid(OE) fld(OE) ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) sunrpc iTCO_wdt iTCO_vendor_support ppdev nfit libnvdimm iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd i2c_i801 lpc_ich joydev pcspkr parport_pc sg i6300esb parport ip_tables ext4 mbcache jbd2 sr_mod sd_mod cdrom crc_t10dif crct10dif_generic mlx5_ib(OE) ib_uverbs(OE) ib_core(OE) bochs_drm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm virtio_net net_failover ahci virtio_blk failover
[ &#160;117.020496] &#160;virtio_scsi(OE) mlx5_core(OE) libahci drm bnxt_en libata mlxfw(OE) psample ptp crct10dif_pclmul crct10dif_common pps_core crc32c_intel auxiliary(OE) mlx_compat(OE) virtio_pci serio_raw devlink virtio_ring virtio drm_panel_orientation_quirks dm_mirror dm_region_hash dm_log dm_mod
[ &#160;117.024227] CPU: 6 PID: 22239 Comm: ll_ost01_001 Kdump: loaded Tainted: G &#160; &#160; &#160; &#160; &#160; OE &#160;------------ T 3.10.0-1160.71.1.el7_lustre.ddn17.x86_64 #1
[ &#160;117.025941] Hardware name: DDN SFA400NVX2E, BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ &#160;117.028520] task: ffff888cb9f08000 ti: ffff888c00d88000 task.ti: ffff888c00d88000
[ &#160;117.030935] RIP: 0010:[&amp;lt;ffffffffc0ab0356&amp;gt;] &#160;[&amp;lt;ffffffffc0ab0356&amp;gt;] nid_keycmp+0x6/0x30 [obdclass]
[ &#160;117.033549] RSP: 0018:ffff888c00d8bb10 &#160;EFLAGS: 00010202
[ &#160;117.035649] RAX: 00000000000000b8 RBX: ffff886a7532ae50 RCX: 00000000b94fb39a
[ &#160;117.037936] RDX: 00000000000000ba RSI: 0000000000000002 RDI: ffff888c00d8bb48
[ &#160;117.040214] RBP: ffff888c00d8bb88 R08: ffff886a7532ae50 R09: 0000000000000038
[ &#160;117.042573] R10: ffffffffc0a96efd R11: fffff1404d579e00 R12: ffff888baa120cb8
[ &#160;117.045048] R13: ffff888b783ac400 R14: ffff888bda00f800 R15: ffff888bda5f0000
[ &#160;117.047322] FS: &#160;0000000000000000(0000) GS:ffff888db1180000(0000) knlGS:0000000000000000
[ &#160;117.049650] CS: &#160;0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ &#160;117.051704] CR2: 0000000000000102 CR3: 0000002250072000 CR4: 0000000000760fe0
[ &#160;117.053922] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ &#160;117.056107] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ &#160;117.058390] PKRU: 00000000
[ &#160;117.059938] Call Trace:
[ &#160;117.063537] &#160;[&amp;lt;ffffffffc0d8165a&amp;gt;] target_handle_connect+0x22ba/0x2c80 [ptlrpc]
[ &#160;117.065718] &#160;[&amp;lt;ffffffffc0e2bf4a&amp;gt;] tgt_request_handle+0x72a/0x18c0 [ptlrpc]
ptlrpc_nrs_req_get_nolock0+0xd1/0x170 [ptlrpc]
[ &#160;117.071850] &#160;[&amp;lt;ffffffffc0dcdf83&amp;gt;] ptlrpc_server_handle_request+0x253/0xc40 [ptlrpc]
[ &#160;117.075869] &#160;[&amp;lt;ffffffffc0dd2d7a&amp;gt;] ptlrpc_main+0xc4a/0x1cb0 [ptlrpc]
[ &#160;117.083399] &#160;[&amp;lt;ffffffff8a6c5f91&amp;gt;] kthread+0xd1/0xe0
[ &#160;117.087007] &#160;[&amp;lt;ffffffff8ad99ddd&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[ &#160;117.090630] Code: c1 ce 10 29 f0 89 c6 31 c2 c1 ce 1c 29 f2 31 d1 c1 ca 12 29 d1 31 c8 c1 c9 08 29 c8 5b 5d c3 66 0f 1f 44 00 00 0f 1f 44 00 00 55 &amp;lt;48&amp;gt; 8b 96 00 01 00 00 48 8b 47 08 48 89 e5 5d 48 8b 00 48 39 42
[ &#160;117.096140] RIP &#160;[&amp;lt;ffffffffc0ab0356&amp;gt;] nid_keycmp+0x6/0x30 [obdclass]
[ &#160;117.097932] &#160;RSP &amp;lt;ffff888c00d8bb10&amp;gt;
[ &#160;117.099278] CR2: 0000000000000102 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="76757">LU-16930</key>
            <summary>BUG: nid_keycmp+0x6</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="scherementsev">Sergey Cheremencev</assignee>
                                    <reporter username="scherementsev">Sergey Cheremencev</reporter>
                        <labels>
                    </labels>
                <created>Wed, 28 Jun 2023 11:28:20 +0000</created>
                <updated>Wed, 25 Oct 2023 13:16:47 +0000</updated>
                            <resolved>Wed, 25 Oct 2023 13:16:24 +0000</resolved>
                                    <version>Lustre 2.14.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="376714" author="sergey" created="Wed, 28 Jun 2023 11:31:29 +0000"  >&lt;p&gt;This ticket will also address following panic as I believe it has the same root cause.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[7010366.394636] BUG: unable to handle kernel NULL pointer dereference at 000000000000062f
[7010366.397009] IP: [&amp;lt;ffffffffc0a34dc8&amp;gt;] obd_nid_del+0xd8/0x210 [obdclass]
[7010366.398933] PGD 0&#160;
[7010366.399976] Oops: 0000 [#1] SMP&#160;
[7010366.401254] Modules linked in: mgs(OE) binfmt_misc mdd(OE) lod(OE) mdt(OE) osp(OE) ofd(OE) lfsck(OE) ost(OE) mgc(OE) osd_ldiskfs(OE) ldiskfs(OE) lquota(OE) lustre(OE) lmv(OE) mdc(OE) lov(OE) osc(OE) fid(OE) fld(OE) ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) libcfs(OE) rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) sunrpc iTCO_wdt iTCO_vendor_support ppdev nfit libnvdimm iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd joydev i2c_i801 lpc_ich pcspkr parport_pc sg i6300esb parport ip_tables ext4 mbcache jbd2 sd_mod sr_mod cdrom crc_t10dif crct10dif_generic mlx5_ib(OE) ib_uverbs(OE) ib_core(OE) bochs_drm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm ahci virtio_net drm net_failover virtio_blk&#160;
[7010366.414585] &#160;libahci failover virtio_scsi(OE) mlx5_core(OE) libata bnxt_en mlxfw(OE) psample ptp crct10dif_pclmul pps_core crct10dif_common crc32c_intel auxiliary(OE) virtio_pci mlx_compat(OE) serio_raw virtio_ring devlink virtio drm_panel_orientation_quirks dm_mirror dm_region_hash dm_log dm_mod
[7010366.419410] CPU: 1 PID: 690 Comm: ll_ost00_051 Kdump: loaded Tainted: G &#160; &#160; &#160; &#160; &#160; OE &#160;------------ T 3.10.0-1160.71.1.el7_lustre.ddn17.x86_64 #1
[7010366.421649] Hardware name: DDN SFA400NVX2E, BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014&#160;
[7010366.423450] task: ffff88679ed7b180 ti: ffff8871784c4000 task.ti: ffff8871784c4000
[7010366.425010] RIP: 0010:[&amp;lt;ffffffffc0a34dc8&amp;gt;] &#160;[&amp;lt;ffffffffc0a34dc8&amp;gt;] obd_nid_del+0xd8/0x210 [obdclass]
[7010366.426822] RSP: 0018:ffff8871784c7b50 &#160;EFLAGS: 00010282
[7010366.428136] RAX: 0000000000000627 RBX: ffff886d611e10b8 RCX: ffff885846e614b0
[7010366.429733] RDX: 0000000000000627 RSI: 0000000000000002 RDI: ffff8870afe4aa70
[7010366.431316] RBP: ffff8871784c7b80 R08: ffff885f5636e910 R09: 0000000000000001
[7010366.432893] R10: ffff8853ffc03600 R11: 0000000000000400 R12: ffff88753c6a1178
[7010366.434551] R13: ffff8870afe4aa70 R14: ffff885bfcabe000 R15: 000000000000009c
[7010366.436157] FS: &#160;0000000000000000(0000) GS:ffff8875f1040000(0000) knlGS:0000000000000000
[7010366.437890] CS: &#160;0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[7010366.439347] CR2: 000000000000062f CR3: 000000230a8f4000 CR4: 0000000000760fe0
[7010366.441049] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[7010366.442695] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[7010366.444342] PKRU: 00000000
[7010366.445435] Call Trace:&#160;
[7010366.446525] &#160;[&amp;lt;ffffffffc0a1deb7&amp;gt;] class_disconnect+0x67/0x410 [obdclass]
[7010366.448183] &#160;[&amp;lt;ffffffffc0cfc7d7&amp;gt;] server_disconnect_export+0x37/0x1a0 [ptlrpc]
[7010366.449880] &#160;[&amp;lt;ffffffffc13ec279&amp;gt;] ofd_obd_disconnect+0x69/0x220 [ofd]
[7010366.453129] &#160;[&amp;lt;ffffffffc0d041c4&amp;gt;] target_handle_disconnect+0x1a4/0x470 [ptlrpc]
[7010366.455091] &#160;[&amp;lt;ffffffffc0da8b28&amp;gt;] tgt_disconnect+0x58/0x170 [ptlrpc]
[7010366.457249] &#160;[&amp;lt;ffffffffc0dae0df&amp;gt;] tgt_request_handle+0x8bf/0x18c0 [ptlrpc]
[7010366.462396] &#160;[&amp;lt;ffffffffc0d4ff83&amp;gt;] ptlrpc_server_handle_request+0x253/0xc40 [ptlrpc]
[7010366.465700] &#160;[&amp;lt;ffffffffc0d54d7a&amp;gt;] ptlrpc_main+0xc4a/0x1cb0 [ptlrpc]
[7010366.470616] &#160;[&amp;lt;ffffffff8dec5f91&amp;gt;] kthread+0xd1/0xe0
[7010366.473660] &#160;[&amp;lt;ffffffff8e599ddd&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[7010366.477355] Code: 00 00 00 48 8b 08 f6 c1 01 0f 85 ea 00 00 00 48 39 cb 0f 84 93 00 00 00 48 89 ca eb 0e 66 0f 1f 44 00 00 48 39 c3 74 2b 48 89 c2 &amp;lt;48&amp;gt; 8b 42 08 48 85 c0 75 ef 48 8b 01 a8 01 0f 85 ba 00 00 00 48
[7010366.482836] RIP &#160;[&amp;lt;ffffffffc0a34dc8&amp;gt;] obd_nid_del+0xd8/0x210 [obdclass]
[7010366.484655] &#160;RSP &amp;lt;ffff8871784c7b50&amp;gt;
[7010366.485997] CR2: 000000000000062f &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="376715" author="bzzz" created="Wed, 28 Jun 2023 11:40:39 +0000"  >&lt;p&gt;branch?&lt;/p&gt;</comment>
                            <comment id="376719" author="sergey" created="Wed, 28 Jun 2023 11:59:24 +0000"  >&lt;p&gt;I&apos;ve got a vmcore from the 1st panic. Some details.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;void obd_nid_del(struct obd_device *obd, struct obd_export *exp)
{
&#160; &#160; &#160; &#160; int rc;&#160; &#160; &#160; &#160; 

        if (exp == exp-&amp;gt;exp_obd-&amp;gt;obd_self_export || !exp-&amp;gt;exp_hashed)
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; return;
...&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It fails due to because exp is NULL, or to be clear it is 0x2. Thus 0x102 comes from 0x2 + exp_obd offset(256):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; dis nid_keycmp+0x6
0xffffffffc0ab0356 &amp;lt;nid_keycmp+6&amp;gt;: &#160; &#160; &#160;mov &#160; &#160;0x100(%rsi),%rdx &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="380171" author="sergey" created="Wed, 26 Jul 2023 11:29:00 +0000"  >&lt;p&gt;Another one panic related to rhashtable. This time it fails trying to destroy obd_uuid_hash.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[ 1441.209414] LustreError: 25025:0:(qsd_reint.c:56:qsd_reint_completion()) Skipped 158 previous similar messages
[ 1441.222517] BUG: unable to handle kernel NULL pointer dereference at 0000000000000002
[ 1441.225026] IP: [&amp;lt;ffffffff8539fdff&amp;gt;] rhashtable_free_and_destroy+0x8f/0x150
...
[ 1441.289736] &#160;[&amp;lt;ffffffffc0d58538&amp;gt;] class_cleanup+0x308/0xc50 [obdclass]
[ 1441.293711] &#160;[&amp;lt;ffffffffc0d599a7&amp;gt;] class_process_config+0x4f7/0x2a90 [obdclass]
[ 1441.297506] &#160;[&amp;lt;ffffffffc0d5c106&amp;gt;] class_manual_cleanup+0x1c6/0x720 [obdclass]
[ 1441.299446] &#160;[&amp;lt;ffffffffc173a138&amp;gt;] osp_obd_disconnect+0x178/0x210 [osp]
[ 1441.301294] &#160;[&amp;lt;ffffffffc18bb9cd&amp;gt;] lod_putref+0x25d/0x7c0 [lod]
[ 1441.303030] &#160;[&amp;lt;ffffffffc18bd641&amp;gt;] lod_fini_tgt+0x111/0x130 [lod]
[ 1441.304769] &#160;[&amp;lt;ffffffffc18b0b2b&amp;gt;] lod_device_fini+0x5b/0x1f0 [lod]
[ 1441.306521] &#160;[&amp;lt;ffffffffc0d58a99&amp;gt;] class_cleanup+0x869/0xc50 [obdclass]
[ 1441.310133] &#160;[&amp;lt;ffffffffc0d599a7&amp;gt;] class_process_config+0x4f7/0x2a90 [obdclass]
[ 1441.313608] &#160;[&amp;lt;ffffffffc0d5c106&amp;gt;] class_manual_cleanup+0x1c6/0x720 [obdclass]
[ 1441.315405] &#160;[&amp;lt;ffffffffc18b09a3&amp;gt;] lod_obd_disconnect+0x93/0x1c0 [lod]
[ 1441.317100] &#160;[&amp;lt;ffffffffc1946d88&amp;gt;] mdd_process_config+0x3a8/0x5f0 [mdd]
[ 1441.318798] &#160;[&amp;lt;ffffffffc1795db2&amp;gt;] mdt_stack_fini+0x2c2/0xca0 [mdt]
[ 1441.320430] &#160;[&amp;lt;ffffffffc1796adb&amp;gt;] mdt_device_fini+0x34b/0x930 [mdt]
[ 1441.322074] &#160;[&amp;lt;ffffffffc0d58be8&amp;gt;] class_cleanup+0x9b8/0xc50 [obdclass]
[ 1441.323708] &#160;[&amp;lt;ffffffffc0d599a7&amp;gt;] class_process_config+0x4f7/0x2a90 [obdclass]
[ 1441.325416] &#160;[&amp;lt;ffffffffc0d5c106&amp;gt;] class_manual_cleanup+0x1c6/0x720 [obdclass]
[ 1441.327140] &#160;[&amp;lt;ffffffffc0d96ce5&amp;gt;] server_put_super+0xa05/0xf40 [obdclass]
[ 1441.330380] &#160;[&amp;lt;ffffffff85250ded&amp;gt;] generic_shutdown_super+0x6d/0x100
[ 1441.331937] &#160;[&amp;lt;ffffffff852511f2&amp;gt;] kill_anon_super+0x12/0x20
[ 1441.333423] &#160;[&amp;lt;ffffffffc0d5ef32&amp;gt;] lustre_kill_super+0x32/0x50 [obdclass]
[ 1441.335017] &#160;[&amp;lt;ffffffff852515ce&amp;gt;] deactivate_locked_super+0x4e/0x70
[ 1441.336535] &#160;[&amp;lt;ffffffff85251d56&amp;gt;] deactivate_super+0x46/0x60
[ 1441.337954] &#160;[&amp;lt;ffffffff852713df&amp;gt;] cleanup_mnt+0x3f/0x80
[ 1441.339291] &#160;[&amp;lt;ffffffff85271472&amp;gt;] __cleanup_mnt+0x12/0x20
[ 1441.340623] &#160;[&amp;lt;ffffffff850c2acb&amp;gt;] task_work_run+0xbb/0xe0
[ 1441.341927] &#160;[&amp;lt;ffffffff8502cc65&amp;gt;] do_notify_resume+0xa5/0xc0
[ 1441.343232] &#160;[&amp;lt;ffffffff8579a2ef&amp;gt;] int_signal+0x12/0x17
[ 1441.344451] Code: 00 48 8b 45 d0 8b 50 04 85 d2 0f 85 c0 00 00 00 8b 45 cc 48 8b 4d d0 48 8d 84 c1 80 00 00 00 4c 8b 38 45 31 f6 41 f6 c7 01 75 03 &amp;lt;4d&amp;gt; 8b 37 41 f6 c7 01 75 4a 0f 1f 84 00 00 00 00 00 80 7b 44 00&#160;
[ 1441.349037] RIP &#160;[&amp;lt;ffffffff8539fdff&amp;gt;] rhashtable_free_and_destroy+0x8f/0x150
[ 1441.350511] &#160;RSP &amp;lt;ffffa062564d7618&amp;gt;
[ 1441.351533] CR2: 0000000000000002  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;bucket_table is corrupted:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; rd 0xffffa062e79ca480 64
ffffa062e79ca480: &#160;0000000000000001 0000000000000002 &#160; ................
ffffa062e79ca490: &#160;0000000000000005 000000000000000e &#160; ................
ffffa062e79ca4a0: &#160;0000000000000009 000000000000000b &#160; ................
ffffa062e79ca4b0: &#160;000000000000000d 000000000000000f &#160; ................
ffffa062e79ca4c0: &#160;0000000000000011 0000000000000013 &#160; ................  &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;In a normal case there should be addresses or uneven numbers(1,3,5,7,9,a,c,f) in the lowest byte. As can be seen from above there are &quot;0x2&quot; and &quot;0xe&quot; instead of 0x3 and 0x7. The code considers even numbers as addresses causing a kernel panic.&lt;/p&gt;

&lt;p&gt;I have 2 same vmcores and in each case bucket_tables look similar - there are 0x2 and 0xe instead of 0x3 and 0x7.&lt;/p&gt;</comment>
                            <comment id="380189" author="sergey" created="Wed, 26 Jul 2023 13:21:56 +0000"  >&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; bt
PID: 28008 &#160;TASK: ffff93b86363e300 &#160;CPU: 1 &#160; COMMAND: &quot;ll_ost00_020&quot;
&#160;#0 [ffff93b84af77770] machine_kexec at ffffffffb2a662f4
&#160;#1 [ffff93b84af777d0] __crash_kexec at ffffffffb2b22b62
&#160;#2 [ffff93b84af778a0] crash_kexec at ffffffffb2b22c50
&#160;#3 [ffff93b84af778b8] oops_end at ffffffffb3191798
&#160;#4 [ffff93b84af778e0] no_context at ffffffffb2a75d14
&#160;#5 [ffff93b84af77930] __bad_area_nosemaphore at ffffffffb2a75fe2
&#160;#6 [ffff93b84af77980] bad_area_nosemaphore at ffffffffb2a76104
&#160;#7 [ffff93b84af77990] __do_page_fault at ffffffffb3194750
&#160;#8 [ffff93b84af77a00] trace_do_page_fault at ffffffffb3194a26
&#160;#9 [ffff93b84af77a40] do_async_page_fault at ffffffffb3193fa2
#10 [ffff93b84af77a60] async_page_fault at ffffffffb31907a8
&#160; &#160; [exception RIP: nid_keycmp+6]
&#160; &#160; RIP: ffffffffc0dda356 &#160;RSP: ffff93b84af77b10 &#160;RFLAGS: 00010206
&#160; &#160; RAX: 00000000000000b8 &#160;RBX: ffff93b8ce97ca90 &#160;RCX: 00000000a3653dad
&#160; &#160; RDX: 00000000000001be &#160;RSI: 0000000000000106 &#160;RDI: ffff93b84af77b48
&#160; &#160; RBP: ffff93b84af77b88 &#160; R8: ffff93b8ce97ca90 &#160; R9: 0000000000000038
&#160; &#160; R10: ffffffffc0dc0efd &#160;R11: ffffee1c2ccbf740 &#160;R12: ffff93b42710a0b8 (ffff93b42710a0b8-0xb8 == obd_export)
&#160; &#160; R13: ffff93a752426c00 &#160;R14: ffff93b2f0372000 &#160;R15: ffff93b89d081178
&#160; &#160; ORIG_RAX: ffffffffffffffff &#160;CS: 0010 &#160;SS: 0018
#11 [ffff93b84af77b18] obd_nid_add at ffffffffc0ddcb8e [obdclass]
#12 [ffff93b84af77b90] target_handle_connect at ffffffffc134465a [ptlrpc]
#13 [ffff93b84af77ca0] tgt_request_handle at ffffffffc13eef4a [ptlrpc]
#14 [ffff93b84af77d30] ptlrpc_server_handle_request at ffffffffc1390f83 [ptlrpc]
#15 [ffff93b84af77de8] ptlrpc_main at ffffffffc1395d7a [ptlrpc]
#16 [ffff93b84af77ec8] kthread at ffffffffb2ac5f91
#17 [ffff93b84af77f50] ret_from_fork_nospec_begin at ffffffffb3199dddcrash&amp;gt;

obd_export ffff93a752426c00 // export that is adding (from the stack) 

crash&amp;gt; obd_device ffff93b89d081178
struct obd_device {
struct obd_device {
&#160; obd_type = 0xffff93ba6cb0fd80,
&#160; obd_magic = 2874988271,
&#160; obd_minor = 40,
&#160; obd_lu_dev = 0xffff939e25704000,
&#160; obd_uuid = {
&#160; &#160; uuid = &quot;scratch-OST0002_UUID\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000&quot;
&#160; },
...
&#160; obd_nid_hash = {
&#160; &#160; ht = {
&#160; &#160; &#160; tbl = 0xffff93b2f0372000,
&#160; &#160; &#160; nelems = {
&#160; &#160; &#160; &#160; counter = 183
&#160; &#160; &#160; },
&#160; &#160; &#160; key_len = 2,
&#160; &#160; &#160; p = {
&#160; &#160; &#160; &#160; nelem_hint = 0,
&#160; &#160; &#160; &#160; key_len = 8,
&#160; &#160; &#160; &#160; key_offset = 0,
&#160; &#160; &#160; &#160; head_offset = 184,
&#160; &#160; &#160; &#160; max_size = 0,
&#160; &#160; &#160; &#160; min_size = 4,
&#160; &#160; &#160; &#160; automatic_shrinking = true,

crash&amp;gt; bucket_table 0xffff93b2f0372000
struct bucket_table {
&#160; size = 512,
&#160; nest = 0,
&#160; rehash = 0,
&#160; hash_rnd = 1439736779,
&#160; locks_mask = 255,
&#160; locks = 0xffff93b88842b000,
&#160; walkers = {
&#160; &#160; next = 0xffff93b2f0372020,
&#160; &#160; prev = 0xffff93b2f0372020
&#160; },
&#160; rcu = {
&#160; &#160; next = 0x0,
&#160; &#160; func = 0x0
&#160; },
&#160; future_tbl = 0x0,
&#160; buckets = 0xffff93b2f0372080
}

crash&amp;gt; obd_export.exp_connection ffff93a752426c00
&#160; exp_connection = 0xffff93b8ce97ca80,
crash&amp;gt; ptlrpc_connection.c_peer 0xffff93b8ce97ca80
&#160; c_peer = {
&#160; &#160; nid = 1407377771021085,
&#160; &#160; pid = 12345
&#160; },


hash for 1407377771021085 is 2092047221 // nid for export ffff93a752426c00

find index in a bucket_table:
&amp;gt;&amp;gt;&amp;gt; hex((2092047221&amp;gt;&amp;gt;5)&amp;amp;511) // table_size is 512. see rht_bucket_index
&apos;0xdb&apos;

crash&amp;gt; p/x 0xffff93b2f0372080+(0xdb*8)
$15 = 0xffff93b2f0372758
crash&amp;gt; rd 0xffff93b2f0372758
ffff93b2f0372758: &#160;ffff93b42710a0b8 &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;...&apos;....

/* have to minus 0xb8 from 0xffff93b42710a0b8 as it is an offset in obd_export */
crash&amp;gt; p ((struct obd_export *)0xffff93b42710a000)-&amp;gt;exp_connection.c_peer
$16 = {
&#160; nid = 1407377771021781,
&#160; pid = 12345
}

crash&amp;gt; obd_export.exp_nid_hash ffff93b42710a000
&#160; exp_nid_hash = {
&#160; &#160; rhead = {
&#160; &#160; &#160; next = 0x1be // 0x1be looks wrong - should be 0x1b7 instead
&#160; &#160; },
&#160; &#160; next = 0x0
&#160; }, &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;As could be seen from above analysis it fails because rhead.next again points at even value. rhashtable_insert_fast calls nid_keycmp in a cycle for each bucket until it gets uneven marker. In the 1st cycle nid_keycmp returned -ESRCH as nids are different. Then it didn&apos;t recognize that there is no object in this bucket and considered 0x1be( minus 0xbe) as an address passing this into nid_keycmp as an address.&lt;/p&gt;</comment>
                            <comment id="380194" author="bzzz" created="Wed, 26 Jul 2023 13:28:52 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=scherementsev&quot; class=&quot;user-hover&quot; rel=&quot;scherementsev&quot;&gt;scherementsev&lt;/a&gt; can you please attach full set of backtrackes for the last case? lustre log?&lt;/p&gt;</comment>
                            <comment id="380202" author="sergey" created="Wed, 26 Jul 2023 14:10:19 +0000"  >&lt;p&gt;More complicated case but with the same sympthoms.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;crash&amp;gt; bt
PID: 22239 &#160;TASK: ffff888cb9f08000 &#160;CPU: 6 &#160; COMMAND: &quot;ll_ost01_001&quot;
&#160;#0 [ffff888c00d8b770] machine_kexec at ffffffff8a6662f4
&#160;#1 [ffff888c00d8b7d0] __crash_kexec at ffffffff8a722b62
&#160;#2 [ffff888c00d8b8a0] crash_kexec at ffffffff8a722c50
&#160;#3 [ffff888c00d8b8b8] oops_end at ffffffff8ad91798
&#160;#4 [ffff888c00d8b8e0] no_context at ffffffff8a675d14
&#160;#5 [ffff888c00d8b930] __bad_area_nosemaphore at ffffffff8a675fe2
&#160;#6 [ffff888c00d8b980] bad_area_nosemaphore at ffffffff8a676104
&#160;#7 [ffff888c00d8b990] __do_page_fault at ffffffff8ad94750
&#160;#8 [ffff888c00d8ba00] trace_do_page_fault at ffffffff8ad94a26
&#160;#9 [ffff888c00d8ba40] do_async_page_fault at ffffffff8ad93fa2
#10 [ffff888c00d8ba60] async_page_fault at ffffffff8ad907a8
&#160; &#160; [exception RIP: nid_keycmp+6]
&#160; &#160; RIP: ffffffffc0ab0356 &#160;RSP: ffff888c00d8bb10 &#160;RFLAGS: 00010202
&#160; &#160; RAX: 00000000000000b8 &#160;RBX: ffff886a7532ae50 &#160;RCX: 00000000b94fb39a
&#160; &#160; RDX: 00000000000000ba &#160;RSI: 0000000000000002 &#160;RDI: ffff888c00d8bb48
&#160; &#160; RBP: ffff888c00d8bb88 &#160; R8: ffff886a7532ae50 &#160; R9: 0000000000000038
&#160; &#160; R10: ffffffffc0a96efd &#160;R11: fffff1404d579e00 &#160;R12: ffff888baa120cb8 // obd_export
&#160; &#160; R13: ffff888b783ac400 &#160;R14: ffff888bda00f800 &#160;R15: ffff888bda5f0000
&#160; &#160; ORIG_RAX: ffffffffffffffff &#160;CS: 0010 &#160;SS: 0018
#11 [ffff888c00d8bb18] obd_nid_add at ffffffffc0ab2b8e [obdclass]
#12 [ffff888c00d8bb90] target_handle_connect at ffffffffc0d8165a [ptlrpc]
#13 [ffff888c00d8bca0] tgt_request_handle at ffffffffc0e2bf4a [ptlrpc]
#14 [ffff888c00d8bd30] ptlrpc_server_handle_request at ffffffffc0dcdf83 [ptlrpc]
#15 [ffff888c00d8bde8] ptlrpc_main at ffffffffc0dd2d7a [ptlrpc]
#16 [ffff888c00d8bec8] kthread at ffffffff8a6c5f91
#17 [ffff888c00d8bf50] ret_from_fork_nospec_begin at ffffffff8ad99ddd

target_handle_connect adds following export:
crash&amp;gt; obd_export.exp_client_uuid ffff888b783ac400 // R13
&#160; exp_client_uuid = {
&#160; &#160; uuid = &quot;694aac0a-4105-5656-4ce4-51df4a388c1a\000\000\000&quot;
&#160; },
crash&amp;gt; p ((struct obd_export *)0xffff888b783ac400)-&amp;gt;exp_connection.c_peer.nid
$7 = 1407377771021236

crash&amp;gt; obd_device.obd_nid_hash ffff888bda5f0000 | grep tbl
&#160; &#160; &#160; tbl = 0xffff888bda00f800,

crash&amp;gt; obd_export.exp_nid_hash ffff888baa120c00
&#160; exp_nid_hash = {
&#160; &#160; rhead = {
&#160; &#160; &#160; next = 0xba
&#160; &#160; },
&#160; &#160; next = 0xffff888c1e67f4b8
&#160; }, &#160;
crash&amp;gt; obd_export.exp_nid_hash 0xffff888c1e67f400
&#160; exp_nid_hash = {&#160;
&#160; &#160; rhead = {&#160;
&#160; &#160; &#160; next = 0xb3
&#160; &#160; }, &#160;
&#160; &#160; next = 0x0
&#160; },

looking for 0xb3 crash&amp;gt; rd 0xffff888bda00f880 128 | grep 000000b1
ffff888bda00fb40: &#160;00000000000000b1 ffff888c1915ecb8 &#160; ................
crash&amp;gt;
crash&amp;gt; obd_export.exp_nid_hash ffff888c1915ec00
&#160; exp_nid_hash = {
&#160; &#160; rhead = {
&#160; &#160; &#160; next = 0xffff888baa120cb8
&#160; &#160; },
&#160; &#160; next = 0x0
&#160; },
Let&apos;s go again from the beginning:
crash&amp;gt; obd_export.exp_nid_hash ffff888c1915ec00
&#160; exp_nid_hash = {
&#160; &#160; rhead = {
&#160; &#160; &#160; next = 0xffff888baa120cb8
  &#160; }, &#160;
&#160; &#160; next = 0x0&#160;
&#160; },
crash&amp;gt; p ((struct obd_export *)0xffff888c1915ec00)-&amp;gt;exp_connection.c_peer.nid
$3 = 1407377771020507

crash&amp;gt; obd_export.exp_nid_hash 0xffff888baa120c00
&#160; exp_nid_hash = {&#160;
&#160; &#160; rhead = {&#160;
&#160; &#160; &#160; next = 0xba
&#160; &#160; }, &#160;
&#160; &#160; next = 0xffff888c1e67f4b8
&#160; },
crash&amp;gt; p ((struct obd_export *)0xffff888baa120c00)-&amp;gt;exp_connection.c_peer.nid
$4 = 1407377771020299

crash&amp;gt; obd_export.exp_nid_hash 0xffff888c1e67f400
&#160; exp_nid_hash = {&#160;
&#160; &#160; rhead = {&#160;
&#160; &#160; &#160; next = 0xb3
&#160; &#160; }, &#160;
&#160; &#160; next = 0x0&#160;
&#160; },
crash&amp;gt; p ((struct obd_export *)0xffff888c1e67f400)-&amp;gt;exp_connection.c_peer.nid
$5 = 1407377771020299
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;As could be seen from above again we have 0xba instead of 0xb3. Thus it doesn&apos;t stop iterating and refers to 0xba like an address.&lt;/p&gt;

&lt;p&gt;One more thing from 2 latest investigations. In 2 cases the next is larger then it should be at 7,i.e.:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;0xba instead of 0xb3
0x1be instead of 0xb7&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;ul&gt;
	&lt;li&gt;0x1be instead of 0xb7 - see comment&lt;/li&gt;
&lt;/ul&gt;
</comment>
                            <comment id="380390" author="gerrit" created="Thu, 27 Jul 2023 14:25:52 +0000"  >&lt;p&gt;&quot;Sergey Cheremencev &amp;lt;scherementsev@ddn.com&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/51780&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/51780&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16930&quot; title=&quot;BUG: nid_keycmp+0x6&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16930&quot;&gt;&lt;del&gt;LU-16930&lt;/del&gt;&lt;/a&gt; obd: trigger a panic if rhltable corrupted&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: a0df4cab734895af784e3826876183b939ae3e2d&lt;/p&gt;</comment>
                            <comment id="380661" author="neilb" created="Mon, 31 Jul 2023 07:57:01 +0000"  >&lt;p&gt;This looks a lot like a use-after-free bug.&#160; There is no way that the rhead.next field would ever be set to the values it is getting, so it must be getting those values because some code thinks that byte of memory is not in rhead.next.&lt;/p&gt;

&lt;p&gt;So either lustre is continuing to read from it after freeing the memory, or some other code is continuing to write after freeing their memory.&#160; In either case, that other code appears to be performing a &quot;+7&quot; operation!&lt;/p&gt;

&lt;p&gt;I found 2 possible places were lustre can read the memory after freeing it.&lt;/p&gt;

&lt;p&gt;1/ the memory is freed directly rather than using RCU.&#160; So it could already be free when some other thread is walking past the memory in the rhash table.&lt;/p&gt;

&lt;p&gt;2/ ldlm_flock_lookup_cb does a &quot;refcount_inc()&quot; (in class_export_get()), but the final ref might already have been put, and the memory might be about to be freed.&#160; It should do refcount_inc_not_zero()&lt;/p&gt;

&lt;p&gt;These should both be fixed, but I don&apos;t see how that can contribute to the observed symptoms.&#160; In particular the details provided in the &quot;more complicated&quot; case show (if I&apos;m interpreting them properly) that the object is still in the rhashtable.&#160; The above two issues would not leave it there.&lt;/p&gt;

&lt;p&gt;So my hypothesis is that some &lt;b&gt;other&lt;/b&gt; code or driver on the crashing machine is corrupting data used by lustre.&#160; One way to test this would be to change the code which allocates a &apos;struct obd_export&apos; to allocate twice as much memory.&#160; This should cause it to be allocated from a different slab, and so reduce the risk of reusing memory that the order driver is abusing.&lt;/p&gt;

&lt;p&gt;But that is just a guess.&lt;/p&gt;</comment>
                            <comment id="380670" author="bzzz" created="Mon, 31 Jul 2023 09:24:27 +0000"  >&lt;p&gt;I&apos;m constantly testing Lustre with &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/51245&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/51245&lt;/a&gt; forwadring different memory allocations via alloc_page() mapped then to a dedicated address space, but can&apos;t hit use-after-free.&lt;/p&gt;</comment>
                            <comment id="380821" author="neilb" created="Tue, 1 Aug 2023 02:53:21 +0000"  >&lt;p&gt;When you are testing with that patch do you get the corruption of the rhashtable chain?&lt;/p&gt;

&lt;p&gt;If you do, then I cannot guess what is happening.&lt;/p&gt;

&lt;p&gt;If you don&apos;t - that would make sense.&#160; The patch only tests for use-after-free in lustre code.&#160; I don&apos;t think the use-after-free is happening in the lustre code.&#160; I think it is happening in some OTHER code that happens to be running on the same machine and lustre is being corrupted by a bug somewhere else.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="380865" author="sergey" created="Tue, 1 Aug 2023 08:58:11 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=neilb&quot; class=&quot;user-hover&quot; rel=&quot;neilb&quot;&gt;neilb&lt;/a&gt; , we can&apos;t reproduce this issue. Just one our customer periodically has this kind of failures.&lt;/p&gt;

&lt;p&gt;I also think that with high probability this could be caused by a wrong code outside lustre.&#160;It is interesting that in a part of failures the bucket table itself is not corrupted at the moment of crash. Instead of that exp_nid_hash.rhash-&amp;gt;next is corrupted. This could be the result of reading wrong value from the table and using it as null marker in a bucket.&lt;/p&gt;</comment>
                            <comment id="382636" author="sergey" created="Wed, 16 Aug 2023 10:21:23 +0000"  >&lt;p&gt;One more occurrence of the same problem:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[117224.294543] BUG: unable to handle kernel NULL pointer dereference at 000000000000002a
[117224.295928] IP: [&amp;lt;ffffffffb13b7476&amp;gt;] rht_deferred_worker+0x226/0x430
...
[117224.342181]  [&amp;lt;ffffffffb10c32ef&amp;gt;] process_one_work+0x17f/0x440
[117224.343597]  [&amp;lt;ffffffffb10c4436&amp;gt;] worker_thread+0x126/0x3c0
[117224.346515]  [&amp;lt;ffffffffb10cb621&amp;gt;] kthread+0xd1/0xe0
[117224.349319]  [&amp;lt;ffffffffb17c61dd&amp;gt;] ret_from_fork_nospec_begin+0x7/0x21
[117224.352394] Code: 8d 04 0b 48 8b 00 a8 01 0f 85 1d 01 00 00 48 8b 18 f6 c3 01 0f 85 06 01 00 00 49 89 c6 eb 0c 66 0f 1f 44 00 00 49 89 de 4c 89 e3 &amp;lt;4c&amp;gt; 8b 23 41 f6 c4 01 74 f1 41 0f b7 57 ce 49 8b 47 e8 48 89 df 
[117224.357641] RIP  [&amp;lt;ffffffffb13b7476&amp;gt;] rht_deferred_worker+0x226/0x430
[117224.359263]  RSP &amp;lt;ffff9843e3977da0&amp;gt;
[117224.360509] CR2: 000000000000002a 

crash&amp;gt; obd_export.exp_nid_hash ffff9862543a6800
  exp_nid_hash = {
    rhead = {
      next = 0x2a
    },
    next = 0x0
  },
crash&amp;gt; l *(rht_deferred_worker+0x226)
0xffffffffb13b7476 is in rht_deferred_worker (lib/rhashtable.c:275).
270     
271             err = -ENOENT;
272     
273             rht_for_each(entry, old_tbl, old_hash) {
274                     err = 0;
275                     next = rht_dereference_bucket(entry-&amp;gt;next, old_tbl, old_hash);
276     
277                     if (rht_is_a_nulls(next))
278                             break;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;If failed again due to wrong rhead.next value == 0x2a. It should be 0x23 and again the difference is 7.&lt;br/&gt;
This time it is seen in the logs that there was the last put for this export but no destroy_export. Probably destroy message is just stolen.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000020:00000080:18.0:1691676633.094263:0:25873:0:(genops.c:985:class_export_put()) final put ffff9862543a6800/d3484309-ce72-216b-074d-759e96edefd8
00000020:00080000:5.0:1691684259.028627:0:15635:0:(obd_config.c:264:obd_nid_add()) scratch-OST00ca: added exp ffff9862543a6800 nid 1407377771022124: rc = 0 &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="382671" author="sergey" created="Wed, 16 Aug 2023 14:07:43 +0000"  >&lt;p&gt;I believe that all kernel panics described in this ticket are caused by &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17034&quot; title=&quot;memory corruption caused by bug in qmt_seed_glbe_all&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17034&quot;&gt;&lt;del&gt;LU-17034&lt;/del&gt;&lt;/a&gt;. So please read the description in 17034 before continue.&lt;/p&gt;

&lt;p&gt;I&apos;ve analysed at least 10 different vmcores with corrupted rhashtables. In 80% of all cases I saw that the value stored in a low byte(usually 3 or 7) was increased at 7(0xA or 0xE). If suppose that 2 buckets in bucket table matches struct lqe_glbl_entry, value 0x3 will be equal to lge_edquot==1 and lge_qunit_set==1 and 0x7 to lge_eduqot==1, lge_qunit_set==1 and lgd.lge_qunit_nu==1.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;struct lqe_glbl_entry {
&#160; &#160; &#160; &#160; __u64 &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;lge_qunit;
&#160; &#160; &#160; &#160; unsigned long &#160; &#160; &#160; &#160; &#160; &#160;lge_edquot:1,
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;/* true when minimum qunit is set */
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;lge_qunit_set:1,
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;/* qunit or edquot is changed - need
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;* to send glimpse to appropriate slave */
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;lge_qunit_nu:1,
&#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160; &#160;lge_edquot_nu:1;
}; &lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I&apos;ve written a simple user mode program(test.c in attchment) that is partially copying the code from qmt_seed_glbe_all to show how 0x3 and 0x7 could be changed to 0xA and 0xE.&lt;br/&gt;
See the results:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@vm2 ~]# ./a.out 
rhashtable bkt 0x3(lge_edquot==1, lge_qunit_set==1), edquot==0
rhashtable bkt 0xa

rhashtable bkt 0x3(lge_edquot==1, lge_qunit_set==1), edquot==1
rhashtable bkt 0x3

rhashtable bkt 0x7(lge_edquot==1, lge_qunit_set==1), edquot==0
rhashtable bkt 0xe

rhashtable bkt 0x7(lge_edquot==1, lge_qunit_set==1), edquot==1
rhashtable bkt 0x7
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It explains why we often saw values increased at 7. However, it could be increased not only at 7 - I&apos;ve just provided an example of the most popular case.&lt;/p&gt;


&lt;p&gt;Note, that sometimes lqe_qunit also could be set outside the array causing to corrupt 8 bytes of neighbor memory region with something like 1024,4096, ...&lt;/p&gt;</comment>
                            <comment id="390488" author="sergey" created="Wed, 25 Oct 2023 13:05:42 +0000"  >&lt;p&gt;rhashtable was corrupted due to the bug in Quota Pools code - see &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17034&quot; title=&quot;memory corruption caused by bug in qmt_seed_glbe_all&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17034&quot;&gt;&lt;del&gt;LU-17034&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10010">
                    <name>Duplicate</name>
                                            <outwardlinks description="duplicates">
                                        <issuelink>
            <issuekey id="77477">LU-17034</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="72520">LU-16189</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="50054" name="test.c" size="2164" author="scherementsev" created="Wed, 16 Aug 2023 14:09:08 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03p4v:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>