<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:35:27 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-17440] after move from 2.14 to 2.15: LNetError: 31941:0:(peer.c:2194:lnet_destroy_peer_ni_locked()) ASSERTION( list_empty(&amp;lpni-&gt;lpni_peer_nis) ) </title>
                <link>https://jira.whamcloud.com/browse/LU-17440</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We combined a OS update from TOSS 4.6-6 to TOSS 4.7-2.1 with a move from lustre 2.14 to 2.15 (2.14.0_22.llnl-1 to lustre-2.15.4_1.llnl-1).&lt;/p&gt;

&lt;p&gt;A few hours later we began to see this error, and eventually saw it on all 12 asp server nodes.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2024-01-16 23:13:19 [39638.886090] Lustre: aspls3-OST0004: Client 8f15405e-d4cc-cf3c-7534-051e7352cf50 (at 192.168.128.24@o2ib35) reconnecting
2024-01-16 23:13:19 [39638.896879] Lustre: Skipped 98 previous similar messages
2024-01-16 23:13:34 [39654.404520] LNetError: 165557:0:(peer.c:2194:lnet_destroy_peer_ni_locked()) ASSERTION( list_empty(&amp;amp;lpni-&amp;gt;lpni_peer_nis) ) failed:
2024-01-16 23:13:34 [39654.416271] LNetError: 165557:0:(peer.c:2194:lnet_destroy_peer_ni_locked()) LBUG
2024-01-16 23:13:34 [39654.423671] Pid: 165557, comm: kiblnd_sd_00_01 4.18.0-513.9.1.1toss.t4.x86_64 #1 SMP Wed Nov 29 11:04:55 PST 2023
2024-01-16 23:13:34 [39654.433921] Call Trace TBD:
2024-01-16 23:13:34 [39654.436731] [&amp;lt;0&amp;gt;] libcfs_call_trace+0x6f/0xa0 [libcfs]
2024-01-16 23:13:34 [39654.441888] [&amp;lt;0&amp;gt;] lbug_with_loc+0x3f/0x70 [libcfs]
2024-01-16 23:13:34 [39654.446688] [&amp;lt;0&amp;gt;] lnet_destroy_peer_ni_locked+0x44d/0x4e0 [lnet]
2024-01-16 23:13:34 [39654.452722] [&amp;lt;0&amp;gt;] lnet_handle_find_routed_path+0x86c/0xee0 [lnet]
2024-01-16 23:13:34 [39654.458845] [&amp;lt;0&amp;gt;] lnet_select_pathway+0xb95/0x16c0 [lnet]
2024-01-16 23:13:34 [39654.464265] [&amp;lt;0&amp;gt;] lnet_send+0x6d/0x1e0 [lnet]
2024-01-16 23:13:34 [39654.468646] [&amp;lt;0&amp;gt;] lnet_parse_local+0x3ed/0xdd0 [lnet]
2024-01-16 23:13:34 [39654.473721] [&amp;lt;0&amp;gt;] lnet_parse+0xd7d/0x1490 [lnet]
2024-01-16 23:13:34 [39654.478366] [&amp;lt;0&amp;gt;] kiblnd_handle_rx+0x30e/0x900 [ko2iblnd]
2024-01-16 23:13:34 [39654.483782] [&amp;lt;0&amp;gt;] kiblnd_scheduler+0x104b/0x10d0 [ko2iblnd]
2024-01-16 23:13:34 [39654.489363] [&amp;lt;0&amp;gt;] kthread+0x14c/0x170
2024-01-16 23:13:34 [39654.493030] [&amp;lt;0&amp;gt;] ret_from_fork+0x1f/0x40
2024-01-16 23:13:34 [39654.497050] Kernel panic - not syncing: LBUG
2024-01-16 23:13:34 [39654.501320] CPU: 47 PID: 165557 Comm: kiblnd_sd_00_01 Kdump: loaded Tainted: P &#160; &#160; &#160; &#160; &#160; OE &#160;X --------- - &#160;- 4.18.0-513.9.1.1toss.t\
4.x86_64 #1
2024-01-16 23:13:34 [39654.514172] Hardware name: Supermicro SSG-229P-DN2R24264-LL013/X11DSN-TS, BIOS 3.4 11/04/2020
2024-01-16 23:13:34 [39654.522683] Call Trace:
2024-01-16 23:13:34 [39654.525137] &#160;dump_stack+0x41/0x60
2024-01-16 23:13:34 [39654.528457] &#160;panic+0xe7/0x2ac
2024-01-16 23:13:34 [39654.531429] &#160;? ret_from_fork+0x1f/0x40
2024-01-16 23:13:34 [39654.535182] &#160;lbug_with_loc.cold.8+0x18/0x18 [libcfs]
2024-01-16 23:13:34 [39654.540156] &#160;lnet_destroy_peer_ni_locked+0x44d/0x4e0 [lnet]
2024-01-16 23:13:35 [39654.545747] &#160;lnet_handle_find_routed_path+0x86c/0xee0 [lnet]
2024-01-16 23:13:35 [39654.551423] &#160;? lnet_peer_ni_find_locked+0x14/0x30 [lnet]
2024-01-16 23:13:35 [39654.556753] &#160;lnet_select_pathway+0xb95/0x16c0 [lnet]
2024-01-16 23:13:35 [39654.561735] &#160;? kiblnd_check_sends_locked+0x1a5/0x4a0 [ko2iblnd]
2024-01-16 23:13:35 [39654.567656] &#160;lnet_send+0x6d/0x1e0 [lnet]
2024-01-16 23:13:35 [39654.571600] &#160;lnet_parse_local+0x3ed/0xdd0 [lnet]
2024-01-16 23:13:35 [39654.576238] &#160;lnet_parse+0xd7d/0x1490 [lnet]
2024-01-16 23:13:35 [39654.580438] &#160;? try_to_wake_up+0x1c2/0x4f0
2024-01-16 23:13:35 [39654.584454] &#160;kiblnd_handle_rx+0x30e/0x900 [ko2iblnd]
2024-01-16 23:13:35 [39654.589427] &#160;? __wake_up_common+0x7a/0x190
2024-01-16 23:13:35 [39654.593526] &#160;kiblnd_scheduler+0x104b/0x10d0 [ko2iblnd]
2024-01-16 23:13:35 [39654.598665] &#160;? finish_wait+0x90/0x90
2024-01-16 23:13:35 [39654.602245] &#160;? kiblnd_cq_event+0x80/0x80 [ko2iblnd]
2024-01-16 23:13:35 [39654.607125] &#160;kthread+0x14c/0x170
2024-01-16 23:13:35 [39654.610357] &#160;? set_kthread_struct+0x50/0x50
2024-01-16 23:13:35 [39654.614544] &#160;ret_from_fork+0x1f/0x40
2024-01-16 23:13:36 [ &#160; &#160;0.000000] Linux version 4.18.0-513.9.1.1toss.t4.x86_64 (mockbuild@builder2-x86.buildfarm.internal) (gcc version 8.5.0 20210514 (Re\
d Hat 8.5.0-20) (GCC)) #1 SMP Wed Nov 29 11:04:55 PST 2023&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>TOSS 4.7-2.1&lt;br/&gt;
lustre 2.15.4_1.llnl-1&lt;br/&gt;
on lustre server asp</environment>
        <key id="80083">LU-17440</key>
            <summary>after move from 2.14 to 2.15: LNetError: 31941:0:(peer.c:2194:lnet_destroy_peer_ni_locked()) ASSERTION( list_empty(&amp;lpni-&gt;lpni_peer_nis) ) </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="ssmirnov">Serguei Smirnov</assignee>
                                    <reporter username="defazio">Gian-Carlo Defazio</reporter>
                        <labels>
                            <label>llnl</label>
                            <label>topllnl</label>
                    </labels>
                <created>Wed, 17 Jan 2024 20:02:06 +0000</created>
                <updated>Wed, 7 Feb 2024 21:21:04 +0000</updated>
                                            <version>Lustre 2.15.4</version>
                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="400096" author="defazio" created="Wed, 17 Jan 2024 20:03:24 +0000"  >&lt;p&gt;For my notes the local issue is TOSS-6214&lt;/p&gt;</comment>
                            <comment id="400122" author="defazio" created="Thu, 18 Jan 2024 00:32:19 +0000"  >&lt;p&gt;The asp cluster has 12 server nodes, asp&lt;span class=&quot;error&quot;&gt;&amp;#91;1-12&amp;#93;&lt;/span&gt;. asp&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt; are MDT&lt;span class=&quot;error&quot;&gt;&amp;#91;0000-0003&amp;#93;&lt;/span&gt;, asp&lt;span class=&quot;error&quot;&gt;&amp;#91;5-12&amp;#93;&lt;/span&gt; are OST&lt;span class=&quot;error&quot;&gt;&amp;#91;0000-0007&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;The nids are 172.19.1.&lt;span class=&quot;error&quot;&gt;&amp;#91;141-152&amp;#93;&lt;/span&gt;@o2ib100 so the last octet of the IPV4 is (&amp;lt;node_number&amp;gt; + 140)&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I will also include some logs for orelic. orelic&lt;span class=&quot;error&quot;&gt;&amp;#91;2-5&amp;#93;&lt;/span&gt; were rebooted from a 2.12 image to a 2.15 image from about 2024-01-16 (15:00 - 17:00). orelic is a ib to tcp lustre router cluster.&lt;/p&gt;

&lt;p&gt;The nids for orelic&lt;span class=&quot;error&quot;&gt;&amp;#91;2-5&amp;#93;&lt;/span&gt; are 172.19.2.&lt;span class=&quot;error&quot;&gt;&amp;#91;22-25&amp;#93;&lt;/span&gt;@o2ib100 and 172.16.66.&lt;span class=&quot;error&quot;&gt;&amp;#91;22-25&amp;#93;&lt;/span&gt;@tcp&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;The console logs are in asp-orelic-console.tar.gz&lt;/p&gt;</comment>
                            <comment id="400125" author="ssmirnov" created="Thu, 18 Jan 2024 00:41:13 +0000"  >&lt;p&gt;I can&apos;t explain yet how exactly this happened. Did you update all nodes to lustre-2.15.4_1.llnl-1 or just the servers? Please share a core file if you have it.&lt;/p&gt;

&lt;p&gt;I&apos;m a bit suspicious of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; title=&quot;Prevent use after free following *_decref_locked() usage&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17062&quot;&gt;&lt;del&gt;LU-17062&lt;/del&gt;&lt;/a&gt; &quot;lnet: Update lnet_peer_*_decref_locked usage&quot; change. If you have room to experiment, it may be worth reverting this change to see if it helps. I&apos;ll work on reproducing locally, too.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="400129" author="defazio" created="Thu, 18 Jan 2024 01:01:43 +0000"  >&lt;p&gt;There&apos;s still a variety of lustre versions, 2.12, 2.14, and 2.15&lt;/p&gt;

&lt;p&gt;The only production server cluster we&apos;ve put 2.15 on is asp. The TOSS 4 (RHEL 8) server clusters are running 2.14 (2.14.0_24.llnl-1). asp was running 2.14.0_22.llnl-1 before the update. The TOSS 3 (RHEL 7) server clusters run 2.12.&lt;/p&gt;

&lt;p&gt;Most client clusters are running 2.12, some experimental and test clusters run 2.15.&lt;/p&gt;

&lt;p&gt;The relic clusters (zrelic and orelic) run 2.12, but orelic has been 2.15 for a while for testing. It got rebooted back to 2.12 on the morning of 2024-01-16 and then put back to 2.15 that afternoon.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;So to be clear, the only 2.15 updates were asp (a server cluster) and orelic (a router cluster).&lt;/p&gt;

&lt;p&gt;orelic had been 2.15 previously, asp had only ever been 2.14.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;One of the first things I checked was to make sure we didn&apos;t leave out &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; class=&quot;external-link&quot; rel=&quot;nofollow&quot;&gt;https://jira.whamcloud.com/browse/LU-17062&lt;/a&gt; in our 2.15_4-llnl branch. We&apos;ll look into testing with that patch removed.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Unfortunately we didn&apos;t get any core files, despite all the LBUGs.&lt;/p&gt;</comment>
                            <comment id="400383" author="ssmirnov" created="Fri, 19 Jan 2024 19:36:19 +0000"  >&lt;p&gt;I did a review of which fixes from Lustre master LLNL branch may be missing, but haven&apos;t found anything that would stand out as definitely related.&#160;&lt;/p&gt;

&lt;p&gt;I&apos;d suggest though that you consider porting &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16709&quot; title=&quot;LNet: locking multiple NIDs of the same MR peer as primary results in incorrect representation&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16709&quot;&gt;&lt;del&gt;LU-16709&lt;/del&gt;&lt;/a&gt; &quot;lnet: fix locking multiple NIDs of the MR peer&quot;. It fixes &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14668&quot; title=&quot;LNet: do discovery in the background&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14668&quot;&gt;&lt;del&gt;LU-14668&lt;/del&gt;&lt;/a&gt; &quot;lnet: Lock primary NID logic&quot; which LLNL branch does have.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Serguei.&lt;/p&gt;</comment>
                            <comment id="401343" author="defazio" created="Fri, 26 Jan 2024 00:53:03 +0000"  >&lt;p&gt;I was able to reproduce the bug.&lt;/p&gt;

&lt;p&gt;The original bug seems to have been between asp and the compute cluster ruby. ruby has 8 routers and is connected to asp:&lt;/p&gt;

&lt;p&gt;&amp;lt;ruby_compute_nodes&amp;gt; &amp;lt;&#8212;&amp;gt; o2ib39 &amp;lt;&#8212;&amp;gt; &amp;lt;ruby_routers&amp;gt; &amp;lt;&#8212;&amp;gt; o2ib100 &amp;lt;&#8212;&amp;gt; asp&lt;/p&gt;

&lt;p&gt;However a route is missing on asp. The routes to ruby should be 172.19.2.&lt;span class=&quot;error&quot;&gt;&amp;#91;39-46&amp;#93;&lt;/span&gt;, but the the routes are 172.19.2.&lt;span class=&quot;error&quot;&gt;&amp;#91;39-45&amp;#93;&lt;/span&gt;, so 172.19.2.46 is missing. I noticed that this nid appears right before a lot of the LBUGs in the console logs.&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;I recreated this setup using test clusters mutt (compute) and garter (server).&lt;/p&gt;

&lt;p&gt;The garter nids for garter&lt;span class=&quot;error&quot;&gt;&amp;#91;1-8&amp;#93;&lt;/span&gt; are 172.19.1.&lt;span class=&quot;error&quot;&gt;&amp;#91;133-140&amp;#93;&lt;/span&gt;@o2ib100&lt;/p&gt;

&lt;p&gt;&amp;lt;mutt_compute_nodes&amp;gt; &amp;lt;&#8212;&amp;gt; o2ib44 &amp;lt;&#8212;&amp;gt; &amp;lt;mutt_routers&amp;gt; &amp;lt;&#8212;&amp;gt; o2ib100 &amp;lt;&#8212;&amp;gt; garter&lt;/p&gt;

&lt;p&gt;The mutt routers are mutt&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt; with nids 172.19.1.&lt;span class=&quot;error&quot;&gt;&amp;#91;105-108&amp;#93;&lt;/span&gt;@o2ib100 and 192.168.128.&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt;@o2ib44. I removed the route through mutt4 (172.19.1.108@o2ib100) on all garter nodes so now the routes are 172.19.1.&lt;span class=&quot;error&quot;&gt;&amp;#91;105-107&amp;#93;&lt;/span&gt;@o2ib100.&lt;/p&gt;

&lt;p&gt;I then pinged from a mutt compute node, mutt7 (192.168.128.7@o2ib44)&#160; to garter3 (172.19.1.135@o2ib100) using lnetctl ping. I had to ping twice, I assume to round-robin my way to using mutt4, for which garter has not route.&lt;/p&gt;

&lt;p&gt;This caused the error on garter3. I had panic on LBUG turned off.&lt;/p&gt;

&lt;p&gt;I&apos;ll upload the debug logs for mutt7, the mutt router mutt&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt; and garter3 as mutt-garter-reproducer.tar.gz&lt;/p&gt;

&lt;p&gt;I haven&apos;t looked through these debug logs yet.&lt;/p&gt;

&lt;p&gt;I&apos;ll see if the patch recommendations you made make a difference.&lt;/p&gt;

&lt;p&gt;Also, we are currently in the configuration with a route to ruby missing on asp, but asp is running lustre-2.14.0_24.llnl-1.t4.x86_64 and we don&apos;t get this bug.&lt;/p&gt;</comment>
                            <comment id="401494" author="defazio" created="Fri, 26 Jan 2024 21:33:24 +0000"  >&lt;p&gt;I was unable to recreate the bug using our 2.14 branch lustre-2.14.0_24.llnl-1.t4.x86_64 or our 2.15 branch lustre 2.15.4_1.llnl-1 with the only changes being &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; title=&quot;Prevent use after free following *_decref_locked() usage&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17062&quot;&gt;&lt;del&gt;LU-17062&lt;/del&gt;&lt;/a&gt; reverted. I&apos;ve uploaded the debug files for these tests.&lt;/p&gt;

&lt;p&gt;I noticed that I used garter4 for the server in these examples, whereas for the bug reproducer I used garter3.&lt;/p&gt;</comment>
                            <comment id="402321" author="defazio" created="Fri, 2 Feb 2024 00:24:09 +0000"  >&lt;p&gt;This is what I think is going wrong, do you agree?&lt;/p&gt;

&lt;p&gt;As noted before, without &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; title=&quot;Prevent use after free following *_decref_locked() usage&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17062&quot;&gt;&lt;del&gt;LU-17062&lt;/del&gt;&lt;/a&gt;, the problem goes away.&lt;/p&gt;

&lt;p&gt;For the code lines below, I&apos;m referring to where we added &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; title=&quot;Prevent use after free following *_decref_locked() usage&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17062&quot;&gt;&lt;del&gt;LU-17062&lt;/del&gt;&lt;/a&gt; in our 2.15.4_1-llnl branch &lt;a href=&quot;https://github.com/LLNL/lustre/commit/2e2719341590edef547e16b4051684781cf1b795&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/LLNL/lustre/commit/2e2719341590edef547e16b4051684781cf1b795&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The debug data is in mutt-garter-debug-2fcd4d27f.tar.gz in the dk.TOSS-6214.garter3.lustre-2.15.4_1.llnl_3_g2fcd4d2-1.t4.x86_64.1706816872 file. Lines 737 to 771&lt;br/&gt;
The debug patch version of lustre that was running on garter3 is at &lt;a href=&quot;https://github.com/LLNL/lustre/tree/debug_print-lnet_handle_find_routed_path&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/LLNL/lustre/tree/debug_print-lnet_handle_find_routed_path&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The setup is&lt;br/&gt;
server garter3: 172.19.1.135@o2ib100&lt;br/&gt;
routers mutt&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt;: 172.19.1.&lt;span class=&quot;error&quot;&gt;&amp;#91;105-108&amp;#93;&lt;/span&gt;@o2ib100 and 192.168.128.&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt;@o2ib44&lt;br/&gt;
client mutt7: 192.168.128.7@o2ib44&lt;/p&gt;

&lt;p&gt;garter3 &amp;lt;&#8212;&amp;gt; o2ib100 &amp;lt;&#8212;&amp;gt; mutt&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt; &amp;lt;&#8212;&amp;gt; o2ib44 &amp;lt;&#8212;&amp;gt; mutt7&lt;/p&gt;

&lt;p&gt;garter3 has routes to o2ib44 via mutt&lt;span class=&quot;error&quot;&gt;&amp;#91;1-3&amp;#93;&lt;/span&gt;, mutt7 has routes to o2ib100 via mutt&lt;span class=&quot;error&quot;&gt;&amp;#91;1-4&amp;#93;&lt;/span&gt;, so the is an asymmetry regarding mutt4&lt;br/&gt;
mutt7 pings garter3 via mutt4&lt;/p&gt;

&lt;p&gt;Looking at function lnet_handle_find_routed_path():&lt;/p&gt;

&lt;p&gt;at line 2182, there is a call to lnet_peer_ni_find_locked()&lt;br/&gt;
the call stack is lnet_handle_find_routed_path() -&amp;gt; lnet_peer_ni_find_locked() -&amp;gt; lnet_get_peer_ni_locked() -&amp;gt; lnet_peer_ni_addref_locked()&lt;br/&gt;
The return value is gwni. If gwni is not NULL, it is the gateway NI. In the debug logs for garter3, I see the nid for mutt4 when gwni is returned.&lt;/p&gt;

&lt;p&gt;The refcount for mutt4 is incremented due to lnet_peer_ni_addref_locked() being in the call stack above.&lt;/p&gt;

&lt;p&gt;before &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; title=&quot;Prevent use after free following *_decref_locked() usage&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17062&quot;&gt;&lt;del&gt;LU-17062&lt;/del&gt;&lt;/a&gt;, if gwni is not NULL, the refcount for mutt4 is decremented almost immediately at line 2184&lt;br/&gt;
after &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; title=&quot;Prevent use after free following *_decref_locked() usage&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17062&quot;&gt;&lt;del&gt;LU-17062&lt;/del&gt;&lt;/a&gt;, the decrement happens at the end of the function.&lt;/p&gt;

&lt;p&gt;at line 2291, net_find_route_locked() is called. In the debug logs I can see the value of gwni-&amp;gt;lpni_nid change after this call.&lt;br/&gt;
It changes from the nid of mutt4 to the nid of another mutt router that garter3 does have a route through.&lt;/p&gt;

&lt;p&gt;before the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; title=&quot;Prevent use after free following *_decref_locked() usage&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17062&quot;&gt;&lt;del&gt;LU-17062&lt;/del&gt;&lt;/a&gt;, there were no other calls to lnet_peer_ni_decref_locked()&lt;br/&gt;
after &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; title=&quot;Prevent use after free following *_decref_locked() usage&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17062&quot;&gt;&lt;del&gt;LU-17062&lt;/del&gt;&lt;/a&gt; there is a call to lnet_peer_ni_decref_locked() if gwni was not NULL when it was modified at line 2182&lt;br/&gt;
and hasn&apos;t subsequently become NULL. It shouldn&apos;t have become NULL if there&apos;s some NI that garter3 can use to get to o2ib44, which&lt;br/&gt;
there is in this case.&lt;/p&gt;

&lt;p&gt;after &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17062&quot; title=&quot;Prevent use after free following *_decref_locked() usage&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17062&quot;&gt;&lt;del&gt;LU-17062&lt;/del&gt;&lt;/a&gt; at line 2365 lnet_peer_ni_decref_locked() is called on the a router with a refcount of 1 (assuming the gwni changed values due to net_find_route_locked()). The refcount goes to 0 (or would go to 0) so lnet_destroy_peer_ni_locked() is called. lnet_destroy_peer_ni_locked() has asserts to make sure something like this doesn&apos;t happen, so the asserts fail.&lt;/p&gt;</comment>
                            <comment id="402328" author="ssmirnov" created="Fri, 2 Feb 2024 01:16:10 +0000"  >&lt;p&gt;Thanks &lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=defazio&quot; class=&quot;user-hover&quot; rel=&quot;defazio&quot;&gt;defazio&lt;/a&gt; for tracking this down! I think your analysis is correct: indeed it looks like essentially we decref on what we didn&apos;t addref here.&#160;&lt;/p&gt;

&lt;p&gt;(I wonder if this can be reproduced with just two routers, for the sake of adding a testcase.)&lt;/p&gt;

&lt;p&gt;Are you planning to push a patch for this or would you like me to?&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="402433" author="defazio" created="Fri, 2 Feb 2024 17:38:58 +0000"  >&lt;p&gt;I&apos;ll push a patch. I&apos;m not entirely sure what the best way to go about this is, but I guess I&apos;ll find out in the code reviews.&lt;/p&gt;

&lt;p&gt;I&apos;ll run a test with 2 routers to see if I can reproduce it.&lt;/p&gt;

&lt;p&gt;Another weird thing I noticed was that it didn&apos;t reproduce on a different test cluster. That cluster, slag, isn&apos;t the same hardware as asp and garter. So I&apos;m gonna go back and confirm that I didn&apos;t just have things set up wrong. However, the bug that I found doesn&apos;t look hardware specific.&lt;/p&gt;</comment>
                            <comment id="402458" author="defazio" created="Fri, 2 Feb 2024 22:00:46 +0000"  >&lt;p&gt;The bug does reproduce with 2 routers. I tested where mutt7 has routes through mutt&lt;span class=&quot;error&quot;&gt;&amp;#91;1,2&amp;#93;&lt;/span&gt; and garter3 has a route through mutt1. The debug files are in mutt-garter-debug-2routers.tar.gz&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;The bug does not reproduce in the case that garter3 has no routes to o2ib44. The debug files are in mutt-garter-debug-garter3_no_routes.tar.gz&lt;/p&gt;</comment>
                            <comment id="402489" author="gerrit" created="Sat, 3 Feb 2024 00:54:07 +0000"  >&lt;p&gt;&quot;Gian-Carlo DeFazio &amp;lt;defazio1@llnl.gov&amp;gt;&quot; uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/c/fs/lustre-release/+/53896&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/c/fs/lustre-release/+/53896&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-17440&quot; title=&quot;after move from 2.14 to 2.15: LNetError: 31941:0:(peer.c:2194:lnet_destroy_peer_ni_locked()) ASSERTION( list_empty(&amp;amp;lpni-&amp;gt;lpni_peer_nis) ) &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-17440&quot;&gt;LU-17440&lt;/a&gt; lnet: prevent errorneous decref for asym route&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 8466a612e1bf5ceda0e3bf32a5eada2277cbbc75&lt;/p&gt;</comment>
                            <comment id="402942" author="defazio" created="Tue, 6 Feb 2024 23:36:49 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=ssmirnov&quot; class=&quot;user-hover&quot; rel=&quot;ssmirnov&quot;&gt;ssmirnov&lt;/a&gt;&lt;br/&gt;
I&apos;ve gone with the patch that Chris Horn recommended. As for the test case, you mentioned using just two routers. Is there a vm configuration that already has 2 routers setup? Or do I need to set things up from within a test script like sanity-lnet.sh?&lt;/p&gt;</comment>
                            <comment id="403079" author="ssmirnov" created="Wed, 7 Feb 2024 21:21:04 +0000"  >&lt;p&gt;I was hoping it would be possible to go with sanity-lnet.sh. There are some routing tests there already, but I don&apos;t think there&apos;s a test which uses 2 routers.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=hornc&quot; class=&quot;user-hover&quot; rel=&quot;hornc&quot;&gt;hornc&lt;/a&gt; can correct me if I&apos;m wrong, but I think it should be possible to basically follow the existing pattern with new test if we make sure there are enough test nodes allocated.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="52991" name="asp-orelic-console.tar.gz" size="9197916" author="defazio" created="Thu, 18 Jan 2024 00:32:41 +0000"/>
                            <attachment id="53311" name="mutt-garter-2.14-no-bug.tar.gz" size="743663" author="defazio" created="Fri, 26 Jan 2024 21:33:42 +0000"/>
                            <attachment id="53310" name="mutt-garter-2.15-revert-LU-17062-no-bug.tar.gz" size="793643" author="defazio" created="Fri, 26 Jan 2024 21:33:36 +0000"/>
                            <attachment id="53527" name="mutt-garter-debug-2fcd4d27f.tar.gz" size="835634" author="defazio" created="Fri, 2 Feb 2024 00:24:27 +0000"/>
                            <attachment id="53552" name="mutt-garter-debug-2routers.tar.gz" size="986538" author="defazio" created="Fri, 2 Feb 2024 22:01:23 +0000"/>
                            <attachment id="53551" name="mutt-garter-debug-garter3_no_routes.tar.gz" size="461991" author="defazio" created="Fri, 2 Feb 2024 22:01:07 +0000"/>
                            <attachment id="53299" name="mutt-garter-reproducer.tar.gz" size="1292368" author="defazio" created="Fri, 26 Jan 2024 00:53:30 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i047zz:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>