<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:20:00 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15633] OSS hit txg_sync hung and then OOM during recovery</title>
                <link>https://jira.whamcloud.com/browse/LU-15633</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After switch soak to master-ib with EL7.9, mofed 4.9, OSS crashed consistently.&lt;/p&gt;

&lt;p&gt;When doing oss failover, during recovery, the paired OSS hit following error&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2022-03-07 23:42:50,054:fsmgmt.fsmgmt:INFO     ... soaked-OST0014 mounted successfully on soak-7
2022-03-07 23:42:50,055:fsmgmt.fsmgmt:INFO     ... soaked-OST0014 failed over
2022-03-07 23:42:50,055:fsmgmt.fsmgmt:INFO     Wait for recovery to complete
2022-03-07 23:42:50,949:fsmgmt.fsmgmt:DEBUG    Recovery Result Record: {&apos;soak-7&apos;: {&apos;soaked-OST000b&apos;: &apos;INACTIVE&apos;, &apos;soaked-OST000a&apos;: &apos;RECOVERING&apos;, &apos;soa
ked-OST000f&apos;: &apos;INACTIVE&apos;, &apos;soaked-OST000e&apos;: &apos;RECOVERING&apos;, &apos;soaked-OST0002&apos;: &apos;COMPLETE&apos;, &apos;soaked-OST0003&apos;: &apos;INACTIVE&apos;, &apos;soaked-OST0006&apos;: &apos;RECOVERING&apos;,
 &apos;soaked-OST0007&apos;: &apos;INACTIVE&apos;}}
2022-03-07 23:42:50,950:fsmgmt.fsmgmt:INFO     Node soak-7: &apos;soaked-OST0002&apos; recovery completed
2022-03-07 23:42:50,950:fsmgmt.fsmgmt:INFO     soaked-OST000a in status &apos;RECOVERING&apos;.
2022-03-07 23:42:50,950:fsmgmt.fsmgmt:INFO     soaked-OST0006 in status &apos;RECOVERING&apos;.
2022-03-07 23:42:50,951:fsmgmt.fsmgmt:INFO     soaked-OST000e in status &apos;RECOVERING&apos;.
2022-03-07 23:42:50,951:fsmgmt.fsmgmt:INFO     Next recovery check in 15s...
...
2022-03-07 23:43:03,746:sched.sched :DEBUG    JobID: 1074530 type: kcompile #tasks: 1 timelimit: 05:00:00
cmd: kcompile.sh /mnt/soaked/soaktest/test
2022-03-07 23:44:38,215:fsmgmt.fsmgmt:DEBUG    Recovery Result Record: {&apos;soak-7&apos;: &apos;error: 255&apos;}
2022-03-07 23:44:38,215:fsmgmt.fsmgmt:ERROR    Node &apos;soak-7&apos;, no recovery status information available for soaked-OST000a.
2022-03-07 23:44:38,215:fsmgmt.fsmgmt:ERROR    Node &apos;soak-7&apos;, no recovery status information available for soaked-OST0006.
2022-03-07 23:44:38,216:fsmgmt.fsmgmt:ERROR    Node &apos;soak-7&apos;, no recovery status information available for soaked-OST000e.
2022-03-07 23:44:38,216:fsmgmt.fsmgmt:INFO     Next recovery check in 15s...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt; 

&lt;p&gt;on soak-7, console shows&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[19313.048562] Lustre: soaked-OST000e: deleting orphan objects from 0x680000400:3566 to 0x680000400:5057
[19440.127775] INFO: task txg_sync:15186 blocked for more than 120 seconds.
[19440.135262] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[19440.144004] txg_sync        D ffff8c21ebfb9080     0 15186      2 0x00000080
[19440.151895] Call Trace:
[19440.154633]  [&amp;lt;ffffffffb7d89179&amp;gt;] schedule+0x29/0x70
[19440.160179]  [&amp;lt;ffffffffb7d86d88&amp;gt;] schedule_timeout+0x168/0x2d0
[19440.166693]  [&amp;lt;ffffffffb76ad8b0&amp;gt;] ? requeue_timers+0x170/0x170
[19440.173208]  [&amp;lt;ffffffffb7706992&amp;gt;] ? ktime_get_ts64+0x52/0xf0
[19440.179526]  [&amp;lt;ffffffffb7d88a2d&amp;gt;] io_schedule_timeout+0xad/0x130
[19440.186244]  [&amp;lt;ffffffffc069c5cd&amp;gt;] __cv_timedwait_common+0x13d/0x190 [spl]
[19440.193829]  [&amp;lt;ffffffffb76c6f50&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[19440.200346]  [&amp;lt;ffffffffc069c659&amp;gt;] __cv_timedwait_io+0x19/0x20 [spl]
[19440.207391]  [&amp;lt;ffffffffc1876eb3&amp;gt;] zio_wait+0x143/0x290 [zfs]
[19440.213746]  [&amp;lt;ffffffffc17cf30f&amp;gt;] dsl_pool_sync+0xdf/0x500 [zfs]
[19440.220484]  [&amp;lt;ffffffffc1811e30&amp;gt;] ? spa_suspend_async_destroy+0x60/0x60 [zfs]
[19440.228481]  [&amp;lt;ffffffffc17fdab3&amp;gt;] spa_sync+0x5c3/0x10e0 [zfs]
[19440.234928]  [&amp;lt;ffffffffc181591b&amp;gt;] ? spa_txg_history_init_io+0x10b/0x120 [zfs]
[19440.242936]  [&amp;lt;ffffffffc181a00f&amp;gt;] txg_sync_thread+0x2cf/0x4d0 [zfs]
[19440.249964]  [&amp;lt;ffffffffc1819d40&amp;gt;] ? txg_fini+0x280/0x280 [zfs]
[19440.256485]  [&amp;lt;ffffffffc06a3873&amp;gt;] thread_generic_wrapper+0x73/0x80 [spl]
[19440.263969]  [&amp;lt;ffffffffc06a3800&amp;gt;] ? __thread_exit+0x20/0x20 [spl]
[19440.270774]  [&amp;lt;ffffffffb76c5e61&amp;gt;] kthread+0xd1/0xe0
[19440.276221]  [&amp;lt;ffffffffb76c5d90&amp;gt;] ? insert_kthread_work+0x40/0x40
[19440.283028]  [&amp;lt;ffffffffb7d95df7&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[19440.290317]  [&amp;lt;ffffffffb76c5d90&amp;gt;] ? insert_kthread_work+0x40/0x40
[19469.327166] Lustre: soaked-OST0002: Client dd15d031-530f-484f-875a-a585acc53921 (at 192.168.1.125@o2ib) reconnecting
[19469.848067] Lustre: soaked-OST0003: Client bd579c0e-c6d9-4026-8251-a17bb20e4467 (at 192.168.1.119@o2ib) reconnecting
[19469.859817] Lustre: Skipped 5 previous similar messages
[19524.058650] Lustre: ll_ost_io01_037: service thread pid 23417 was inactive for 200.092 seconds. Watchdog stack traces are limited to 3 per 300 seconds, skipping this one.
[19524.058652] Lustre: ll_ost_io00_087: service thread pid 23495 was inactive for 200.087 seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[19524.058654] Lustre: ll_ost_io00_068: service thread pid 23475 was inactive for 200.015 seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[19524.058655] Lustre: ll_ost_io00_037: service thread pid 23405 was inactive for 200.090 seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:
[19524.058658] Pid: 23495, comm: ll_ost_io00_087 3.10.0-1160.49.1.el7_lustre.x86_64 #1 SMP Mon Dec 13 22:00:59 UTC 2021
[19524.058660] Call Trace:
[19524.058685] [&amp;lt;0&amp;gt;] cv_wait_common+0x125/0x150 [spl]
[19524.058688] [&amp;lt;0&amp;gt;] __cv_wait+0x15/0x20 [spl]
[19524.058729] [&amp;lt;0&amp;gt;] dbuf_read+0x286/0x570 [zfs]
[19524.058748] [&amp;lt;0&amp;gt;] dbuf_hold_impl+0x4fd/0x5d0 [zfs]
[19524.058767] [&amp;lt;0&amp;gt;] dbuf_hold_impl+0x4e2/0x5d0 [zfs]
[19524.058785] [&amp;lt;0&amp;gt;] dbuf_hold_level+0x35/0x60 [zfs]
[19524.058803] [&amp;lt;0&amp;gt;] dbuf_hold+0x16/0x20 [zfs]
[19524.058823] [&amp;lt;0&amp;gt;] dmu_buf_hold_array_by_dnode+0xee/0x4e0 [zfs]
[19524.058843] [&amp;lt;0&amp;gt;] dmu_buf_hold_array_by_bonus+0x69/0x90 [zfs]
[19524.058852] [&amp;lt;0&amp;gt;] osd_bufs_get+0x7c0/0xfb0 [osd_zfs]
[19524.058864] [&amp;lt;0&amp;gt;] ofd_preprw+0x801/0x18d0 [ofd]
[19524.058926] [&amp;lt;0&amp;gt;] tgt_brw_read+0x9d4/0x2040 [ptlrpc]
[19524.058956] [&amp;lt;0&amp;gt;] tgt_request_handle+0x92f/0x19c0 [ptlrpc]
[19524.058984] [&amp;lt;0&amp;gt;] ptlrpc_server_handle_request+0x253/0xc30 [ptlrpc]
[19524.059010] [&amp;lt;0&amp;gt;] ptlrpc_main+0xbf4/0x15e0 [ptlrpc]
[19524.059015] [&amp;lt;0&amp;gt;] kthread+0xd1/0xe0
[19524.059018] [&amp;lt;0&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[19524.059033] [&amp;lt;0&amp;gt;] 0xfffffffffffffffe
[19524.059034] Pid: 23475, comm: ll_ost_io00_068 3.10.0-1160.49.1.el7_lustre.x86_64 #1 SMP Mon Dec 13 22:00:59 UTC 2021
[19524.059034] Call Trace:
...

[19581.563091] Lustre: Skipped 2 previous similar messages
[19581.869624] Lustre: soaked-OST000b: Bulk IO read error with b5d7b2be-72fd-4f21-9635-1a800193ea85 (at 192.168.1.121@o2ib), client will retry: rc -107
[19583.779078] obd_memory max: 533504839, obd_memory current: 532366911
[19583.786625] kthreadd invoked oom-killer: gfp_mask=0x3000d0, order=2, oom_score_adj=0
[19583.795274] kthreadd cpuset=/ mems_allowed=0-1
[19583.800235] CPU: 14 PID: 2 Comm: kthreadd Kdump: loaded Tainted: P           OE  ------------   3.10.0-1160.49.1.el7_lustre.x86_64 #1
[19583.813622] Hardware name: Intel Corporation S2600GZ ........../S2600GZ, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
[19583.826137] Call Trace:
[19583.828872]  [&amp;lt;ffffffffb7d83539&amp;gt;] dump_stack+0x19/0x1b
[19583.834609]  [&amp;lt;ffffffffb7d7e5d8&amp;gt;] dump_header+0x90/0x229
[19583.840540]  [&amp;lt;ffffffffb7d7d3d8&amp;gt;] ? printk+0x60/0x77
[19583.846085]  [&amp;lt;ffffffffb77c254d&amp;gt;] oom_kill_process+0x2cd/0x490
[19583.852596]  [&amp;lt;ffffffffb77c1f3d&amp;gt;] ? oom_unkillable_task+0xcd/0x120
[19583.859498]  [&amp;lt;ffffffffb77c2c3a&amp;gt;] out_of_memory+0x31a/0x500
[19583.865720]  [&amp;lt;ffffffffb77c9854&amp;gt;] __alloc_pages_nodemask+0xad4/0xbe0
[19583.872817]  [&amp;lt;ffffffffb76c5d90&amp;gt;] ? insert_kthread_work+0x40/0x40
[19583.879624]  [&amp;lt;ffffffffb769882d&amp;gt;] copy_process+0x1dd/0x1a80
[19583.885846]  [&amp;lt;ffffffffb769a281&amp;gt;] do_fork+0x91/0x330
[19583.891390]  [&amp;lt;ffffffffb7d88df0&amp;gt;] ? __schedule+0x320/0x680
[19583.897516]  [&amp;lt;ffffffffb769a546&amp;gt;] kernel_thread+0x26/0x30
[19583.903548]  [&amp;lt;ffffffffb76c6841&amp;gt;] kthreadd+0x2e1/0x320
[19583.909286]  [&amp;lt;ffffffffb76c6560&amp;gt;] ? kthread_create_on_cpu+0x60/0x60
[19583.916287]  [&amp;lt;ffffffffb7d95df7&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[19583.923575]  [&amp;lt;ffffffffb76c6560&amp;gt;] ? kthread_create_on_cpu+0x60/0x60
[19583.930571] Mem-Info:
...
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>soak is running lustre-master-ib build #647 tag-2.15.0_RC2_2_g94f4e1f</environment>
        <key id="69018">LU-15633</key>
            <summary>OSS hit txg_sync hung and then OOM during recovery</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="sarah">Sarah Liu</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Tue, 8 Mar 2022 21:02:40 +0000</created>
                <updated>Tue, 8 Mar 2022 21:48:20 +0000</updated>
                                            <version>Lustre 2.15.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                                    <attachments>
                            <attachment id="42636" name="soak-7.log" size="1010381" author="sarah" created="Tue, 8 Mar 2022 21:05:42 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02kav:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>