<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:46:14 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-11707] obdfilter-survey test 1c hangs with lctl blocked</title>
                <link>https://jira.whamcloud.com/browse/LU-11707</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;obdfilter-survey test_1c hangs. Looking at the logs at &lt;a href=&quot;https://testing.whamcloud.com/test_sets/77a3ca20-ef6b-11e8-86c0-52540065bddc&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/77a3ca20-ef6b-11e8-86c0-52540065bddc&lt;/a&gt; , the last output in the client test_log is&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== obdfilter-survey test 1c: Object Storage Targets survey, big batch ================================ 19:09:08 (1543000148)
CMD: trevis-54vm3 lctl dl | grep obdfilter
CMD: trevis-54vm3 /usr/sbin/lctl list_nids | grep tcp | cut -f 1 -d @
+ NETTYPE=tcp thrlo=32 nobjhi=1 thrhi=32 size=8192 case=disk rslt_loc=/tmp targets=&quot;10.9.6.102:lustre-OST0000 10.9.6.102:lustre-OST0001 10.9.6.102:lustre-OST0002 10.9.6.102:lustre-OST0003 10.9.6.102:lustre-OST0004 10.9.6.102:lustre-OST0005 10.9.6.102:lustre-OST0006 10.9.6.102:lustre-OST0007&quot; /usr/bin/obdfilter-survey
Fri Nov 23 19:09:22 UTC 2018 Obdfilter-survey for case=disk from trevis-54vm1.trevis.whamcloud.com
ost  8 sz 67108864K rsz 1024K obj    8 thr  256 write 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Looking at the OSS (vm3) console log, we see lctl blocked and it looks like the/a problem started with test 1a&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[128050.794039] Lustre: DEBUG MARKER: /usr/sbin/lctl mark == obdfilter-survey test 1a: Object Storage Targets survey =========================================== 18:04:49 \(1542996289\)
[128050.999311] Lustre: DEBUG MARKER: == obdfilter-survey test 1a: Object Storage Targets survey =========================================== 18:04:49 (1542996289)
[128051.201155] Lustre: DEBUG MARKER: lctl dl | grep obdfilter
[128051.547065] Lustre: DEBUG MARKER: /usr/sbin/lctl list_nids | grep tcp | cut -f 1 -d @
[131760.431733] INFO: task lctl:20810 blocked for more than 120 seconds.
[131760.433054] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[131760.434345] lctl            D ffff95bdb658bf40     0 20810  20695 0x00000080
[131760.435608] Call Trace:
[131760.436105]  [&amp;lt;ffffffffb8d18f39&amp;gt;] schedule+0x29/0x70
[131760.437209]  [&amp;lt;ffffffffc0c54577&amp;gt;] osd_do_bio.isra.25+0x717/0x8d0 [osd_ldiskfs]
[131760.438430]  [&amp;lt;ffffffffb86bef10&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[131760.439484]  [&amp;lt;ffffffffc0c54a17&amp;gt;] osd_read_prep+0x2e7/0x3f0 [osd_ldiskfs]
[131760.440721]  [&amp;lt;ffffffffc0d90979&amp;gt;] ofd_preprw+0x809/0x1170 [ofd]
[131760.441782]  [&amp;lt;ffffffffb879bf3e&amp;gt;] ? __get_free_pages+0xe/0x40
[131760.442759]  [&amp;lt;ffffffffb87f835e&amp;gt;] ? kmalloc_order_trace+0x2e/0xa0
[131760.443851]  [&amp;lt;ffffffffb87fbf01&amp;gt;] ? __kmalloc+0x211/0x230
[131760.444968]  [&amp;lt;ffffffffc0f7d17a&amp;gt;] echo_client_prep_commit.isra.49+0x33a/0xc30 [obdecho]
[131760.446338]  [&amp;lt;ffffffffc0f84ebf&amp;gt;] echo_client_iocontrol+0x95f/0x1be0 [obdecho]
[131760.447782]  [&amp;lt;ffffffffc07bd7a9&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[131760.449040]  [&amp;lt;ffffffffc07a8609&amp;gt;] class_handle_ioctl+0x1929/0x1dc0 [obdclass]
[131760.450245]  [&amp;lt;ffffffffb8706640&amp;gt;] ? futex_wake+0x90/0x180
[131760.451231]  [&amp;lt;ffffffffb87caefd&amp;gt;] ? handle_mm_fault+0x39d/0x9b0
[131760.452288]  [&amp;lt;ffffffffb88d4bfe&amp;gt;] ? security_capable+0x1e/0x20
[131760.453313]  [&amp;lt;ffffffffc078d5d2&amp;gt;] obd_class_ioctl+0xd2/0x170 [obdclass]
[131760.454479]  [&amp;lt;ffffffffb8834100&amp;gt;] do_vfs_ioctl+0x360/0x550
[131760.455413]  [&amp;lt;ffffffffb8d2056c&amp;gt;] ? __do_page_fault+0x1bc/0x4f0
[131760.456411]  [&amp;lt;ffffffffb8834391&amp;gt;] SyS_ioctl+0xa1/0xc0
[131760.457266]  [&amp;lt;ffffffffb8d256d5&amp;gt;] ? system_call_after_swapgs+0xa2/0x146
[131760.458374]  [&amp;lt;ffffffffb8d2579b&amp;gt;] system_call_fastpath+0x22/0x27
[131760.459380]  [&amp;lt;ffffffffb8d256e1&amp;gt;] ? system_call_after_swapgs+0xae/0x146
[131760.460481] INFO: task lctl:20813 blocked for more than 120 seconds.
[131760.461538] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[131760.462838] lctl            D ffff95bdb658eeb0     0 20813  20695 0x00000080
[131760.464077] Call Trace:
[131760.464576]  [&amp;lt;ffffffffb8d18f39&amp;gt;] schedule+0x29/0x70
[131760.465481]  [&amp;lt;ffffffffc0c54577&amp;gt;] osd_do_bio.isra.25+0x717/0x8d0 [osd_ldiskfs]
[131760.466745]  [&amp;lt;ffffffffb86bef10&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[131760.467729]  [&amp;lt;ffffffffc0c54a17&amp;gt;] osd_read_prep+0x2e7/0x3f0 [osd_ldiskfs]
[131760.468876]  [&amp;lt;ffffffffc0d90979&amp;gt;] ofd_preprw+0x809/0x1170 [ofd]
[131760.469860]  [&amp;lt;ffffffffb879bf3e&amp;gt;] ? __get_free_pages+0xe/0x40
[131760.470831]  [&amp;lt;ffffffffb87f835e&amp;gt;] ? kmalloc_order_trace+0x2e/0xa0
[131760.472017]  [&amp;lt;ffffffffb87fbf01&amp;gt;] ? __kmalloc+0x211/0x230
[131760.473021]  [&amp;lt;ffffffffc0f7d17a&amp;gt;] echo_client_prep_commit.isra.49+0x33a/0xc30 [obdecho]
[131760.474345]  [&amp;lt;ffffffffc0f84ebf&amp;gt;] echo_client_iocontrol+0x95f/0x1be0 [obdecho]
[131760.475556]  [&amp;lt;ffffffffc07bd7a9&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[131760.476755]  [&amp;lt;ffffffffc07a8609&amp;gt;] class_handle_ioctl+0x1929/0x1dc0 [obdclass]
[131760.477947]  [&amp;lt;ffffffffb87caefd&amp;gt;] ? handle_mm_fault+0x39d/0x9b0
[131760.478941]  [&amp;lt;ffffffffb88d4bfe&amp;gt;] ? security_capable+0x1e/0x20
[131760.479986]  [&amp;lt;ffffffffc078d5d2&amp;gt;] obd_class_ioctl+0xd2/0x170 [obdclass]
[131760.481127]  [&amp;lt;ffffffffb8834100&amp;gt;] do_vfs_ioctl+0x360/0x550
[131760.482055]  [&amp;lt;ffffffffb8834391&amp;gt;] SyS_ioctl+0xa1/0xc0
[131760.482916]  [&amp;lt;ffffffffb8d256d5&amp;gt;] ? system_call_after_swapgs+0xa2/0x146
[131760.484012]  [&amp;lt;ffffffffb8d2579b&amp;gt;] system_call_fastpath+0x22/0x27
[131760.485016]  [&amp;lt;ffffffffb8d256e1&amp;gt;] ? system_call_after_swapgs+0xae/0x146
[131760.486125] INFO: task lctl:20815 blocked for more than 120 seconds.
[131760.487186] &quot;echo 0 &amp;gt; /proc/sys/kernel/hung_task_timeout_secs&quot; disables this message.
[131760.488468] lctl            D ffff95bd48ab8fd0     0 20815  20695 0x00000080
[131760.489714] Call Trace:
[131760.490151]  [&amp;lt;ffffffffb8d18f39&amp;gt;] schedule+0x29/0x70
[131760.490993]  [&amp;lt;ffffffffc0c54577&amp;gt;] osd_do_bio.isra.25+0x717/0x8d0 [osd_ldiskfs]
[131760.492185]  [&amp;lt;ffffffffb86bef10&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[131760.493174]  [&amp;lt;ffffffffc0c54a17&amp;gt;] osd_read_prep+0x2e7/0x3f0 [osd_ldiskfs]
[131760.494309]  [&amp;lt;ffffffffc0d90979&amp;gt;] ofd_preprw+0x809/0x1170 [ofd]
[131760.495314]  [&amp;lt;ffffffffb879bf3e&amp;gt;] ? __get_free_pages+0xe/0x40
[131760.496277]  [&amp;lt;ffffffffb87f835e&amp;gt;] ? kmalloc_order_trace+0x2e/0xa0
[131760.497298]  [&amp;lt;ffffffffb87fbf01&amp;gt;] ? __kmalloc+0x211/0x230
[131760.498209]  [&amp;lt;ffffffffc0f7d17a&amp;gt;] echo_client_prep_commit.isra.49+0x33a/0xc30 [obdecho]
[131760.499522]  [&amp;lt;ffffffffc0f84ebf&amp;gt;] echo_client_iocontrol+0x95f/0x1be0 [obdecho]
[131760.500725]  [&amp;lt;ffffffffc07bd7a9&amp;gt;] ? lprocfs_counter_add+0xf9/0x160 [obdclass]
[131760.501935]  [&amp;lt;ffffffffc07a8609&amp;gt;] class_handle_ioctl+0x1929/0x1dc0 [obdclass]
[131760.503119]  [&amp;lt;ffffffffb88d4bfe&amp;gt;] ? security_capable+0x1e/0x20
[131760.504107]  [&amp;lt;ffffffffc078d5d2&amp;gt;] obd_class_ioctl+0xd2/0x170 [obdclass]
[131760.505205]  [&amp;lt;ffffffffb8834100&amp;gt;] do_vfs_ioctl+0x360/0x550
[131760.506134]  [&amp;lt;ffffffffb8d2056c&amp;gt;] ? __do_page_fault+0x1bc/0x4f0
[131760.507129]  [&amp;lt;ffffffffb8834391&amp;gt;] SyS_ioctl+0xa1/0xc0
[131760.507990]  [&amp;lt;ffffffffb8d256d5&amp;gt;] ? system_call_after_swapgs+0xa2/0x146
[131760.509101]  [&amp;lt;ffffffffb8d2579b&amp;gt;] system_call_fastpath+0x22/0x27
[131760.510154]  [&amp;lt;ffffffffb8d256e1&amp;gt;] ? system_call_after_swapgs+0xae/0x146
[131906.715976] Lustre: DEBUG MARKER: lctl set_param -n fail_loc=0 	    fail_val=0 2&amp;gt;/dev/null
[131907.490234] Lustre: DEBUG MARKER: rc=0;
[131907.490234] 			val=$(/usr/sbin/lctl get_param -n catastrophe 2&amp;gt;&amp;amp;1);
[131907.490234] 			if [[ $? -eq 0 &amp;amp;&amp;amp; $val -ne 0 ]]; then
[131907.490234] 				echo $(hostname -s): $val;
[131907.490234] 				rc=$val;
[131907.490234] 			fi;
[131907.490234] 			exit $rc
[131908.529978] Lustre: DEBUG MARKER: dmesg
[131909.803409] Lustre: DEBUG MARKER: /usr/sbin/lctl mark  SKIP: obdfilter-survey test_1b skipping ALWAYS excluded test 1b
[131910.013295] Lustre: DEBUG MARKER: SKIP: obdfilter-survey test_1b skipping ALWAYS excluded test 1b
[131910.263609] Lustre: DEBUG MARKER: /usr/sbin/lctl mark == obdfilter-survey test 1c: Object Storage Targets survey, big batch ================================ 19:09:08 \(1543000148\)
[131910.479290] Lustre: DEBUG MARKER: == obdfilter-survey test 1c: Object Storage Targets survey, big batch ================================ 19:09:08 (1543000148)
[131910.678546] Lustre: DEBUG MARKER: lctl dl | grep obdfilter
[131911.110787] Lustre: DEBUG MARKER: /usr/sbin/lctl list_nids | grep tcp | cut -f 1 -d @
[132377.393312] SysRq : Changing Loglevel
[132377.394371] Loglevel set to 8
[132605.646160] SysRq : Show State
[132605.647102]   task                        PC stack   pid father
[132605.648386] systemd         S ffff95bdbc140000     0     1      0 0x00000000
[132605.650160] Call Trace:
[132605.650768]  [&amp;lt;ffffffffb8d18f39&amp;gt;] schedule+0x29/0x70
[132605.651958]  [&amp;lt;ffffffffb8d17fdd&amp;gt;] schedule_hrtimeout_range_clock+0x12d/0x150
[132605.653477]  [&amp;lt;ffffffffb886bdc9&amp;gt;] ? ep_scan_ready_list.isra.7+0x1b9/0x1f0
[132605.655070]  [&amp;lt;ffffffffb8d18013&amp;gt;] schedule_hrtimeout_range+0x13/0x20
[132605.656442]  [&amp;lt;ffffffffb886c05e&amp;gt;] ep_poll+0x23e/0x360
[132605.657646]  [&amp;lt;ffffffffb86d2010&amp;gt;] ? wake_up_state+0x20/0x20
[132605.658875]  [&amp;lt;ffffffffb886d50d&amp;gt;] SyS_epoll_wait+0xed/0x120
[132605.660172]  [&amp;lt;ffffffffb8d256d5&amp;gt;] ? system_call_after_swapgs+0xa2/0x146
[132605.661598]  [&amp;lt;ffffffffb8d2579b&amp;gt;] system_call_fastpath+0x22/0x27
[132605.662973]  [&amp;lt;ffffffffb8d256e1&amp;gt;] ? system_call_after_swapgs+0xae/0x146
&#8230;
[132606.070217] jbd2/vda1-8     D ffff95bd7612af70     0   262      2 0x00000000
[132606.071560] Call Trace:
[132606.072001]  [&amp;lt;ffffffffb8d16ec0&amp;gt;] ? bit_wait+0x50/0x50
[132606.072969]  [&amp;lt;ffffffffb8d18f39&amp;gt;] schedule+0x29/0x70
[132606.073822]  [&amp;lt;ffffffffb8d168a9&amp;gt;] schedule_timeout+0x239/0x2c0
[132606.074887]  [&amp;lt;ffffffffb866a14e&amp;gt;] ? kvm_clock_get_cycles+0x1e/0x20
[132606.075920]  [&amp;lt;ffffffffb8d16ec0&amp;gt;] ? bit_wait+0x50/0x50
[132606.076878]  [&amp;lt;ffffffffb8d1844d&amp;gt;] io_schedule_timeout+0xad/0x130
[132606.077880]  [&amp;lt;ffffffffb8d184e8&amp;gt;] io_schedule+0x18/0x20
[132606.078903]  [&amp;lt;ffffffffb8d16ed1&amp;gt;] bit_wait_io+0x11/0x50
[132606.079799]  [&amp;lt;ffffffffb8d169f7&amp;gt;] __wait_on_bit+0x67/0x90
[132606.080788]  [&amp;lt;ffffffffb8d16ec0&amp;gt;] ? bit_wait+0x50/0x50
[132606.081665]  [&amp;lt;ffffffffb8d16b61&amp;gt;] out_of_line_wait_on_bit+0x81/0xb0
[132606.082798]  [&amp;lt;ffffffffb86befd0&amp;gt;] ? wake_bit_function+0x40/0x40
[132606.083808]  [&amp;lt;ffffffffb885621a&amp;gt;] __wait_on_buffer+0x2a/0x30
[132606.357324]  [&amp;lt;ffffffffc00cd7f1&amp;gt;] jbd2_journal_commit_transaction+0x1781/0x19b0 [jbd2]
[132606.359015]  [&amp;lt;ffffffffb86cc7f0&amp;gt;] ? finish_task_switch+0x50/0x170
[132606.368501]  [&amp;lt;ffffffffc00d2ab9&amp;gt;] kjournald2+0xc9/0x260 [jbd2]
[132606.369630]  [&amp;lt;ffffffffb86bef10&amp;gt;] ? wake_up_atomic_t+0x30/0x30
[132606.370614]  [&amp;lt;ffffffffc00d29f0&amp;gt;] ? commit_timeout+0x10/0x10 [jbd2]
[132606.371738]  [&amp;lt;ffffffffb86bdf21&amp;gt;] kthread+0xd1/0xe0
[132606.372563]  [&amp;lt;ffffffffb86bde50&amp;gt;] ? insert_kthread_work+0x40/0x40
[132606.373577]  [&amp;lt;ffffffffb8d255f7&amp;gt;] ret_from_fork_nospec_begin+0x21/0x21
[132606.374762]  [&amp;lt;ffffffffb86bde50&amp;gt;] ? insert_kthread_work+0x40/0x40
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We have seen a similar OSS call trace where lctl is blocked in &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10872&quot; title=&quot;obdfilter-survey test 1a hangs in lctl&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10872&quot;&gt;LU-10872&lt;/a&gt;, but that ticket is for servers running ZFS.&lt;/p&gt;

&lt;p&gt;I can&#8217;t find any other examples of this failure for the past three months.&lt;/p&gt;</description>
                <environment></environment>
        <key id="54128">LU-11707</key>
            <summary>obdfilter-survey test 1c hangs with lctl blocked</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="jamesanunez">James Nunez</reporter>
                        <labels>
                    </labels>
                <created>Tue, 27 Nov 2018 17:21:50 +0000</created>
                <updated>Tue, 5 Nov 2019 22:22:17 +0000</updated>
                                            <version>Lustre 2.10.6</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>2</watches>
                                                                            <comments>
                            <comment id="257788" author="jamesanunez" created="Tue, 5 Nov 2019 22:15:20 +0000"  >&lt;p&gt;EDIT: Comment moved to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10872&quot; title=&quot;obdfilter-survey test 1a hangs in lctl&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10872&quot;&gt;LU-10872&lt;/a&gt;. &lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="51627">LU-10872</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i0070f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>