<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:21:05 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-15763] large-scale test_3a: LustreError: (file.c:243:ll_close_inode_openhandle())  mdc close failed: rc = -108</title>
                <link>https://jira.whamcloud.com/browse/LU-15763</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Cliff White &amp;lt;cwhite@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.whamcloud.com/test_sets/cde24d83-d48d-4472-a697-ac0b030ae772&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/cde24d83-d48d-4472-a697-ac0b030ae772&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Test run timed out after nfsv4 scale, multiple stack dumps on various nodes, example: &lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
[11156.489772] Lustre: DEBUG MARKER: ./compilebench -D /mnt/lustre/d0.parallel-scale-nfs/d0.compilebench.513626 -i 2 -r 2 --makej
[16428.917978] compilebench (518595): drop_caches: 3
[16733.594234] compilebench (518595): drop_caches: 3
[18407.728842] sysrq: SysRq : Changing Loglevel
[18407.737126] sysrq: Loglevel set to 8
[18407.740113] sysrq: SysRq : Show State
[18407.740908] task:systemd         state:S stack:    0 pid:    1 ppid:     0 flags:0x00000000
[18407.742495] Call Trace:
[18407.743081]  __schedule+0x2c4/0x700
[18407.743825]  schedule+0x37/0xa0
[18407.744493]  schedule_hrtimeout_range_clock+0xf7/0x110
[18407.745550]  ? seq_release+0x24/0x30
[18407.746279]  ep_poll+0x403/0x460
[18407.747000]  ? wake_up_q+0x80/0x80
[18407.747713]  do_epoll_wait+0xb0/0xd0
[18407.748453]  __x64_sys_epoll_wait+0x1a/0x20
[18407.749299]  do_syscall_64+0x5b/0x1a0
[18407.750070]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[18407.751089] RIP: 0033:0x7f87afd9c0f7
[18407.751837] Code: Unable to access opcode bytes at RIP 0x7f87afd9c0cd.
[18407.753097] RSP: 002b:00007ffe859313f0 EFLAGS: 00000293 ORIG_RAX: 00000000000000e8
[18407.754551] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f87afd9c0f7
[18407.755919] RDX: 000000000000006a RSI: 00007ffe85931430 RDI: 0000000000000004
[18407.757299] RBP: 00007ffe85931430 R08: 0000000000000000 R09: 00007ffe85931a80
[18407.758669] R10: 00000000ffffffff R11: 0000000000000293 R12: 000000000000006a
[18407.760038] R13: 00000000ffffffff R14: 00007ffe85931430 R15: 0000000000000001
[18407.761415] task:kthreadd        state:S stack:    0 pid:    2 ppid:     0 flags:0x80004000
[18407.762995] Call Trace:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="69810">LU-15763</key>
            <summary>large-scale test_3a: LustreError: (file.c:243:ll_close_inode_openhandle())  mdc close failed: rc = -108</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="Deiter">Alex Deiter</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                            <label>failing_tests</label>
                    </labels>
                <created>Tue, 19 Apr 2022 18:33:16 +0000</created>
                <updated>Fri, 3 Nov 2023 19:18:34 +0000</updated>
                                            <version>Lustre 2.12.9</version>
                    <version>Lustre 2.15.0</version>
                                    <fixVersion>Lustre 2.16.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="337748" author="sarah" created="Tue, 14 Jun 2022 18:31:51 +0000"  >&lt;p&gt;Similar error in 2.12.9 with el8.6 client large-scale test_3a and performance-sanity test_6&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/458d77ec-e890-4bd0-9b4a-b9efd3f31591&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/458d77ec-e890-4bd0-9b4a-b9efd3f31591&lt;/a&gt;&lt;br/&gt;
&lt;a href=&quot;https://testing.whamcloud.com/test_sets/33e08404-0251-46ee-910e-c88f3f8fe0db&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/test_sets/33e08404-0251-46ee-910e-c88f3f8fe0db&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;client 2 crash&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[14214.730002] Lustre: DEBUG MARKER: == rpc test complete, duration -o sec ================================================================ 03:44:34 (1655178274)
[14215.236907] Lustre: DEBUG MARKER: /usr/sbin/lctl mark onyx-46vm4.onyx.whamcloud.com: executing _wait_recovery_complete *.lustre-MDT0000.recovery_status 200
[14215.600050] Lustre: DEBUG MARKER: onyx-46vm4.onyx.whamcloud.com: executing _wait_recovery_complete *.lustre-MDT0000.recovery_status 200
[14414.697205] Lustre: DEBUG MARKER: /usr/sbin/lctl mark 3 : Starting failover on mds1
[14415.090008] Lustre: DEBUG MARKER: 3 : Starting failover on mds1
[14416.480451] LustreError: 11-0: lustre-MDT0000-mdc-ffff8bb04541b000: operation ldlm_enqueue to node 10.240.24.11@tcp failed: rc = -19
[14416.483207] LustreError: Skipped 2 previous similar messages
[14416.483304] Lustre: lustre-MDT0000-mdc-ffff8bb04541b000: Connection to lustre-MDT0000 (at 10.240.24.11@tcp) was lost; in progress operations using this service will wait for recovery to complete
[14416.487495] Lustre: Skipped 2 previous similar messages
[14447.015526] Lustre: DEBUG MARKER: /usr/sbin/lctl mark == rpc test complete, duration -o sec ================================================================ 03:48:27 \(1655178507\)
[14447.375691] Lustre: DEBUG MARKER: == rpc test complete, duration -o sec ================================================================ 03:48:27 (1655178507)
[14447.951613] Lustre: DEBUG MARKER: /usr/sbin/lctl mark onyx-46vm4.onyx.whamcloud.com: executing set_default_debug -1 all 4
[14448.364838] Lustre: DEBUG MARKER: onyx-46vm4.onyx.whamcloud.com: executing set_default_debug -1 all 4
[14455.474927] Lustre: DEBUG MARKER: /usr/sbin/lctl mark == rpc test complete, duration -o sec ================================================================ 03:48:35 \(1655178515\)
[14456.067224] Lustre: DEBUG MARKER: == rpc test complete, duration -o sec ================================================================ 03:48:35 (1655178515)
[14456.877997] Lustre: DEBUG MARKER: /usr/sbin/lctl mark onyx-46vm4.onyx.whamcloud.com: executing _wait_recovery_complete *.lustre-MDT0000.recovery_status 200
[14457.333427] Lustre: DEBUG MARKER: onyx-46vm4.onyx.whamcloud.com: executing _wait_recovery_complete *.lustre-MDT0000.recovery_status 200
[20405.164682] sysrq: SysRq : Changing Loglevel
[20405.167754] sysrq: Loglevel set to 8
[20405.171169] sysrq: SysRq : Show State
[20405.171956] task:systemd         state:S stack:    0 pid:    1 ppid:     0 flags:0x00000000
[20405.173513] Call Trace:
[20405.174071]  __schedule+0x2d1/0x830
[20405.174837]  schedule+0x35/0xa0
[20405.175479]  schedule_hrtimeout_range_clock+0xf1/0x100
[20405.176494]  ? ep_scan_ready_list.constprop.23+0x20c/0x230
[20405.177576]  ? __handle_mm_fault+0x7c4/0x7e0
[20405.178444]  ep_poll+0x3f3/0x450
[20405.179134]  ? wake_up_q+0x70/0x70
[20405.179845]  do_epoll_wait+0xb0/0xd0
[20405.180576]  __x64_sys_epoll_wait+0x1a/0x20
[20405.181414]  do_syscall_64+0x5b/0x1a0
[20405.182197]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[20405.183190] RIP: 0033:0x7f63a7011a27
[20405.183935] Code: Unable to access opcode bytes at RIP 0x7f63a70119fd.
[20405.185171] RSP: 002b:00007ffdb172ee70 EFLAGS: 00000293 ORIG_RAX: 00000000000000e8
[20405.186592] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f63a7011a27
[20405.187932] RDX: 0000000000000058 RSI: 00007ffdb172eeb0 RDI: 0000000000000004
[20405.189273] RBP: 00007ffdb172eeb0 R08: 0000000000000000 R09: 00007ffdb172eeb0
[20405.190616] R10: 00000000ffffffff R11: 0000000000000293 R12: 0000000000000058
[20405.191965] R13: 00000000ffffffff R14: ffffffffffffffff R15: 00007ffdb172f430
[20405.193340] task:kthreadd        state:S stack:    0 pid:    2 ppid:     0 flags:0x80004000
[20405.194898] Call Trace:
[20405.195414]  __schedule+0x2d1/0x830
[20405.196131]  ? __schedule+0x2f3/0x830
[20405.196884]  schedule+0x35/0xa0
[20405.197522]  kthreadd+0x2d7/0x2f0
[20405.198249]  ? kthread_is_per_cpu+0x20/0x20
[20405.199092]  ret_from_fork+0x35/0x40
[20405.199825] task:rcu_gp          state:I stack:    0 pid:    3 ppid:     2 flags:0x80004000
[20405.201384] Call Trace:
[20405.201922]  __schedule+0x2d1/0x830
[20405.202620]  schedule+0x35/0xa0
[20405.203294]  rescuer_thread+0x289/0x330
[20405.204102]  ? process_one_work+0x360/0x360
[20405.204956]  kthread+0x10a/0x120
[20405.205641]  ? set_kthread_struct+0x40/0x40
[20405.206472]  ret_from_fork+0x35/0x40
[20405.207206] task:rcu_par_gp      state:I stack:    0 pid:    4 ppid:     2 flags:0x80004000
[20405.208767] Call Trace:
[20405.209288]  __schedule+0x2d1/0x830
[20405.210007]  schedule+0x35/0xa0
[20405.210659]  rescuer_thread+0x289/0x330
[20405.211442]  ? process_one_work+0x360/0x360
[20405.212274]  kthread+0x10a/0x120
[20405.212951]  ? set_kthread_struct+0x40/0x40
[20405.213782]  ret_from_fork+0x35/0x40
[20405.214501] task:kworker/0:0H    state:I stack:    0 pid:    6 ppid:     2 flags:0x80004000
[20405.216058] Workqueue:  0x0 (events_highpri)
[20405.216909] Call Trace:
[20405.217428]  __schedule+0x2d1/0x830
[20405.218138]  schedule+0x35/0xa0
[20405.218787]  worker_thread+0xb7/0x390
[20405.219519]  ? create_worker+0x1a0/0x1a0
[20405.220295]  kthread+0x10a/0x120
[20405.220969]  ? set_kthread_struct+0x40/0x40
[20405.221804]  ret_from_fork+0x35/0x40
[20405.222528] task:mm_percpu_wq    state:I stack:    0 pid:    8 ppid:     2 flags:0x80004000
[20405.224077] Call Trace:
[20405.224590]  __schedule+0x2d1/0x830
[20405.225300]  schedule+0x35/0xa0
[20405.225961]  rescuer_thread+0x289/0x330
[20405.226758]  ? process_one_work+0x360/0x360
[20405.227575]  kthread+0x10a/0x120
[20405.228238]  ? set_kthread_struct+0x40/0x40
[20405.229078]  ret_from_fork+0x35/0x40
[20405.229799] task:rcu_tasks_rude_ state:S stack:    0 pid:    9 ppid:     2 flags:0x80004000
[20405.231352] Call Trace:
[20405.231888]  __schedule+0x2d1/0x830
[20405.232579]  ? __switch_to_asm+0x41/0x70
[20405.233358]  ? rcu_tasks_kthread+0x102/0x1a0
[20405.234248]  schedule+0x35/0xa0
[20405.234909]  rcu_tasks_kthread+0x170/0x1a0
[20405.235734]  ? finish_wait+0x80/0x80
[20405.236457]  ? rcu_tasks_wait_gp+0x1f0/0x1f0
[20405.237300]  kthread+0x10a/0x120
[20405.237976]  ? set_kthread_struct+0x40/0x40
[20405.238821]  ret_from_fork+0x35/0x40
[20405.239539] task:rcu_tasks_trace state:S stack:    0 pid:   10 ppid:     2 flags:0x80004000
[20405.241087] Call Trace:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="346955" author="adilger" created="Fri, 16 Sep 2022 19:26:16 +0000"  >&lt;p&gt;It looks like large-scale test_3a has &lt;b&gt;never&lt;/b&gt; passed with RHEL8.6 (55/55 timeouts in past 4 weeks) or el9 (9/9 timeouts in past 4 weeks), while it passed 100% (309/309) on CentOS8.5 and 37/43 runs on CentOS8.3, and 100% (156/156) on el7.9.&lt;/p&gt;
</comment>
                            <comment id="361639" author="mdiep" created="Sat, 4 Feb 2023 01:25:30 +0000"  >&lt;p&gt;this large-scale test_3a failure really needs to be looked at carefully, it only failed on rhel8.6, rhel9.0, but passed on rhel8.7, sles15sp3, ubuntu20.04 and ubuntu22.04&lt;/p&gt;</comment>
                            <comment id="367305" author="adilger" created="Fri, 24 Mar 2023 22:36:20 +0000"  >&lt;p&gt;It looks like &lt;tt&gt;mdsrate&lt;/tt&gt; is failing due to &lt;tt&gt;UCX ERROR&lt;/tt&gt; seen in other uses of mdsrate.  I would prefer that all of these tests be moved over to use mdtest to avoid what seems like a bug in mdsrate...&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://testing.whamcloud.com/sub_tests/2883fff3-fd86-4e82-bd7a-c6fb77995b88&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.whamcloud.com/sub_tests/2883fff3-fd86-4e82-bd7a-c6fb77995b88&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;trevis-67vm4: *.lustre-MDT0000.recovery_status status: COMPLETE
CMD: trevis-67vm4 lctl get_param -n *.lustre-MDT0000.recovery_status
RECOVERY TIME: NFILES=50000 number of clients: 2 recovery_duration: 11
Rate: 526.79 eff 526.82 aggr 87.80 avg client creates/sec (total: 6 threads 50000 creates 1 dirs 6 threads/dir 94.91 secs)
0: trevis-67vm1.trevis.whamcloud.com finished at Wed Mar 22 02:50:30 2023
+ chmod 0777 /mnt/lustre
drwxrwxrwx 4 root root 65536 Mar 22 02:48 /mnt/lustre
+ su mpiuser sh -c &quot;/usr/lib64/openmpi/bin/mpirun --mca btl tcp,self --mca btl_tcp_if_include eth0 -mca boot ssh --oversubscribe --oversubscribe -machinefile /tmp/auster.machines -np 3 /usr/lib64/openmpi/bin/mdsrate --unlink --nfiles 50000 --dir /mnt/lustre/d0.test_3a --filefmt f%%d --ignore &quot;
0: trevis-67vm1.trevis.whamcloud.com starting at Wed Mar 22 02:50:31 2023
[1679453431.525929] [trevis-67vm2:342581:0]          select.c:514  UCX  ERROR   no active messages transport to : posix/memory - Destination is unreachable, sysv/memory - Destination is unreachable, self/memory0 - Destination is unreachable, tcp/lo - Destination is unreachable, tcp/eth0 - Destination is unreachable
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="384112" author="adilger" created="Tue, 29 Aug 2023 19:35:11 +0000"  >&lt;p&gt;There was patch &lt;a href=&quot;https://review.whamcloud.com/51414&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/51414&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-14697&quot; title=&quot;change performance-sanity to use mdtest&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-14697&quot;&gt;&lt;del&gt;LU-14697&lt;/del&gt;&lt;/a&gt; change performance-sanity to use mdtest&lt;/tt&gt;&quot; landed to convert the testing over to use &lt;tt&gt;mdtest&lt;/tt&gt; instead of &lt;tt&gt;mdsrate&lt;/tt&gt;.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="64367">LU-14697</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="70049">LU-15795</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i02nfj:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>