<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:55:31 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5904] Interop 2.6.0&lt;-&gt;master sanity-scrub test_13: BUG: ldlm_lock_create() on OST</title>
                <link>https://jira.whamcloud.com/browse/LU-5904</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for sarah &amp;lt;sarah@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://testing.hpdd.intel.com/test_sets/a99dab2e-68f9-11e4-9444-5254006e85c2&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://testing.hpdd.intel.com/test_sets/a99dab2e-68f9-11e4-9444-5254006e85c2&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_13 failed with the following error:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;test failed to respond and timed out
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;15:56:02:Lustre: DEBUG MARKER: == sanity-scrub test 13: OI scrub can rebuild missed /O entries == 15:55:38 (1415404538)
15:56:02:Lustre: DEBUG MARKER: /usr/sbin/lctl set_param fail_loc=0x196
15:56:02:Lustre: *** cfs_fail_loc=196, val=0***
15:56:02:Lustre: Skipped 63 previous similar messages
15:56:02:Lustre: DEBUG MARKER: /usr/sbin/lctl set_param fail_loc=0
15:56:02:Lustre: DEBUG MARKER: grep -c /mnt/ost1&apos; &apos; /proc/mounts
15:56:03:Lustre: DEBUG MARKER: umount -d /mnt/ost1
15:56:03:Lustre: Failing over lustre-OST0000
15:56:03:Lustre: DEBUG MARKER: lsmod | grep lnet &amp;gt; /dev/null &amp;amp;&amp;amp; lctl dl | grep &apos; ST &apos;
15:56:03:Lustre: DEBUG MARKER: mkdir -p /mnt/ost1
15:56:03:Lustre: DEBUG MARKER: test -b /dev/lvm-Role_OSS/P1
15:56:03:Lustre: DEBUG MARKER: mkdir -p /mnt/ost1; mount -t lustre -o user_xattr,noscrub  		                   /dev/lvm-Role_OSS/P1 /mnt/ost1
15:56:03:LDISKFS-fs (dm-0): mounted filesystem with ordered data mode. quota=on. Opts: 
15:56:04:LustreError: 137-5: lustre-OST0000_UUID: not available for connect from 10.2.4.225@tcp (no target). If you are running an HA pair check that the target is mounted on the other server.
15:56:04:Lustre: DEBUG MARKER: PATH=/usr/lib64/lustre/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:/opt/iozone/bin:/opt/iozone/bin:/usr/lib64/lustre/tests/mpi:/usr/lib64/lustre/tests/racer:/usr/lib64/lustre/../lustre-iokit/sgpdd-survey:/usr/lib64/lustre/tests:/usr/lib64/lustre/u
15:56:04:Lustre: DEBUG MARKER: e2label /dev/lvm-Role_OSS/P1 2&amp;gt;/dev/null
15:56:04:Lustre: lustre-OST0000: Will be in recovery for at least 1:00, or until 2 clients reconnect
15:56:04:Lustre: lustre-OST0000: Denying connection for new client 02b0a6e6-f977-0e0b-772a-b2c3b66ac87b (at 10.2.4.220@tcp), waiting for all 2 known clients (0 recovered, 0 in progress, and 0 evicted) to recover in 0:59
15:56:04:Lustre: lustre-OST0000: Denying connection for new client 02b0a6e6-f977-0e0b-772a-b2c3b66ac87b (at 10.2.4.220@tcp), waiting for all 2 known clients (1 recovered, 0 in progress, and 0 evicted) to recover in 1:19
15:56:04:Lustre: lustre-OST0000: Recovery over after 0:06, of 2 clients 2 recovered and 0 were evicted.
15:56:04:Lustre: lustre-OST0000: deleting orphan objects from 0x0:738 to 0x0:769
15:56:04:LustreError: 11266:0:(ldlm_resource.c:1150:ldlm_resource_get()) lustre-OST0000: lvbo_init failed for resource 0x2d8:0x0: rc = -2
15:56:04:LustreError: 11266:0:(ldlm_resource.c:1150:ldlm_resource_get()) Skipped 63 previous similar messages
15:56:04:BUG: unable to handle kernel paging request at 000000005a5a5a5a
15:56:04:IP: [&amp;lt;ffffffffa07ea251&amp;gt;] ldlm_lock_create+0x201/0xd70 [ptlrpc]
15:56:04:PGD 0 
15:56:05:Oops: 0000 [#1] SMP 
15:56:05:last sysfs file: /sys/devices/system/cpu/online
15:56:05:CPU 0 
15:56:05:Modules linked in: osp(U) ofd(U) lfsck(U) ost(U) mgc(U) osd_ldiskfs(U) lquota(U) lustre(U) lov(U) mdc(U) fid(U) lmv(U) fld(U) ksocklnd(U) ptlrpc(U) obdclass(U) lnet(U) sha512_generic sha256_generic libcfs(U) ldiskfs(U) jbd2 nfs fscache nfsd lockd nfs_acl auth_rpcgss sunrpc exportfs autofs4 ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ipv6 ib_sa ib_mad ib_core microcode virtio_balloon 8139too 8139cp mii i2c_piix4 i2c_core ext3 jbd mbcache virtio_blk virtio_pci virtio_ring virtio pata_acpi ata_generic ata_piix dm_mirror dm_region_hash dm_log dm_mod [last unloaded: obdecho]
15:56:05:
15:56:05:Pid: 11261, comm: ll_ost00_014 Tainted: G        W  ---------------    2.6.32-431.20.3.el6_lustre.x86_64 #1 Red Hat KVM
15:56:06:RIP: 0010:[&amp;lt;ffffffffa07ea251&amp;gt;]  [&amp;lt;ffffffffa07ea251&amp;gt;] ldlm_lock_create+0x201/0xd70 [ptlrpc]
15:56:06:RSP: 0018:ffff88006c515c60  EFLAGS: 00010246
15:56:06:RAX: ffff88006ccc1358 RBX: ffff88006ccc1180 RCX: ffff880037a64880
15:56:06:RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff88006ccc1290
15:56:06:RBP: ffff88006c515cb0 R08: 0000000000000005 R09: 0000000000000000
15:56:06:R10: ffff88006ccc1180 R11: 0000000000000200 R12: ffff88006ccc1180
15:56:06:R13: ffffffffa0916ea0 R14: ffff88007ba5e400 R15: 000000005a5a5a5a
15:56:06:FS:  0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
15:56:06:CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
15:56:06:CR2: 000000005a5a5a5a CR3: 000000007db32000 CR4: 00000000000006f0
15:56:06:DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
15:56:07:DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
15:56:07:Process ll_ost00_014 (pid: 11261, threadinfo ffff88006c514000, task ffff88006f803540)
15:56:07:Stack:
15:56:07: ffff880000000010 0000000000000000 0000000b00000004 ffffffffa05d5e31
15:56:07:&amp;lt;d&amp;gt; ffff88007ce36738 ffff88006e0ed850 ffff88007ce36818 ffffffffa0916ea0
15:56:07:&amp;lt;d&amp;gt; ffff88007ba5e400 0000000000000001 ffff88006c515d20 ffffffffa0811c74
15:56:07:Call Trace:
15:56:07: [&amp;lt;ffffffffa05d5e31&amp;gt;] ? lprocfs_counter_add+0x151/0x1c0 [obdclass]
15:56:07: [&amp;lt;ffffffffa0811c74&amp;gt;] ldlm_handle_enqueue0+0x174/0x11d0 [ptlrpc]
15:56:07: [&amp;lt;ffffffffa08947c2&amp;gt;] tgt_enqueue+0x62/0x1d0 [ptlrpc]
15:56:08: [&amp;lt;ffffffffa0894b6c&amp;gt;] tgt_request_handle+0x23c/0xac0 [ptlrpc]
15:56:08: [&amp;lt;ffffffffa084426a&amp;gt;] ptlrpc_main+0xd1a/0x1980 [ptlrpc]
15:56:08: [&amp;lt;ffffffffa0843550&amp;gt;] ? ptlrpc_main+0x0/0x1980 [ptlrpc]
15:56:08: [&amp;lt;ffffffff8109abf6&amp;gt;] kthread+0x96/0xa0
15:56:08: [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
15:56:09: [&amp;lt;ffffffff8109ab60&amp;gt;] ? kthread+0x0/0xa0
15:56:09: [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20
15:56:09:Code: 00 00 49 8d 84 24 d8 01 00 00 49 c7 84 24 90 00 00 00 00 00 00 00 ba 01 00 00 00 49 89 84 24 d8 01 00 00 49 89 84 24 e0 01 00 00 &amp;lt;49&amp;gt; 8b 07 48 8b 00 48 8b b8 88 01 00 00 e8 7d ba de ff 4d 89 24 
15:56:09:RIP  [&amp;lt;ffffffffa07ea251&amp;gt;] ldlm_lock_create+0x201/0xd70 [ptlrpc]
15:56:09: RSP &amp;lt;ffff88006c515c60&amp;gt;
15:56:09:CR2: 000000005a5a5a5a
15:56:09:Initializing cgroup subsys cpuset
15:56:09:Initializing cgroup subsys cpu
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Info required for matching: sanity-scrub 13&lt;/p&gt;</description>
                <environment>server: 2.6.0&lt;br/&gt;
client: lustre-master build #2733</environment>
        <key id="27549">LU-5904</key>
            <summary>Interop 2.6.0&lt;-&gt;master sanity-scrub test_13: BUG: ldlm_lock_create() on OST</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="5">Cannot Reproduce</resolution>
                                        <assignee username="wc-triage">WC Triage</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Tue, 11 Nov 2014 17:52:28 +0000</created>
                <updated>Mon, 17 Apr 2017 21:31:57 +0000</updated>
                            <resolved>Mon, 17 Apr 2017 21:31:57 +0000</resolved>
                                    <version>Lustre 2.7.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>5</watches>
                                                                            <comments>
                            <comment id="99096" author="adilger" created="Thu, 13 Nov 2014 20:21:43 +0000"  >&lt;p&gt;Sarah, were both the MDS and OSS running 2.6.0 in this test?&lt;/p&gt;</comment>
                            <comment id="99272" author="yong.fan" created="Sat, 15 Nov 2014 06:10:34 +0000"  >&lt;p&gt;The crash happened at the line &apos;ls -ail $DIR/$tdir &amp;gt; /dev/null 2&amp;gt;&amp;amp;1 &amp;amp;&amp;amp; error &quot;(5) ls should fail&quot;&apos; after client remount. Because of some failure injection, the &quot;ls&quot; should could NOT access some OST-objects before the OI scrub rebuilt the /O on OST. But at that time-point, the OI scrub was not started yet. So the &quot;ls&quot; should get error.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;test_13() {
        check_mount_and_prep
        $SETSTRIPE -c 1 -i 0 $DIR/$tdir

        local count=$(precreated_ost_obj_count 0 0)

        #define OBD_FAIL_OSD_COMPAT_NO_ENTRY            0x196
        do_facet ost1 $LCTL set_param fail_loc=0x196
        createmany -o $DIR/$tdir/f $((count + 32))
        do_facet ost1 $LCTL set_param fail_loc=0

        umount_client $MOUNT || error &lt;span class=&quot;code-quote&quot;&gt;&quot;(1) Fail to stop client!&quot;&lt;/span&gt;

        stop ost1 || error &lt;span class=&quot;code-quote&quot;&gt;&quot;(2) Fail to stop ost1&quot;&lt;/span&gt;

        start ost1 $(ostdevname 1) $MOUNT_OPTS_NOSCRUB ||
                error &lt;span class=&quot;code-quote&quot;&gt;&quot;(3) Fail to start ost1&quot;&lt;/span&gt;

        mount_client $MOUNT || error &lt;span class=&quot;code-quote&quot;&gt;&quot;(4) Fail to start client!&quot;&lt;/span&gt;

==&amp;gt;        ls -ail $DIR/$tdir &amp;gt; /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt; 2&amp;gt;&amp;amp;1 &amp;amp;&amp;amp; error &lt;span class=&quot;code-quote&quot;&gt;&quot;(5) ls should fail&quot;&lt;/span&gt;

        $START_SCRUB_ON_OST -r || error &lt;span class=&quot;code-quote&quot;&gt;&quot;(6) Fail to start OI scrub on OST!&quot;&lt;/span&gt;

        wait_update_facet ost1 &quot;$LCTL get_param -n \
                osd-ldiskfs.$(facet_svc ost1).oi_scrub |
                awk &lt;span class=&quot;code-quote&quot;&gt;&apos;/^status/ { print \\\$2 }&apos;&lt;/span&gt;&lt;span class=&quot;code-quote&quot;&gt;&quot; &quot;&lt;/span&gt;completed&quot; 6 ||
                error &lt;span class=&quot;code-quote&quot;&gt;&quot;(7) Expected &lt;span class=&quot;code-quote&quot;&gt;&apos;$expected&apos;&lt;/span&gt; on ost1&quot;&lt;/span&gt;

        ls -ail $DIR/$tdir &amp;gt; /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt; || error &lt;span class=&quot;code-quote&quot;&gt;&quot;(8) ls should succeed&quot;&lt;/span&gt;
}
run_test 13 &lt;span class=&quot;code-quote&quot;&gt;&quot;OI scrub can rebuild missed /O entries&quot;&lt;/span&gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;According to the test logs on OST, the OST accessed some released RAM when handle LDLM lock.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;(gdb) l * ldlm_lock_create+0x201
0x7521 is in ldlm_lock_create (include/linux/list.h:28).
23		struct list_head name = LIST_HEAD_INIT(name)
24	
25	&lt;span class=&quot;code-keyword&quot;&gt;static&lt;/span&gt; inline void INIT_LIST_HEAD(struct list_head *list)
26	{
27		list-&amp;gt;next = list;
28		list-&amp;gt;prev = list;
29	}
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;ldlm_handle_enqueue0() =&amp;gt; ldlm_lock_create() =&amp;gt; ldlm_lock_new() =&amp;gt; INIT_LIST_HEAD().&lt;br/&gt;
It looks like that the new created LDLM lock was released by some other during such lock initialisation. So it was possible that the LDLM slab on the OST was corrupted. We need more clew for debug. Anyway, it should NOT be OI scrub issue since the OI scrub was not started yet. &lt;/p&gt;</comment>
                            <comment id="100341" author="sarah" created="Mon, 1 Dec 2014 19:17:09 +0000"  >&lt;p&gt;Andreas, both MDS and OSS were 2.6.0&lt;/p&gt;</comment>
                            <comment id="192360" author="adilger" created="Mon, 17 Apr 2017 21:31:57 +0000"  >&lt;p&gt;Close old issue.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzx0qf:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>16494</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>