<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:30:01 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2991] (osd_internal.h:909:osd_trans_exec_op()) ASSERTION( oti-&gt;oti_declare_ops_rb[rb] &gt; 0 ) failed: rb = 2</title>
                <link>https://jira.whamcloud.com/browse/LU-2991</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;1 mds, 2 oss with 150 osts each&lt;br/&gt;
1 client mounted doing&lt;br/&gt;
for i in $(seq 1 300); do&lt;br/&gt;
  lfs setstripe /mnt/lustre/dir$i&lt;br/&gt;
  touch /mnt/lustre/dir$i/file&lt;br/&gt;
  lfs getstripe /mnt/lustre/dir$i&lt;br/&gt;
done&lt;/p&gt;

&lt;p&gt;rm -rf /mnt/lustre/dir*&lt;/p&gt;

&lt;p&gt;LustreError: 3627:0:(osd_internal.h:909:osd_trans_exec_op()) ASSERTION( oti-&amp;gt;oti_declare_ops_rb&lt;span class=&quot;error&quot;&gt;&amp;#91;rb&amp;#93;&lt;/span&gt; &amp;gt; 0 ) failed: rb = 2&lt;br/&gt;
LustreError: 3627:0:(osd_internal.h:909:osd_trans_exec_op()) LBUG&lt;br/&gt;
Pid: 3627, comm: mdt02_001&lt;/p&gt;

&lt;p&gt;Call Trace:&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04cd895&amp;gt;&amp;#93;&lt;/span&gt; libcfs_debug_dumpstack+0x55/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04cde97&amp;gt;&amp;#93;&lt;/span&gt; lbug_with_loc+0x47/0xb0 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d1c688&amp;gt;&amp;#93;&lt;/span&gt; osd_trans_exec_op+0x128/0x160 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d26878&amp;gt;&amp;#93;&lt;/span&gt; osd_xattr_set+0x278/0x380 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0edef30&amp;gt;&amp;#93;&lt;/span&gt; lod_generate_and_set_lovea+0x350/0x720 &lt;span class=&quot;error&quot;&gt;&amp;#91;lod&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0eea6d6&amp;gt;&amp;#93;&lt;/span&gt; lod_striping_create+0x1e6/0x320 &lt;span class=&quot;error&quot;&gt;&amp;#91;lod&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0eea983&amp;gt;&amp;#93;&lt;/span&gt; lod_object_create+0x173/0x260 &lt;span class=&quot;error&quot;&gt;&amp;#91;lod&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c3b540&amp;gt;&amp;#93;&lt;/span&gt; mdd_object_create_internal+0xa0/0x1c0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c4e503&amp;gt;&amp;#93;&lt;/span&gt; mdd_create+0x963/0x1470 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0ee7460&amp;gt;&amp;#93;&lt;/span&gt; ? lod_index_lookup+0x0/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;lod&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e4bf84&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_open+0x13d4/0x20d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04ea7ae&amp;gt;&amp;#93;&lt;/span&gt; ? upcall_cache_get_entry+0x28e/0x860 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07bb7ec&amp;gt;&amp;#93;&lt;/span&gt; ? lustre_msg_add_version+0x6c/0xc0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0638cf0&amp;gt;&amp;#93;&lt;/span&gt; ? lu_ucred+0x20/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e368c1&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_rec+0x41/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e2ff23&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_internal+0x4e3/0x7d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e304dd&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_reint+0x1ed/0x4f0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e2c0ae&amp;gt;&amp;#93;&lt;/span&gt; mdt_intent_policy+0x3ae/0x750 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0774351&amp;gt;&amp;#93;&lt;/span&gt; ldlm_lock_enqueue+0x361/0x8d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa079a3e7&amp;gt;&amp;#93;&lt;/span&gt; ldlm_handle_enqueue0+0x4f7/0x10b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e2c586&amp;gt;&amp;#93;&lt;/span&gt; mdt_enqueue+0x46/0x110 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e21168&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x628/0x1620 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e5a285&amp;gt;&amp;#93;&lt;/span&gt; mds_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07cc27c&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x41c/0xdf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04ce5de&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04dfd8f&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x6f/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07c38b9&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x290 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81052223&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up+0x53/0x70&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07cd7c5&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xb75/0x1870 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07ccc50&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1870 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07ccc50&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1870 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07ccc50&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1870 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;/p&gt;

&lt;p&gt;LustreError: dumping log to /tmp/lustre-log.1363711711.3627&lt;/p&gt;</description>
                <environment>&lt;a href=&quot;https://build.whamcloud.com/job/lustre-master/1329/&quot;&gt;https://build.whamcloud.com/job/lustre-master/1329/&lt;/a&gt;</environment>
        <key id="18013">LU-2991</key>
            <summary>(osd_internal.h:909:osd_trans_exec_op()) ASSERTION( oti-&gt;oti_declare_ops_rb[rb] &gt; 0 ) failed: rb = 2</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bfaccini">Bruno Faccini</assignee>
                                    <reporter username="mdiep">Minh Diep</reporter>
                        <labels>
                            <label>MB</label>
                    </labels>
                <created>Tue, 19 Mar 2013 21:26:13 +0000</created>
                <updated>Mon, 15 Apr 2013 08:27:28 +0000</updated>
                            <resolved>Mon, 15 Apr 2013 08:27:28 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="54485" author="adilger" created="Wed, 20 Mar 2013 17:08:51 +0000"  >&lt;p&gt;Looks like the transaction credit debugging caught a case where we didn&apos;t declare any credits for the rollback or something.  The &lt;a href=&quot;http://review.whamcloud.com/5698&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5698&lt;/a&gt; change will disable this accounting for 2.3.90 and beyond, but I suspect we are missing some error handling code?&lt;/p&gt;</comment>
                            <comment id="54665" author="bfaccini" created="Fri, 22 Mar 2013 16:12:16 +0000"  >&lt;p&gt;Minh,&lt;br/&gt;
Where/How have you been able to run such &quot;2 oss with 150 osts each&quot; configuration ?&lt;br/&gt;
Is it easily reproducible ?&lt;br/&gt;
Have you been able to get a crash-dump from occurrence(s) you already got ?&lt;/p&gt;
</comment>
                            <comment id="54666" author="mdiep" created="Fri, 22 Mar 2013 16:19:38 +0000"  >&lt;p&gt;I ran this on our lab creating osts on LVM. I see if I can get the crash dump. I&apos;ll ping you offline to see if you can access the system&lt;/p&gt;</comment>
                            <comment id="54670" author="bfaccini" created="Fri, 22 Mar 2013 17:00:31 +0000"  >&lt;p&gt;Assert comes from Patch/commit 28fc8fcc8bfe092c9a1a8c192ab6fe22d92820e7 for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2640&quot; title=&quot;deactivate OSD_EXEC_OP() operation accounting if operation is being undone&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2640&quot;&gt;&lt;del&gt;LU-2640&lt;/del&gt;&lt;/a&gt;.&lt;br/&gt;
Could it be the number of credits exceeds/overflows the size of entries (unsigned char) in oti_declare_ops_rb[] ??&lt;/p&gt;

&lt;p&gt;Both the transaction credits counters declaration and the Assert comes from Patch/commit 28fc8fcc8bfe092c9a1a8c192ab6fe22d92820e7 for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2640&quot; title=&quot;deactivate OSD_EXEC_OP() operation accounting if operation is being undone&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2640&quot;&gt;&lt;del&gt;LU-2640&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;</comment>
                            <comment id="54672" author="bzzz" created="Fri, 22 Mar 2013 17:12:11 +0000"  >&lt;p&gt;with wide striping - yes. &lt;/p&gt;</comment>
                            <comment id="54739" author="bfaccini" created="Mon, 25 Mar 2013 08:14:16 +0000"  >&lt;p&gt;Thank&apos;s Alex, and assume this may have occur whith the 150 OSTs Minh was using !&lt;/p&gt;

&lt;p&gt;I changed the size of the transaction credits counters from uchar to ushort within patch &lt;a href=&quot;http://review.whamcloud.com/5830&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5830&lt;/a&gt; for master I just pushed to gerrit.&lt;/p&gt;

&lt;p&gt;Minh, if no build failures (I still expect side-effects during build+run when changing size !!) can you give it a try ??&lt;/p&gt;</comment>
                            <comment id="54828" author="bfaccini" created="Tue, 26 Mar 2013 09:34:09 +0000"  >&lt;p&gt;Humm, tests of build for my patch failed in sanity/test_24v with same LBUG :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;04:13:52:Lustre: DEBUG MARKER: == sanity test 24v: list directory with large files (handle hash collision, bug: 17560) == 04:04:38 (1364209478)
04:13:52:LustreError: 11418:0:(osd_internal.h:909:osd_trans_exec_op()) ASSERTION( oti-&amp;gt;oti_declare_ops_rb[rb] &amp;gt; 0 ) failed: rb = 5
04:13:52:LustreError: 11418:0:(osd_internal.h:909:osd_trans_exec_op()) LBUG
04:13:52:Pid: 11418, comm: mdt00_000
04:13:52:
04:13:53:Call Trace:
04:13:53: [&amp;lt;ffffffffa04d7895&amp;gt;] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
04:13:53: [&amp;lt;ffffffffa04d7e97&amp;gt;] lbug_with_loc+0x47/0xb0 [libcfs]
04:13:53: [&amp;lt;ffffffffa0d44360&amp;gt;] osd_object_ref_del+0x2d0/0x310 [osd_ldiskfs]
04:13:53: [&amp;lt;ffffffffa0f157fb&amp;gt;] lod_ref_del+0x3b/0xd0 [lod]
04:13:53: [&amp;lt;ffffffffa0c6acd5&amp;gt;] mdo_ref_del+0x35/0xc0 [mdd]
04:13:54: [&amp;lt;ffffffffa0c72446&amp;gt;] mdd_unlink+0x6c6/0xe20 [mdd]
04:13:54: [&amp;lt;ffffffffa0e61a18&amp;gt;] mdo_unlink+0x18/0x50 [mdt]
04:13:54: [&amp;lt;ffffffffa0e64cd9&amp;gt;] mdt_reint_unlink+0x739/0xfd0 [mdt]
04:13:54: [&amp;lt;ffffffffa0e616d1&amp;gt;] mdt_reint_rec+0x41/0xe0 [mdt]
04:13:54: [&amp;lt;ffffffffa0e5ad53&amp;gt;] mdt_reint_internal+0x4e3/0x7d0 [mdt]
04:13:55: [&amp;lt;ffffffffa0e5b084&amp;gt;] mdt_reint+0x44/0xe0 [mdt]
04:13:55: [&amp;lt;ffffffffa0e49078&amp;gt;] mdt_handle_common+0x648/0x1660 [mdt]
04:13:56: [&amp;lt;ffffffffa0e85125&amp;gt;] mds_regular_handle+0x15/0x20 [mdt]
04:13:56: [&amp;lt;ffffffffa07f91dc&amp;gt;] ptlrpc_server_handle_request+0x41c/0xdf0 [ptlrpc]
04:13:56: [&amp;lt;ffffffffa04d85de&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
04:13:57: [&amp;lt;ffffffffa07f0819&amp;gt;] ? ptlrpc_wait_event+0xa9/0x290 [ptlrpc]
04:13:57: [&amp;lt;ffffffff81052223&amp;gt;] ? __wake_up+0x53/0x70
04:13:57: [&amp;lt;ffffffffa07fa725&amp;gt;] ptlrpc_main+0xb75/0x1870 [ptlrpc]
04:13:58: [&amp;lt;ffffffffa07f9bb0&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
04:13:58: [&amp;lt;ffffffff8100c0ca&amp;gt;] child_rip+0xa/0x20
04:13:58: [&amp;lt;ffffffffa07f9bb0&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
04:13:58: [&amp;lt;ffffffffa07f9bb0&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
04:13:58: [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
04:13:58:
04:13:59:Kernel panic - not syncing: LBUG
04:13:59:Pid: 11418, comm: mdt00_000 Not tainted 2.6.32-279.19.1.el6_lustre.gc4681d8.x86_64 #1
04:13:59:Call Trace:
04:14:00: [&amp;lt;ffffffff814e9811&amp;gt;] ? panic+0xa0/0x168
04:14:00: [&amp;lt;ffffffffa04d7eeb&amp;gt;] ? lbug_with_loc+0x9b/0xb0 [libcfs]
04:14:00: [&amp;lt;ffffffffa0d44360&amp;gt;] ? osd_object_ref_del+0x2d0/0x310 [osd_ldiskfs]
04:14:00: [&amp;lt;ffffffffa0f157fb&amp;gt;] ? lod_ref_del+0x3b/0xd0 [lod]
04:14:00: [&amp;lt;ffffffffa0c6acd5&amp;gt;] ? mdo_ref_del+0x35/0xc0 [mdd]
04:14:01: [&amp;lt;ffffffffa0c72446&amp;gt;] ? mdd_unlink+0x6c6/0xe20 [mdd]
04:14:01: [&amp;lt;ffffffffa0e61a18&amp;gt;] ? mdo_unlink+0x18/0x50 [mdt]
04:14:01: [&amp;lt;ffffffffa0e64cd9&amp;gt;] ? mdt_reint_unlink+0x739/0xfd0 [mdt]
04:14:01: [&amp;lt;ffffffffa0e616d1&amp;gt;] ? mdt_reint_rec+0x41/0xe0 [mdt]
04:14:01: [&amp;lt;ffffffffa0e5ad53&amp;gt;] ? mdt_reint_internal+0x4e3/0x7d0 [mdt]
04:14:01: [&amp;lt;ffffffffa0e5b084&amp;gt;] ? mdt_reint+0x44/0xe0 [mdt]
04:14:02: [&amp;lt;ffffffffa0e49078&amp;gt;] ? mdt_handle_common+0x648/0x1660 [mdt]
04:14:02: [&amp;lt;ffffffffa0e85125&amp;gt;] ? mds_regular_handle+0x15/0x20 [mdt]
04:14:02: [&amp;lt;ffffffffa07f91dc&amp;gt;] ? ptlrpc_server_handle_request+0x41c/0xdf0 [ptlrpc]
04:14:02: [&amp;lt;ffffffffa04d85de&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
04:14:02: [&amp;lt;ffffffffa07f0819&amp;gt;] ? ptlrpc_wait_event+0xa9/0x290 [ptlrpc]
04:14:03: [&amp;lt;ffffffff81052223&amp;gt;] ? __wake_up+0x53/0x70
04:14:03: [&amp;lt;ffffffffa07fa725&amp;gt;] ? ptlrpc_main+0xb75/0x1870 [ptlrpc]
04:14:03: [&amp;lt;ffffffffa07f9bb0&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
04:14:03: [&amp;lt;ffffffff8100c0ca&amp;gt;] ? child_rip+0xa/0x20
04:14:03: [&amp;lt;ffffffffa07f9bb0&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
04:14:03: [&amp;lt;ffffffffa07f9bb0&amp;gt;] ? ptlrpc_main+0x0/0x1870 [ptlrpc]
04:14:03: [&amp;lt;ffffffff8100c0c0&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Unless my very little change of counter&apos;s size introduced a regression, I am afraid that recent landed patches from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2640&quot; title=&quot;deactivate OSD_EXEC_OP() operation accounting if operation is being undone&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2640&quot;&gt;&lt;del&gt;LU-2640&lt;/del&gt;&lt;/a&gt; may need some re-work ...&lt;br/&gt;
I will try to run this build on Toro and reproduce in order to get a crash-dump for analysis, but Minh if you already have been able to reproduce with your own test/configuration, just tell me.&lt;/p&gt;</comment>
                            <comment id="54846" author="bfaccini" created="Tue, 26 Mar 2013 16:06:22 +0000"  >&lt;p&gt;I am trying to reproduce on a single Toro node and loop devices but no success/LBUG till now ...&lt;br/&gt;
I added &quot;max_loop=164&quot; Kernel boot parameter and then ran the following script based on your reproducer &lt;br/&gt;
original text :&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;export OSTCOUNT=150
/usr/lib64/lustre/tests/llmount.sh
while true ; do 
     for i in $(seq 1 300); do 
          mkdir /mnt/lustre/dir$i
          lfs setstripe -c 150 /mnt/lustre/dir$i
         touch /mnt/lustre/dir$i/file
         lfs getstripe /mnt/lustre/dir$i
     done
     rm -rf /mnt/lustre/dir*
done
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;may be I need more Nodes/VMs to trigger it ??&lt;/p&gt;</comment>
                            <comment id="54848" author="mdiep" created="Tue, 26 Mar 2013 16:13:33 +0000"  >&lt;p&gt;Yes, I think you might need more than one node to reproduce it. Did auto test provide a crash dump?&lt;/p&gt;</comment>
                            <comment id="54849" author="bfaccini" created="Tue, 26 Mar 2013 16:24:35 +0000"  >&lt;p&gt;Yes, according to MDS Console log, but then how/where can I find it ?&lt;/p&gt;</comment>
                            <comment id="54860" author="mdiep" created="Tue, 26 Mar 2013 17:47:31 +0000"  >&lt;p&gt;I hit this during your patch test. I am not sure if it&apos;s related, likely not. I will reproduce this bug on master and collect the crash dump.&lt;/p&gt;


&lt;p&gt;-----------&lt;del&gt;[ cut here ]&lt;/del&gt;-----------&lt;br/&gt;
kernel BUG at fs/jbd2/transaction.c:1033!&lt;br/&gt;
invalid opcode: 0000 &lt;a href=&quot;#1&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;1&lt;/a&gt; SMP&lt;br/&gt;
last sysfs file: /sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_map&lt;br/&gt;
CPU 8&lt;br/&gt;
Modules linked in: osp(U) lod(U) mdt(U) mgs(U) mgc(U) fsfilt_ldiskfs(U) osd_ldiskfs(U) lquota(U) mdd(U) lustre(U) lov(U) osc(U) mdc(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) lvfs(U) sha512_generic sha256_generic libcfs(U) ldiskfs(U) jbd2 nfsd exportfs nfs lockd fscache nfs_acl auth_rpcgss sunrpc cpufreq_ondemand acpi_cpufreq freq_table mperf ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_addr ipv6 ib_sa microcode serio_raw mlx4_ib ib_mad ib_core mlx4_en mlx4_core i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support ioatdma i7core_edac edac_core ses enclosure sg igb dca ext3 jbd mbcache sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix mpt2sas scsi_transport_sas raid_class dm_mirror dm_region_hash dm_log dm_mod &lt;span class=&quot;error&quot;&gt;&amp;#91;last unloaded: scsi_wait_scan&amp;#93;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;Pid: 674, comm: mdt00_003 Not tainted 2.6.32-279.19.1.el6_lustre.gc4681d8.x86_64 #1 Supermicro X8DTH-i/6/iF/6F/X8DTH&lt;br/&gt;
RIP: 0010:&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa044c86d&amp;gt;&amp;#93;&lt;/span&gt;  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa044c86d&amp;gt;&amp;#93;&lt;/span&gt; jbd2_journal_dirty_metadata+0x10d/0x150 &lt;span class=&quot;error&quot;&gt;&amp;#91;jbd2&amp;#93;&lt;/span&gt;&lt;br/&gt;
RSP: 0018:ffff88032d16d6b0  EFLAGS: 00010246&lt;br/&gt;
RAX: ffff88035d5bf380 RBX: ffff880359dd4678 RCX: ffff880f985172d0&lt;br/&gt;
RDX: 0000000000000000 RSI: ffff880f985172d0 RDI: 0000000000000000&lt;br/&gt;
RBP: ffff88032d16d6d0 R08: a010000000000000 R09: f049b2734454b402&lt;br/&gt;
R10: 0000000000000000 R11: ffff880eb0d34000 R12: ffff88038e527ba8&lt;br/&gt;
R13: ffff880f985172d0 R14: ffff8808298c2800 R15: 0000000000001000&lt;br/&gt;
FS:  00007fac03683700(0000) GS:ffff880028280000(0000) knlGS:0000000000000000&lt;br/&gt;
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b&lt;br/&gt;
CR2: 00000000006dbf98 CR3: 0000000826eb6000 CR4: 00000000000006e0&lt;br/&gt;
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000&lt;br/&gt;
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400&lt;br/&gt;
Process mdt00_003 (pid: 674, threadinfo ffff88032d16c000, task ffff88032d213540)&lt;br/&gt;
Stack:&lt;br/&gt;
 ffff880359dd4678 ffffffffa0d82b50 ffff880f985172d0 0000000000000000&lt;br/&gt;
&amp;lt;d&amp;gt; ffff88032d16d710 ffffffffa04721bb ffff88032d16d710 ffffffffa047f898&lt;br/&gt;
&amp;lt;d&amp;gt; 0000000000001000 0000000000001000 0000000000001000 ffff880f985172d0&lt;br/&gt;
Call Trace:&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04721bb&amp;gt;&amp;#93;&lt;/span&gt; __ldiskfs_handle_dirty_metadata+0x7b/0x100 &lt;span class=&quot;error&quot;&gt;&amp;#91;ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa047f898&amp;gt;&amp;#93;&lt;/span&gt; ? ldiskfs_bread+0x18/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d6d1ec&amp;gt;&amp;#93;&lt;/span&gt; osd_ldiskfs_write_record+0xec/0x330 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d6e038&amp;gt;&amp;#93;&lt;/span&gt; osd_write+0x148/0x2a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa066aeb5&amp;gt;&amp;#93;&lt;/span&gt; dt_record_write+0x45/0x130 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c8acd0&amp;gt;&amp;#93;&lt;/span&gt; ? md_capainfo+0x20/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa063788e&amp;gt;&amp;#93;&lt;/span&gt; llog_osd_write_blob+0x2fe/0x730 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa063b157&amp;gt;&amp;#93;&lt;/span&gt; llog_osd_write_rec+0x6b7/0x1200 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d679a5&amp;gt;&amp;#93;&lt;/span&gt; ? iam_path_fini+0x25/0x30 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06073b8&amp;gt;&amp;#93;&lt;/span&gt; llog_write_rec+0xc8/0x290 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa060f57d&amp;gt;&amp;#93;&lt;/span&gt; llog_cat_add_rec+0xad/0x480 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa06071b1&amp;gt;&amp;#93;&lt;/span&gt; llog_add+0x91/0x1d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;obdclass&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f71ef7&amp;gt;&amp;#93;&lt;/span&gt; osp_sync_add_rec+0x247/0x8a0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osp&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0d650af&amp;gt;&amp;#93;&lt;/span&gt; ? osd_oi_delete+0x2af/0x4b0 &lt;span class=&quot;error&quot;&gt;&amp;#91;osd_ldiskfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f725fb&amp;gt;&amp;#93;&lt;/span&gt; osp_sync_add+0x7b/0x80 &lt;span class=&quot;error&quot;&gt;&amp;#91;osp&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f664e6&amp;gt;&amp;#93;&lt;/span&gt; osp_object_destroy+0x106/0x150 &lt;span class=&quot;error&quot;&gt;&amp;#91;osp&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0f22347&amp;gt;&amp;#93;&lt;/span&gt; lod_object_destroy+0x1a7/0x350 &lt;span class=&quot;error&quot;&gt;&amp;#91;lod&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c83a19&amp;gt;&amp;#93;&lt;/span&gt; mdd_finish_unlink+0x229/0x380 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0c86748&amp;gt;&amp;#93;&lt;/span&gt; mdd_unlink+0x9c8/0xe20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdd&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e6ba18&amp;gt;&amp;#93;&lt;/span&gt; mdo_unlink+0x18/0x50 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e6ecd9&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_unlink+0x739/0xfd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e6b6d1&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_rec+0x41/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e64d53&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint_internal+0x4e3/0x7d0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e65084&amp;gt;&amp;#93;&lt;/span&gt; mdt_reint+0x44/0xe0 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e53078&amp;gt;&amp;#93;&lt;/span&gt; mdt_handle_common+0x648/0x1660 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0e8f125&amp;gt;&amp;#93;&lt;/span&gt; mds_regular_handle+0x15/0x20 &lt;span class=&quot;error&quot;&gt;&amp;#91;mdt&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07fc1dc&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_server_handle_request+0x41c/0xdf0 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa04f45de&amp;gt;&amp;#93;&lt;/span&gt; ? cfs_timer_arm+0xe/0x10 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa0505d8f&amp;gt;&amp;#93;&lt;/span&gt; ? lc_watchdog_touch+0x6f/0x170 &lt;span class=&quot;error&quot;&gt;&amp;#91;libcfs&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07f3819&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_wait_event+0xa9/0x290 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff81052223&amp;gt;&amp;#93;&lt;/span&gt; ? __wake_up+0x53/0x70&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07fd725&amp;gt;&amp;#93;&lt;/span&gt; ptlrpc_main+0xb75/0x1870 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07fcbb0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1870 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0ca&amp;gt;&amp;#93;&lt;/span&gt; child_rip+0xa/0x20&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07fcbb0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1870 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa07fcbb0&amp;gt;&amp;#93;&lt;/span&gt; ? ptlrpc_main+0x0/0x1870 &lt;span class=&quot;error&quot;&gt;&amp;#91;ptlrpc&amp;#93;&lt;/span&gt;&lt;br/&gt;
 &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffff8100c0c0&amp;gt;&amp;#93;&lt;/span&gt; ? child_rip+0x0/0x20&lt;br/&gt;
Code: c6 9c 03 00 00 4c 89 f7 e8 a1 ff 09 e1 48 8b 33 ba 01 00 00 00 4c 89 e7 e8 11 ec ff ff 4c 89 f0 66 ff 00 66 66 90 e9 73 ff ff ff &amp;lt;0f&amp;gt; 0b eb fe 0f 0b eb fe 0f 0b 66 0f 1f 84 00 00 00 00 00 eb f5&lt;br/&gt;
RIP  &lt;span class=&quot;error&quot;&gt;&amp;#91;&amp;lt;ffffffffa044c86d&amp;gt;&amp;#93;&lt;/span&gt; jbd2_journal_dirty_metadata+0x10d/0x150 &lt;span class=&quot;error&quot;&gt;&amp;#91;jbd2&amp;#93;&lt;/span&gt;&lt;br/&gt;
 RSP &amp;lt;ffff88032d16d6b0&amp;gt;&lt;/p&gt;

&lt;p&gt;I will file a separate bug if I hit this on master&lt;/p&gt;</comment>
                            <comment id="54899" author="bfaccini" created="Wed, 27 Mar 2013 09:13:40 +0000"  >&lt;p&gt;According to stack-trace, I don&apos;t think it is related, but as I already said, with item size changes some weird consequences can show-up ...&lt;/p&gt;

&lt;p&gt;Also, Chris confirmed that actually auto-test generated crash-dumps are not saved, it is in their to-do list.&lt;/p&gt;

&lt;p&gt;Thus I rely on your testing to reproduce the LBUG, in // I try to build a configuration with more Nodes/VMs too, like the failed auto-test one.&lt;/p&gt;</comment>
                            <comment id="54920" author="mdiep" created="Wed, 27 Mar 2013 15:37:39 +0000"  >&lt;p&gt;I have reproduced it on master and got the debug log with debug=-1. I also did a crash dump but since this is a LBUG, not sure if a crash dump is useful.&lt;/p&gt;</comment>
                            <comment id="54996" author="bfaccini" created="Thu, 28 Mar 2013 11:46:10 +0000"  >&lt;p&gt;crash-dump is not really useful in this case since it can only confirm that oti-&amp;gt;oti_declare_ops_rb&lt;span class=&quot;error&quot;&gt;&amp;#91;OSD_OT_XATTR_SET&amp;#93;&lt;/span&gt; = 0 which caused the Assert to trigger.&lt;/p&gt;

&lt;p&gt;But the lustre-logs are much more interesting since they confirm that the thread that trigger the Assert was gathering transaction credits stats for a work-load where wide-striping was used, and each time near the 256 (in fact 253 + ??) &quot;fatal&quot; value. So I still suspect an overflow due to uchar size ...&lt;/p&gt;

&lt;p&gt;Minh, I know that for the crash-dump you were running with a genuine master build, but was it also the case for the full/debug=1 trace case ?&lt;/p&gt;

&lt;p&gt;In fact, I would like to know if you got such Assert/LBUG when running with my patch ??&lt;/p&gt;</comment>
                            <comment id="54997" author="bzzz" created="Thu, 28 Mar 2013 11:52:06 +0000"  >&lt;p&gt;to check the theory about uchar size, you could put LASSERT(currect_value &amp;lt; 255); just before increment?&lt;/p&gt;</comment>
                            <comment id="55004" author="bfaccini" created="Thu, 28 Mar 2013 14:34:10 +0000"  >&lt;p&gt;You are right Alex, I thought I can find it by counting accurate traces in the full debug log from Minh, but seems we miss these traces in fact ...&lt;/p&gt;

&lt;p&gt;Minh, I just submitted &lt;a href=&quot;http://review.whamcloud.com/5869&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5869&lt;/a&gt; to trigger if we reach UCHAR_MAX, can you give it a try and expose it to your reproducer+configuration ??&lt;/p&gt;</comment>
                            <comment id="55081" author="bfaccini" created="Fri, 29 Mar 2013 08:05:08 +0000"  >&lt;p&gt;Ok, fact that we can reach UCHAR_MAX and overflow oti_declare_ops counter has been definitely proven.&lt;/p&gt;

&lt;p&gt;Then, changing the size of both oti_declare_ops/oti_declare_ops_rb from uchar to ushort, as per 1st patch/change 5830, seems to allow us to go further, but then we finally trigger the same Assert, now because oti_rollback is true but oti_declare_ops_rb is Null !!&#8230;&lt;/p&gt;

&lt;p&gt;Seems that we missed something in the rollback logic &#8230; So, we continue to debug by adding traces/prints to see how counters progress and find where things break.&lt;/p&gt;</comment>
                            <comment id="55082" author="bzzz" created="Fri, 29 Mar 2013 08:17:47 +0000"  >&lt;p&gt;notice the checks in rollback path are not quite correct. in short, from ldiskfs/zfs point of view, there is no point to declare rollback changes: if you have insert for a name declared, then you&apos;ll be able to remove that name as part of rollback. this is why we plan to just disable these checks in production (until a better mechanism to track changes is implemented).&lt;/p&gt;</comment>
                            <comment id="55207" author="mdiep" created="Mon, 1 Apr 2013 17:53:58 +0000"  >&lt;p&gt;Alex, can you confirm that if we disable these check in production, this ASSERTION will no happen?&lt;/p&gt;</comment>
                            <comment id="55208" author="bzzz" created="Mon, 1 Apr 2013 18:02:55 +0000"  >&lt;p&gt;the assertions in OSD - yes. but I think we still need to understand the root cause for kernel BUG at fs/jbd2/transaction.c:1033!&lt;/p&gt;
</comment>
                            <comment id="55211" author="bfaccini" created="Mon, 1 Apr 2013 18:18:46 +0000"  >&lt;p&gt;Minh,&lt;/p&gt;

&lt;p&gt;Yes, undefine of compile-time OSD_TRACK_DECLARES will fully disable the concerned transaction credits counters feature/code.&lt;br/&gt;
Concerning the &quot;fs/jbd2/transaction.c:1033&quot; kernel BUG()/Oops, am I wrong when I think it was a one-shoot and never re-occurred during the next heavy testing  you ran ?&lt;br/&gt;
Also did you get a crash-dump at that time ? And did you open a new ticket for it finally ?&lt;/p&gt;</comment>
                            <comment id="55212" author="mdiep" created="Mon, 1 Apr 2013 18:21:03 +0000"  >&lt;p&gt;Disable by taking out the define OSD_TRACK_DECLARES results in compile error&lt;/p&gt;

&lt;p&gt;/home/minh.diep/build/lu2991/lustre/osd-ldiskfs/osd_handler.c: In function &#8216;osd_trans_create&#8217;:&lt;br/&gt;
/home/minh.diep/build/lu2991/lustre/osd-ldiskfs/osd_handler.c:683: error: &#8216;struct osd_thread_info&#8217; has no member named &#8216;oti_declare_ops&#8217;&lt;br/&gt;
/home/minh.diep/build/lu2991/lustre/osd-ldiskfs/osd_handler.c:684: error: &#8216;struct osd_thread_info&#8217; has no member named &#8216;oti_declare_ops_rb&#8217;&lt;br/&gt;
/home/minh.diep/build/lu2991/lustre/osd-ldiskfs/osd_handler.c:685: error: &#8216;struct osd_thread_info&#8217; has no member named &#8216;oti_declare_ops_cred&#8217;&lt;br/&gt;
/home/minh.diep/build/lu2991/lustre/osd-ldiskfs/osd_handler.c:686: error: &#8216;struct osd_thread_info&#8217; has no member named &#8216;oti_rollback&#8217;&lt;br/&gt;
cc1: warnings being treated as errors&lt;br/&gt;
/home/minh.diep/build/lu2991/lustre/osd-ldiskfs/osd_handler.c: In function &#8216;osd_trans_start&#8217;:&lt;br/&gt;
/home/minh.diep/build/lu2991/lustre/osd-ldiskfs/osd_handler.c:717: error: unused variable &#8216;last_credits&#8217;&lt;br/&gt;
/home/minh.diep/build/lu2991/lustre/osd-ldiskfs/osd_handler.c:716: error: unused variable &#8216;last_printed&#8217;&lt;/p&gt;

&lt;p&gt;Bruno, no bug was filed for the kernel BUG and haven&apos;t hit it again&lt;/p&gt;</comment>
                            <comment id="55216" author="bfaccini" created="Mon, 1 Apr 2013 18:46:45 +0000"  >&lt;p&gt;Humm, in osd_trans_create() function oti_declare_ops&lt;span class=&quot;error&quot;&gt;&amp;#91;_rb,_cred&amp;#93;&lt;/span&gt; and rollback fields memset()s/initializations need to be put under #ifdef  OSD_TRACK_DECLARES.&lt;/p&gt;

&lt;p&gt;Same for last_credits/last_printed variables declares in osd_trans_start().&lt;/p&gt;

&lt;p&gt;Then we may run your tests again and see if we trigger Oops in jbd2 again.&lt;/p&gt;</comment>
                            <comment id="55251" author="mdiep" created="Tue, 2 Apr 2013 04:46:22 +0000"  >&lt;p&gt;ok, fixed the compile issue and ran the test for couple hours now. No ASSERTION. I will try to reproduce the Oops but in the mean time, this bug&apos;s priority can be lowered.&lt;/p&gt;</comment>
                            <comment id="55382" author="bfaccini" created="Wed, 3 Apr 2013 13:44:55 +0000"  >&lt;p&gt;Patch-set #3 submitted according to the fact patch-set #1 did not correctly initialized/memset()s the new sized fields.&lt;/p&gt;

&lt;p&gt;If it still shows the same wrong behavior we will definitely revert unset/undefine OSD_TRACK_DECLARES.&lt;/p&gt;</comment>
                            <comment id="55625" author="bfaccini" created="Fri, 5 Apr 2013 17:39:02 +0000"  >&lt;p&gt;I had to re-base patch #6 due to auto-tests failures like racer/test_1 &lt;span class=&quot;error&quot;&gt;&amp;#91;LustreError: 31880:0:(file.c:930:ll_file_io_generic()) ASSERTION( io-&amp;gt;u.ci_rw.crw_count == count ) failed: 21504 != 4194304&amp;#93;&lt;/span&gt; which has been fixed with &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3035&quot; title=&quot;Failure on racer: ASSERTION( io-&amp;gt;u.ci_rw.crw_count == count ) failed: 785408 != 4194304 &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3035&quot;&gt;&lt;del&gt;LU-3035&lt;/del&gt;&lt;/a&gt; ...&lt;/p&gt;</comment>
                            <comment id="55683" author="bfaccini" created="Sun, 7 Apr 2013 17:42:06 +0000"  >&lt;p&gt;Wow too bad, now patch-set #6 experienced 2 auto-tests failures (sanity/test_56w linked to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3092&quot; title=&quot;sanity 56w fails on current master.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3092&quot;&gt;&lt;del&gt;LU-3092&lt;/del&gt;&lt;/a&gt; and sanity/test_65ic linked to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3099&quot; title=&quot;Test failure on test suite sanity, subtest test_65ic&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3099&quot;&gt;&lt;del&gt;LU-3099&lt;/del&gt;&lt;/a&gt;), so a new re-base is required to integrate &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3057&quot; title=&quot;Hang at sanity test_56x and test_56w&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3057&quot;&gt;&lt;del&gt;LU-3057&lt;/del&gt;&lt;/a&gt; fix since both &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3092&quot; title=&quot;sanity 56w fails on current master.&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3092&quot;&gt;&lt;del&gt;LU-3092&lt;/del&gt;&lt;/a&gt;/&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-3099&quot; title=&quot;Test failure on test suite sanity, subtest test_65ic&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-3099&quot;&gt;&lt;del&gt;LU-3099&lt;/del&gt;&lt;/a&gt; have been dup&apos;ed to it ...&lt;/p&gt;

&lt;p&gt;So, patch-set #7 just submitted after rebase ...&lt;/p&gt;
</comment>
                            <comment id="55713" author="bfaccini" created="Mon, 8 Apr 2013 09:05:14 +0000"  >&lt;p&gt;Again 4x, and different from the preceding!, auto-tests failures. But all in review-zfs group, with at least 2 of them being known/perf ZFS issues (&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2872&quot; title=&quot;Test timeout failure on test suite sanity-quota test_1&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2872&quot;&gt;&lt;del&gt;LU-2872&lt;/del&gt;&lt;/a&gt;, &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2547&quot; title=&quot;test: recovery-small test_24a, test_24b: multiop didn&amp;#39;t fail fsync: rc 0&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2547&quot;&gt;&lt;del&gt;LU-2547&lt;/del&gt;&lt;/a&gt;) still under investigations. Thus, considering that this bug+patch is pure osd-ldiskfs stuff and since all review (pure ldiskfs ?) group tests have been successful, we may reasonably think that patch is ok and verified ...&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvlq7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7290</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>