<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:51:05 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-5392] kernel BUG at fs/jbd2/transaction.c:1030! on the MDS while starting OSTs</title>
                <link>https://jira.whamcloud.com/browse/LU-5392</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We hit the following bug on our MDS while starting the OSTs. We were registering all the targerts after a writeconf, so the MDT has been started before the OSTs.&lt;/p&gt;

&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;kernel BUG at fs/jbd2/transaction.c:1030!
invalid opcode: 0000 [#1] SMP
last sysfs file: /sys/devices/pci0000:80/0000:80:05.0/0000:85:00.0/host12/rport-12:0-0/target12:0:0/12:0:0:0/state
CPU 1
Modules linked in: osp(U) lod(U) mdt(U) mgs(U) mgc(U) osd_ldiskfs(U) mdd(U) lustre(U) lov(U) osc(U) mdc(U) lquota(U) fid(U) fld(U) ko2iblnd(U) ptlrpc(U) obdclass(U) lnet(U) fsfilt_ldiskfs(U) lvfs(U) libcfs(U) ldiskfs(U) sha512_generic sha256_generic crc32c_intel nfs lockd fscache auth_rpcgss nfs_acl sunrpc ipmi_devintf cpufreq_ondemand acpi_cpufreq freq_table mperf bonding 8021q garp stp llc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U) ib_mad(U) ib_core(U) mlx4_core(U) dm_round_robin scsi_dh_rdac dm_multipath uinput sg lpc_ich mfd_core ioatdma igb dca i2c_algo_bit i2c_core ptp pps_core lpfc scsi_transport_fc scsi_tgt ext4 jbd2 mbcache sd_mod crc_t10dif ahci dm_mirror dm_region_hash dm_log dm_mod megaraid_sas [last unloaded: libcfs]

Pid: 18550, comm: osp-syn-1 Tainted: G        W  ---------------    2.6.32-431.17.1.el6.Bull.50.x86_64 #1 BULL bullx &lt;span class=&quot;code-keyword&quot;&gt;super&lt;/span&gt;-node
RIP: 0010:[&amp;lt;ffffffffa006f79d&amp;gt;]  [&amp;lt;ffffffffa006f79d&amp;gt;] jbd2_journal_dirty_metadata+0x10d/0x150 [jbd2]
RSP: 0018:ffff880c68cb74a0  EFLAGS: 00010246
RAX: ffff88087a587680 RBX: ffff88087a13d588 RCX: ffff881075405678
RDX: 0000000000000000 RSI: ffff881075405678 RDI: 0000000000000000
RBP: ffff880c68cb74c0 R08: f010000000000000 R09: ef6cc3c6d80d9e02
R10: 0000000000000001 R11: 0000000000000000 R12: ffff88107542cf28
R13: ffff881075405678 R14: ffff881060c97000 R15: 0000000000000008
FS:  00007f0798d0f700(0000) GS:ffff88048e400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000000006d3aa8 CR3: 000000046dea3000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
&lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; osp-syn-1 (pid: 18550, threadinfo ffff880c68cb6000, task ffff880c78a1d500)
Stack:
 ffff88087a13d588 ffffffffa0eb5df0 ffff881075405678 0000000000000000
&amp;lt;d&amp;gt; ffff880c68cb7500 ffffffffa0e730eb ffff880c68cb74f0 ffffffff8109a1bf
&amp;lt;d&amp;gt; ffff88107543c310 ffff88087a13d588 0000000000000018 ffff881075405678
Call Trace:
 [&amp;lt;ffffffffa0e730eb&amp;gt;] __ldiskfs_handle_dirty_metadata+0x7b/0x100 [ldiskfs]
 [&amp;lt;ffffffff8109a1bf&amp;gt;] ? wake_up_bit+0x2f/0x40
 [&amp;lt;ffffffffa0ea9145&amp;gt;] ldiskfs_quota_write+0x165/0x210 [ldiskfs]
 [&amp;lt;ffffffff811eeb31&amp;gt;] v2_write_file_info+0xa1/0xe0
 [&amp;lt;ffffffff811eac38&amp;gt;] dquot_acquire+0x138/0x140
 [&amp;lt;ffffffffa0ea83e6&amp;gt;] ldiskfs_acquire_dquot+0x66/0xb0 [ldiskfs]
 [&amp;lt;ffffffff811ecbac&amp;gt;] dqget+0x2ac/0x390
 [&amp;lt;ffffffff811ed13b&amp;gt;] dquot_initialize+0x7b/0x240
 [&amp;lt;ffffffffa0ea8602&amp;gt;] ldiskfs_dquot_initialize+0x62/0xc0 [ldiskfs]
 [&amp;lt;ffffffffa0522d64&amp;gt;] osd_write+0x104/0x2a0 [osd_ldiskfs]
 [&amp;lt;ffffffffa1003945&amp;gt;] dt_record_write+0x45/0x130 [obdclass]
 [&amp;lt;ffffffffa0fdc45b&amp;gt;] llog_osd_write_blob+0x57b/0x850 [obdclass]
 [&amp;lt;ffffffffa0fdf7d4&amp;gt;] llog_osd_write_rec+0x274/0x1370 [obdclass]
 [&amp;lt;ffffffffa0fab438&amp;gt;] llog_write_rec+0xc8/0x290 [obdclass]
 [&amp;lt;ffffffffa0facbad&amp;gt;] llog_write+0x2ad/0x420 [obdclass]
 [&amp;lt;ffffffffa0face0c&amp;gt;] llog_cancel_rec+0xbc/0x7c0 [obdclass]
 [&amp;lt;ffffffffa0fb3447&amp;gt;] llog_cat_cancel_records+0x107/0x340 [obdclass]
 [&amp;lt;ffffffffa070a12b&amp;gt;] osp_sync_process_queues+0x12bb/0x15e0 [osp]
 [&amp;lt;ffffffffa050ac4b&amp;gt;] ? osd_object_read_unlock+0x8b/0xd0 [osd_ldiskfs]
 [&amp;lt;ffffffffa0fade0b&amp;gt;] llog_process_thread+0x8fb/0xe00 [obdclass]
 [&amp;lt;ffffffffa0708e70&amp;gt;] ? osp_sync_process_queues+0x0/0x15e0 [osp]
 [&amp;lt;ffffffffa0fafc7d&amp;gt;] llog_process_or_fork+0x12d/0x660 [obdclass]
 [&amp;lt;ffffffffa0fb261a&amp;gt;] llog_cat_process_cb+0x56a/0x620 [obdclass]
 [&amp;lt;ffffffff81054619&amp;gt;] ? __wake_up_common+0x59/0x90
 [&amp;lt;ffffffffa0708e70&amp;gt;] ? osp_sync_process_queues+0x0/0x15e0 [osp]
 [&amp;lt;ffffffffa0fb1509&amp;gt;] llog_cat_process+0x19/0x20 [obdclass]
 [&amp;lt;ffffffffa0edb75a&amp;gt;] ? cfs_waitq_signal+0x1a/0x20 [libcfs]
 [&amp;lt;ffffffffa070aa60&amp;gt;] osp_sync_thread+0x240/0x7e0 [osp]
 [&amp;lt;ffffffffa070a820&amp;gt;] ? osp_sync_thread+0x0/0x7e0 [osp]
 [&amp;lt;ffffffff8100c20a&amp;gt;] child_rip+0xa/0x20
 [&amp;lt;ffffffffa070a820&amp;gt;] ? osp_sync_thread+0x0/0x7e0 [osp]
 [&amp;lt;ffffffffa070a820&amp;gt;] ? osp_sync_thread+0x0/0x7e0 [osp]
 [&amp;lt;ffffffff8100c200&amp;gt;] ? child_rip+0x0/0x20
Code: c6 9c 03 00 00 4c 89 f7 e8 81 bf 4b e1 48 8b 33 ba 01 00 00 00 4c 89 e7 e8 b1 ec ff ff 4c 89 f0 66 ff 00 66 66 90 e9 73 ff ff ff &amp;lt;0f&amp;gt; 0b eb fe 0f 0b eb fe 0f 0b 66 0f 1f 84 00 00 00 00 00 eb f5
RIP  [&amp;lt;ffffffffa006f79d&amp;gt;] jbd2_journal_dirty_metadata+0x10d/0x150 [jbd2]
 RSP &amp;lt;ffff880c68cb74a0&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This bug looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-5040&quot; title=&quot;kernel BUG at fs/jbd2/transaction.c:1033&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-5040&quot;&gt;&lt;del&gt;LU-5040&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Before the crash, we had a lot of VFS errors about quotas in the console:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;VFS: find_free_dqentry(): Data block full but it shouldn&apos;t.
VFS: Error -5 occurred &lt;span class=&quot;code-keyword&quot;&gt;while&lt;/span&gt; creating quota.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;After the crash, we tried to start the MDT and hit another BUG:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;kernel BUG at fs/inode.c:1358!
invalid opcode: 0000 [#1] SMP
last sysfs file: /sys/devices/pci0000:80/0000:80:05.0/0000:85:00.0/host12/rport-12:0-0/target12:0:0/12:0:0:0/state
CPU 1
Modules linked in: ldiskfs(U) sha512_generic sha256_generic crc32c_intel libcfs(U) nfs lockd fscache auth_rpcgss nfs_acl sunrpc ipmi_devintf cpufreq_ondemand acpi_cpufreq freq_table mperf bonding 8021q garp stp llc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U) ib_ipoib(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U) ib_mad(U) ib_core(U) mlx4_core(U) dm_round_robin scsi_dh_rdac dm_multipath uinput sg lpc_ich mfd_core ioatdma lpfc scsi_transport_fc scsi_tgt igb dca i2c_algo_bit i2c_core ptp pps_core ext4 jbd2 mbcache sd_mod crc_t10dif ahci dm_mirror dm_region_hash dm_log dm_mod megaraid_sas [last unloaded: scsi_wait_scan]

Pid: 6003, comm: mount Tainted: G        W  ---------------    2.6.32-431.17.1.el6.Bull.50.x86_64 #1 BULL bullx &lt;span class=&quot;code-keyword&quot;&gt;super&lt;/span&gt;-node
RIP: 0010:[&amp;lt;ffffffff811a58d9&amp;gt;]  [&amp;lt;ffffffff811a58d9&amp;gt;] iput+0x69/0x70
RSP: 0018:ffff880c78495c28  EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff88087a3eb528 RCX: 0000000000000034
RDX: 0000000000000001 RSI: ffff88087a348d80 RDI: ffff88087a3eb528
RBP: ffff880c78495c38 R08: 8038000000000000 R09: f767c3674f91d007
R10: ffff880c7a6cd000 R11: 7fffffffffffffff R12: ffff88087a663000
R13: ffff88087a3eb528 R14: 00000000ffffffea R15: 0000000000000000
FS:  00007f8051d2f7e0(0000) GS:ffff88048e400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00007f2e4552f00f CR3: 0000000878ca6000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
&lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; mount (pid: 6003, threadinfo ffff880c78494000, task ffff880c78608080)
Stack:
 ffff880879cabc00 ffff880879cabc00 ffff880c78495d58 ffffffffa056d947
&amp;lt;d&amp;gt; 00000004000000fd 78495d8800000004 ffff880877e32000 ffff88086ce2c000
&amp;lt;d&amp;gt; 0000000000000004 ffff88087a6630d0 00000000ffffffff ffff88087a6630f8
Call Trace:
 [&amp;lt;ffffffffa056d947&amp;gt;] ldiskfs_fill_super+0x1757/0x2ac0 [ldiskfs]
 [&amp;lt;ffffffff8128ceb4&amp;gt;] ? snprintf+0x34/0x40
 [&amp;lt;ffffffff8118c98e&amp;gt;] get_sb_bdev+0x18e/0x1d0
 [&amp;lt;ffffffffa056c1f0&amp;gt;] ? ldiskfs_fill_super+0x0/0x2ac0 [ldiskfs]
 [&amp;lt;ffffffffa0567018&amp;gt;] ldiskfs_get_sb+0x18/0x20 [ldiskfs]
 [&amp;lt;ffffffff8118be1b&amp;gt;] vfs_kern_mount+0x7b/0x1b0
 [&amp;lt;ffffffff8118bfc2&amp;gt;] do_kern_mount+0x52/0x130
 [&amp;lt;ffffffff811acf9b&amp;gt;] do_mount+0x2fb/0x930
 [&amp;lt;ffffffff81140d34&amp;gt;] ? strndup_user+0x64/0xc0
 [&amp;lt;ffffffff811ad660&amp;gt;] sys_mount+0x90/0xe0
 [&amp;lt;ffffffff8100b072&amp;gt;] system_call_fastpath+0x16/0x1b
Code: 38 48 c7 c0 20 6a 1a 81 48 85 d2 74 12 48 8b 42 20 48 c7 c2 20 6a 1a 81 48 85 c0 48 0f 44 c2 48 89 df ff d0 48 83 c4 08 5b c9 c3 &amp;lt;0f&amp;gt; 0b eb fe 0f 1f 00 55 48 89 e5 41 55 41 54 53 48 83 ec 08 0f
RIP  [&amp;lt;ffffffff811a58d9&amp;gt;] iput+0x69/0x70
 RSP &amp;lt;ffff880c78495c28&amp;gt;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;We then ran a fsck on the MDT and were able to start the MDS.&lt;/p&gt;

&lt;p&gt;You can find attached a bundle of logs containing:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;the log of the fscks&lt;/li&gt;
	&lt;li&gt;the syslog of the MDS&lt;/li&gt;
	&lt;li&gt;the console log of the MDS&lt;/li&gt;
	&lt;li&gt;some traces from the two crashes (bt, foreach bt, ps -l, ps, log)&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;My concern is the first crash as I believe the second is a consequence of the first one.&lt;/p&gt;</description>
                <environment>RHEL 6 w/ patched kernel for Lustre</environment>
        <key id="25687">LU-5392</key>
            <summary>kernel BUG at fs/jbd2/transaction.c:1030! on the MDS while starting OSTs</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="1" iconUrl="https://jira.whamcloud.com/images/icons/statuses/open.png" description="The issue is open and ready for the assignee to start work on it.">Open</status>
                    <statusCategory id="2" key="new" colorName="default"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="bobijam">Zhenyu Xu</assignee>
                                    <reporter username="bruno.travouillon">Bruno Travouillon</reporter>
                        <labels>
                    </labels>
                <created>Tue, 22 Jul 2014 06:45:13 +0000</created>
                <updated>Tue, 7 Jun 2016 15:38:31 +0000</updated>
                                            <version>Lustre 2.4.3</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>9</watches>
                                                                            <comments>
                            <comment id="89710" author="bobijam" created="Tue, 22 Jul 2014 07:21:28 +0000"  >&lt;p&gt;I think &lt;a href=&quot;http://review.whamcloud.com/#/c/11096/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/11096/&lt;/a&gt; can also handle this credit deficiency issue.&lt;/p&gt;</comment>
                            <comment id="89712" author="bfaccini" created="Tue, 22 Jul 2014 08:50:04 +0000"  >&lt;p&gt;Bobi, it is unclear for me but just in case, should they not need to also integrate &lt;a href=&quot;http://review.whamcloud.com/#/c/10293/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#/c/10293/&lt;/a&gt; from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-4382&quot; title=&quot;kernel BUG at fs/jbd2/transaction.c:1033&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-4382&quot;&gt;&lt;del&gt;LU-4382&lt;/del&gt;&lt;/a&gt; ?&lt;/p&gt;</comment>
                            <comment id="89713" author="bobijam" created="Tue, 22 Jul 2014 09:01:34 +0000"  >&lt;p&gt;#11096 can cure the exact issue you encountered, and #10293 will handle another credit deficiency issue, you can/should include both of them.&lt;/p&gt;</comment>
                            <comment id="92723" author="bruno.travouillon" created="Thu, 28 Aug 2014 16:36:45 +0000"  >&lt;p&gt;Thanks guys.&lt;/p&gt;

&lt;p&gt;Can I safely use #11096 on top of b2_4?&lt;/p&gt;</comment>
                            <comment id="92773" author="bobijam" created="Fri, 29 Aug 2014 01:49:26 +0000"  >&lt;p&gt;You can wait for the finish of its review and autotest cycle before using it in your product system.&lt;/p&gt;</comment>
                            <comment id="97459" author="bruno.travouillon" created="Fri, 24 Oct 2014 20:01:21 +0000"  >&lt;p&gt;FYI, we are running patches in both 2.4.3 (#11096) and 2.5.3 (#11097) and didn&apos;t hit this bug since.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="22452">LU-4382</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="24640">LU-5040</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="15401" name="crash_mds.tgz" size="577928" author="bruno.travouillon" created="Tue, 22 Jul 2014 06:45:13 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10490" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>End date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Fri, 24 Oct 2014 06:45:13 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10030" key="com.atlassian.jira.plugin.system.customfieldtypes:labels">
                        <customfieldname>Epic/Theme</customfieldname>
                        <customfieldvalues>
                                        <label>Quota</label>
    
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzwrzb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>15008</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                        <customfield id="customfield_10493" key="com.atlassian.jira.plugin.system.customfieldtypes:datepicker">
                        <customfieldname>Start date</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>Tue, 22 Jul 2014 06:45:13 +0000</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                    </customfields>
    </item>
</channel>
</rss>