<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:55:02 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-12717] ASSERTION( !lod_obj_is_striped(child) ) failed</title>
                <link>https://jira.whamcloud.com/browse/LU-12717</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;LBUG today on oak-MDT0000, never seem this one before. We have had some big data transfers using dsync going on on Sherlock (2.12.0 clients). Might be related, or not.&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[4954375.921845] LustreError: 15102:0:(tgt_handler.c:628:process_req_last_xid()) @@@ Unexpected xid 5d6425ffe4140 vs. last_xid 5d6425ffe418f
  req@ffffa1597f41f200 x1642955450237248/t0(0) o101-&amp;gt;98bbe778-4f70-8a89-d80e-d6a8120c693b@10.8.2.23@o2ib6:663/0 lens 736/0 e 0 to 0 dl 1567111883 ref 1 fl Interpret:/2/ffffffff rc 0/-1
[4954542.487326] LustreError: 15290:0:(mdt_lib.c:961:mdt_attr_valid_xlate()) Unknown attr bits: 0x60000
[4954542.517377] LustreError: 15290:0:(mdt_lib.c:961:mdt_attr_valid_xlate()) Skipped 3754300 previous similar messages
[4954874.316190] LustreError: 15347:0:(lod_object.c:3919:lod_ah_init()) ASSERTION( !lod_obj_is_striped(child) ) failed: 
[4954874.351112] LustreError: 15347:0:(lod_object.c:3919:lod_ah_init()) LBUG
[4954874.373452] Pid: 15347, comm: mdt01_049 3.10.0-862.14.4.el7_lustre.x86_64 #1 SMP Mon Oct 8 11:21:37 PDT 2018
[4954874.406359] Call Trace:
[4954874.414973]  [&amp;lt;ffffffffc08af7cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
[4954874.437035]  [&amp;lt;ffffffffc08af87c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
[4954874.459664]  [&amp;lt;ffffffffc135a89f&amp;gt;] lod_ah_init+0x23f/0xde0 [lod]
[4954874.479751]  [&amp;lt;ffffffffc13d306b&amp;gt;] mdd_object_make_hint+0xcb/0x190 [mdd]
[4954874.502388]  [&amp;lt;ffffffffc13bed50&amp;gt;] mdd_create_data+0x330/0x730 [mdd]
[4954874.523606]  [&amp;lt;ffffffffc129140c&amp;gt;] mdt_mfd_open+0xc5c/0xe70 [mdt]
[4954874.544523]  [&amp;lt;ffffffffc1291b9b&amp;gt;] mdt_finish_open+0x57b/0x690 [mdt]
[4954874.565743]  [&amp;lt;ffffffffc1293478&amp;gt;] mdt_reint_open+0x17c8/0x3190 [mdt]
[4954874.587229]  [&amp;lt;ffffffffc1288cb3&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
[4954874.607567]  [&amp;lt;ffffffffc126a19b&amp;gt;] mdt_reint_internal+0x5fb/0x9c0 [mdt]
[4954874.630197]  [&amp;lt;ffffffffc126a6c2&amp;gt;] mdt_intent_reint+0x162/0x430 [mdt]
[4954874.651677]  [&amp;lt;ffffffffc126d4cb&amp;gt;] mdt_intent_opc+0x1eb/0xaf0 [mdt]
[4954874.672619]  [&amp;lt;ffffffffc1275d68&amp;gt;] mdt_intent_policy+0x138/0x320 [mdt]
[4954874.694668]  [&amp;lt;ffffffffc0be82dd&amp;gt;] ldlm_lock_enqueue+0x38d/0x980 [ptlrpc]
[4954874.719320]  [&amp;lt;ffffffffc0c11c03&amp;gt;] ldlm_handle_enqueue0+0xa83/0x1670 [ptlrpc]
[4954874.743104]  [&amp;lt;ffffffffc0c977f2&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
[4954874.764026]  [&amp;lt;ffffffffc0c9b72a&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
[4954874.787245]  [&amp;lt;ffffffffc0c4404b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
[4954874.813872]  [&amp;lt;ffffffffc0c47792&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
[4954874.835628]  [&amp;lt;ffffffff8babdf21&amp;gt;] kthread+0xd1/0xe0
[4954874.852252]  [&amp;lt;ffffffff8c1255f7&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
[4954874.873448]  [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
[4954874.890366] Kernel panic - not syncing: LBUG
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;I do have a crash dump if you&apos;re interested. MDT failover was smooth so not a big deal:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Aug 29 14:04:49 oak-md1-s1 kernel: Lustre: oak-MDT0000: Recovery over after 0:55, of 1464 clients 1464 recovered and 0 were evicted.
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment>Clients: 2.12.0, CentOS 7.6</environment>
        <key id="56804">LU-12717</key>
            <summary>ASSERTION( !lod_obj_is_striped(child) ) failed</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="3" iconUrl="https://jira.whamcloud.com/images/icons/statuses/inprogress.png" description="This issue is being actively worked on at the moment by the assignee.">In Progress</status>
                    <statusCategory id="4" key="indeterminate" colorName="inprogress"/>
                                    <resolution id="-1">Unresolved</resolution>
                                        <assignee username="laisiyao">Lai Siyao</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                    </labels>
                <created>Thu, 29 Aug 2019 21:24:06 +0000</created>
                <updated>Thu, 30 Apr 2020 23:28:13 +0000</updated>
                                            <version>Lustre 2.10.8</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>7</watches>
                                                                            <comments>
                            <comment id="253875" author="sthiell" created="Thu, 29 Aug 2019 21:26:03 +0000"  >&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[4954375.921845] LustreError: 15102:0:(tgt_handler.c:628:process_req_last_xid()) @@@ Unexpected xid 5d6425ffe4140 vs. last_xid 5d6425ffe418f
  req@ffffa1597f41f200 x1642955450237248/t0(0) o101-&amp;gt;98bbe778-4f70-8a89-d80e-d6a8120c693b@10.8.2.23@o2ib6:663/0 lens 736/0 e 0 to 0 dl 1567111883 ref 1 fl Interpret:/2/ffffffff rc 0/-1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;&lt;tt&gt;10.8.2.23@o2ib6&lt;/tt&gt; is confirmed to have run dsync at the time of the MDT LBUG.&lt;/p&gt;</comment>
                            <comment id="253878" author="adilger" created="Thu, 29 Aug 2019 22:09:33 +0000"  >&lt;blockquote&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[4954542.487326] LustreError: 15290:0:(mdt_lib.c:961:mdt_attr_valid_xlate()) Unknown attr bits: 0x60000
[4954542.517377] LustreError: 15290:0:(mdt_lib.c:961:mdt_attr_valid_xlate()) Skipped 3754300 previous similar messages
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/blockquote&gt;
&lt;p&gt;Not directly related to this problem, but the &lt;tt&gt;mdt_attr_valid_xlate()&lt;/tt&gt; error message is quieted by patch &lt;a href=&quot;https://review.whamcloud.com/34343&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/34343&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12021&quot; title=&quot;Error message of mdt_attr_valid_xlate() when 2.12 client to 2.10 server&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12021&quot;&gt;&lt;del&gt;LU-12021&lt;/del&gt;&lt;/a&gt; lsom: Add an OBD_CONNECT2_LSOM connect flag&lt;/tt&gt;&quot; (applied only on the MDS) so you don&apos;t get a constant console spew from 2.12.0 clients.&lt;/p&gt;

&lt;p&gt;&lt;b&gt;Note:&lt;/b&gt; if applying this patch to a 2.10.x MDS, please remove &lt;tt&gt;OBD_CONNECT2_LSOM&lt;/tt&gt; from &lt;tt&gt;MDT_CONNECT_SUPPORTED2&lt;/tt&gt; and from &lt;tt&gt;data-&amp;gt;ocd_connect_flags2&lt;/tt&gt; on the client so that the 2.10 nodes do not advertise support for that feature.&lt;/p&gt;</comment>
                            <comment id="253953" author="pjones" created="Fri, 30 Aug 2019 17:27:16 +0000"  >&lt;p&gt;Stephane &lt;/p&gt;

&lt;p&gt;It would probably be a good idea to upload the crash dump to the WC ftp server just in case it is required&lt;/p&gt;

&lt;p&gt;Lai&lt;/p&gt;

&lt;p&gt;Could you please advise?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="254018" author="sthiell" created="Tue, 3 Sep 2019 04:29:54 +0000"  >&lt;p&gt;Thanks Peter.&lt;/p&gt;

&lt;p&gt;We lost the original crash dump, because we were in the process of updating the kernel on Oak. But we had another occurrence of the same crash with the new kernel. So I just uploaded the latest vmcore as &lt;tt&gt;vmcore-oak-md1-s2-2019-08-31-18-02-59&lt;/tt&gt;. New kernel debuginfo rpms have also been uploaded to the WC ftp server as &lt;tt&gt;kernel-debuginfo-3.10.0-957.27.2.el7_lustre.pl1.x86_64.rpm&lt;/tt&gt; and &lt;tt&gt;kernel-debuginfo-common-x86_64-3.10.0-957.27.2.el7_lustre.pl1.x86_64.rpm&lt;/tt&gt;. For convenience I&apos;m attaching the output of foreach bt as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33447/33447_foreach_bt-crash-oak-md1-s2-2019-08-31-18-02-59.log&quot; title=&quot;foreach_bt-crash-oak-md1-s2-2019-08-31-18-02-59.log attached to LU-12717&quot;&gt;foreach_bt-crash-oak-md1-s2-2019-08-31-18-02-59.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; and the dmesg-vmcore.txt as  &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/33448/33448_vmcore-dmesg-oak-md1-s2-2019-08-31-18-02-59.txt&quot; title=&quot;vmcore-dmesg-oak-md1-s2-2019-08-31-18-02-59.txt attached to LU-12717&quot;&gt;vmcore-dmesg-oak-md1-s2-2019-08-31-18-02-59.txt&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;/p&gt;

&lt;p&gt;Lai, thanks for looking into this. I wanted to add that it happened under heavy load, both in terms of read/write bandwidth on the OSTs and metadata load. Big dsync parallel transfers were running along with other user jobs with unusual large sequential reads (plink2).  My (2-cent) theory is that the OSTs were so loaded that this MDT couldn&apos;t keep up allocating new stripes on them, and we hit that bug. But let me know what you think and what else we can provide. Thanks much!&lt;/p&gt;</comment>
                            <comment id="254019" author="sthiell" created="Tue, 3 Sep 2019 04:35:53 +0000"  >&lt;p&gt;You can see the heavy load at the moment of the crash from the vmcore I uploaded:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;      KERNEL: /usr/lib/debug/lib/modules/3.10.0-957.27.2.el7_lustre.pl1.x86_64/vmlinux
    DUMPFILE: vmcore-oak-md1-s2-2019-08-31-18-02-59  [PARTIAL DUMP]
        CPUS: 24
        DATE: Sat Aug 31 18:02:35 2019
      UPTIME: 1 days, 11:25:03
LOAD AVERAGE: 425.53, 389.13, 349.79
       TASKS: 1533
    NODENAME: oak-md1-s2
     RELEASE: 3.10.0-957.27.2.el7_lustre.pl1.x86_64
     VERSION: #1 SMP Mon Aug 5 15:28:37 PDT 2019
     MACHINE: x86_64  (3399 Mhz)
      MEMORY: 127.8 GB
       PANIC: &quot;Kernel panic - not syncing: LBUG&quot;
         PID: 33146
     COMMAND: &quot;mdt00_037&quot;
        TASK: ffff8e69f5a8d140  [THREAD_INFO: ffff8e6957ae8000]
         CPU: 2
       STATE: TASK_RUNNING (PANIC)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="254036" author="laisiyao" created="Tue, 3 Sep 2019 08:35:22 +0000"  >&lt;p&gt;Stephane, I&apos;m not familiar with dsync, where can I get its source code? And how do you use it?&lt;/p&gt;</comment>
                            <comment id="254075" author="sthiell" created="Tue, 3 Sep 2019 16:13:21 +0000"  >&lt;p&gt;Hi Lai,&lt;/p&gt;

&lt;p&gt;A colleague has been playing with it for the last few days to perform large parallel transfers. dsync is part of mpiFileUtils:&lt;/p&gt;
&lt;ul class=&quot;alternate&quot; type=&quot;square&quot;&gt;
	&lt;li&gt;man: &lt;a href=&quot;https://github.com/hpc/mpifileutils/blob/master/doc/rst/dsync.1.rst&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/hpc/mpifileutils/blob/master/doc/rst/dsync.1.rst&lt;/a&gt;&lt;/li&gt;
	&lt;li&gt;source: &lt;a href=&quot;https://github.com/hpc/mpifileutils/blob/master/src/dsync/dsync.c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://github.com/hpc/mpifileutils/blob/master/src/dsync/dsync.c&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;I think what makes it different (vs. rsync/fpsync) is that it will remove all files from the destination first, then it will create directories and empty files, then when it&apos;s done, it will copy the data and after that set permissions on all copied files. Below is an example of run:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;   0: [2019-09-02T08:25:31] Walking source path                                 
   0: [2019-09-02T08:25:31] Walking /scratch/groups/pritch                      
   0: [2019-09-02T08:54:15] Items walked 863997                                 
   0: [2019-09-02T09:19:38] Items walked 7289486                                
   0: [2019-09-02T09:19:46] Items walked 9080593                                
   0: [2019-09-02T09:19:55] Items walked 9334035                                
   0: [2019-09-02T09:20:05] Items walked 9564552                                
   0: [2019-09-02T09:20:20] Items walked 9746523                                
   0: [2019-09-02T09:20:30] Items walked 9956014                                
   0: [2019-09-02T09:20:46] Items walked 10128637                               
   0: [2019-09-02T09:21:10] Items walked 10389513                               
   0: [2019-09-02T09:21:20] Items walked 10528663                               
   0: [2019-09-02T09:21:33] Items walked 10528663                               
   0: [2019-09-02T09:21:43] Items walked 10528663                               
   0: [2019-09-02T09:21:54] Items walked 10528663                               
   0: [2019-09-02T09:22:03] Walked 10528663 items in 3391.952981 seconds (3104.012072 files/sec)
   0: [2019-09-02T09:22:03] Walking destination path                            
   0: [2019-09-02T09:22:03] Walking /oak/stanford/groups/pritch/scratch.copy/groups/pritch
   0: [2019-09-02T09:39:47] Items walked 59941                                  
   0: [2019-09-02T09:40:00] Items walked 8726652                                
   0: [2019-09-02T09:40:08] Items walked 8811009
   0: [2019-09-02T09:40:20] Removing 10404883 items                             
   0: [2019-09-02T09:40:26] level=26 min=0 max=1 sum=3 rate=7.486081 secs=0.400744
   0: [2019-09-02T09:40:27] level=25 min=0 max=1 sum=185 rate=403.045488 secs=0.459005
   0: [2019-09-02T09:40:27] level=24 min=0 max=1 sum=783 rate=1699.719168 secs=0.460664
   0: [2019-09-02T09:40:28] level=23 min=3 max=4 sum=3176 rate=5708.211666 secs=0.556391
   0: [2019-09-02T09:40:29] level=22 min=7 max=8 sum=7293 rate=11232.519041 secs=0.649276
   0: [2019-09-02T09:40:29] level=21 min=8 max=9 sum=8477 rate=11066.037630 secs=0.766038
   0: [2019-09-02T09:40:31] level=20 min=23 max=24 sum=24537 rate=13170.356973 secs=1.863047
   0: [2019-09-02T09:40:33] level=19 min=44 max=45 sum=45833 rate=22867.105954 secs=2.004320
   0: [2019-09-02T09:40:36] level=18 min=76 max=77 sum=78818 rate=27849.612712 secs=2.830129
   0: [2019-09-02T09:40:41] level=17 min=138 max=139 sum=142200 rate=32225.704622 secs=4.412627
   0: [2019-09-02T09:41:04] level=16 min=625 max=626 sum=640878 rate=27090.786318 secs=23.656678
   0: [2019-09-02T09:42:44] level=15 min=611 max=612 sum=625755 rate=6303.651528 secs=99.268654
   0: [2019-09-02T09:47:07] level=14 min=1291 max=1292 sum=1322053 rate=5012.764106 secs=263.737326
   0: [2019-09-02T10:16:42] level=13 min=4246 max=4247 sum=4348287 rate=2450.833574 secs=1774.207374
   0: [2019-09-02T10:34:13] level=12 min=2252 max=2253 sum=2306862 rate=2193.813409 secs=1051.530632
   0: [2019-09-02T10:36:53] level=11 min=771 max=772 sum=790333 rate=4943.302018 secs=159.879570
   0: [2019-09-02T10:36:56] level=10 min=56 max=57 sum=58175 rate=25132.111486 secs=2.314768
   0: [2019-09-02T10:36:56] level=9 min=1 max=2 sum=1225 rate=1446.678564 secs=0.846767
   0: [2019-09-02T10:36:57] level=8 min=0 max=1 sum=10 rate=14.656902 secs=0.682272
   0: [2019-09-02T10:36:57] Removed 10404883 items in 3397.492401 seconds (3062.518402 items/sec)
   0: [2019-09-02T10:36:58] Copying items to destination                        
   0: [2019-09-02T10:36:58] Copying to /oak/stanford/groups/pritch/scratch.copy/groups/pritch
   0: [2019-09-02T10:36:58] Items: 10408855                                     
   0: [2019-09-02T10:36:58]   Directories: 104                                  
   0: [2019-09-02T10:36:58]   Files: 10408751                                   
   0: [2019-09-02T10:36:58]   Links: 0                                          
   0: [2019-09-02T10:36:58] Data: 40.035 TB (4.033 MB per file)                 
   0: [2019-09-02T10:37:10] Creating directories.                               
   0: [2019-09-02T10:37:10]   level=4 min=0 max=0 sum=0 rate=0.000000/sec secs=0.017780
   0: [2019-09-02T10:37:10]   level=5 min=0 max=0 sum=0 rate=0.000000/sec secs=0.012852
   0: [2019-09-02T10:37:10]   level=6 min=0 max=0 sum=0 rate=0.000000/sec secs=0.022612
&#8230;
   0: [2019-09-02T10:37:12] Created 104 directories in 2.017779 seconds (51.541817 items/sec)
   0: [2019-09-02T10:37:12] Creating files.    
   0: [2019-09-02T11:12:10] Created 10408751 items in 2097.987293 seconds (4961.303166 items/sec)
   0: [2019-09-02T11:12:10] Copying data.                                       
   0: [2019-09-02T14:12:18] Copy data: 40.035 TB (44018758689546 bytes)         
   0: [2019-09-02T14:12:18] Copy rate: 3.793 GB/s (44018758689546 bytes in 10807.864147 seconds)
   0: [2019-09-02T14:12:18] Syncing data to disk.                               
   0: [2019-09-02T14:12:32] Sync completed in 14.066516 seconds.                
   0: [2019-09-02T14:12:32] Setting ownership, permissions, and timestamps.     
   0: [2019-09-02T14:32:09] Updated 10408855 items in 1177.167491 seconds (8842.288869 items/sec)
   0: [2019-09-02T14:32:09] Syncing directory updates to disk.                  
   0: [2019-09-02T14:32:22] Sync completed in 13.326690 seconds.                
   0: [2019-09-02T14:32:23] Started: Sep-02-2019,10:36:58                       
   0: [2019-09-02T14:32:23] Completed: Sep-02-2019,14:32:22                     
   0: [2019-09-02T14:32:23] Seconds: 14124.547                                  
   0: [2019-09-02T14:32:23] Items: 10408855                                     
   0: [2019-09-02T14:32:23]   Directories: 104                                  
   0: [2019-09-02T14:32:23]   Files: 10408751                                   
   0: [2019-09-02T14:32:23]   Links: 0                                          
   0: [2019-09-02T14:32:23] Data: 40.035 TB (44018758689546 bytes)              
   0: [2019-09-02T14:32:23] Rate: 2.902 GB/s (44018758689546 bytes in 14124.547 seconds)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Oak doesn&apos;t use SSDs on the MDTs at the moment, and I fear that the JBD2 journal (4GB) was full at some point. I&apos;m investigating a file-level backup/restore-to-SSD option (we would also like to move from 512-byte inodes to 1024 for PFL later).&lt;/p&gt;</comment>
                            <comment id="254076" author="sthiell" created="Tue, 3 Sep 2019 16:28:17 +0000"  >&lt;p&gt;Another hint:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@oak-md1-s2 127.0.0.1-2019-08-31-18:02:59]# grep -c __jbd2_log_wait_for_space foreach_bt-crash-oak-md1-s2-2019-08-31-18-02-59 
290
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;That doesn&apos;t sound good... Still, it shouldn&apos;t LBUG. Let me know what you think. &lt;/p&gt;</comment>
                            <comment id="254354" author="gerrit" created="Sun, 8 Sep 2019 08:48:34 +0000"  >&lt;p&gt;Lai Siyao (lai.siyao@whamcloud.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/36100&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/36100&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-12717&quot; title=&quot;ASSERTION( !lod_obj_is_striped(child) ) failed&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-12717&quot;&gt;LU-12717&lt;/a&gt; mdd: free striping upon LOV setting error&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 47dd1cae1c02f63840cb9e301239917ca2138de9&lt;/p&gt;</comment>
                            <comment id="254355" author="laisiyao" created="Sun, 8 Sep 2019 09:00:28 +0000"  >&lt;p&gt;Hi Stephane, I uploaded a patch, you can apply it and see whether dsync works with it.&lt;/p&gt;</comment>
                            <comment id="254356" author="laisiyao" created="Sun, 8 Sep 2019 09:03:14 +0000"  >&lt;p&gt;It may be related with journal full: upon journal full, the LOV setting transaction may fail, but it doesn&apos;t free allocated striping in LOV declare_set, and next LOV setting will trigger lod_obj_is_striped() assertion.&lt;/p&gt;</comment>
                            <comment id="254448" author="sthiell" created="Tue, 10 Sep 2019 14:11:28 +0000"  >&lt;p&gt;Hi Lai,&lt;/p&gt;

&lt;p&gt;Thanks! As we&apos;re still running 2.10.8 on Oak, this patch apparently will require some back porting:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Making all in .
/tmp/rpmbuild-lustre-sthiell-SRrpJSP1/BUILD/lustre-2.10.8_4_g05af3ab/lustre/lod/lod_object.c: In function &apos;lod_invalidate&apos;:
/tmp/rpmbuild-lustre-sthiell-SRrpJSP1/BUILD/lustre-2.10.8_4_g05af3ab/lustre/lod/lod_object.c:4883:2: error: implicit declaration of function &apos;lod_striping_free&apos; [-Werror=implicit-function-declaration]
  lod_striping_free(env, lod_dt_obj(dt));
  ^
cc1: all warnings being treated as errors
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;In any case, for us, we&apos;re in the process of migrating MDT0 to a SSD backed RAID-10 volume (offline device-level copy + resize2fs). We&apos;ll do the same for MDT1 later (as soon as I get more SSDs). Meanwhile we have tested NRS TBF to limit the number of op/s and this has been helpful and it didn&apos;t crash again even with dsync running, but of course the performance is not optimal.&lt;/p&gt;</comment>
                            <comment id="269048" author="sthiell" created="Thu, 30 Apr 2020 23:28:13 +0000"  >&lt;p&gt;Hi! This issue hit us again today, even though we&apos;re now using SSDs on all Oak&apos;s MDTs. I see that Lai&apos;s patch above (&lt;a href=&quot;https://review.whamcloud.com/36100&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/36100&lt;/a&gt;) was almost ready to land and even had Andreas&apos; approval. It would probably be too much effort to port it to 2.10.8 (that we&apos;re still running on Oak), but would it be possible that you look at the patch again so that it can land into master. That way, this rare issue would be avoided in the future. Thanks!&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;Apr 30 15:28:38 oak-md1-s2 kernel: LustreError: 9033:0:(lod_object.c:3919:lod_ah_init()) ASSERTION( !lod_obj_is_striped(child) ) failed: 
Apr 30 15:28:38 oak-md1-s2 kernel: LustreError: 9033:0:(lod_object.c:3919:lod_ah_init()) LBUG
Apr 30 15:28:38 oak-md1-s2 kernel: Pid: 9033, comm: mdt01_088 3.10.0-957.27.2.el7_lustre.pl1.x86_64 #1 SMP Mon Aug 5 15:28:37 PDT 2019
Apr 30 15:28:38 oak-md1-s2 kernel: Call Trace:
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc0ddb7cc&amp;gt;] libcfs_call_trace+0x8c/0xc0 [libcfs]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc0ddb87c&amp;gt;] lbug_with_loc+0x4c/0xa0 [libcfs]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc174fb4f&amp;gt;] lod_ah_init+0x23f/0xde0 [lod]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc17cc09b&amp;gt;] mdd_object_make_hint+0xcb/0x190 [mdd]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc17b7d50&amp;gt;] mdd_create_data+0x330/0x730 [mdd]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc168a3fc&amp;gt;] mdt_mfd_open+0xc5c/0xe70 [mdt]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc168ab8b&amp;gt;] mdt_finish_open+0x57b/0x690 [mdt]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc168d09d&amp;gt;] mdt_reint_open+0x23fd/0x3190 [mdt]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc1681ca3&amp;gt;] mdt_reint_rec+0x83/0x210 [mdt]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc166318b&amp;gt;] mdt_reint_internal+0x5fb/0x9c0 [mdt]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc16636b2&amp;gt;] mdt_intent_reint+0x162/0x430 [mdt]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc16664bb&amp;gt;] mdt_intent_opc+0x1eb/0xaf0 [mdt]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc166ed58&amp;gt;] mdt_intent_policy+0x138/0x320 [mdt]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc123d2dd&amp;gt;] ldlm_lock_enqueue+0x38d/0x980 [ptlrpc]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc1266c03&amp;gt;] ldlm_handle_enqueue0+0xa83/0x1670 [ptlrpc]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc12ec892&amp;gt;] tgt_enqueue+0x62/0x210 [ptlrpc]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc12f07ca&amp;gt;] tgt_request_handle+0x92a/0x1370 [ptlrpc]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc129905b&amp;gt;] ptlrpc_server_handle_request+0x23b/0xaa0 [ptlrpc]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffc129c7a2&amp;gt;] ptlrpc_main+0xa92/0x1e40 [ptlrpc]
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffff866c2e81&amp;gt;] kthread+0xd1/0xe0
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffff86d77c37&amp;gt;] ret_from_fork_nospec_end+0x0/0x39
Apr 30 15:28:38 oak-md1-s2 kernel: [&amp;lt;ffffffffffffffff&amp;gt;] 0xffffffffffffffff
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="33447" name="foreach_bt-crash-oak-md1-s2-2019-08-31-18-02-59.log" size="1341948" author="sthiell" created="Tue, 3 Sep 2019 04:24:42 +0000"/>
                            <attachment id="33448" name="vmcore-dmesg-oak-md1-s2-2019-08-31-18-02-59.txt" size="708907" author="sthiell" created="Tue, 3 Sep 2019 04:26:22 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i00m3r:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>