<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 03:29:10 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-16689] upgrade to 2.15.2 lost sever top level directories </title>
                <link>https://jira.whamcloud.com/browse/LU-16689</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;After upgrading filesystem from 2.12 to 2.15.2 Several top level directories got corrupted.&#160;&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@nbp11-srv1 ~&amp;#93;&lt;/span&gt;# ls -l /nobackupp11/&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/ylin4&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/mbarad&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/ldgrant&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/kknizhni&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/mzhao4&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/afahad&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/jliu7&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/jswest&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/hsp&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/vjespos1&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/ssepka&apos;: No such file or directory&lt;br/&gt;
ls: cannot access &apos;/nobackupp11/cjang1&apos;: No such file or directory&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;debugfs: &#160;stat ylin4&lt;br/&gt;
Inode: 43051102 &#160; Type: directory &#160; &#160;Mode: &#160;0000 &#160; Flags: 0x80000&lt;br/&gt;
Generation: 503057142 &#160; &#160;Version: 0x00000000:00000000&lt;br/&gt;
User: &#160; &#160; 0 &#160; Group: &#160; &#160; 0 &#160; Project: &#160; &#160; 0 &#160; Size: 4096&lt;br/&gt;
File ACL: 0&lt;br/&gt;
Links: 2 &#160; Blockcount: 8&lt;br/&gt;
Fragment: &#160;Address: 0 &#160; &#160;Number: 0 &#160; &#160;Size: 0&lt;br/&gt;
&#160;ctime: 0x63dd8e2f:22c83c08 &amp;#8211; Fri Feb &#160;3 14:43:59 2023&lt;br/&gt;
&#160;atime: 0x63dd8e2f:22c83c08 &amp;#8211; Fri Feb &#160;3 14:43:59 2023&lt;br/&gt;
&#160;mtime: 0x63dd8e2f:22c83c08 &amp;#8211; Fri Feb &#160;3 14:43:59 2023&lt;br/&gt;
crtime: 0x63dd8e2f:22c83c08 &amp;#8211; Fri Feb &#160;3 14:43:59 2023&lt;br/&gt;
Size of extra inode fields: 32&lt;br/&gt;
Extended attributes:&lt;br/&gt;
&#160; lma: fid=&lt;span class=&quot;error&quot;&gt;&amp;#91;0x280015902:0x2:0x0&amp;#93;&lt;/span&gt; compat=0 incompat=2&lt;br/&gt;
EXTENTS:&lt;br/&gt;
(0):671099035&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Not thinking I delete these via ldiskfs. The data is still there how can we recover the director data.&lt;/p&gt;

&lt;ol&gt;
	&lt;li&gt;lfs quota -u ylin4 &#160;/nobackupp11&lt;br/&gt;
Disk quotas for usr ylin4 (uid 11560):&lt;br/&gt;
&#160; &#160; &#160;Filesystem &#160;kbytes &#160; quota &#160; limit &#160; grace &#160; files &#160; quota &#160; limit &#160; grace&lt;br/&gt;
&#160; &#160;/nobackupp11 11337707848* 1073741824 2147483648 &#160; &#160; &#160; - &#160;208359 &#160;500000 &#160;600000 &#160; &#160; &#160; -&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</description>
                <environment></environment>
        <key id="75366">LU-16689</key>
            <summary>upgrade to 2.15.2 lost sever top level directories </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="3">Duplicate</resolution>
                                        <assignee username="adilger">Andreas Dilger</assignee>
                                    <reporter username="mhanafi">Mahmoud Hanafi</reporter>
                        <labels>
                    </labels>
                <created>Thu, 30 Mar 2023 18:58:30 +0000</created>
                <updated>Sat, 13 May 2023 14:40:40 +0000</updated>
                            <resolved>Sat, 13 May 2023 14:40:19 +0000</resolved>
                                                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="367922" author="mhanafi" created="Thu, 30 Mar 2023 19:34:47 +0000"  >&lt;p&gt;I started a lfsck dry-run&lt;/p&gt;

&lt;p&gt;on MDT0 getting a lot of these errors that are for files with hard links&lt;/p&gt;

&lt;p&gt;Mar 30 12:32:28 nbp11-srv1 kernel: ret_from_fork+0x1f/0x40&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: Lustre: nbp11-MDT0000-osd: namespace LFSCK add flags for &lt;span class=&quot;error&quot;&gt;&amp;#91;0x20004ca8c:0x8986:0x0&amp;#93;&lt;/span&gt; in the trace file, flags 1, old 0, new 1: rc = -22&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: CPU: 34 PID: 1520983 Comm: lfsck Kdump: loaded Tainted: G &#160; &#160; &#160; &#160; &#160; OE &#160; &#160;--------- - &#160;- 4.18.0-425.3.1.el8_lustre.x86_64 #1&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 04/21/2022&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: Call Trace:&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: dump_stack+0x41/0x60&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: lfsck_trans_create.part.58+0x63/0x70 &lt;span class=&quot;error&quot;&gt;&amp;#91;lfsck&amp;#93;&lt;/span&gt;&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: lfsck_namespace_trace_update+0x972/0x980 &lt;span class=&quot;error&quot;&gt;&amp;#91;lfsck&amp;#93;&lt;/span&gt;&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: lfsck_namespace_exec_oit+0x87d/0x970 &lt;span class=&quot;error&quot;&gt;&amp;#91;lfsck&amp;#93;&lt;/span&gt;&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: lfsck_master_oit_engine+0xc56/0x1360 &lt;span class=&quot;error&quot;&gt;&amp;#91;lfsck&amp;#93;&lt;/span&gt;&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: lfsck_master_engine+0x512/0xcd0 &lt;span class=&quot;error&quot;&gt;&amp;#91;lfsck&amp;#93;&lt;/span&gt;&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: ? __schedule+0x2d9/0x860&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: ? finish_wait+0x80/0x80&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: ? lfsck_master_oit_engine+0x1360/0x1360 &lt;span class=&quot;error&quot;&gt;&amp;#91;lfsck&amp;#93;&lt;/span&gt;&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: kthread+0x10a/0x120&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: ? set_kthread_struct+0x50/0x50&lt;br/&gt;
Mar 30 12:32:28 nbp11-srv1 kernel: ret_from_fork+0x1f/0x40&lt;/p&gt;



&lt;p&gt;on MDT2 getting these errors&lt;/p&gt;

&lt;p&gt;Mar 30 12:33:43 nbp11-srv5 kernel: Lustre: nbp11-MDT0002-osd: layout LFSCK master found bad lmm_oi for &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2400ecb78:0x1e8bb:0x0&amp;#93;&lt;/span&gt;: rc = 56&lt;br/&gt;
Mar 30 12:33:43 nbp11-srv5 kernel: Lustre: nbp11-MDT0002-osd: layout LFSCK master found bad lmm_oi for &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2400ecb78:0x1e8bc:0x0&amp;#93;&lt;/span&gt;: rc = 56&lt;/p&gt;

&lt;p&gt;These are the files for the bad directories.&lt;/p&gt;</comment>
                            <comment id="367932" author="mhanafi" created="Thu, 30 Mar 2023 20:54:32 +0000"  >&lt;p&gt;I recovered the files.&lt;/p&gt;

&lt;p&gt;I found the parent fid and cd into /fs/.lustre/fid/fidnum then just move all contents to a newly created directory&lt;/p&gt;

&lt;p&gt;I still like to understand what caused the corruption.&#160;&lt;/p&gt;</comment>
                            <comment id="367936" author="dongyang" created="Thu, 30 Mar 2023 21:14:15 +0000"  >&lt;p&gt;Hi Mahmoud, 2 questions:&lt;br/&gt;
What does stat look like on nobackupp11 via debugfs?&lt;br/&gt;
How did you find ylin4 in debugfs?&lt;/p&gt;</comment>
                            <comment id="367944" author="adilger" created="Fri, 31 Mar 2023 00:07:00 +0000"  >&lt;p&gt;This looks like &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16655&quot; title=&quot;Files not accessible after 2.12 -&amp;gt; 2.14/2.15 upgrade&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16655&quot;&gt;&lt;del&gt;LU-16655&lt;/del&gt;&lt;/a&gt;, which was caused by a bad code change breaking the on-disk file format for OI Scrub.  If Scrub has been run on a filesystem prior to upgrade then it will incorrectly read the fields from this file.  The patch &lt;a href=&quot;https://review.whamcloud.com/50455&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/50455&lt;/a&gt; &quot;&lt;tt&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16655&quot; title=&quot;Files not accessible after 2.12 -&amp;gt; 2.14/2.15 upgrade&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16655&quot;&gt;&lt;del&gt;LU-16655&lt;/del&gt;&lt;/a&gt; scrub: upgrade scrub_file from 2.12 format&lt;/tt&gt;&quot; fixes this issue and &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16655&quot; title=&quot;Files not accessible after 2.12 -&amp;gt; 2.14/2.15 upgrade&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16655&quot;&gt;&lt;del&gt;LU-16655&lt;/del&gt;&lt;/a&gt; describes the details (though it is too late to avoid this bug for your system).&lt;/p&gt;</comment>
                            <comment id="367945" author="adilger" created="Fri, 31 Mar 2023 00:08:26 +0000"  >&lt;p&gt;It may be that mounting the MDT with &quot;&lt;tt&gt;-o resetoi&lt;/tt&gt;&quot; would have rebuilt the OI files without having to move them from &lt;tt&gt;lost+found&lt;/tt&gt;, in case someone finds this ticket in the future.&lt;/p&gt;</comment>
                            <comment id="367950" author="mhanafi" created="Fri, 31 Mar 2023 01:22:18 +0000"  >&lt;p&gt;I used debugfs to dump all fid in /REMOTE_DIR on each MDT. Then I did a lookup of the fid2path to match the directories that were missing. I then cd into the /fs/.lustre/fid/fidnum and moved all contents to its new location.&lt;/p&gt;

&lt;p&gt;Dry-run lfsck still running and finding lots of these&lt;br/&gt;
Mar 30 12:53:39 nbp11-srv5 kernel: Lustre: nbp11-MDT0002-osd: layout LFSCK master found bad lmm_oi for &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2400ecb98:0x8467:0x0&amp;#93;&lt;/span&gt;: rc = 56&lt;br/&gt;
Mar 30 12:53:39 nbp11-srv5 kernel: Lustre: nbp11-MDT0002-osd: layout LFSCK master found bad lmm_oi for &lt;span class=&quot;error&quot;&gt;&amp;#91;0x2400ecb98:0x8468:0x0&amp;#93;&lt;/span&gt;: rc = 56&lt;/p&gt;

&lt;p&gt;These are files under the directories that gotten corrupted.&#160;&lt;/p&gt;</comment>
                            <comment id="368051" author="adilger" created="Fri, 31 Mar 2023 18:15:43 +0000"  >&lt;p&gt;Mahmoud, do you have any logs from the mount after the upgrade that indicate OI Scrub has been run/completed on the MDTs?  It would be worthwhile to check the state of the OI files on the MDTs to confirm that they are correct:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;mds# lctl get_param osd-ldiskfs.*.oi_scrub
osd-ldiskfs.testfs-MDT0000.oi_scrub=
name: OI_scrub
magic: 0x4c5fd252
oi_files: 64
status: completed
flags:
param:
time_since_last_completed: 6 seconds
time_since_latest_start: 6 seconds
time_since_last_checkpoint: 6 seconds

:
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The important information here is that it reports &lt;tt&gt;oi_files: 64&lt;/tt&gt; and not some other number (which is what the &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16655&quot; title=&quot;Files not accessible after 2.12 -&amp;gt; 2.14/2.15 upgrade&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16655&quot;&gt;&lt;del&gt;LU-16655&lt;/del&gt;&lt;/a&gt; bug broke).  The OI files map the Lustre FIDs to local inode numbers, so without these most of the by-FID lookups will be broken.  The OI Scrub can rebuild the OI files from the FID xattr stored in each inode.&lt;/p&gt;

&lt;p&gt;If this is showing &quot;&lt;tt&gt;oi_files: 1&lt;/tt&gt;&quot; or 2 or similar, my recommendation would be to mount the MDTs with  &quot;&lt;tt&gt;-o resetoi&lt;/tt&gt;&quot; to force a rebuild of the OI files, or alternately mount MDTs with ldiskfs and move the &quot;&lt;tt&gt;oi.16.X&lt;/tt&gt;&quot; files out of the filesystem and then remount as Lustre and it should rebuild them automatically at mount (this will take a few minutes).  Having a small number of OI files will cause scalability/performance issues.&lt;/p&gt;</comment>
                            <comment id="370144" author="pjones" created="Fri, 21 Apr 2023 15:34:48 +0000"  >&lt;p&gt;Mahmoud&lt;/p&gt;

&lt;p&gt;I&apos;m just checking in on this one. Presumably you have the &#160;&lt;font color=&quot;#172b4d&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16655&quot; title=&quot;Files not accessible after 2.12 -&amp;gt; 2.14/2.15 upgrade&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16655&quot;&gt;&lt;del&gt;LU-16655&lt;/del&gt;&lt;/a&gt; &lt;/font&gt;fix in place on the NASA distribution now (and it is already merged for the upcoming 2.15.3 release) but has the restoration of the OI files for the impacted system been completed?&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="372211" author="pjones" created="Sat, 13 May 2023 14:40:19 +0000"  >&lt;p&gt;Closing this as a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-16655&quot; title=&quot;Files not accessible after 2.12 -&amp;gt; 2.14/2.15 upgrade&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-16655&quot;&gt;&lt;del&gt;LU-16655&lt;/del&gt;&lt;/a&gt;&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="75162">LU-16655</issuekey>
        </issuelink>
                            </outwardlinks>
                                                        </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|i03hpb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>