<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:31:31 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-10040] nodemap and quota issues (ineffective GID mapping)</title>
                <link>https://jira.whamcloud.com/browse/LU-10040</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;We&apos;re using the nodemap feature with map_mode=gid_only in production and we are seeing more and more issues with GID mapping, which seems to default to squash_gid instead of being properly mapped. The nodemap hasn&apos;t changed for these groups, we just add new groups from time to time.&lt;/p&gt;

&lt;p&gt;Example, configuration for mapping &apos;sherlock&apos; on MGS:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-md1-s1 sherlock]# pwd
/proc/fs/lustre/nodemap/sherlock

[root@oak-md1-s1 sherlock]# cat ranges 
[
 { id: 6, start_nid: 0.0.0.0@o2ib4, end_nid: 255.255.255.255@o2ib4 },
 { id: 5, start_nid: 0.0.0.0@o2ib3, end_nid: 255.255.255.255@o2ib3 }
]

[root@oak-md1-s1 sherlock]# cat idmap 
[
 { idtype: gid, client_id: 3525, fs_id: 3741 } { idtype: gid, client_id: 6401, fs_id: 3752 } { idtype: gid, client_id: 99001, fs_id: 3159 } { idtype: gid, client_id: 10525, fs_id: 3351 } { idtype: gid, client_id: 11886, fs_id: 3593 } { idtype: gid, client_id: 12193, fs_id: 3636 } { idtype: gid, client_id: 13103, fs_id: 3208 } { idtype: gid, client_id: 17079, fs_id: 3700 } { idtype: gid, client_id: 19437, fs_id: 3618 } { idtype: gid, client_id: 22959, fs_id: 3745 } { idtype: gid, client_id: 24369, fs_id: 3526 } { idtype: gid, client_id: 26426, fs_id: 3352 } { idtype: gid, client_id: 29361, fs_id: 3746 } { idtype: gid, client_id: 29433, fs_id: 3479 } { idtype: gid, client_id: 30289, fs_id: 3262 } { idtype: gid, client_id: 32264, fs_id: 3199 } { idtype: gid, client_id: 32774, fs_id: 3623 } { idtype: gid, client_id: 38517, fs_id: 3702 } { idtype: gid, client_id: 40387, fs_id: 3708 } { idtype: gid, client_id: 47235, fs_id: 3674 } { idtype: gid, client_id: 48931, fs_id: 3325 } { idtype: gid, client_id: 50590, fs_id: 3360 } { idtype: gid, client_id: 52892, fs_id: 3377 } { idtype: gid, client_id: 56316, fs_id: 3353 } { idtype: gid, client_id: 56628, fs_id: 3411 } { idtype: gid, client_id: 59943, fs_id: 3372 } { idtype: gid, client_id: 63938, fs_id: 3756 } { idtype: gid, client_id: 100533, fs_id: 3281 } { idtype: gid, client_id: 244300, fs_id: 3617 } { idtype: gid, client_id: 254778, fs_id: 3362 } { idtype: gid, client_id: 267829, fs_id: 3748 } { idtype: gid, client_id: 270331, fs_id: 3690 } { idtype: gid, client_id: 305454, fs_id: 3371 } { idtype: gid, client_id: 308753, fs_id: 3367 }

[root@oak-md1-s1 sherlock]# cat squash_gid 
99
[root@oak-md1-s1 sherlock]# cat map_mode 
gid_only

[root@oak-md1-s1 sherlock]# cat admin_nodemap 
0
[root@oak-md1-s1 sherlock]# cat deny_unknown 
1
[root@oak-md1-s1 sherlock]# cat trusted_nodemap 
0


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Issue with group: GID 3593 (mapped to GID 11886 on sherlock)&lt;/p&gt;

&lt;p&gt;lfs quota, not mapped (using canonical GID 3593):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-rbh01 ~]# lfs quota -g oak_euan /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; group oak_euan (gid 3593):
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace
           /oak 33255114444  50000000000 50000000000       -  526016  7500000 7500000       -

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Broken lfs quota mapped on sherlock (o2ib4):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@sh-113-01 ~]# lfs quota -g euan /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; grp euan (gid 11886):
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace
           /oak 2875412844*      1       1       -      26*      1       1       -
[root@sh-113-01 ~]# lctl list_nids
10.9.113.1@o2ib4

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;It matches the quota usage for squash_gid:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-rbh01 ~]# lfs quota -g 99 /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; group 99 (gid 99):
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace
           /oak 2875412844*      1       1       -      26*      1       1       -

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;Please note that GID mapping works OK for most of the groups though:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;3199 -&amp;gt; 32264(sherlock)

canonical:
[root@oak-rbh01 ~]# lfs quota -g oak_ruthm /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; group oak_ruthm (gid 3199):
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace
           /oak 10460005688  20000000000 20000000000       - 1683058  3000000 3000000       -

mapped (sherlock):
[root@sh-113-01 ~]# lfs quota -g ruthm /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; grp ruthm (gid 32264):
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace
           /oak 10460005688  20000000000 20000000000       - 1683058  3000000 3000000       -


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Failing over the MDT resolved a few groups, but not all. Failing the MDT back showed an issue on the exact same original groups having issues (currently 4-5).&lt;/p&gt;

&lt;p&gt;While I haven&apos;t seen it by myself yet, the issue &lt;em&gt;seems&lt;/em&gt; to affect users as a few of them reported erroneous EDQUOT errors. This is why it is quite urgent to figure out what&apos;s wrong. Please note that the issue was already there before using the patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9929&quot; title=&quot;Use &amp;quot;setfacl&amp;quot; to set  &amp;quot;default&amp;quot; setting fail when nodemap enabled&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9929&quot;&gt;&lt;del&gt;LU-9929&lt;/del&gt;&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;I&apos;m willing to attach some debug logs, but what debug flags should I enable to troubleshoot such a quota+nodemap issue on client and server?&lt;/p&gt;

&lt;p&gt;Thanks!&lt;br/&gt;
 Stephane&lt;/p&gt;</description>
                <environment>client: lustre-client-2.10.0-1.el7.x86_64, lustre-2.10.1_RC1_srcc01-1.el7.centos.x86_64 (2.10.1-RC1 + patch from &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9929&quot; title=&quot;Use &amp;quot;setfacl&amp;quot; to set  &amp;quot;default&amp;quot; setting fail when nodemap enabled&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9929&quot;&gt;&lt;strike&gt;LU-9929&lt;/strike&gt;&lt;/a&gt;)</environment>
        <key id="48502">LU-10040</key>
            <summary>nodemap and quota issues (ineffective GID mapping)</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="emoly.liu">Emoly Liu</assignee>
                                    <reporter username="sthiell">Stephane Thiell</reporter>
                        <labels>
                            <label>patch</label>
                    </labels>
                <created>Thu, 28 Sep 2017 04:07:17 +0000</created>
                <updated>Fri, 15 Mar 2019 08:33:40 +0000</updated>
                            <resolved>Wed, 22 Nov 2017 14:44:38 +0000</resolved>
                                    <version>Lustre 2.10.0</version>
                    <version>Lustre 2.10.1</version>
                                    <fixVersion>Lustre 2.11.0</fixVersion>
                    <fixVersion>Lustre 2.10.2</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>4</watches>
                                                                            <comments>
                            <comment id="209816" author="emoly.liu" created="Thu, 28 Sep 2017 08:17:25 +0000"  >&lt;p&gt;Stephane,&lt;br/&gt;
There is no debugging mask for nodemap, so I think enabling quota should be OK. Is this issue related to &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9671&quot; title=&quot;Group quota not enforced on clients with mapped GID&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9671&quot;&gt;&lt;del&gt;LU-9671&lt;/del&gt;&lt;/a&gt;?&lt;br/&gt;
Thanks,&lt;br/&gt;
Emoly&lt;br/&gt;
&#160;&lt;/p&gt;</comment>
                            <comment id="209872" author="sthiell" created="Thu, 28 Sep 2017 18:38:39 +0000"  >&lt;p&gt;Hi Emoly,&lt;/p&gt;

&lt;p&gt;Yes it might be related to &lt;del&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-9671&quot; title=&quot;Group quota not enforced on clients with mapped GID&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-9671&quot;&gt;&lt;del&gt;LU-9671&lt;/del&gt;&lt;/a&gt;&lt;/del&gt;. I just attached some logs from the MDS and one client. I did the following on the client, one that is incorrectly mapped, followed by one correctly mapped:&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@sh-113-01 ~]# lfs quota -hg euan /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; grp euan (gid 11886):
     Filesystem    used   quota   limit   grace   files   quota   limit   grace
           /oak  2.678T*     1k      1k       -      26*      1       1       -
[root@sh-113-01 ~]# lfs quota -hg ruthm /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; grp ruthm (gid 32264):
     Filesystem    used   quota   limit   grace   files   quota   limit   grace
           /oak  9.886T  18.63T  18.63T       - 1683246  3000000 3000000       -


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I think the first lfs quota ends up like this on the MDT:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;1691 00040000:04000000:0.0:1506621079.799639:0:200838:0:(qmt_handler.c:65:qmt_get()) $$$ fetch settings qmt:oak-QMT0000 pool:0-md id:99 enforced:1 hard:1 soft:1 granted:27 time:1506754418 qunit:1024 edquot:1 may_rel:0 revoke:135062
1692 00040000:04000000:0.0:1506621079.799642:0:200838:0:(qmt_handler.c:65:qmt_get()) $$$ fetch settings qmt:oak-QMT0000 pool:0-dt id:99 enforced:1 hard:1 soft:1 granted:2875370220 time:1506717740 qunit:1024 edquot:1 may_rel:0 revoke:135032


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Notice the id:99, which is the squash_gid&lt;/p&gt;

&lt;p&gt;The good one seems to be:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;3417 00040000:04000000:7.0:1506621082.680316:0:200864:0:(qmt_handler.c:65:qmt_get()) $$$ fetch settings qmt:oak-QMT0000 pool:0-md id:3199 enforced:1 hard:3000000 soft:3000000 granted:1769473 time:0 qunit:262144 edquot:0 may_rel:0 revoke:0
3418 00040000:04000000:7.0:1506621082.680324:0:200864:0:(qmt_handler.c:65:qmt_get()) $$$ fetch settings qmt:oak-QMT0000 pool:0-dt id:3199 enforced:1 hard:20000000000 soft:20000000000 granted:12762252828 time:0 qunit:67108864 edquot:0 may_rel:0 revoke:0
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Any idea of what could be wrong there?&lt;br/&gt;
Thanks!!&lt;br/&gt;
Stephane&lt;/p&gt;</comment>
                            <comment id="209892" author="emoly.liu" created="Fri, 29 Sep 2017 03:17:03 +0000"  >&lt;p&gt;Stephane,&lt;br/&gt;
 Can you provide the gid of user &quot;euan&quot; and &quot;ruthm&quot; in your example above? because I don&apos;t know if they are 11886 and 32264 respectively.&lt;br/&gt;
 And BTW, I just noticed in your following example, the users are different:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;lfs quota, not mapped (using canonical GID 3593):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-rbh01 ~]# lfs quota -g oak_euan /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; group oak_euan (gid 3593):
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace
           /oak 33255114444  50000000000 50000000000       -  526016  7500000 7500000       -

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Broken lfs quota mapped on sherlock (o2ib4):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@sh-113-01 ~]# lfs quota -g euan /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; grp euan (gid 11886):
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace
           /oak 2875412844*      1       1       -      26*      1       1       -
[root@sh-113-01 ~]# lctl list_nids
10.9.113.1@o2ib4

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;&lt;/blockquote&gt;
&lt;p&gt;Can you show me&#160;the gid of&#160;&quot;oak_euan&quot; on oak_rbh01 and &quot;euan&quot; on sh-110-01? I will analyze your logs and see if I can reproduce this issue.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
 Emoly&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="209896" author="emoly.liu" created="Fri, 29 Sep 2017 03:46:39 +0000"  >&lt;p&gt;And, can you run &quot;getfacl /oak&quot; and post the output here? Thanks.&lt;/p&gt;</comment>
                            <comment id="209897" author="sthiell" created="Fri, 29 Sep 2017 03:54:26 +0000"  >&lt;p&gt;oak_euan GID 3593 maps to euan GID 11886:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-rbh01 ~]# getent group oak_euan
oak_euan:*:3593:(...)

[root@sh-113-01 ~]# getent group euan
euan:*:11886:(...)



&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Similarly, oak_ruthm GID 3199 maps to ruthm GID 32264&lt;/p&gt;

&lt;p&gt;getfacl /oak results below:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-rbh01 ~]# getfacl /oak
getfacl: Removing leading &lt;span class=&quot;code-quote&quot;&gt;&apos;/&apos;&lt;/span&gt; from absolute path names
# file: oak
# owner: root
# group: root
user::rwx
group::r-x
other::r-x

[root@sh-113-01 ~]# getfacl /oak
getfacl: Removing leading &lt;span class=&quot;code-quote&quot;&gt;&apos;/&apos;&lt;/span&gt; from absolute path names
# file: oak
# owner: root
# group: nobody
user::rwx
group::r-x
other::r-x


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Thanks!&lt;/p&gt;</comment>
                            <comment id="209900" author="emoly.liu" created="Fri, 29 Sep 2017 04:50:29 +0000"  >&lt;p&gt;Your nodemap policy { idtype: gid, client_id: 11886, fs_id: 3593 } means a user GID=11886 from a client with any IP address will be mapped to a canonical GID=3593 on server. But oak-euan(3593) is not defined as a client_id in any idmap, so there is no mapping for it. That&apos;s why you see the following output. This is right.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@oak-rbh01 ~&amp;#93;&lt;/span&gt;# lfs quota -g oak_euan /oak&lt;br/&gt;
 Disk quotas for group oak_euan (gid 3593):&lt;/p&gt;&lt;/blockquote&gt;</comment>
                            <comment id="209901" author="sthiell" created="Fri, 29 Sep 2017 04:58:37 +0000"  >&lt;p&gt;No, my nodemap policy is for o2ib3 and o2ib4 (sherlock). oak-rbh01 is o2ib5 and is in the canonical GID range, sorry for the confusion, I used it to get the canonical GID, that&apos;s all. sh-113-01 is the client with mapped GID in my example.&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-md1-s2 ~]# cat /proc/fs/lustre/nodemap/sherlock/ranges 
[
 { id: 6, start_nid: 0.0.0.0@o2ib4, end_nid: 255.255.255.255@o2ib4 },
 { id: 5, start_nid: 0.0.0.0@o2ib3, end_nid: 255.255.255.255@o2ib3 }
]


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;The &apos;sherlock&apos; policy has&lt;/p&gt;

&lt;p&gt;{ idtype: gid, client_id: 11886, fs_id: 3593 }&lt;/p&gt;

&lt;p&gt;with ranges=o2ib3 and o2ib4, so GID=11886 on sherlock (eg. sh-113-01, o2ib4) should be mapped to the canonical GID 3593, but it is not...&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;

&lt;p&gt;&#160;&lt;/p&gt;</comment>
                            <comment id="209904" author="sthiell" created="Fri, 29 Sep 2017 05:09:10 +0000"  >&lt;p&gt;Actually, I wasn&apos;t able to reproduce the &apos;disk quota exceeded errors&apos; that a few users from the &apos;euan&apos; group are getting and reported. That means the gid mapping seems to be done correctly when using the filesystem  from &apos;sherlock&apos;. But what is clearly broken right now is &quot;lfs quota -g euan /oak&quot; from sherlock (with the sherlock policy nodemap). But it works for almost all other groups, weird. Thanks for your help.&lt;/p&gt;</comment>
                            <comment id="209905" author="emoly.liu" created="Fri, 29 Sep 2017 05:17:41 +0000"  >&lt;p&gt;When you run &quot;&lt;span class=&quot;error&quot;&gt;&amp;#91;root@sh-113-01 ~&amp;#93;&lt;/span&gt;# lfs quota -g euan /oak&quot;, as you said, since sh-113-01 is&#160;o2ib4 and euan GID=11886, it will be mapped to canonical GID=3593 on server. But,&#160;this is&#160;only&#160;half, because there is an unmapping process when server returns the GID to the client, 3593 will be unmapped to 11886.&lt;/p&gt;</comment>
                            <comment id="209906" author="sthiell" created="Fri, 29 Sep 2017 05:19:01 +0000"  >&lt;p&gt;To illustrate what I am trying to say:&lt;/p&gt;

&lt;p&gt;Using policy&lt;/p&gt;

&lt;p&gt;{ idtype: gid, client_id: 11886, fs_id: 3593 }&lt;/p&gt;

&lt;p&gt;On a client with mapped GID 11886:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[euan@sherlock-ln01 login_node ~]$ id
uid=11886(euan) gid=11886(euan) groups=11886(euan),&#8230;
[euan@sherlock-ln01 login_node ~]$ cd $OAK
[euan@sherlock-ln01 login_node /oak/stanford/groups/euan]$ touch test
[euan@sherlock-ln01 login_node /oak/stanford/groups/euan]$ ls -l test
-rw-rw----+ 1 euan euan 0 Sep 28 22:14 test
[euan@sherlock-ln01 login_node /oak/stanford/groups/euan]$ ls -ln test
-rw-rw----+ 1 11886 11886 0 Sep 28 22:14 test
[euan@sherlock-ln01 login_node /oak/stanford/groups/euan]$ lfs quota -g euan /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; grp euan (gid 11886):
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace
           /oak 2875169252*      1       1       -      25*      1       1       -


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;On a client with canonical GID 3593:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-rbh01 ~]# ls -l /oak/stanford/groups/euan/test
-rw-rw----+ 1 euan oak_euan 0 Sep 28 22:17 /oak/stanford/groups/euan/test
[root@oak-rbh01 ~]# ls -ln /oak/stanford/groups/euan/test
-rw-rw----+ 1 11886 3593 0 Sep 28 22:17 /oak/stanford/groups/euan/test

[root@oak-rbh01 ~]# lfs quota -g oak_euan /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; group oak_euan (gid 3593):
     Filesystem  kbytes   quota   limit   grace   files   quota   limit   grace
           /oak 33246312992  50000000000 50000000000       -  525598  7500000 7500000       -
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="209907" author="emoly.liu" created="Fri, 29 Sep 2017 05:23:46 +0000"  >&lt;p&gt;Stephane, never mind. If you can hit this issue next time, can you collect the logs for the whole lfs quota command? Because this time&#160;I didn&apos;t see any nodemap related information in your mds log file. I need to check if there is something wrong with nodemap mapping or unmapping. Thanks.&lt;/p&gt;</comment>
                            <comment id="209908" author="sthiell" created="Fri, 29 Sep 2017 05:26:27 +0000"  >&lt;p&gt;Ok, will do that first thing tomorrow morning, thanks! If you have the debug flags and subs I should use, please let me know. Otherwise, I will enable full debug, but the MDS is in production so that might be a bit verbose. Thanks!&lt;/p&gt;</comment>
                            <comment id="209914" author="emoly.liu" created="Fri, 29 Sep 2017 07:17:42 +0000"  >&lt;p&gt;I see your question now, you mean both of these two users should return same quota information. I have checked our code that we have nodemap handling in quota. I will keep investigation.&lt;/p&gt;</comment>
                            <comment id="209948" author="emoly.liu" created="Fri, 29 Sep 2017 14:33:20 +0000"  >&lt;p&gt;Stephane, &lt;/p&gt;

&lt;p&gt;Could you please run the following&#160;command on all the MDS server(s)? If the output is too big to paste here, you can save it as a file and upload here.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;lctl get_param qmt.*.*.glb-grp
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;BTW, as you said, you failed over the MDT, could you please check if this MDT has the same lustre version to others?&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
 Emoly&lt;/p&gt;</comment>
                            <comment id="209971" author="sthiell" created="Fri, 29 Sep 2017 16:48:41 +0000"  >&lt;p&gt;Hi Emoly,&lt;/p&gt;

&lt;p&gt;Attached oak-md1-s2.glb-grp.txt (output of glb-grp of the primary MDS), then I failed over the MDT to oak-md1-s1 (MDS failover partner) and did the same command,&#160; output in attached&#160;file oak-md1-s1.glb-grp.txt.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
 Stephane&lt;/p&gt;</comment>
                            <comment id="210030" author="emoly.liu" created="Sat, 30 Sep 2017 08:55:52 +0000"  >&lt;p&gt;Since GID mapping works OK for most of the groups and this issue can&apos;t be reproduced now, I suspect it is related to the process of assigning or getting the nodemap to/from the given export. I will have a check.&lt;/p&gt;</comment>
                            <comment id="210054" author="sthiell" created="Mon, 2 Oct 2017 05:44:28 +0000"  >&lt;p&gt;Hi Emoly,&lt;/p&gt;

&lt;p&gt;The issue with &apos;lfs quota&apos; is easily reproducible (for me). The one with the real quota error isn&apos;t, but a few users provided job logs with disk quota exceeded errors. So far, users that have reported this error are from the groups corresponding to the broken groups in &quot;lfs quota&quot;, so I think there is a link.&lt;/p&gt;

&lt;p&gt;I took new, cleaner logs from MDT and one mapped client (sh-101-59)&lt;/p&gt;

&lt;p&gt;&lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/28382/28382_oak-md1-s2.mdt.dk.full.log&quot; title=&quot;oak-md1-s2.mdt.dk.full.log attached to LU-10040&quot;&gt;oak-md1-s2.mdt.dk.full.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt; &lt;span class=&quot;nobr&quot;&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/attachment/28383/28383_sh-101-59.client.dk.full.log&quot; title=&quot;sh-101-59.client.dk.full.log attached to LU-10040&quot;&gt;sh-101-59.client.dk.full.log&lt;sup&gt;&lt;img class=&quot;rendericon&quot; src=&quot;https://jira.whamcloud.com/images/icons/link_attachment_7.gif&quot; height=&quot;7&quot; width=&quot;7&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt;&lt;/sup&gt;&lt;/a&gt;&lt;/span&gt;&lt;/p&gt;

&lt;p&gt;with two lfs quota commands:&lt;/p&gt;
&lt;ul&gt;
	&lt;li&gt;one lfs quota that doesn&apos;t work (euan),&lt;/li&gt;
	&lt;li&gt;and the second one that does work (ruthm).&lt;/li&gt;
&lt;/ul&gt;


&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@sh-101-59 ~]# lfs quota -gh euan /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; grp euan (gid 11886):
     Filesystem    used   quota   limit   grace   files   quota   limit   grace
           /oak  2.687T*     1k      1k       -      40*      1       1       -
[root@sh-101-59 ~]# lfs quota -gh ruthm /oak
Disk quotas &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; grp ruthm (gid 32264):
     Filesystem    used   quota   limit   grace   files   quota   limit   grace
           /oak  14.86T  18.63T  18.63T       - 2958342  3000000 3000000       -
[root@sh-101-59 ~]# lctl dk /tmp/dk.full.log


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;As a reminder, gid 11886 is mapped to 3593 on MDT, and gid 32264 to 3199.&lt;/p&gt;

&lt;p&gt;For the working GID 32264-&amp;gt;3199, I can find two log entries in the MDT log:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;115253 00000001:00000001:1.0:1506921877.591507:0:124387:0:(nodemap_handler.c:672:nodemap_map_id()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=3199 : 3199 : c7f)

116197 00000001:00000001:1.0:1506921877.595021:0:124387:0:(nodemap_handler.c:672:nodemap_map_id()) &lt;span class=&quot;code-object&quot;&gt;Process&lt;/span&gt; leaving (rc=3199 : 3199 : c7f)


&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Unfortunately, for the first one, if I look for &quot;rc=3593&quot;, I cannot find anything :/&lt;/p&gt;

&lt;p&gt;Please note that the client was doing nothing when the logs were taken, but the MDT is in production.&lt;/p&gt;

&lt;p&gt;Please let me know if I can further help debug this issue.&lt;/p&gt;

&lt;p&gt;Thanks!&lt;br/&gt;
 Stephane&lt;/p&gt;</comment>
                            <comment id="210347" author="sthiell" created="Thu, 5 Oct 2017 00:40:42 +0000"  >&lt;p&gt;I tried to reproduce the issue on two VMs today, one client and one server (llmount.sh) after importing the exact same &apos;sherlock&apos; idmap config, but I wasn&apos;t able to reproduce the issue, all groups are properly mapped including &apos;euan&apos; and the two other broken groups. This is super weird.&lt;br/&gt;
Also, users from the group &apos;euan&apos; are still reporting sporadic Disk quota exceeded error, apparently on disk quota not inode. We have now 2.10.1 GA on clients and 2.10.1 RC1 on servers (I&apos;ll upgrade soon to 2.10.1 GA too).&lt;br/&gt;
It is like this particular (euan) group mapping is ignored for lfs quota and sometimes for I/Os too.&lt;br/&gt;
The nodemap code doesn&apos;t have many debugging capabilities, this is very unfortunate. Could you help me add pertinent debugging code lines so I can compile a special version to retrieve more info about that?&lt;br/&gt;
Thanks!&lt;br/&gt;
Stephane&lt;/p&gt;</comment>
                            <comment id="210575" author="sthiell" created="Sat, 7 Oct 2017 03:33:58 +0000"  >&lt;p&gt;I just found this useful server-side &apos;lctl nodemap_test_id&apos; command. At least, I know I&apos;m not crazy now &lt;img class=&quot;emoticon&quot; src=&quot;https://jira.whamcloud.com/images/icons/emoticons/smile.png&quot; height=&quot;16&quot; width=&quot;16&quot; align=&quot;absmiddle&quot; alt=&quot;&quot; border=&quot;0&quot;/&gt; .&lt;/p&gt;

&lt;p&gt;From the MGS (2.10.1 RC1):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-md1-s1 ~]# cat /proc/fs/lustre/nodemap/sherlock/ranges 
[
 { id: 6, start_nid: 0.0.0.0@o2ib4, end_nid: 255.255.255.255@o2ib4 },
 { id: 5, start_nid: 0.0.0.0@o2ib3, end_nid: 255.255.255.255@o2ib3 }
]
[root@oak-md1-s1 ~]# cat /proc/fs/lustre/nodemap/sherlock/idmap 
[
 { idtype: gid, client_id: 3525, fs_id: 3741 } { idtype: gid, client_id: 6401, fs_id: 3752 } { idtype: gid, client_id: 99001, fs_id: 3159 } { idtype: gid, client_id: 10525, fs_id: 3351 } { idtype: gid, client_id: 11886, fs_id: 3593 } { idtype: gid, client_id: 12193, fs_id: 3636 } { idtype: gid, client_id: 13103, fs_id: 3208 } { idtype: gid, client_id: 17079, fs_id: 3700 } { idtype: gid, client_id: 19437, fs_id: 3618 } { idtype: gid, client_id: 22959, fs_id: 3745 } { idtype: gid, client_id: 24369, fs_id: 3526 } { idtype: gid, client_id: 26426, fs_id: 3352 } { idtype: gid, client_id: 29361, fs_id: 3746 } { idtype: gid, client_id: 29433, fs_id: 3479 } { idtype: gid, client_id: 30289, fs_id: 3262 } { idtype: gid, client_id: 32264, fs_id: 3199 } { idtype: gid, client_id: 32774, fs_id: 3623 } { idtype: gid, client_id: 38517, fs_id: 3702 } { idtype: gid, client_id: 40387, fs_id: 3708 } { idtype: gid, client_id: 40837, fs_id: 3768 } { idtype: gid, client_id: 47235, fs_id: 3674 } { idtype: gid, client_id: 48931, fs_id: 3325 } { idtype: gid, client_id: 50590, fs_id: 3360 } { idtype: gid, client_id: 52892, fs_id: 3377 } { idtype: gid, client_id: 56316, fs_id: 3353 } { idtype: gid, client_id: 56628, fs_id: 3411 } { idtype: gid, client_id: 59943, fs_id: 3372 } { idtype: gid, client_id: 63938, fs_id: 3756 } { idtype: gid, client_id: 100533, fs_id: 3281 } { idtype: gid, client_id: 244300, fs_id: 3617 } { idtype: gid, client_id: 254778, fs_id: 3362 } { idtype: gid, client_id: 267829, fs_id: 3748 } { idtype: gid, client_id: 270331, fs_id: 3690 } { idtype: gid, client_id: 305454, fs_id: 3371 } { idtype: gid, client_id: 308753, fs_id: 3367 }
]

This is pertinent below:
{ idtype: gid, client_id: 32264, fs_id: 3199 }
{ idtype: gid, client_id: 11886, fs_id: 3593 }

[root@oak-md1-s1 ~]# lctl nodemap_test_id --nid 10.210.47.253@o2ib3 --idtype gid --id 32264
3199
[root@oak-md1-s1 ~]# lctl nodemap_test_id --nid 10.210.47.253@o2ib3 --idtype gid --id 11886
99
[root@oak-md1-s1 ~]# cat /proc/fs/lustre/nodemap/sherlock/squash_gid 
99

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="210576" author="sthiell" created="Sat, 7 Oct 2017 06:52:26 +0000"  >&lt;p&gt;Ok so&#160;I found out the origin of the problem.&#160;It looks&#160;like the problem came when we&#160;changed a client GID, then this broke the nodemap idmap rbtree somehow (only the client_to_fs_gidmap I think, the reverse seems to work). I would bet the &apos;replace&apos; code part of&#160;idmap_insert() is broken.&lt;/p&gt;

&lt;p&gt;I was able to reproduce the problem on&#160;a test VM. It is easy. Please try llmount following by ./break_nodemap_rbtree.sh&lt;/p&gt;

&lt;p&gt;Reproducer logs and script attached. You&apos;ll see that client GID 99001 is not mapped and cannot be removed anymore, but listed in idmap.&lt;/p&gt;

&lt;p&gt;On the impacted production system, I&apos;m not sure this can be fixed without stopping production and recreating the whole nodemap. Please advise.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Stephane&lt;/p&gt;</comment>
                            <comment id="210582" author="gerrit" created="Sat, 7 Oct 2017 18:05:54 +0000"  >&lt;p&gt;Stephan Thiell (sthiell@stanford.edu) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/29364&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29364&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10040&quot; title=&quot;nodemap and quota issues (ineffective GID mapping)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10040&quot;&gt;&lt;del&gt;LU-10040&lt;/del&gt;&lt;/a&gt; nodemap: only replace exact same idmap in idmap_insert&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 95d487db29c6a6d9e3289ea865d5dc8ea6ad51a9&lt;/p&gt;</comment>
                            <comment id="210583" author="sthiell" created="Sat, 7 Oct 2017 21:47:22 +0000"  >&lt;p&gt;With a better understanding of the situation, we were able to fix the issue of the production system without downtime. We&#160;duplicated the corrupted &apos;sherlock&apos; idmap config into a new &apos;sherlock2&apos; nodemap (without the nid ranges at first). After double-checking the config, we then moved the ranges from sherlock to sherlock2,&#160;that might have generated some bad GIDs&#160;during this short period of time (a few seconds), but now&#160;all the groups are properly mapped. I submitted a patch to review to avoid corrupting the idmap red black trees in the future. I&apos;ll delete the corrupted nodemap config on Monday (it worked on the test system). The priority of this ticket&#160;may be lowered.&lt;/p&gt;

&lt;p&gt;Stephane&lt;/p&gt;</comment>
                            <comment id="210586" author="emoly.liu" created="Mon, 9 Oct 2017 01:32:47 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/secure/ViewProfile.jspa?name=sthiell&quot; class=&quot;user-hover&quot; rel=&quot;sthiell&quot;&gt;sthiell&lt;/a&gt;,&lt;/p&gt;

&lt;p&gt;Thanks for your investigation on this issue and your patch! I&apos;m just back to work from my vacation. I will review you patch now.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Emoly&lt;/p&gt;</comment>
                            <comment id="210609" author="emoly.liu" created="Mon, 9 Oct 2017 13:48:55 +0000"  >&lt;p&gt;The root cause does come from idmap_insert():&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;    if (!replace) {
        ...
    } else {
                rb_replace_node(&amp;amp;cur-&amp;gt;id_client_to_fs,
                                &amp;amp;idmap-&amp;gt;id_client_to_fs,
                                fwd_root);
                rb_replace_node(&amp;amp;cur-&amp;gt;id_fs_to_client,
                                &amp;amp;idmap-&amp;gt;id_fs_to_client,
                                bck_root);
                idmap_destroy(cur);
    }

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;When idmap_insert() calls rb_replace_node() to replace the &quot;cur&quot; id, only id_client_to_fs or id_fs_to_client is changed accordingly correctly, but the other one is not.&lt;br/&gt;
 In your &quot;break_nodemap_rbtree.sh&quot; example, {3525:3741, 10000(99001):3159, 10525:3351, 270331:3690}, 3159 was found in id_fs_to_client, but 10000 was replaced with 99001 in a wrong place, I mean it has wrong left node and right node, so when idmap_search() was called later, nothing was found. My following debugging information shows that 10525&apos;s right node was not 99001 but 270331.&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;00000001:00000040:1.0:1507545424.048143:0:18591:0:(nodemap_idmap.c:254:idmap_search()) tree_type=1, id=99001, idmap-&amp;gt;id_client=10525
00000001:00000040:1.0:1507545424.048144:0:18591:0:(nodemap_idmap.c:254:idmap_search()) tree_type=1, id=99001, idmap-&amp;gt;id_client=270331
00000001:00000001:1.0:1507545424.048144:0:18591:0:(nodemap_idmap.c:264:idmap_search()) Process leaving (rc=0 : 0 : 0)

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;But when you call &quot;lctl get_param nodemap.sherlock.idmap&quot;, it outputs everything correctly, as follows, because it calls rb_first() to print idmap from the leftmost one:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;nodemap.sherlock.idmap=
[
 { idtype: gid, client_id: 3525, fs_id: 3741 } { idtype: gid, client_id: 99001, fs_id: 3159 } { idtype: gid, client_id: 10525, fs_id: 3351 } { idtype: gid, client_id: 270331, fs_id: 3690 }
]

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I will fix this issue.&lt;/p&gt;</comment>
                            <comment id="210687" author="sthiell" created="Tue, 10 Oct 2017 02:35:10 +0000"  >&lt;p&gt;Hi Emoly,&lt;/p&gt;

&lt;p&gt;Thanks! That was my understanding too. Glad to have you back!&lt;/p&gt;

&lt;p&gt;When you say &quot;when you call &quot;lctl get_param nodemap.sherlock.idmap&quot;, it outputs everything correctly&quot;, I&#160;noticed that it is usually sorted by client_ids, but when the tree is &quot;corrupted&quot;, the output is not sorted (in the case of the reproducer, we have: 3525,&#160;99001,&#160;10525,&#160;270331 instead of&#160;3525,&#160;10525, 99001,&#160;270331).&lt;/p&gt;

&lt;p&gt;The full idmap output of my current (good) gid mapping&#160;looks&#160;like that:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@oak-md1-s1 ~]# cat /proc/fs/lustre/nodemap/sherlock2/idmap 
[
 { idtype: gid, client_id: 3525, fs_id: 3741 } { idtype: gid, client_id: 6401, fs_id: 3752 } { idtype: gid, client_id: 10525, fs_id: 3351 } { idtype: gid, client_id: 11886, fs_id: 3593 } { idtype: gid, client_id: 12193, fs_id: 3636 } { idtype: gid, client_id: 13103, fs_id: 3208 } { idtype: gid, client_id: 17079, fs_id: 3700 } { idtype: gid, client_id: 19437, fs_id: 3618 } { idtype: gid, client_id: 22959, fs_id: 3745 } { idtype: gid, client_id: 24369, fs_id: 3526 } { idtype: gid, client_id: 26426, fs_id: 3352 } { idtype: gid, client_id: 29361, fs_id: 3746 } { idtype: gid, client_id: 29433, fs_id: 3479 } { idtype: gid, client_id: 30289, fs_id: 3262 } { idtype: gid, client_id: 32264, fs_id: 3199 } { idtype: gid, client_id: 32774, fs_id: 3623 } { idtype: gid, client_id: 38517, fs_id: 3702 } { idtype: gid, client_id: 40387, fs_id: 3708 } { idtype: gid, client_id: 40837, fs_id: 3768 } { idtype: gid, client_id: 47235, fs_id: 3674 } { idtype: gid, client_id: 48931, fs_id: 3325 } { idtype: gid, client_id: 50590, fs_id: 3360 } { idtype: gid, client_id: 52892, fs_id: 3377 } { idtype: gid, client_id: 56316, fs_id: 3353 } { idtype: gid, client_id: 56628, fs_id: 3411 } { idtype: gid, client_id: 59943, fs_id: 3372 } { idtype: gid, client_id: 63938, fs_id: 3756 } { idtype: gid, client_id: 99001, fs_id: 3159 } { idtype: gid, client_id: 100533, fs_id: 3281 } { idtype: gid, client_id: 244300, fs_id: 3617 } { idtype: gid, client_id: 254778, fs_id: 3362 } { idtype: gid, client_id: 267829, fs_id: 3748 } { idtype: gid, client_id: 270331, fs_id: 3690 } { idtype: gid, client_id: 305454, fs_id: 3371 } { idtype: gid, client_id: 308753, fs_id: 3367 }
]

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This is very clean now, and users don&apos;t seem to have any issue anymore.&lt;/p&gt;

&lt;p&gt;But, unfortunately, when I tried to remove the corrupted nodemap &apos;sherlock&apos; using &quot;lctl nodemap_del sherlock&quot;, it was properly removed on the MGS, but I got the following message on all MDS and OSS:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[1197072.957459] Lustre: 123959:0:(mgc_request.c:1797:mgc_process_recover_nodemap_log()) MGC10.0.2.51@o2ib5: error processing nodemap log nodemap: rc = -2

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Thanks,&lt;br/&gt;
 Stephane&lt;/p&gt;</comment>
                            <comment id="211247" author="sthiell" created="Mon, 16 Oct 2017 21:51:36 +0000"  >&lt;p&gt;Hi Emoly,&lt;/p&gt;

&lt;p&gt;Since we tried to remove the bad nodemap &apos;sherlock&apos; using:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;lctl nodemap_del sherlock

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;we cannot add or remove any idmap anymore on the servers. It only works on the MGS, and then the following errors occur on the servers:&lt;br/&gt;
 &#160;&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;# clush -w@mds,@oss &lt;span class=&quot;code-quote&quot;&gt;&apos;dmesg | tail -1&apos;&lt;/span&gt;
oak-md1-s2: [1784564.587104] Lustre: 123959:0:(mgc_request.c:1797:mgc_process_recover_nodemap_log()) MGC10.0.2.51@o2ib5: error processing nodemap log nodemap: rc = -2
oak-io2-s1: [1025664.072965] Lustre: 317684:0:(mgc_request.c:1797:mgc_process_recover_nodemap_log()) MGC10.0.2.51@o2ib5: error processing nodemap log nodemap: rc = -2
oak-io2-s2: [1025667.518836] Lustre: 309962:0:(mgc_request.c:1797:mgc_process_recover_nodemap_log()) MGC10.0.2.51@o2ib5: error processing nodemap log nodemap: rc = -2
oak-io1-s2: [1465169.609494] Lustre: 149882:0:(mgc_request.c:1797:mgc_process_recover_nodemap_log()) MGC10.0.2.51@o2ib5: error processing nodemap log nodemap: rc = -2
oak-io1-s1: [1462808.944808] Lustre: 226436:0:(mgc_request.c:1797:mgc_process_recover_nodemap_log()) MGC10.0.2.51@o2ib5: error processing nodemap log nodemap: rc = -2

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;I think because of the idmap_search() bug, the servers cannot properly remove the nodemap. I should have kept as is ... do you think your patch could handle this case to fix the trees and allow processing the next nodemap logs? I&apos;ll start to work on a new build with your patch just in case.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
 Stephane&lt;/p&gt;</comment>
                            <comment id="211248" author="sthiell" created="Mon, 16 Oct 2017 21:57:42 +0000"  >&lt;p&gt;Also, do you know a way to force purge the nodemap log on the MGS? I would like to get rid of that &apos;nodemap_del&apos; entry, I can easily replay all next ones.&lt;/p&gt;

&lt;p&gt;Many thanks,&lt;br/&gt;
Stephane&lt;/p&gt;</comment>
                            <comment id="211250" author="sthiell" created="Mon, 16 Oct 2017 22:32:58 +0000"  >&lt;p&gt;Hi,&lt;br/&gt;
 I applied the patch and tried on the MDS, but unfortunately it is not able to process nodemap log. I will need to find a way to purge the nodemap log.&lt;/p&gt;

&lt;p&gt;oak-MDT0000:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[  127.492117] Lustre: Lustre: Build Version: 2.10.1_srcc02
[  127.527461] LNet: Using FMR &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; registration
[  127.553475] LNet: Added LNI 10.0.2.52@o2ib5 [8/256/0/180]
[  190.367048] LDISKFS-fs (dm-0): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,acl,no_mbcache,nodelalloc
[  191.433340] LustreError: 137-5: oak-MDT0000_UUID: not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 10.210.45.60@o2ib3 (no target). If you are running an HA pair check that the target is mounted on the other server.
[  191.452861] LustreError: Skipped 3 previous similar messages
[  191.471790] Lustre: 13119:0:(mgc_request.c:1797:mgc_process_recover_nodemap_log()) MGC10.0.2.51@o2ib5: error processing nodemap log nodemap: rc = -2
[  191.523256] Lustre: oak-MDT0000: Not available &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; connect from 10.210.47.38@o2ib3 (not set up)
[  191.532970] Lustre: Skipped 3 previous similar messages
[  192.015060] Lustre: oak-MDT0000: Imperative Recovery enabled, recovery window shrunk from 300-900 down to 150-900
[  192.501895] Lustre: oak-MDD0000: changelog on
[  192.549977] Lustre: oak-MDT0000: Will be in recovery &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; at least 2:30, or until 1212 clients reconnect
[  192.560492] Lustre: oak-MDT0000: Connection restored to cd0e08e0-aa22-f4da-21ed-94f218f886a1 (at 10.210.45.100@o2ib3)
[  192.595309] Lustre: oak-MDT0000: root_squash is set to 99:99
[  192.603004] Lustre: oak-MDT0000: nosquash_nids set to 10.0.2.[1-3]@o2ib5 10.0.2.[51-58]@o2ib5 10.0.2.[101-120]@o2ib5 10.0.2.[221-223]@o2ib5 10.0.2.[226-229]@o2ib5 10.0.2.[232-235]@o2ib5 10.0.2.[240-241]@o2ib5 10.210.47.253@o2ib3 10.9.0.[1-2]@o2ib4
...

&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Thanks,&lt;br/&gt;
Stephane&lt;/p&gt;</comment>
                            <comment id="211251" author="sthiell" created="Mon, 16 Oct 2017 22:48:12 +0000"  >&lt;p&gt;Also, I cannot remount the MGS anymore (2.10.1 + patch gerrit 29364):&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[ 1174.919438] LDISKFS-fs (dm-1): mounted filesystem with ordered data mode. Opts: user_xattr,errors=remount-ro,no_mbcache,nodelalloc
[ 1174.932548] Lustre: 17247:0:(osd_handler.c:7008:osd_mount()) MGS-osd: device /dev/mapper/md1-rbod1-mgt was upgraded from Lustre-1.x without enabling the dirdata feature. If you &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; not want to downgrade to Lustre-1.x again, you can enable it via &lt;span class=&quot;code-quote&quot;&gt;&apos;tune2fs -O dirdata device&apos;&lt;/span&gt;
[ 1175.062057] Lustre: 17247:0:(nodemap_storage.c:914:nodemap_load_entries()) MGS-osd: failed to load nodemap configuration: rc = -2
[ 1175.075067] LustreError: 17247:0:(mgs_fs.c:187:mgs_fs_setup()) MGS: error loading nodemap config file, file must be removed via ldiskfs: rc = -2
[ 1175.089557] LustreError: 17247:0:(mgs_handler.c:1297:mgs_init0()) MGS: MGS filesystem method init failed: rc = -2
[ 1175.145812] LustreError: 17247:0:(obd_config.c:608:class_setup()) setup MGS failed (-2)
[ 1175.154748] LustreError: 17247:0:(obd_mount.c:203:lustre_start_simple()) MGS setup error -2
[ 1175.164081] LustreError: 17247:0:(obd_mount_server.c:135:server_deregister_mount()) MGS not registered
[ 1175.174463] LustreError: 15e-a: Failed to start MGS &lt;span class=&quot;code-quote&quot;&gt;&apos;MGS&apos;&lt;/span&gt; (-2). Is the &lt;span class=&quot;code-quote&quot;&gt;&apos;mgs&apos;&lt;/span&gt; module loaded?
[ 1175.282230] Lustre: server umount MGS complete
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="211260" author="emoly.liu" created="Tue, 17 Oct 2017 02:53:20 +0000"  >&lt;p&gt;Stephane, I saw the same &quot;-2&quot; logs from my server on Oct. 9 when I tried to reproduce this issue. Let me see how to purge the nodemap log. &lt;/p&gt;</comment>
                            <comment id="211261" author="emoly.liu" created="Tue, 17 Oct 2017 05:02:00 +0000"  >&lt;p&gt;Here are some steps to remove nodemap config log from MGS. This will remove all nodemap information from MGS, &lt;font color=&quot;#d04437&quot;&gt;so before do that, you&apos;d better save all of nodemap information by &quot;&lt;tt&gt;cp -r /proc/fs/lustre/nodemap $nodemap_dir&lt;/tt&gt;&quot; or &quot;lctl get_param nodemap.&amp;#42;.&amp;#42;&#160;&amp;gt; $nodemap_file&quot;.&lt;/font&gt;&lt;/p&gt;
&lt;ol&gt;
	&lt;li&gt;umount your MGS&lt;/li&gt;
	&lt;li&gt;mount your MGS with ldiskfs type, by the command: &lt;tt&gt;mount -t ldiskfs $your_MGS_device $mountpoint&lt;/tt&gt;&lt;/li&gt;
	&lt;li&gt;cd &lt;tt&gt;$mountpoint&lt;/tt&gt;, you will see file &lt;tt&gt;./CONFIGS/nodemap&lt;/tt&gt;. I also suggest to save a backup(e.g. /tmp/nodemap) before remove it.&lt;/li&gt;
	&lt;li&gt;umount your MGS and remount it with&#160;lustre type&lt;/li&gt;
&lt;/ol&gt;


&lt;p&gt;Please let me know if this works for you.&lt;/p&gt;</comment>
                            <comment id="211286" author="sthiell" created="Tue, 17 Oct 2017 15:17:22 +0000"  >&lt;p&gt;Hi Emoly,&lt;/p&gt;

&lt;p&gt;Good news. I renamed ./CONFIGS/nodemap into ./CONFIGS/nodemap.corrupted instead of removing it, but it worked! I was then able to mount the MGS and recreate all nodemaps by hand from there. And now, I can add new&#160;idmaps again&#160;and they are properly propagated to the targets. The corrupted &apos;sherlock&apos; nodemap can&apos;t be seen anymore from the MGS.&lt;/p&gt;

&lt;p&gt;After some time, like a few minutes maybe (not immediately),&#160;the&#160;corrupted &apos;sherlock&apos; nodemap was also automatically removed from all targets (MDT, OST). This is great.&lt;/p&gt;

&lt;p&gt;Thanks again! By the way, I am now running 2.10.1 with the patch on the MGS/MDS.&lt;/p&gt;

&lt;p&gt;Stephane&lt;/p&gt;</comment>
                            <comment id="211355" author="emoly.liu" created="Wed, 18 Oct 2017 03:21:59 +0000"  >&lt;p&gt;Stephane,&lt;/p&gt;

&lt;p&gt;That&apos;s great. After the MGS restarts/remounts, the&#160;other targets will detect the lock of config log changed&#160;and then&#160;fetch the config log from MGS to update their local copy.&lt;/p&gt;

&lt;p&gt;Thanks,&lt;/p&gt;

&lt;p&gt;Emoly&lt;/p&gt;</comment>
                            <comment id="214390" author="gerrit" created="Wed, 22 Nov 2017 03:54:45 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/29364/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/29364/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10040&quot; title=&quot;nodemap and quota issues (ineffective GID mapping)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10040&quot;&gt;&lt;del&gt;LU-10040&lt;/del&gt;&lt;/a&gt; nodemap: add nodemap idmap correctly&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: 253ccbd55ffe7fcdc405c9fcc4f72a47578920fe&lt;/p&gt;</comment>
                            <comment id="214429" author="pjones" created="Wed, 22 Nov 2017 14:44:38 +0000"  >&lt;p&gt;Landed for 2.11&lt;/p&gt;</comment>
                            <comment id="214431" author="gerrit" created="Wed, 22 Nov 2017 14:48:09 +0000"  >&lt;p&gt;James Nunez (james.a.nunez@intel.com) uploaded a new patch: &lt;a href=&quot;https://review.whamcloud.com/30206&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30206&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10040&quot; title=&quot;nodemap and quota issues (ineffective GID mapping)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10040&quot;&gt;&lt;del&gt;LU-10040&lt;/del&gt;&lt;/a&gt; nodemap: add nodemap idmap correctly&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: a4de3c0f0ae3dbb684ba63874fa70e171c219cdf&lt;/p&gt;</comment>
                            <comment id="214669" author="gerrit" created="Mon, 27 Nov 2017 14:34:09 +0000"  >&lt;p&gt;John L. Hammond (john.hammond@intel.com) merged in patch &lt;a href=&quot;https://review.whamcloud.com/30206/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://review.whamcloud.com/30206/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-10040&quot; title=&quot;nodemap and quota issues (ineffective GID mapping)&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-10040&quot;&gt;&lt;del&gt;LU-10040&lt;/del&gt;&lt;/a&gt; nodemap: add nodemap idmap correctly&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: b2_10&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: e881c665bb60543fd2bbbd2d195ccce99a65f16b&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                                        </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="48800">LU-10135</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                            <attachment id="28419" name="break_nodemap_rbtree.sh" size="759" author="sthiell" created="Sat, 7 Oct 2017 06:49:54 +0000"/>
                            <attachment id="28370" name="oak-md1-s1.glb-grp.txt" size="11740" author="sthiell" created="Fri, 29 Sep 2017 16:44:59 +0000"/>
                            <attachment id="28367" name="oak-md1-s2.dk.log" size="1310033" author="sthiell" created="Thu, 28 Sep 2017 18:33:00 +0000"/>
                            <attachment id="28371" name="oak-md1-s2.glb-grp.txt" size="11740" author="sthiell" created="Fri, 29 Sep 2017 16:44:59 +0000"/>
                            <attachment id="28382" name="oak-md1-s2.mdt.dk.full.log" size="56450733" author="sthiell" created="Mon, 2 Oct 2017 05:41:45 +0000"/>
                            <attachment id="28420" name="reproducer.log" size="2728" author="sthiell" created="Sat, 7 Oct 2017 06:49:58 +0000"/>
                            <attachment id="28383" name="sh-101-59.client.dk.full.log" size="2362222" author="sthiell" created="Mon, 2 Oct 2017 05:41:56 +0000"/>
                            <attachment id="28368" name="sh-113-01.dk.log" size="559789" author="sthiell" created="Thu, 28 Sep 2017 18:33:07 +0000"/>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzzkxr:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10021"><![CDATA[2]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>