<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 02:17:41 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-8456] HSM configuration not persitent for DNE</title>
                <link>https://jira.whamcloud.com/browse/LU-8456</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;error happened during soaktesting of build &apos;20160727&apos; (see &lt;a href=&quot;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160727&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160727&lt;/a&gt;)&lt;br/&gt;
OSTs formatted with zfs, MDSs formatted with ldiskfs&lt;br/&gt;
DNE is enabled, HSM/robinhood enable and integrated&lt;br/&gt;
4 MDSs with 1 MDT / MDS&lt;br/&gt;
6 OSSs with 4 OSTs / OSS&lt;br/&gt;
Server nodes configured in active-active HA confguration&lt;/p&gt;


&lt;p&gt;The error occurs every time a MDS was rebooted or pathed a failover/failback sequence during soak testing. The manual command sequence below shows the effect.&lt;/p&gt;

&lt;p&gt;The missing persistent setting will make it impossible to start the Lustre POSIX copytool or if started any archive action will fail for those files with metadata references residing on the MDT with stopped HSM configuration.&lt;/p&gt;

&lt;p&gt;Parameter &lt;tt&gt;hsm_control&lt;/tt&gt; enabled on all MDSs:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@lola-16 ~]# pdsh -g mds &apos;lctl get_param mdt.*.hsm_control&apos;
lola-11: mdt.soaked-MDT0003.hsm_control=enabled
lola-8: mdt.soaked-MDT0000.hsm_control=enabled
lola-9: mdt.soaked-MDT0001.hsm_control=enabled
lola-10: mdt.soaked-MDT0002.hsm_control=enabled
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Reboot single node for test (&lt;tt&gt;lola-9&lt;/tt&gt;):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@lola-16 ~]# date; pdsh -g mds &apos; lctl get_param mdt.*.hsm_control&apos; ; date
Mon Aug  1 03:50:15 PDT 2016
lola-11: mdt.soaked-MDT0003.hsm_control=enabled
lola-9: mdt.soaked-MDT0001.hsm_control=stopped
lola-8: mdt.soaked-MDT0000.hsm_control=enabled
lola-10: mdt.soaked-MDT0002.hsm_control=enabled
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Archiving of files in DNE striped dir failed with&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@lola-12 lhsm_parameter_test_2]# ps -ef | grep lhsmtool | grep -v grep
root      16560      1  0 Jul28 ?        00:00:11 /usr/sbin/lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked

[root@lola-12 lhsm_parameter_test_2]# for i in `seq 1 29 `
&amp;gt; do
&amp;gt; lfs hsm_archive file_$i
&amp;gt; done
Cannot send HSM request (use of file_4): Resource temporarily unavailable
Cannot send HSM request (use of file_8): Resource temporarily unavailable
Cannot send HSM request (use of file_13): Resource temporarily unavailable
Cannot send HSM request (use of file_17): Resource temporarily unavailable
Cannot send HSM request (use of file_22): Resource temporarily unavailable
Cannot send HSM request (use of file_26): Resource temporarily unavailable

[root@lola-12 lhsm_parameter_test_2]# lfs getdirstripe .
.
lmv_stripe_count: 4 lmv_stripe_offset: 1
mdtidx           FID[seq:oid:ver]
     1           [0x240006990:0x1507:0x0]
     2           [0x280007930:0x1507:0x0]
     3           [0x2c0004280:0x1507:0x0]
     0           [0x200004a51:0x1507:0x0]
[root@lola-12 lhsm_parameter_test_2]# ls -l
total 20073833
-rw-r--r-- 1 root root 1073741824 Aug  1 04:05 file_1
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_10
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_11
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_12
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_13
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_14
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_15
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_16
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_17
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_18
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_19
-rw-r--r-- 1 root root 1073741824 Aug  1 04:06 file_2
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_20
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_21
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_22
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_23
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_24
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_25
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_26
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_27
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_28
-rw-r--r-- 1 root root   31457280 Aug  1 04:08 file_29
-rw-r--r-- 1 root root 1073741824 Aug  1 04:07 file_3
-rw-r--r-- 1 root root 1073741824 Aug  1 04:07 file_4
-rw-r--r-- 1 root root 1073741824 Aug  1 04:07 file_5
-rw-r--r-- 1 root root 1073741824 Aug  1 04:07 file_6
-rw-r--r-- 1 root root 1073741824 Aug  1 04:07 file_7
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_8
-rw-r--r-- 1 root root 1073741824 Aug  1 04:08 file_9
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Also startup of lhsmtool_posix fail if (single) MDS has lost HSM configuration (hsm_control != enabled):&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt; 
---&amp;gt; Copy tool truely stopped
[root@lola-12 lhsm_parameter_test_2]# ps -ef | grep -v grep | grep lhsmtool_posix
[root@lola-12 lhsm_parameter_test_2]# echo $?
1
[root@lola-12 lhsm_parameter_test_2]# lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked
1470051347.328262 lhsmtool_posix[34413]: action=0 src=(null) dst=(null) mount_point=/mnt/soaked
[root@lola-12 lhsm_parameter_test_2]# cannot start copytool on &apos;/mnt/soaked&apos;: No such device or address (6)
1470051347.356775 lhsmtool_posix[34414]: cannot start copytool interface: No such device or address (6)
1470051347.356850 lhsmtool_posix[34414]: process finished, errs: 0 major, 0 minor, rc=-6 (No such device or address)

---&amp;gt; Startup fails
[root@lola-12 hsm_test]# lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked
1470048821.789862 lhsmtool_posix[34161]: action=0 src=(null) dst=(null) mount_point=/mnt/soaked
[root@lola-12 hsm_test]# cannot start copytool on &apos;/mnt/soaked&apos;: No such device or address (6)
1470048821.826642 lhsmtool_posix[34162]: cannot start copytool interface: No such device or address (6)
1470048821.826682 lhsmtool_posix[34162]: process finished, errs: 0 major, 0 minor, rc=-6 (No such device or address)
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;But works again if setting is correted:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;on MDS:
[root@lola-9 ~]# lctl get_param mdt.soaked-MDT0001.hsm_control
mdt.soaked-MDT0001.hsm_control=stopped
[root@lola-9 ~]# lctl set_param mdt.soaked-MDT0001.hsm_control=enabled
mdt.soaked-MDT0001.hsm_control=enabled
on HSM node:
[root@lola-12 lhsm_parameter_test_2]# lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked
1470051717.918156 lhsmtool_posix[34423]: action=0 src=(null) dst=(null) mount_point=/mnt/soaked
[root@lola-12 lhsm_parameter_test_2]# 1470051717.923027 lhsmtool_posix[34424]: waiting for message from kernel

[root@lola-12 lhsm_parameter_test_2]# 1470051721.627906 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1
1470051721.628060 lhsmtool_posix[34424]: waiting for message from kernel
1470051721.628145 lhsmtool_posix[34425]: &apos;[0x200004a57:0x7:0x0]&apos; action ARCHIVE reclen 72, cookie=0x579a2021
1470051721.628820 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1
1470051721.628904 lhsmtool_posix[34424]: waiting for message from kernel
1470051721.628960 lhsmtool_posix[34426]: &apos;[0x200004a57:0x6:0x0]&apos; action ARCHIVE reclen 72, cookie=0x579a2020
1470051721.629582 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1
1470051721.629665 lhsmtool_posix[34424]: waiting for message from kernel
...
...

[root@lola-12 lhsm_parameter_test_2]# ps -ef | grep -v grep | grep lhsmtool_posix
root      34424      1 26 04:41 ?        00:00:07 lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked

...
-&amp;gt;archiving works again:
[root@lola-12 lhsm_parameter_test_2]# for i in `seq 1 29 `; do lfs hsm_archive file_$i; done
1470051862.686419 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1
1470051862.686490 lhsmtool_posix[34424]: waiting for message from kernel
1470051862.686523 lhsmtool_posix[34479]: &apos;[0x240007161:0x1:0x0]&apos; action ARCHIVE reclen 72, cookie=0x579f3538
1470051862.688868 lhsmtool_posix[34479]: processing file &apos;soaktest/hsm_test/lhsm_parameter_test_2/file_4&apos;
1470051862.702707 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1
...
...

[root@lola-12 lhsm_parameter_test_2]# echo $?
0
[root@lola-12 lhsm_parameter_test_2]# for i in `seq 1 29 `; do lfs hsm_state file_$i; done
file_1: (0x00000009) exists archived, archive_id:1
file_2: (0x00000009) exists archived, archive_id:1
file_3: (0x00000009) exists archived, archive_id:1
file_4: (0x00000009) exists archived, archive_id:1
file_5: (0x00000009) exists archived, archive_id:1
file_6: (0x00000009) exists archived, archive_id:1
file_7: (0x00000009) exists archived, archive_id:1
file_8: (0x00000009) exists archived, archive_id:1
file_9: (0x00000009) exists archived, archive_id:1
file_10: (0x00000009) exists archived, archive_id:1
file_11: (0x00000009) exists archived, archive_id:1
file_12: (0x00000009) exists archived, archive_id:1
file_13: (0x00000009) exists archived, archive_id:1
file_14: (0x00000009) exists archived, archive_id:1
file_15: (0x00000009) exists archived, archive_id:1
file_16: (0x00000009) exists archived, archive_id:1
file_17: (0x00000001) exists, archive_id:1
file_18: (0x00000009) exists archived, archive_id:1
file_19: (0x00000009) exists archived, archive_id:1
file_20: (0x00000009) exists archived, archive_id:1
file_21: (0x00000009) exists archived, archive_id:1
file_22: (0x00000001) exists, archive_id:1
file_23: (0x00000009) exists archived, archive_id:1
file_24: (0x00000009) exists archived, archive_id:1
file_25: (0x00000009) exists archived, archive_id:1
file_26: (0x00000001) exists, archive_id:1
file_27: (0x00000009) exists archived, archive_id:1
file_28: (0x00000009) exists archived, archive_id:1
file_29: (0x00000009) exists archived, archive_id:1
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment>lola&lt;br/&gt;
build: tip of master, commit 0f37c051158a399f7b00536eeec27f5dbdd54168 </environment>
        <key id="38516">LU-8456</key>
            <summary>HSM configuration not persitent for DNE</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="6">Not a Bug</resolution>
                                        <assignee username="jhammond">John Hammond</assignee>
                                    <reporter username="heckes">Frank Heckes</reporter>
                        <labels>
                            <label>soak</label>
                    </labels>
                <created>Mon, 1 Aug 2016 12:01:24 +0000</created>
                <updated>Tue, 2 Aug 2016 09:22:15 +0000</updated>
                            <resolved>Tue, 2 Aug 2016 09:22:15 +0000</resolved>
                                    <version>Lustre 2.9.0</version>
                                                        <due></due>
                            <votes>0</votes>
                                    <watches>6</watches>
                                                                            <comments>
                            <comment id="160462" author="jgmitter" created="Mon, 1 Aug 2016 17:50:24 +0000"  >&lt;p&gt;Hi John,&lt;/p&gt;

&lt;p&gt;Could you please assess this issue?&lt;/p&gt;

&lt;p&gt;Thanks.&lt;br/&gt;
Joe&lt;/p&gt;</comment>
                            <comment id="160467" author="jhammond" created="Mon, 1 Aug 2016 18:01:15 +0000"  >&lt;p&gt;How was &lt;tt&gt;mdt.*.hsm_control&lt;/tt&gt; set initially? Was it a persistent set param?&lt;/p&gt;</comment>
                            <comment id="160507" author="heckes" created="Tue, 2 Aug 2016 09:22:00 +0000"  >&lt;p&gt;Sorry, I was confused by the error messages from &lt;tt&gt;-P&lt;/tt&gt; option when I commited the configuration the first time:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;[root@lola-16 ~]# pdsh -g mds &apos;lctl set_param -P mdt.*.hsm_control=enabled&apos;
lola-11: No device found for name MGS: Invalid argument
lola-11: This command must be run on the MGS.
lola-11: error: executing set_param: No such device
lola-9: No device found for name MGS: Invalid argument
lola-9: This command must be run on the MGS.
lola-9: error: executing set_param: No such device
lola-10: No device found for name MGS: Invalid argument
lola-10: This command must be run on the MGS.
lola-10: error: executing set_param: No such device
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;HSM is configured correctly and the parameter turns out to be persistent using &lt;tt&gt;-P&lt;/tt&gt;. (Rebooted nodes twice).&lt;br/&gt;
I&apos;m sorry for the spam.&lt;/p&gt;</comment>
                    </comments>
                    <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzyj2f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>9223372036854775807</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>