Details
-
Bug
-
Resolution: Not a Bug
-
Critical
-
None
-
Lustre 2.9.0
-
lola
build: tip of master, commit 0f37c051158a399f7b00536eeec27f5dbdd54168
-
3
-
9223372036854775807
Description
error happened during soaktesting of build '20160727' (see https://wiki.hpdd.intel.com/display/Releases/Soak+Testing+on+Lola#SoakTestingonLola-20160727)
OSTs formatted with zfs, MDSs formatted with ldiskfs
DNE is enabled, HSM/robinhood enable and integrated
4 MDSs with 1 MDT / MDS
6 OSSs with 4 OSTs / OSS
Server nodes configured in active-active HA confguration
The error occurs every time a MDS was rebooted or pathed a failover/failback sequence during soak testing. The manual command sequence below shows the effect.
The missing persistent setting will make it impossible to start the Lustre POSIX copytool or if started any archive action will fail for those files with metadata references residing on the MDT with stopped HSM configuration.
Parameter hsm_control enabled on all MDSs:
[root@lola-16 ~]# pdsh -g mds 'lctl get_param mdt.*.hsm_control' lola-11: mdt.soaked-MDT0003.hsm_control=enabled lola-8: mdt.soaked-MDT0000.hsm_control=enabled lola-9: mdt.soaked-MDT0001.hsm_control=enabled lola-10: mdt.soaked-MDT0002.hsm_control=enabled
Reboot single node for test (lola-9):
[root@lola-16 ~]# date; pdsh -g mds ' lctl get_param mdt.*.hsm_control' ; date Mon Aug 1 03:50:15 PDT 2016 lola-11: mdt.soaked-MDT0003.hsm_control=enabled lola-9: mdt.soaked-MDT0001.hsm_control=stopped lola-8: mdt.soaked-MDT0000.hsm_control=enabled lola-10: mdt.soaked-MDT0002.hsm_control=enabled
Archiving of files in DNE striped dir failed with
[root@lola-12 lhsm_parameter_test_2]# ps -ef | grep lhsmtool | grep -v grep root 16560 1 0 Jul28 ? 00:00:11 /usr/sbin/lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked [root@lola-12 lhsm_parameter_test_2]# for i in `seq 1 29 ` > do > lfs hsm_archive file_$i > done Cannot send HSM request (use of file_4): Resource temporarily unavailable Cannot send HSM request (use of file_8): Resource temporarily unavailable Cannot send HSM request (use of file_13): Resource temporarily unavailable Cannot send HSM request (use of file_17): Resource temporarily unavailable Cannot send HSM request (use of file_22): Resource temporarily unavailable Cannot send HSM request (use of file_26): Resource temporarily unavailable [root@lola-12 lhsm_parameter_test_2]# lfs getdirstripe . . lmv_stripe_count: 4 lmv_stripe_offset: 1 mdtidx FID[seq:oid:ver] 1 [0x240006990:0x1507:0x0] 2 [0x280007930:0x1507:0x0] 3 [0x2c0004280:0x1507:0x0] 0 [0x200004a51:0x1507:0x0] [root@lola-12 lhsm_parameter_test_2]# ls -l total 20073833 -rw-r--r-- 1 root root 1073741824 Aug 1 04:05 file_1 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_10 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_11 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_12 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_13 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_14 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_15 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_16 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_17 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_18 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_19 -rw-r--r-- 1 root root 1073741824 Aug 1 04:06 file_2 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_20 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_21 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_22 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_23 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_24 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_25 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_26 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_27 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_28 -rw-r--r-- 1 root root 31457280 Aug 1 04:08 file_29 -rw-r--r-- 1 root root 1073741824 Aug 1 04:07 file_3 -rw-r--r-- 1 root root 1073741824 Aug 1 04:07 file_4 -rw-r--r-- 1 root root 1073741824 Aug 1 04:07 file_5 -rw-r--r-- 1 root root 1073741824 Aug 1 04:07 file_6 -rw-r--r-- 1 root root 1073741824 Aug 1 04:07 file_7 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_8 -rw-r--r-- 1 root root 1073741824 Aug 1 04:08 file_9
Also startup of lhsmtool_posix fail if (single) MDS has lost HSM configuration (hsm_control != enabled):
---> Copy tool truely stopped [root@lola-12 lhsm_parameter_test_2]# ps -ef | grep -v grep | grep lhsmtool_posix [root@lola-12 lhsm_parameter_test_2]# echo $? 1 [root@lola-12 lhsm_parameter_test_2]# lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked 1470051347.328262 lhsmtool_posix[34413]: action=0 src=(null) dst=(null) mount_point=/mnt/soaked [root@lola-12 lhsm_parameter_test_2]# cannot start copytool on '/mnt/soaked': No such device or address (6) 1470051347.356775 lhsmtool_posix[34414]: cannot start copytool interface: No such device or address (6) 1470051347.356850 lhsmtool_posix[34414]: process finished, errs: 0 major, 0 minor, rc=-6 (No such device or address) ---> Startup fails [root@lola-12 hsm_test]# lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked 1470048821.789862 lhsmtool_posix[34161]: action=0 src=(null) dst=(null) mount_point=/mnt/soaked [root@lola-12 hsm_test]# cannot start copytool on '/mnt/soaked': No such device or address (6) 1470048821.826642 lhsmtool_posix[34162]: cannot start copytool interface: No such device or address (6) 1470048821.826682 lhsmtool_posix[34162]: process finished, errs: 0 major, 0 minor, rc=-6 (No such device or address)
But works again if setting is correted:
on MDS: [root@lola-9 ~]# lctl get_param mdt.soaked-MDT0001.hsm_control mdt.soaked-MDT0001.hsm_control=stopped [root@lola-9 ~]# lctl set_param mdt.soaked-MDT0001.hsm_control=enabled mdt.soaked-MDT0001.hsm_control=enabled on HSM node: [root@lola-12 lhsm_parameter_test_2]# lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked 1470051717.918156 lhsmtool_posix[34423]: action=0 src=(null) dst=(null) mount_point=/mnt/soaked [root@lola-12 lhsm_parameter_test_2]# 1470051717.923027 lhsmtool_posix[34424]: waiting for message from kernel [root@lola-12 lhsm_parameter_test_2]# 1470051721.627906 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1 1470051721.628060 lhsmtool_posix[34424]: waiting for message from kernel 1470051721.628145 lhsmtool_posix[34425]: '[0x200004a57:0x7:0x0]' action ARCHIVE reclen 72, cookie=0x579a2021 1470051721.628820 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1 1470051721.628904 lhsmtool_posix[34424]: waiting for message from kernel 1470051721.628960 lhsmtool_posix[34426]: '[0x200004a57:0x6:0x0]' action ARCHIVE reclen 72, cookie=0x579a2020 1470051721.629582 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1 1470051721.629665 lhsmtool_posix[34424]: waiting for message from kernel ... ... [root@lola-12 lhsm_parameter_test_2]# ps -ef | grep -v grep | grep lhsmtool_posix root 34424 1 26 04:41 ? 00:00:07 lhsmtool_posix --daemon --hsm_root /mnt/soaked-arch/ --archive 1 /mnt/soaked ... ->archiving works again: [root@lola-12 lhsm_parameter_test_2]# for i in `seq 1 29 `; do lfs hsm_archive file_$i; done 1470051862.686419 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1 1470051862.686490 lhsmtool_posix[34424]: waiting for message from kernel 1470051862.686523 lhsmtool_posix[34479]: '[0x240007161:0x1:0x0]' action ARCHIVE reclen 72, cookie=0x579f3538 1470051862.688868 lhsmtool_posix[34479]: processing file 'soaktest/hsm_test/lhsm_parameter_test_2/file_4' 1470051862.702707 lhsmtool_posix[34424]: copytool fs=soaked archive#=1 item_count=1 ... ... [root@lola-12 lhsm_parameter_test_2]# echo $? 0 [root@lola-12 lhsm_parameter_test_2]# for i in `seq 1 29 `; do lfs hsm_state file_$i; done file_1: (0x00000009) exists archived, archive_id:1 file_2: (0x00000009) exists archived, archive_id:1 file_3: (0x00000009) exists archived, archive_id:1 file_4: (0x00000009) exists archived, archive_id:1 file_5: (0x00000009) exists archived, archive_id:1 file_6: (0x00000009) exists archived, archive_id:1 file_7: (0x00000009) exists archived, archive_id:1 file_8: (0x00000009) exists archived, archive_id:1 file_9: (0x00000009) exists archived, archive_id:1 file_10: (0x00000009) exists archived, archive_id:1 file_11: (0x00000009) exists archived, archive_id:1 file_12: (0x00000009) exists archived, archive_id:1 file_13: (0x00000009) exists archived, archive_id:1 file_14: (0x00000009) exists archived, archive_id:1 file_15: (0x00000009) exists archived, archive_id:1 file_16: (0x00000009) exists archived, archive_id:1 file_17: (0x00000001) exists, archive_id:1 file_18: (0x00000009) exists archived, archive_id:1 file_19: (0x00000009) exists archived, archive_id:1 file_20: (0x00000009) exists archived, archive_id:1 file_21: (0x00000009) exists archived, archive_id:1 file_22: (0x00000001) exists, archive_id:1 file_23: (0x00000009) exists archived, archive_id:1 file_24: (0x00000009) exists archived, archive_id:1 file_25: (0x00000009) exists archived, archive_id:1 file_26: (0x00000001) exists, archive_id:1 file_27: (0x00000009) exists archived, archive_id:1 file_28: (0x00000009) exists archived, archive_id:1 file_29: (0x00000009) exists archived, archive_id:1