Details
-
Bug
-
Resolution: Unresolved
-
Medium
-
None
-
None
-
None
-
3
-
9223372036854775807
Description
After a MDT crash/failover STARTED archive/remove hang indefinitely (until active_request_timeout).
Reproducer
[root@server]# lctl set_param -P mdt.lustre-MDT0000.hsm_control=enabled
[root@client-el8 ~]# pkill -STOP lhsm
[root@client-el8 ~]# printf "%s\n" test{1..10} | xargs -P10 -I{} dd if=/dev/urandom of={} bs=1M count=1
[root@client-el8 ~]# lfs hsm_archive test*
[root@server]# lctl get_param mdt.lustre-MDT0000.hsm.active_requests
fid=[0x200000401:0xb:0x0] dfid=[0x200000401:0xb:0x0] compound/cookie=0x0/0x690cc770 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 data=[] canceled=0 uuid=beb75c8b-e53a-4436-be82-b288d258a08a done=0
fid=[0x200000401:0x13:0x0] dfid=[0x200000401:0x13:0x0] compound/cookie=0x0/0x690cc771 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 data=[] canceled=0 uuid=beb75c8b-e53a-4436-be82-b288d258a08a done=0
fid=[0x200000401:0xf:0x0] dfid=[0x200000401:0xf:0x0] compound/cookie=0x0/0x690cc772 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 data=[] canceled=0 uuid=beb75c8b-e53a-4436-be82-b288d258a08a done=0
[root@server]# umount /mnt/lustre-mds1
[root@client-el8]# pkill -CONT lhsm
[root@server]# mount -tlustre /dev/mapper/mds1_flakey /mnt/lustre-mds1
[root@server]# sleep 60; lctl get_param mdt.lustre-MDT0000.hsm.actions
mdt.lustre-MDT0000.hsm.actions=
lrh=[type=10680000 len=136 idx=7/1] fid=[0x200000401:0xb:0x0] dfid=[0x200000401:0xb:0x0] compound/cookie=0x0/0x690cc770 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 datalen=0 status=STARTED data=[]
lrh=[type=10680000 len=136 idx=7/2] fid=[0x200000401:0x13:0x0] dfid=[0x200000401:0x13:0x0] compound/cookie=0x0/0x690cc771 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 datalen=0 status=STARTED data=[]
lrh=[type=10680000 len=136 idx=7/3] fid=[0x200000401:0xf:0x0] dfid=[0x200000401:0xf:0x0] compound/cookie=0x0/0x690cc772 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 datalen=0 status=STARTED data=[]
[root@server# lctl get_param mdt.lustre-MDT0000.hsm.active_requests
mdt.lustre-MDT0000.hsm.active_requests=
[root@server]#
Copytool:
lt-lhsmtool_posix: 1762508823.726096 [36631]: llapi_hsm_action_begin() on '/mnt/lustre/.lustre/fid/0x200000401:0x13:0x0' failed: No such file or directory (2) lt-lhsmtool_posix: 1762508823.726195 [36631]: Action completed, notifying coordinator cookie=0x690cc771, FID=[0x200000401:0x13:0x0], hp_flags=0 err=2 lt-lhsmtool_posix: 1762508823.726161 [36630]: Action completed, notifying coordinator cookie=0x690cc770, FID=[0x200000401:0xb:0x0], hp_flags=0 err=2 lt-lhsmtool_posix: 1762508823.726337 [36631]: llapi_hsm_action_end() on '/mnt/lustre/.lustre/fid/0x200000401:0x13:0x0' failed: No such file or directory (2) lt-lhsmtool_posix: 1762508823.726421 [36630]: llapi_hsm_action_end() on '/mnt/lustre/.lustre/fid/0x200000401:0xb:0x0' failed: No such file or directory (2)
The action is not re-registred in active_requests hashtable after the coordinator restart. So the coordinator returns "No such file or directory" to the copytool, but the action stays "STARTED".