Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-19579

HSM recovery does not work for STARTED actions

XMLWordPrintable

    • Icon: Bug Bug
    • Resolution: Unresolved
    • Icon: Medium Medium
    • None
    • None
    • None
    • 3
    • 9223372036854775807

      After a MDT crash/failover STARTED archive/remove hang indefinitely (until active_request_timeout).

      Reproducer

      [root@server]# lctl set_param -P mdt.lustre-MDT0000.hsm_control=enabled
      
      [root@client-el8 ~]# pkill -STOP lhsm                  
      [root@client-el8 ~]# printf "%s\n" test{1..10} | xargs -P10 -I{} dd if=/dev/urandom of={} bs=1M count=1
      [root@client-el8 ~]# lfs hsm_archive test*             
      
      [root@server]# lctl get_param mdt.lustre-MDT0000.hsm.active_requests
      fid=[0x200000401:0xb:0x0] dfid=[0x200000401:0xb:0x0] compound/cookie=0x0/0x690cc770 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 data=[] canceled=0 uuid=beb75c8b-e53a-4436-be82-b288d258a08a done=0
      fid=[0x200000401:0x13:0x0] dfid=[0x200000401:0x13:0x0] compound/cookie=0x0/0x690cc771 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 data=[] canceled=0 uuid=beb75c8b-e53a-4436-be82-b288d258a08a done=0
      fid=[0x200000401:0xf:0x0] dfid=[0x200000401:0xf:0x0] compound/cookie=0x0/0x690cc772 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 data=[] canceled=0 uuid=beb75c8b-e53a-4436-be82-b288d258a08a done=0
      
      [root@server]# umount /mnt/lustre-mds1
      
      [root@client-el8]# pkill -CONT lhsm
      
      [root@server]# mount -tlustre /dev/mapper/mds1_flakey /mnt/lustre-mds1
      [root@server]# sleep 60; lctl get_param mdt.lustre-MDT0000.hsm.actions                                                                                                                                                               
      mdt.lustre-MDT0000.hsm.actions=
      lrh=[type=10680000 len=136 idx=7/1] fid=[0x200000401:0xb:0x0] dfid=[0x200000401:0xb:0x0] compound/cookie=0x0/0x690cc770 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 datalen=0 status=STARTED data=[]
      lrh=[type=10680000 len=136 idx=7/2] fid=[0x200000401:0x13:0x0] dfid=[0x200000401:0x13:0x0] compound/cookie=0x0/0x690cc771 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 datalen=0 status=STARTED data=[]
      lrh=[type=10680000 len=136 idx=7/3] fid=[0x200000401:0xf:0x0] dfid=[0x200000401:0xf:0x0] compound/cookie=0x0/0x690cc772 action=ARCHIVE archive#=1 flags=0x0 extent=0x0-0xffffffffffffffff gid=0x0 datalen=0 status=STARTED data=[]
      [root@server# lctl get_param mdt.lustre-MDT0000.hsm.active_requests
      mdt.lustre-MDT0000.hsm.active_requests=
      [root@server]# 
      

      Copytool:

      lt-lhsmtool_posix: 1762508823.726096 [36631]: llapi_hsm_action_begin() on '/mnt/lustre/.lustre/fid/0x200000401:0x13:0x0' failed: No such file or directory (2)
      lt-lhsmtool_posix: 1762508823.726195 [36631]: Action completed, notifying coordinator cookie=0x690cc771, FID=[0x200000401:0x13:0x0], hp_flags=0 err=2
      lt-lhsmtool_posix: 1762508823.726161 [36630]: Action completed, notifying coordinator cookie=0x690cc770, FID=[0x200000401:0xb:0x0], hp_flags=0 err=2
      lt-lhsmtool_posix: 1762508823.726337 [36631]: llapi_hsm_action_end() on '/mnt/lustre/.lustre/fid/0x200000401:0x13:0x0' failed: No such file or directory (2)
      lt-lhsmtool_posix: 1762508823.726421 [36630]: llapi_hsm_action_end() on '/mnt/lustre/.lustre/fid/0x200000401:0xb:0x0' failed: No such file or directory (2)
      

      The action is not re-registred in active_requests hashtable after the coordinator restart. So the coordinator returns "No such file or directory" to the copytool, but the action stays "STARTED".

            eaujames Etienne Aujames
            eaujames Etienne Aujames
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

              Created:
              Updated: