Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-17687

'lctl set_param/conf_param' hung running on older system

Details

    • Bug
    • Resolution: Fixed
    • Minor
    • Lustre 2.16.0
    • None
    • 3
    • 9223372036854775807

    Description

      Running "lctl set_param -P lod.*.mdt_hash=crush" or "lctl conf_param testfs.quota.mdt=ug3" hangs when running on an older system (el7.9 running 2.14.0 kernel modules):

      Using TIMEOUT=20
      Writer error: failed to resolve Netlink family id
      Writer error: failed to resolve Netlink family id
      osc.testfs-OST0000-osc-ffff95fc0529ae98.idle_timeout=debug
      osc.testfs-OST0001-osc-ffff95fc0529ae98.idle_timeout=debug
      osc.testfs-OST0002-osc-ffff95fc0529ae98.idle_timeout=debug
      osc.testfs-OST0003-osc-ffff95fc0529ae98.idle_timeout=debug
      disable quota as required
      Writer error: failed to resolve Netlink family id
      

      Running with "ltrace lctl set_param -P lod.*.mdt_hash=crush" it looks like it is looping forever in yaml code:

      __libc_start_main(0x4209b5, 4, 0x7ffdf2c34e38, 0x42af80 <unfinished ...>
      setlinebuf(0x7fcc8c685400)                       = <void>
      register_ioc_dev(0, 0x42b257, 0x7ffdf2c34e38, 0) = 0
      register_ioc_dev(1, 0x42e0ce, 0x7ffdf2c34e38, 0) = 1
      llapi_set_command_name(0x7ffdf2c36fa9, 0x7ffdf2c34e38, 0x7ffdf2c34e38, 0) = 0x7fcc8d88c5a0
      cfs_parser(4, 0x7ffdf2c34e38, 0x63ea40, 0x7ffdf2c34e38 <unfinished ...>
      memset(0x7ffdf2c34c70, '\0', 8)                  = 0x7ffdf2c34c70
      getopt(3, 0x7ffdf2c34e40, "dFnPt::")             = 80
      getopt(3, 0x7ffdf2c34e40, "dFnPt::")             = -1
      memset(0x7ffdf2c34b00, '\0', 104)                = 0x7ffdf2c34b00
      strlen("general")                                = 7
      strlen("lod.*.mdt_hash=crush")                   = 20
      malloc(72)                                       = 0x1dc5010
      memcpy(0x1dc5038, "general\0", 8)                = 0x1dc5038
      memcpy(0x1dc5040, "lod.*.mdt_hash=crush\0", 21)  = 0x1dc5040
      memset(0x7ffdf2c34890, '\0', 576)                = 0x7ffdf2c34890
      strspn("$MGS", "0123456789")                     = 0
      strlen("$MGS")                                   = 4
      nl_socket_alloc(0x7ffdf2c32841, 0x7ffdf2c32841, 0xffffffff, 0x7ffdf2c32841) = 0x1dc5150
      yaml_parser_initialize(0x7ffdf2c30170, 0x1dc5190, 16, 9) = 1
      yaml_parser_set_input_netlink(0x7ffdf2c30170, 0x1dc5150, 0, 0x1dc5150) = 1
      yaml_emitter_initialize(0x7ffdf2c30350, 3, 0, 0x1dd5ca0) = 1
      yaml_emitter_set_output_netlink(0x7ffdf2c30350, 0x1dc5150, 0x42f96f, 1) = 0
      yaml_emitter_log_error(0x7ffdf2c30350, 0x7fcc8c6851c0, 0x7fcc8c6851c0, 0x1df2500Writer error: failed to resolve Netlink family id
      ) = 50
      yaml_emitter_delete(0x7ffdf2c30350, 0, 0, 50)    = 0
      yaml_parser_parse(0x7ffdf2c30170, 0x7ffdf2c30100, 0x7ffdf2c30100, 0) = 1
      yaml_event_delete(0x7ffdf2c30100, 0x7ffdf2c30100, 0, 0) = 0
      yaml_parser_parse(0x7ffdf2c30170, 0x7ffdf2c30100, 0x7ffdf2c30100, 0) = 1
      yaml_event_delete(0x7ffdf2c30100, 0x7ffdf2c30100, 0, 0) = 0
      yaml_parser_parse(0x7ffdf2c30170, 0x7ffdf2c30100, 0x7ffdf2c30100, 0) = 1
      yaml_event_delete(0x7ffdf2c30100, 0x7ffdf2c30100, 0, 0) = 0
      yaml_parser_parse(0x7ffdf2c30170, 0x7ffdf2c30100, 0x7ffdf2c30100, 0) = 1
      yaml_event_delete(0x7ffdf2c30100, 0x7ffdf2c30100, 0, 0) = 0
      yaml_parser_parse(0x7ffdf2c30170, 0x7ffdf2c30100, 0x7ffdf2c30100, 0) = 1
      :
      [repeats forever]
      :
      

      Running "lctl get_param lod.*.mdt_hash" or "lctl set_param lod.*.mdt_hash=crush" works OK, so it appears to be something to do with setting permanent parameters using netlink:

      # ltrace ./lustre/utils/.libs/lt-lctl set_param lod.*.mdt_hash=crush |& tee /tmp/lctl-ok.log
      __libc_start_main(0x4209b5, 3, 0x7fffaf57bb78, 0x42af80 <unfinished ...>
      setlinebuf(0x7fe438feb400)                       = <void>
      register_ioc_dev(0, 0x42b257, 0x7fffaf57bb78, 0) = 0
      register_ioc_dev(1, 0x42e0ce, 0x7fffaf57bb78, 0) = 1
      llapi_set_command_name(0x7fffaf57bfac, 0x7fffaf57bb78, 0x7fffaf57bb78, 0) = 0x7fe43a1f25a0
      cfs_parser(3, 0x7fffaf57bb78, 0x63ea40, 0x7fffaf57bb78 <unfinished ...>
      memset(0x7fffaf57b9b0, '\0', 8)                  = 0x7fffaf57b9b0
      getopt(2, 0x7fffaf57bb80, "dFnPt::")             = -1
      strchr("lod.*.mdt_hash=crush", '=')              = "=crush"
      strrchr("lod.*.mdt_hash", '\\')                  = nil
      strchr("lod.*.mdt_hash", '@')                    = nil
      strchr("lod.*.mdt_hash", '.')                    = ".*.mdt_hash"
      strchr("*.mdt_hash", '.')                        = ".mdt_hash"
      strchr("mdt_hash", '.')                          = nil
      llapi_param_get_paths(0x7fffaf57bfb6, 0x7fffaf57b8a0, 0x7fffaf57b8a0, 2) = 0
      calloc(4, 8)                                     = 0x20d4090
      __xstat64(1, "/sys/fs/lustre/lod/testfs-MDT000"..., 0x7fffaf579800) = 0
      strstr("/sys/fs/lustre/lod/testfs-MDT000"..., "/lustre/") = "/lustre/lod/testfs-MDT0000-mdtlo"...
      strdup("lod/testfs-MDT0000-mdtlov/mdt_ha"...)    = 0x20d4300
      strchr("lod/testfs-MDT0000-mdtlov/mdt_ha"..., '/') = "/testfs-MDT0000-mdtlov/mdt_hash"
      strchr(".testfs-MDT0000-mdtlov/mdt_hash", '/')   = "/mdt_hash"
      strchr(".mdt_hash", '/')                         = nil
      open64("/sys/fs/lustre/lod/testfs-MDT000"..., 1, 025725734660) = 3
      strlen("crush")                                  = 5
      write(3, "crush", 5)                             = 5
      strlen("crush")                                  = 5
      printf("%s=%s\n", "lod.testfs-MDT0000-mdtlov.mdt_ha"..., "crush"lod.testfs-MDT0000-mdtlov.mdt_hash=crush
      ) = 41
      close(3)                                         = 0
      :
      [repeats for other MDTs]
      :
      

      I thought there were fallbacks for old behavior in case the Netlink/YAML was not working?

      Attachments

        Issue Links

          Activity

            [LU-17687] 'lctl set_param/conf_param' hung running on older system
            pjones Peter Jones added a comment -

            Merged for 2.16

            pjones Peter Jones added a comment - Merged for 2.16

            "Oleg Drokin <green@whamcloud.com>" merged in patch https://review.whamcloud.com/c/fs/lustre-release/+/55881/
            Subject: LU-17687 utils: Netlink doesn't return for missing MGS device
            Project: fs/lustre-release
            Branch: master
            Current Patch Set:
            Commit: e67edc5fc9ed83aa140ab2ce798db154a450c0a0

            gerrit Gerrit Updater added a comment - "Oleg Drokin <green@whamcloud.com>" merged in patch https://review.whamcloud.com/c/fs/lustre-release/+/55881/ Subject: LU-17687 utils: Netlink doesn't return for missing MGS device Project: fs/lustre-release Branch: master Current Patch Set: Commit: e67edc5fc9ed83aa140ab2ce798db154a450c0a0

            I wouldn't say a hard requirement.

            simmonsja James A Simmons added a comment - I wouldn't say a hard requirement.
            pjones Peter Jones added a comment -

            ok but presumably that is not must have for 2.16?

            pjones Peter Jones added a comment - ok but presumably that is not must have for 2.16?

            The patch I pushed did have some fixes. We should still landed it.

            simmonsja James A Simmons added a comment - The patch I pushed did have some fixes. We should still landed it.
            yujian Jian Yu added a comment -

            It turned out I had an old lnet.ko installed in /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/kernel/net/lustre/, while the new lnet.ko installed in /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/, which caused the following failure:

            # depmod -eF /boot/System.map-4.18.0-477.27.1.el8_lustre.x86_64 
            depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol lnet_ni_add_interface
            depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol libcfs_setup
            depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol libcfs_strnid
            depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol lnet_md_discarded
            depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol lnet_nid_to_ni_addref
            depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol libcfs_nidstr_r
            depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol libcfs_idstr
            depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol lnet_inet_select
            

            After removing the old modules, the issue was resolved.

            yujian Jian Yu added a comment - It turned out I had an old lnet.ko installed in /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/ kernel/net/lustre /, while the new lnet.ko installed in /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/ lustre/net /, which caused the following failure: # depmod -eF /boot/System.map-4.18.0-477.27.1.el8_lustre.x86_64 depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol lnet_ni_add_interface depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol libcfs_setup depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol libcfs_strnid depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol lnet_md_discarded depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol lnet_nid_to_ni_addref depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol libcfs_nidstr_r depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol libcfs_idstr depmod: WARNING: /lib/modules/4.18.0-477.27.1.el8_lustre.x86_64/extra/lustre/net/in-kernel-ko2iblnd.ko needs unknown symbol lnet_inet_select After removing the old modules, the issue was resolved.
            yujian Jian Yu added a comment -

            Sure, simmonsja. Let me figure it out.

            yujian Jian Yu added a comment - Sure, simmonsja . Let me figure it out.

            This code is more than a year old. Why is breaking now? Did something land that broke this? Can you try a git bisect.

            simmonsja James A Simmons added a comment - This code is more than a year old. Why is breaking now? Did something land that broke this? Can you try a git bisect.
            yujian Jian Yu added a comment -

            Separating client and server on two nodes also hit the same issue.
            The failure can be reproduced by just running llmount.sh after installing the el8.8 kernel and lustre rpms from build https://build.whamcloud.com/job/lustre-reviews/106407/:

            # cd /usr/lib64/lustre/tests
            #  ./llmount.sh 
            mgs: Red Hat Enterprise Linux release 8.8 (Ootpa)
            MGS_OS_ID_LIKE=fedora rhel
            MGS_OS_VERSION_ID=8.8
            MGS_OS_ID=rhel
            MGS_OS_VERSION_CODE=134742016
            mds1: Red Hat Enterprise Linux release 8.8 (Ootpa)
            MDS1_OS_VERSION_ID=8.8
            MDS1_OS_VERSION_CODE=134742016
            MDS1_OS_ID_LIKE=fedora rhel
            MDS1_OS_ID=rhel
            ost1: Red Hat Enterprise Linux release 8.8 (Ootpa)
            OST1_OS_VERSION_CODE=134742016
            OST1_OS_ID_LIKE=fedora rhel
            OST1_OS_VERSION_ID=8.8
            OST1_OS_ID=rhel
            client: Red Hat Enterprise Linux release 8.8 (Ootpa)
            CLIENT_OS_ID=rhel
            CLIENT_OS_VERSION_CODE=134742016
            CLIENT_OS_VERSION_ID=8.8
            CLIENT_OS_ID_LIKE=fedora rhel
            Stopping clients: vm88 /mnt/lustre (opts:-f)
            Stopping clients: vm88 /mnt/lustre2 (opts:-f)
            vm88: executing set_hostid
            Loading modules from /usr/lib64/lustre/tests/..
            detected 1 online CPUs by sysfs
            libcfs will create CPU partition based on online CPUs
            ../lnet/lnet/lnet options: 'networks=tcp0(enp0s8) accept=all'
            ptlrpc/ptlrpc options: 'lbug_on_grant_miscount=1'
            quota/lquota options: 'hash_lqs_cur_bits=3'
            Formatting mgs, mds, osts
            Format mds1: /tmp/lustre-mdt1
            Writer error: failed to resolve Netlink family id
            Format ost1: /tmp/lustre-ost1
            Writer error: failed to resolve Netlink family id
            Format ost2: /tmp/lustre-ost2
            Writer error: failed to resolve Netlink family id
            Checking servers environments
            Checking clients vm88 environments
            Loading modules from /usr/lib64/lustre/tests/..
            detected 1 online CPUs by sysfs
            libcfs will create CPU partition based on online CPUs
            Setup mgs, mdt, osts
            Starting mds1: -o localrecov  /dev/mapper/mds1_flakey /mnt/lustre-mds1
            Commit the device label on /tmp/lustre-mdt1
            Started lustre-MDT0000
            Writer error: failed to resolve Netlink family id
            No device found for name MGS: Invalid argument
            This command must be run on the MGS.
            error: executing set_param: No such device
            
            yujian Jian Yu added a comment - Separating client and server on two nodes also hit the same issue. The failure can be reproduced by just running llmount.sh after installing the el8.8 kernel and lustre rpms from build https://build.whamcloud.com/job/lustre-reviews/106407/: # cd /usr/lib64/lustre/tests # ./llmount.sh mgs: Red Hat Enterprise Linux release 8.8 (Ootpa) MGS_OS_ID_LIKE=fedora rhel MGS_OS_VERSION_ID=8.8 MGS_OS_ID=rhel MGS_OS_VERSION_CODE=134742016 mds1: Red Hat Enterprise Linux release 8.8 (Ootpa) MDS1_OS_VERSION_ID=8.8 MDS1_OS_VERSION_CODE=134742016 MDS1_OS_ID_LIKE=fedora rhel MDS1_OS_ID=rhel ost1: Red Hat Enterprise Linux release 8.8 (Ootpa) OST1_OS_VERSION_CODE=134742016 OST1_OS_ID_LIKE=fedora rhel OST1_OS_VERSION_ID=8.8 OST1_OS_ID=rhel client: Red Hat Enterprise Linux release 8.8 (Ootpa) CLIENT_OS_ID=rhel CLIENT_OS_VERSION_CODE=134742016 CLIENT_OS_VERSION_ID=8.8 CLIENT_OS_ID_LIKE=fedora rhel Stopping clients: vm88 /mnt/lustre (opts:-f) Stopping clients: vm88 /mnt/lustre2 (opts:-f) vm88: executing set_hostid Loading modules from /usr/lib64/lustre/tests/.. detected 1 online CPUs by sysfs libcfs will create CPU partition based on online CPUs ../lnet/lnet/lnet options: 'networks=tcp0(enp0s8) accept=all' ptlrpc/ptlrpc options: 'lbug_on_grant_miscount=1' quota/lquota options: 'hash_lqs_cur_bits=3' Formatting mgs, mds, osts Format mds1: /tmp/lustre-mdt1 Writer error: failed to resolve Netlink family id Format ost1: /tmp/lustre-ost1 Writer error: failed to resolve Netlink family id Format ost2: /tmp/lustre-ost2 Writer error: failed to resolve Netlink family id Checking servers environments Checking clients vm88 environments Loading modules from /usr/lib64/lustre/tests/.. detected 1 online CPUs by sysfs libcfs will create CPU partition based on online CPUs Setup mgs, mdt, osts Starting mds1: -o localrecov /dev/mapper/mds1_flakey /mnt/lustre-mds1 Commit the device label on /tmp/lustre-mdt1 Started lustre-MDT0000 Writer error: failed to resolve Netlink family id No device found for name MGS: Invalid argument This command must be run on the MGS. error: executing set_param: No such device
            simmonsja James A Simmons added a comment - - edited

            Is this only for the case for everything on one node on RHEL8?  The above error shows it going back to the old ioctl but still failing.

            simmonsja James A Simmons added a comment - - edited Is this only for the case for everything on one node on RHEL8?  The above error shows it going back to the old ioctl but still failing.

            People

              simmonsja James A Simmons
              adilger Andreas Dilger
              Votes:
              0 Vote for this issue
              Watchers:
              5 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: