Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-18439

LLTPfcntl tests are failing during regression IO test

Details

    • Bug
    • Resolution: Unresolved
    • Major
    • None
    • Lustre 2.16.0
    • None
    • 3
    • 9223372036854775807

    Description

      Bunch of CL_LLTP/LLTPfcntl tags failed during regression IO run. Found these kind of failures on tag output.

       Job Script: command started at Wed Oct 16 01:52:35 CDT 2024
       fcntl19     0  [1;34mTINFO[0m  :  Enter block 1
       fcntl19     0  [1;34mTINFO[0m  :  Test block 1: PASSED
       fcntl19     0  [1;34mTINFO[0m  :  Exit block 1
       fcntl19     0  [1;34mTINFO[0m  :  Enter block 2
       fcntl19     0  [1;34mTINFO[0m  :  Test block 2: PASSED
       fcntl19     0  [1;34mTINFO[0m  :  Exit block 2
       fcntl19     0  [1;34mTINFO[0m  :  Enter block 3
       fcntl19     0  [1;34mTINFO[0m  :  Test block 3: PASSED
       fcntl19     0  [1;34mTINFO[0m  :  Exit block 3
       fcntl19     0  [1;34mTINFO[0m  :  Enter blcok 4
       fcntl19     0  [1;34mTINFO[0m  :  Test block 4: PASSED
       fcntl19     0  [1;34mTINFO[0m  :  Exit block 4
       fcntl19     0  [1;34mTINFO[0m  :  Enter block 5
       fcntl19     1  [1;31mTFAIL[0m  :  fcntl19.c:190: region length is wrong, should be 3 is 7
       fcntl19     0  [1;34mTINFO[0m  :  Test block 5: FAILED
       fcntl19     0  [1;34mTINFO[0m  :  Exit block 5
       fcntl19     0  [1;34mTINFO[0m  :  Enter block 6
       fcntl19     2  [1;31mTFAIL[0m  :  fcntl19.c:190: region length is wrong, should be 4 is 8
       fcntl19     0  [1;34mTINFO[0m  :  Test block 6: FAILED
       fcntl19     0  [1;34mTINFO[0m  :  Exit block 6
       fcntl19     0  [1;34mTINFO[0m  :  Enter block 7
       fcntl19     0  [1;34mTINFO[0m  :  Test block 7: PASSED
       fcntl19     0  [1;34mTINFO[0m  :  Exit block 7
       Application 8246098 exit codes: 1 

      Test command line : 

      test cmdline ......: ubrun -t -o -T LLTPfcntl11 -E "LTPROOT=$LLTPROOT" -e LLTPROOT -p LLTPROOT=testcases/bin LLTPROOT=testcases/bin/fcntl11 

      Passed job :

      ubrun: Env LLTPROOT_CL=/hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/xtcnl/ltp.latest/ROOT.latest
      ubrun: Prepending path  : '/hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/xtcnl/ltp.latest/ROOT.latest/testcases/bin'
      ubrun: Binary Info: -rwxrwxr-x 1 vers tsttool 392704 May  7  2021 /hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/xtcnl/ltp.latest/ROOT.latest/testcases/bin/ftest04
      ubrun: Execute Cmd: '  aptrun -n1 -M1 /hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/xtcnl/ltp.latest/ROOT.latest/testcases/bin/ftest04'
      CL_LLTPftest04    1  PASS  :  No failures found with the command 'aptrun'     + The return value was 0 as expected.
      aptrun : Launch cmd is 'aprun -n 1 /hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/xtcnl/ltp.latest/ROOT.latest/testcases/bin/ftest04'
      Job Script: command started at Sun Nov 3 23:18:35 CST 2024
      ftest04     1  TPASS  :  Test passed.
      Application 8328916 resources: utime ~0s, stime ~1s, Rss ~9100, inblocks ~2152, outblocks ~83832
      Job Script: command stopped at Sun Nov 3 23:18:39 CST 2024
      Job Script: command runtime was 4 seconds
      
      jupiter-p1:/lus/kjcf05/nivi/rerun # ubrun -t -o -T LLTPmsgctl11 -E "LTPROOT=$LLTPROOT" -e LLTPROOT -p LLTPROOT=testcases/bin LLTPROOT=testcases/bin/msgctl11
      ubrun: Env LLTPROOT=/hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest
      ubrun: Prepending path  : '/hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest/testcases/bin'
      ubrun: Binary Info: -rwxrwxr-x 1 vers tsttool 414048 May  7  2021 /hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest/testcases/bin/msgctl11
      ubrun: Execute Cmd: '  /hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest/testcases/bin/msgctl11'
      LLTPmsgctl11    1  PASS  :  No failures found with the command 'LLTPROOT=testcases/bin/msgctl11'     + The return value was 0 as expected.
      msgctl11    0  TINFO  :  Found 32000 available message queues
      msgctl11    0  TINFO  :  Using upto 16114 pids
      msgctl11    1  TPASS  :  msgctl11 ran successfully!
      jupiter-p1:/lus/kjcf05/nivi/rerun #
      
      jupiter-p1:/lus/kjcf05/nivi/rerun #  jupiter-p1:/lus/kjcf05/nivi/rerun # ubrun -t -o -T LLTPftest01 -E "LTPROOT=$LLTPROOT" -e LLTPROOT -p LLTPROOT=testcases/bin LLTPROOT=testcases/bin/ftest01
      ubrun: Env LLTPROOT=/hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest
      ubrun: Prepending path  : '/hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest/testcases/bin'
      ubrun: Binary Info: -rwxrwxr-x 1 vers tsttool 400080 May  7  2021 /hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest/testcases/bin/ftest01
      ubrun: Execute Cmd: '  /hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest/testcases/bin/ftest01'
      LLTPftest01    1  PASS  :  No failures found with the command 'LLTPROOT=testcases/bin/ftest01'     + The return value was 0 as expected.
      ftest01     1  TPASS  :  Test passed in fork and wait.
      ftest01     2  TPASS  :  Test passed.
      
      jupiter-p1:/lus/kjcf05/nivi/rerun # ubrun -t -o -T LLTPftest06 -E "LTPROOT=$LLTPROOT" -e LLTPROOT -p LLTPROOT=testcases/bin LLTPROOT=testcases/bin/ftest06
      ubrun: Env LLTPROOT=/hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest
      ubrun: Prepending path  : '/hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest/testcases/bin'
      ubrun: Binary Info: -rwxrwxr-x 1 vers tsttool 393168 May  7  2021 /hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest/testcases/bin/ftest06
      ubrun: Execute Cmd: '  /hpcdc/project/craytest/ostest/binaries/xt/rel.70up03.aries.cray-sp2/baselinux/ltp.latest/ROOT.latest/testcases/bin/ftest06'
      LLTPftest06    1  PASS  :  No failures found with the command 'LLTPROOT=testcases/bin/ftest06'     + The return value was 0 as expected.
      ftest06     1  TPASS  :  Test passed.
      ftest06     2  TPASS  :  Test passed.
      jupiter-p1:/lus/kjcf05/nivi/rerun #  

      console-20241103

      Attachments

        1. console-20241103
          96 kB
        2. console-20241104
          93 kB
        3. kern
          11.01 MB
        4. messages
          33.30 MB
        5. messages-20241103
          898 kB
        6. messages-20241104
          1.12 MB

        Issue Links

          Activity

            [LU-18439] LLTPfcntl tests are failing during regression IO test
            rajeevm Rajeev Mishra added a comment - fixed by https://review.whamcloud.com/c/fs/lustre-release/+/57105

            multiple commit which can be cause of the problem, as flock logic has been changed

            0e2f9f8568 LU-11085 tests: add a test case for same range lock
            a863101084 LU-17276 tests: performance test for same range lock
            df0663f5a3 LU-11085 lustre: remove interval-tree code
            ea012dfb2a LU-17276 ldlm: use interval tree for searching in flock
            1c635e263f LU-17276 ldlm: convert flock locks to linux interval tree.
            0cf356c4e4 LU-11085 ldlm: optimise extent locks with identical extent
            d2ff746a99 LU-11085 ldlm: convert ldlm extent locks to linux extent-tree
            f5e73b3efb LU-11085 ldlm: move interval_insert call from ldlm_lock to ldlm_extent
            d199b178ed LU-11085 ldlm: save space in struct ldlm_lock
            c5b8ac83bd LU-11085 ldlm: simplify use of interval-tree.
            887889b333 LU-11085 tests: Add performance test for ldlm_extent code
            f94dca9dcb LU-17276 tests: Enqueue same range flocks
            4527751bec LU-17276 ldlm: add interval in flock
            6377859352 LU-17276 tests: performance test case for flock
            0e008ef67c LU-11085 llite: reimplement range_lock with Linux interval_tree
            f684172237 LU-11085 mdt: revise recording of hsm progress updates.
            f55fdfff5d LU-11085 nodemap: switch interval tree to in-kernel impl.
            ec138c5c58 LU-11085 ldlm: change lock_matches() to return bool.

            I am still working on the fix

            rajeevm Rajeev Mishra added a comment - multiple commit which can be cause of the problem, as flock logic has been changed 0e2f9f8568 LU-11085 tests: add a test case for same range lock a863101084 LU-17276 tests: performance test for same range lock df0663f5a3 LU-11085 lustre: remove interval-tree code ea012dfb2a LU-17276 ldlm: use interval tree for searching in flock 1c635e263f LU-17276 ldlm: convert flock locks to linux interval tree. 0cf356c4e4 LU-11085 ldlm: optimise extent locks with identical extent d2ff746a99 LU-11085 ldlm: convert ldlm extent locks to linux extent-tree f5e73b3efb LU-11085 ldlm: move interval_insert call from ldlm_lock to ldlm_extent d199b178ed LU-11085 ldlm: save space in struct ldlm_lock c5b8ac83bd LU-11085 ldlm: simplify use of interval-tree. 887889b333 LU-11085 tests: Add performance test for ldlm_extent code f94dca9dcb LU-17276 tests: Enqueue same range flocks 4527751bec LU-17276 ldlm: add interval in flock 6377859352 LU-17276 tests: performance test case for flock 0e008ef67c LU-11085 llite: reimplement range_lock with Linux interval_tree f684172237 LU-11085 mdt: revise recording of hsm progress updates. f55fdfff5d LU-11085 nodemap: switch interval tree to in-kernel impl. ec138c5c58 LU-11085 ldlm: change lock_matches() to return bool. I am still working on the fix

            This commit may be cause of the problem. Multiple ltp fcntl test are failing due to this
            commit ea012dfb2a94060450c75c4f425f2d8b8c4329f5
            Author: Mr NeilBrown <neilb@suse.de>
            Date: Fri Apr 26 10:40:20 2024 -0400

            LU-17276 ldlm: use interval tree for searching in flock

            This patch converts ldlm_process_flock_lock() to use the new interval
            tree to find flock locks more efficiently.

            Previously all locks the the same owner were adjacent in the
            lr_granted list. This was used for the second stage of merging
            overlapping locks once it was confirmed that there were no conflicts.
            Now instead we build up a temporary list of locks in the target range
            that have the same owner, and use that for the second stage.

            Signed-off-by: Mr NeilBrown <neilb@suse.de>
            Signed-off-by: Yang Sheng <ys@whamcloud.com>
            Change-Id: I0a4f1e833d8db36827c318a020de564a78b0adb5
            Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53951
            Tested-by: jenkins <devops@whamcloud.com>
            Tested-by: Maloo <maloo@whamcloud.com>
            Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
            Reviewed-by: James Simmons <jsimmons@infradead.org>
            Reviewed-by: Oleg Drokin <green@whamcloud.com>

            rajeevm Rajeev Mishra added a comment - This commit may be cause of the problem. Multiple ltp fcntl test are failing due to this commit ea012dfb2a94060450c75c4f425f2d8b8c4329f5 Author: Mr NeilBrown <neilb@suse.de> Date: Fri Apr 26 10:40:20 2024 -0400 LU-17276 ldlm: use interval tree for searching in flock This patch converts ldlm_process_flock_lock() to use the new interval tree to find flock locks more efficiently. Previously all locks the the same owner were adjacent in the lr_granted list. This was used for the second stage of merging overlapping locks once it was confirmed that there were no conflicts. Now instead we build up a temporary list of locks in the target range that have the same owner, and use that for the second stage. Signed-off-by: Mr NeilBrown <neilb@suse.de> Signed-off-by: Yang Sheng <ys@whamcloud.com> Change-Id: I0a4f1e833d8db36827c318a020de564a78b0adb5 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53951 Tested-by: jenkins <devops@whamcloud.com> Tested-by: Maloo <maloo@whamcloud.com> Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com> Reviewed-by: James Simmons <jsimmons@infradead.org> Reviewed-by: Oleg Drokin <green@whamcloud.com>

            code to recreate the issue

            // code placeholder
            [rocky@test-build-fcntl ~]$ cat fcntltest.c
            #include <stdio.h>
            #include <stdlib.h>
            #include <unistd.h>
            #include <fcntl.h>
            #include <errno.h>
            #include <string.h>
            #include <sys/stat.h>
            #include <sys/wait.h>
            
            
            int fd;
            # if __WORDSIZE == 64
            #  define __PRI64_PREFIX        "l"
            #  define __PRIPTR_PREFIX       "l"
            # else
            #  define __PRI64_PREFIX        "ll"
            #  define __PRIPTR_PREFIX
            # endif
            
            
            # define PRId64         __PRI64_PREFIX "d"
            
            
            char *str_type(int type)
            {
                    static char buf[20];
            
            
                    switch (type) {
                    case F_RDLCK:
                            return ("F_RDLCK");
                    case F_WRLCK:
                            return ("F_WRLCK");
                    case F_UNLCK:
                            return ("F_UNLCK");
                    default:
                            sprintf(buf, "BAD VALUE: %d", type);
                            return (buf);
                    }
            }
            
            
            int set_lock(int cmd, short type, short whence, int start, int len) {
                struct flock lock;
                lock.l_type = type;
                lock.l_whence = whence;
                lock.l_start = start;
                lock.l_len = len;
            
            
                printf ("fd = %d\n", fd);
                if (fcntl(fd, cmd, &lock) == -1) {
                    perror("fcntl (set_lock)");
                    return -1;
                }
                return 0;
            }
            
            
            int check_lock(short type, short whence, int start, int len, pid_t pid) 
            {
                struct flock lock;
                struct flock *fl = &lock;
                    fl->l_type = type;
                    fl->l_whence = whence;
                    fl->l_start = 0;
                    fl->l_len = 0;
                    fl->l_pid = (short)0;
            	int pipefd[2];
            	if (pipe(pipefd) == -1) {
                    perror("pipe");
                    return 1;
            	}
            	pid = fork();
                if (pid == -1) { perror("fork"); return 1; }
            
            
                if (pid == 0) { // Child
            	    close(pipefd[0]); // Close the read end of the pipe
                    if (fcntl(fd, F_GETLK, &lock) == -1) { perror("fcntl get (child)"); exit(1); }
            	ssize_t bytes_written = write(pipefd[1], &lock, sizeof(lock));
                    printf("Child: Lock type: %d\n", lock.l_type); // Child's copy
                    exit(0);
                } else { // Parent
                    sleep(1); // Give child time to run
                    waitpid(pid, NULL, 0);
            	read(pipefd[0], &lock, sizeof(lock));
                    printf("Parent: Lock type: %d\n", lock.l_type); // Parent's copy (will be unchanged)
            
            
            	printf ("i am working \n");
            	if (fl->l_type != type)
            		printf ("lock type is wrong should be %s is %s \n",
                                     str_type(type), str_type(fl->l_type));
            
            
                    if (fl->l_whence != whence)
                            printf ("lock whence is wrong should be %d is %d \n",
                                     whence, fl->l_whence);
            
            
                    if (fl->l_start != start)
                            printf ("region starts in wrong place, should be "
                                     "%d is %\n" PRId64, start, (int64_t) fl->l_start);
            
            
                    if (fl->l_len != len)
                           printf ( "region length is wrong, should be %d is %d \n" PRId64,
                                     len, (int64_t) fl->l_len);
                }
                return 0;
            }
            #define STRINGSIZE      27
            #define STRING          "abcdefghijklmnopqrstuvwxyz\n"
            int main() {
            
            
            	char template[] = "/mnt/lustre/lock_testXXXXXX";
            // #define FILE_NAME "/mnt/lustre/lock_test.txt"
            // #define FILE_NAME "lock_test.txt"
            //	strcpy(template,"/mnt/lustre/lock_test.txt");
                    fd = mkstemp(template);
                    if (fd == -1) {
                        perror("open (create)");
                        return 1;
                    }
            
            
            
            
            	char *buf = STRING;
            
            
                if (write(fd, buf, STRINGSIZE) != STRINGSIZE) {
                    perror("write");
                    close(fd);
            	unlink(template);
                    return 1;
                }
            
            
                if (set_lock(F_SETLK, (short)F_WRLCK, (short)0, 10, 5) < 0) {
            	    perror("set_lck write  faled");
                }
                if (set_lock(F_SETLK, (short)F_RDLCK, (short)0, 14, 5) < 0) {
            	    perror("set_lck read faled");
                }
            
            
                if (check_lock((short)F_WRLCK, (short)0,10, 4, getpid())< 0)  {
            	    perror("check faled");
                }
                if (set_lock(F_SETLK, (short)F_UNLCK, (short)0, 0, 0) < 0) {
            	    perror("set_lck unlock faled");
                }
            
            
            
            
                close(fd);
                unlink(template);
                printf("Test complete.\n");
                return 0;
            }
            
            rajeevm Rajeev Mishra added a comment - code to recreate the issue // code placeholder [rocky@test-build-fcntl ~]$ cat fcntltest.c #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <fcntl.h> #include <errno.h> #include <string.h> #include <sys/stat.h> #include <sys/wait.h> int fd; # if __WORDSIZE == 64 # define __PRI64_PREFIX "l" # define __PRIPTR_PREFIX "l" # else # define __PRI64_PREFIX "ll" # define __PRIPTR_PREFIX # endif # define PRId64 __PRI64_PREFIX "d" char *str_type( int type) { static char buf[20]; switch (type) { case F_RDLCK: return ( "F_RDLCK" ); case F_WRLCK: return ( "F_WRLCK" ); case F_UNLCK: return ( "F_UNLCK" ); default : sprintf(buf, "BAD VALUE: %d" , type); return (buf); } } int set_lock( int cmd, short type, short whence, int start, int len) { struct flock lock; lock.l_type = type; lock.l_whence = whence; lock.l_start = start; lock.l_len = len; printf ( "fd = %d\n" , fd); if (fcntl(fd, cmd, &lock) == -1) { perror( "fcntl (set_lock)" ); return -1; } return 0; } int check_lock( short type, short whence, int start, int len, pid_t pid) { struct flock lock; struct flock *fl = &lock; fl->l_type = type; fl->l_whence = whence; fl->l_start = 0; fl->l_len = 0; fl->l_pid = ( short )0; int pipefd[2]; if (pipe(pipefd) == -1) { perror( "pipe" ); return 1; } pid = fork(); if (pid == -1) { perror( "fork" ); return 1; } if (pid == 0) { // Child close(pipefd[0]); // Close the read end of the pipe if (fcntl(fd, F_GETLK, &lock) == -1) { perror( "fcntl get (child)" ); exit(1); } ssize_t bytes_written = write(pipefd[1], &lock, sizeof(lock)); printf( "Child: Lock type: %d\n" , lock.l_type); // Child's copy exit(0); } else { // Parent sleep(1); // Give child time to run waitpid(pid, NULL, 0); read(pipefd[0], &lock, sizeof(lock)); printf( "Parent: Lock type: %d\n" , lock.l_type); // Parent's copy (will be unchanged) printf ( "i am working \n" ); if (fl->l_type != type) printf ( "lock type is wrong should be %s is %s \n" , str_type(type), str_type(fl->l_type)); if (fl->l_whence != whence) printf ( "lock whence is wrong should be %d is %d \n" , whence, fl->l_whence); if (fl->l_start != start) printf ( "region starts in wrong place, should be " "%d is %\n" PRId64, start, (int64_t) fl->l_start); if (fl->l_len != len) printf ( "region length is wrong, should be %d is %d \n" PRId64, len, (int64_t) fl->l_len); } return 0; } #define STRINGSIZE 27 #define STRING "abcdefghijklmnopqrstuvwxyz\n" int main() { char template[] = "/mnt/lustre/lock_testXXXXXX" ; // #define FILE_NAME "/mnt/lustre/lock_test.txt" // #define FILE_NAME "lock_test.txt" // strcpy(template, "/mnt/lustre/lock_test.txt" ); fd = mkstemp(template); if (fd == -1) { perror( "open (create)" ); return 1; } char *buf = STRING; if (write(fd, buf, STRINGSIZE) != STRINGSIZE) { perror( "write" ); close(fd); unlink(template); return 1; } if (set_lock(F_SETLK, ( short )F_WRLCK, ( short )0, 10, 5) < 0) { perror( "set_lck write faled" ); } if (set_lock(F_SETLK, ( short )F_RDLCK, ( short )0, 14, 5) < 0) { perror( "set_lck read faled" ); } if (check_lock(( short )F_WRLCK, ( short )0,10, 4, getpid())< 0) { perror( "check faled" ); } if (set_lock(F_SETLK, ( short )F_UNLCK, ( short )0, 0, 0) < 0) { perror( "set_lck unlock faled" ); } close(fd); unlink(template); printf( "Test complete.\n" ); return 0; }

            People

              rajeevm Rajeev Mishra
              prasannakumar Prasannakumar Nagasubramani
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

              Dates

                Created:
                Updated: