# This file describes how we checked if SLES11 SP3 clients with # Lustre 2.4.3. Before we had done similar tests with Lustre 2.4.2. # We also apply new debug settings as instructed by DDN support. # Check installed versions # ======================== [root@pfscn1 ~]# pdsh -a cat /proc/fs/lustre/version | dshbak -c ---------------- pfscn[1-4] ---------------- lustre: 2.4.1 kernel: patchless_client build: EXAScaler-ddn1.0--PRISTINE-2.6.32-358.18.1.el6_lustre.es143.devel.x86_64 er2341@iccn997:~> sudo pdsh -a cat /proc/fs/lustre/version | dshbak -c ---------------- iccn[001-009,996-997,999] ---------------- lustre: 2.4.3 kernel: patchless_client build: jenkins-arch=x86_64,build_type=client,distro=sles11sp2,ib_stack=inkernel-73--PRISTINE-../lustre/scripts ---------------- # Preparation steps # ================= # see file check_SLES11SP3_20140210.txt # Follow instructions of support for debug settings: er2341@iccn997:~> sudo pdsh -w iccn[001-008] /usr/sbin/lctl set_param debug=cache er2341@iccn997:~> sudo pdsh -w iccn[001-008] '/usr/sbin/lctl set_param debug="+rpctrace +dlmtrace"' er2341@iccn997:~> sudo pdsh -w iccn[001-008] /usr/sbin/lctl set_param debug_mb=1024 er2341@iccn997:~> sudo pdsh -w iccn[001-008] lctl clear [root@pfscn1 ~]# pdsh -a /usr/sbin/lctl set_param debug=cache [root@pfscn1 ~]# pdsh -a '/usr/sbin/lctl set_param debug="+rpctrace +dlmtrace"' [root@pfscn1 ~]# pdsh -a /usr/sbin/lctl set_param debug_mb=1024 [root@pfscn1 ~]# pdsh -a lctl clear # Start tests to run in parallel on icc # ===================================== # We also test failover with oss2 of the pfscdat2 file system. [root@pfscn1 ~]# crm_mon -1 | grep ost | grep pfscdat2 ost_pfscdat2_0 (ocf::ddn:lustre_server): Started pfscn4 # We used screen to create 4 sessions. # On first session run parallel_dd: er2341@iccn997:~> /pfs/data1/perftest/bin/parallel_dd -b /pfs/data1/perftest/bin -d /pfs/data2/perftest/tmp -s 10000 -n iccn[001-003] -p 1:1:12 -v # On second session run parallel_bonnie: er2341@iccn997:~> /pfs/data1/perftest/bin/parallel_bonnie -e /pfs/data1/perftest/bin -d /pfs/data2/perftest/tmp -n 16 -l iccn[004-006] -p 12 # On third session run fstest: er2341@iccn007:~> /pfs/data2/perftest/fstest/fstest /pfs/data2/perftest/fstest/tmp > /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.out 2> /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.err # On fourth session copy file tree: er2341@iccn008:/pfs/data2/perftest/failover> for iter in $(seq 2 30); do echo $iter; date; mkdir bind_cp$iter; cp -p -r bind_cp`expr $iter - 1`/* bind_cp$iter/; date; done # Reboot OSS # ========== # wait 3 minutes [root@mds1 ~]# pdsh -w pfscn4 shutdown -r now # wait a minute [root@pfscn1 ~]# crm_mon -1 | grep ost | grep pfscdat2 ost_pfscdat2_0 (ocf::ddn:lustre_server): Started pfscn3 # wait few minutes [root@pfscn1 ~]# pdsh -a uptime pfscn1: 17:02:40 up 3 days, 1:45, 2 users, load average: 0.08, 0.07, 0.01 pfscn2: 17:02:40 up 3 days, 1:26, 0 users, load average: 1.06, 0.59, 0.32 pfscn4: 17:02:40 up 6 min, 0 users, load average: 0.76, 1.30, 0.77 pfscn3: 17:02:40 up 101 days, 1:49, 0 users, load average: 0.36, 0.54, 0.62 [root@pfscn1 ~]# crm_mon -1 | grep ost | grep pfscdat2 ost_pfscdat2_0 (ocf::ddn:lustre_server): Started pfscn4 [root@pfscn1 ~]# pdsh -a lustre_recovery_status.sh | grep -v COMPLETE # Check and stop tests # ==================== # Check parallel_dd: ... Wrote 9999 MB on 1 nodes (3 tasks) in 35.53 sec: throughput is 281.4 MB/s. Read 9999 MB on 1 nodes (3 tasks) in 1.24 sec: throughput is 8033.0 MB/s. Deleted 3 files on 1 nodes (3 tasks) in 0.25 sec: 12.0 deletes/s. Wrote 9996 MB on 2 nodes (3 tasks) in 28.83 sec: throughput is 346.8 MB/s. Read 9996 MB on 2 nodes (3 tasks) in 4.77 sec: throughput is 2094.6 MB/s. Deleted 6 files on 2 nodes (3 tasks) in 0.29 sec: 20.4 deletes/s. !! Error in last command (pdsh -f 3 -w iccn[001,002,003] LANG=C /pfs/data1/perftest/bin/parallel_dd_doit w /pfs/data2/perftest/tmp 3 1111 2>/tmp/.parallel_dd.err.34806): !! iccn002: Last dd command (dd if=/dev/zero of=/pfs/data2/perftest/tmp/dd_test_3_iccn002 bs=1M count=1111 2> /tmp/.iccn002.3.ddout.17793) did not have expected output on STDERR: !! iccn002: Line: "dd: writing `/pfs/data2/perftest/tmp/dd_test_1_iccn002': Input/output error" !! iccn002: Exiting... !! pdsh@iccn997: iccn002: ssh exited with exit code 1 !! iccn001: Last dd command (dd if=/dev/zero of=/pfs/data2/perftest/tmp/dd_test_3_iccn001 bs=1M count=1111 2> /tmp/.iccn001.3.ddout.21693) did not have expected output on STDERR: !! iccn001: Line: "dd: writing `/pfs/data2/perftest/tmp/dd_test_1_iccn001': Input/output error" !! iccn001: Exiting... !! pdsh@iccn997: iccn001: ssh exited with exit code 1 !! iccn003: Last dd command (dd if=/dev/zero of=/pfs/data2/perftest/tmp/dd_test_3_iccn003 bs=1M count=1111 2> /tmp/.iccn003.3.ddout.14031) did not have expected output on STDERR: !! iccn003: Line: "dd: writing `/pfs/data2/perftest/tmp/dd_test_1_iccn003': Input/output error" !! iccn003: Exiting... !! pdsh@iccn997: iccn003: ssh exited with exit code 1 Exiting... # Check parallel_bonnie: | 1 | 2 | 2 | 3 | 3 | 3 | 1 | 1 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 | 3 | 3 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 !! Error in last command (pdsh -f 3 -w iccn[004,005,006] /pfs/data1/perftest/bin/parallel_bonnie_doit /pfs/data1/perftest/bin /pfs/data2/perftest/tmp 4 1 2>/tmp/.parallel_bonnie.err.34813): !! iccn004: Output string of time command in file /tmp/.iccn004.2.time.11861 has wrong format: "Command exited with non-zero status 1 !! iccn004: 7:22.22"! Exiting... !! pdsh@iccn997: iccn004: ssh exited with exit code 1 Exiting... # Check fstest: # Was still running. Typed Ctrl+C er2341@iccn007:~> ls -l /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.* -rw-r--r-- 1 er2341 scc 0 20. Mär 16:47 /pfs/work1/perftest/tmp/fstest_20140320.err -rw-r--r-- 1 er2341 scc 1624 20. Mär 16:50 /pfs/work1/perftest/tmp/fstest_20140320.out er2341@iccn007:~> egrep -i "err|fail" /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.out # Check copy tests: ... cp: Schließen von „bind_cp5/bind-9.2.4rc4/lib/win32/bindevt/bindevt.dsw“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/lib/win32/bindevt/bindevt.mak“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/lib/win32/bindevt/bindevt.dsp“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/lib/win32/bindevt/bindevt.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/lib/tests/include/Makefile.in“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/lib/tests/include/tests/Makefile.in“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/lib/tests/include/tests/t_api.h“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/lib/tests/Makefile.in“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/lib/tests/t_api.c“: Eingabe-/Ausgabefehler Do 20. Mär 17:00:42 CET 2014 7 Do 20. Mär 17:00:42 CET 2014 Do 20. Mär 17:01:18 CET 2014 8 Do 20. Mär 17:01:18 CET 2014 Do 20. Mär 17:01:51 CET 2014 9 Do 20. Mär 17:01:51 CET 2014 Do 20. Mär 17:02:25 CET 2014 10 Do 20. Mär 17:02:25 CET 2014 Do 20. Mär 17:02:59 CET 2014 11 Do 20. Mär 17:02:59 CET 2014 Do 20. Mär 17:03:34 CET 2014 12 Do 20. Mär 17:03:34 CET 2014 Do 20. Mär 17:04:09 CET 2014 13 Do 20. Mär 17:04:09 CET 2014 Do 20. Mär 17:04:43 CET 2014 14 Do 20. Mär 17:04:43 CET 2014 ^C # Collect data for problem report # =============================== # Collect lctl dk on servers: [root@pfscn1 ~]# pdsh -a 'cd /tmp; lctl dk `hostname`.llog' pfscn4: Debug log: 61048 lines, 61048 kept, 0 dropped, 0 bad. pfscn1: Debug log: 614588 lines, 614588 kept, 0 dropped, 0 bad. pfscn3: Debug log: 1150996 lines, 1150996 kept, 0 dropped, 0 bad. pfscn2: Debug log: 2311345 lines, 2311345 kept, 0 dropped, 0 bad. [root@pfscn1 ~]# mkdir logs/lctl_dk [root@pfscn1 ~]# pdsh -a 'scp /tmp/`hostname`.llog pfscn1:/root/logs/lctl_dk/' [root@pfscn1 ~]# ls -lh logs/lctl_dk/ [root@pfscn1 ~]# cd logs; tar cvzf server_lctl_dk_`date +%Y%m%d`.tgz lctl_dk/ [root@pfscn1 logs]# rm -fr lctl_dk/; cd - rz54@rzm-laifer2:~$ cd ~/UCFS/Problem/Problem_data/; scp root@pfscn1:logs/server_lctl_dk_`date +%Y%m%d`.tgz . # Collect lctl dk on clients: er2341@iccn997:~> sudo pdsh -w iccn[001-008] 'cd /tmp; lctl dk `hostname`.llog' iccn007: Debug log: 860599 lines, 860599 kept, 0 dropped, 0 bad. iccn008: Debug log: 1168339 lines, 1168339 kept, 0 dropped, 0 bad. iccn003: Debug log: 1265640 lines, 1265640 kept, 0 dropped, 0 bad. iccn004: Debug log: 1563204 lines, 1563204 kept, 0 dropped, 0 bad. iccn002: Debug log: 1881922 lines, 1881922 kept, 0 dropped, 0 bad. iccn005: Debug log: 1548983 lines, 1548983 kept, 0 dropped, 0 bad. iccn006: Debug log: 1514179 lines, 1514179 kept, 0 dropped, 0 bad. iccn001: Debug log: 2158394 lines, 2158394 kept, 0 dropped, 0 bad. er2341@iccn997:~> mkdir /tmp/problem_lctl_dk er2341@iccn997:~> sudo pdsh -w iccn[001-008] 'scp /tmp/`hostname`.llog iccn997:/tmp/problem_lctl_dk/' er2341@iccn997:~> sudo chown er2341 /tmp/problem_lctl_dk/* er2341@iccn997:~> cd /tmp; tar cvzf client_lctl_dk_`date +%Y%m%d`.tgz problem_lctl_dk/ er2341@iccn997:/tmp> rm -rf problem_lctl_dk/; cd - rz54@rzm-laifer2:~$ cd ~/UCFS/Problem/Problem_data/; scp er2341@icc:/tmp/client_lctl_dk_`date +%Y%m%d`.tgz . er2341@iccn997:~> rm /tmp/client_lctl_dk_`date +%Y%m%d`.tgz # Collect es_showall on servers: [root@pfscn1 ~]# es_showall --include-logs [root@pfscn1 ~]# mv es_lustre_showall_`date +%Y-%m-%d`_*.tar.bz2 logs/ rz54@rzm-laifer2:~$ cd ~/UCFS/Problem/Problem_data/; scp root@pfscn1:logs/es_lustre_showall_`date +%Y-%m-%d`_\*.tar.bz2 . # Collect messages on clients: er2341@iccn997:~> mkdir /tmp/logs/ er2341@iccn997:~> sudo pdsh -w iccn[001-008] 'scp /var/log/messages iccn997:/tmp/logs/messages_$(hostname | sed -e "s/\..*//")' er2341@iccn997:~> sudo chown er2341 /tmp/logs/* er2341@iccn997:~> cd /tmp/logs/; tar czvf client_messages_`date +%Y%m%d`.tgz messages_* rz54@rzm-laifer2:~$ cd ~/UCFS/Problem/Problem_data/; scp er2341@icc:/tmp/logs/client_messages_`date +%Y%m%d`.tgz . er2341@iccn997:/tmp/logs> cd -; rm -rf /tmp/logs/ # Send data to ftp site: rz54@rzm-laifer2:~/UCFS/Problem/Problem_data$ ls | grep $(date +%Y-%m-%d) es_lustre_showall_2014-03-20_170825.tar.bz2 rz54@rzm-laifer2:~/UCFS/Problem/Problem_data$ ls | grep $(date +%Y%m%d) client_lctl_dk_20140320.tgz client_messages_20140320.tgz server_lctl_dk_20140320.tgz # ftp upload ftp ftp.ddntsr.com User: anonymous Pass: roland.laifer@kit.edu cd /upload bin put # repeat for all 4 files bye # Cleanup # ======= # parallel_dd: er2341@iccn997:~> rm /pfs/data2/perftest/tmp/dd_test* # parallel_bonnie: er2341@iccn997:~> rm -rf /pfs/data2/perftest/tmp/icc* # fstest: er2341@iccn007:~> rm -rf /pfs/data2/perftest/fstest/tmp/fstest* # copy file tree: er2341@iccn008:/pfs/data2/perftest/failover> rm -rf bind_cp1[0-9]* bind_cp[2-9]* # Further discussion see SR 30502.