# This file describes how we checked if SLES11 SP3 clients with # Lustre 2.4.2 work correctly on icc and hcd. # Check installed versions # ======================== [root@pfscn1 ~]# pdsh -a cat /proc/fs/lustre/version | dshbak -c ---------------- pfscn[1-4] ---------------- lustre: 2.4.1 kernel: patchless_client build: EXAScaler-ddn1.0--PRISTINE-2.6.32-358.18.1.el6_lustre.es143.devel.x86_64 er2341@iccn997:~> sudo pdsh -a cat /proc/fs/lustre/version | dshbak -c ---------------- iccn[001-009,996-997,999] ---------------- lustre: 2.4.2 kernel: patchless_client build: jenkins-arch=x86_64,build_type=client,distro=sles11sp2,ib_stack=inkernel-70--PRISTINE-../lustre/scripts ---------------- iccn998 ---------------- lustre: 2.4.2 kernel: patchless_client build: jenkins-arch=x86_64,build_type=client,distro=sles11sp2,ib_stack=inkernel-70--PRISTINE-2.6.32-431.3.1.el6.x86_64 er2341@hcdn997:~> sudo pdsh -a cat /proc/fs/lustre/version | dshbak -c hcdn999: cat: /proc/fs/lustre/version: Datei oder Verzeichnis nicht gefunden pdsh@hcdn997: hcdn999: ssh exited with exit code 1 hcdn998: ssh: connect to host hcdn998 port 22: No route to host pdsh@hcdn997: hcdn998: ssh exited with exit code 255 ---------------- hcdn[988-997] ---------------- lustre: 2.4.2 kernel: patchless_client build: jenkins-arch=x86_64,build_type=client,distro=sles11sp2,ib_stack=inkernel-70--PRISTINE-../lustre/scripts # Preparation steps # ================= # see file failover_pfsc_20130711.txt root@hcdn997:~# chown er2341 /pfs/data2/perftest/tmp root@hcdn997:~# chown er2341 /pfs/data2/perftest/failover root@hcdn997:~# chown er2341 /pfs/work1/perftest/tmp root@hcdn997:~# chown er2341 /pfs/data2/perftest/fstest root@hcdn997:~# chown er2341 /pfs/data2/perftest/fstest/* root@hcdn997:~# chmod 755 /pfs/data1/perftest/bin/* rz54@rzm-laifer2:~/PFS/Software/aakef-ql-fstest-1df78602a8d1$ scp * er2341@hcd:/pfs/data2/perftest/fstest er2341@hcdn997:~> cd /pfs/data2/perftest/fstest er2341@hcdn997:/pfs/data2/perftest/fstest> make clean er2341@hcdn997:/pfs/data2/perftest/fstest> make # Start tests to run in parallel on hcd # ===================================== # We also test failover with oss2 of the pfscdat2 file system. [root@pfscn1 ~]# crm_mon -1 | grep ost | grep pfscdat2 ost_pfscdat2_0 (ocf::ddn:lustre_server): Started pfscn4 # We used screen to create 4 sessions. # On first session run parallel_dd: er2341@hcdn997:~> /pfs/data1/perftest/bin/parallel_dd -b /pfs/data1/perftest/bin -d /pfs/data2/perftest/tmp -s 10000 -n hcdn[993-995] -p 1:1:12 -v # On second session run parallel_bonnie: er2341@hcdn997:~> /pfs/data1/perftest/bin/parallel_bonnie -e /pfs/data1/perftest/bin -d /pfs/data2/perftest/tmp -n 16 -l hcdn[990-992] -p 12 # On third session run fstest: er2341@hcdn997:~> /pfs/data2/perftest/fstest/fstest /pfs/data2/perftest/fstest/tmp > /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.out 2> /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.err # On fourth session copy file tree: er2341@hcdn997:/pfs/data2/perftest/failover> for iter in $(seq 2 30); do echo $iter; date; mkdir bind_cp$iter; cp -p -r bind_cp`expr $iter - 1`/* bind_cp$iter/; date; done # Reboot OSS # ========== # wait 3 minutes [root@mds1 ~]# pdsh -w pfscn4 shutdown -r now # wait a minute [root@pfscn1 ~]# crm_mon -1 | grep ost | grep pfscdat2 ost_pfscdat2_0 (ocf::ddn:lustre_server): Started pfscn3 # wait few minutes [root@pfscn1 ~]# pdsh -a uptime pfscn1: 17:13:00 up 63 days, 1:59, 1 user, load average: 0.00, 0.01, 0.00 pfscn2: 17:13:00 up 63 days, 1:59, 0 users, load average: 0.24, 0.42, 0.29 pfscn3: 17:13:00 up 63 days, 1:59, 0 users, load average: 1.03, 1.08, 0.77 pfscn4: 17:13:00 up 3 min, 0 users, load average: 1.56, 0.70, 0.27 [root@pfscn1 ~]# crm_mon -1 | grep ost | grep pfscdat2 ost_pfscdat2_0 (ocf::ddn:lustre_server): Started pfscn4 [root@pfscn1 ~]# pdsh -a lustre_recovery_status.sh | grep -v COMPLETE # Check and stop tests # ==================== # Check parallel_dd: ... Read 10000 MB on 2 nodes (2 tasks) in 5.75 sec: throughput is 1738.7 MB/s. Deleted 4 files on 2 nodes (2 tasks) in 0.42 sec: 9.4 deletes/s. !! Error in last command (pdsh -f 3 -w hcdn[993,994,995] LANG=C /pfs/data1/perftest/bin/parallel_dd_doit w /pfs/data2/perftest/tmp 2 1666 2>/tmp/.parallel_dd.err.20520): !! hcdn995: Last dd command (dd if=/dev/zero of=/pfs/data2/perftest/tmp/dd_test_2_hcdn995 bs=1M count=1666 2> /tmp/.hcdn995.2.ddout.327) did not have expected output on STDERR: !! hcdn995: Line: "dd: writing `/pfs/data2/perftest/tmp/dd_test_1_hcdn995': Input/output error" !! hcdn995: Exiting... !! pdsh@hcdn997: hcdn995: ssh exited with exit code 1 !! hcdn993: Last dd command (dd if=/dev/zero of=/pfs/data2/perftest/tmp/dd_test_2_hcdn993 bs=1M count=1666 2> /tmp/.hcdn993.2.ddout.6398) did not have expected output on STDERR: !! hcdn993: Line: "dd: writing `/pfs/data2/perftest/tmp/dd_test_1_hcdn993': Input/output error" !! hcdn993: Exiting... !! hcdn994: Last dd command (dd if=/dev/zero of=/pfs/data2/perftest/tmp/dd_test_2_hcdn994 bs=1M count=1666 2> /tmp/.hcdn994.2.ddout.5404) did not have expected output on STDERR: !! hcdn994: Line: "dd: writing `/pfs/data2/perftest/tmp/dd_test_1_hcdn994': Input/output error" !! hcdn994: Exiting... !! pdsh@hcdn997: hcdn993: ssh exited with exit code 1 !! pdsh@hcdn997: hcdn994: ssh exited with exit code 1 Exiting... # Check parallel_bonnie: | 1 | 2 | 2 | 3 | 3 | 3 | 1 | 1 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 | 3 | 3 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 2 !! Error in last command (pdsh -f 3 -w hcdn[990,991,992] /pfs/data1/perftest/bin/parallel_bonnie_doit /pfs/data1/perftest/bin /pfs/data2/perftest/tmp 3 1 2>/tmp/.parallel_bonnie.err.20527): !! hcdn991: Output string of time command in file /tmp/.hcdn991.1.time.26958 has wrong format: "Command exited with non-zero status 1 !! hcdn991: 6:43.53"! Exiting... !! pdsh@hcdn997: hcdn991: ssh exited with exit code 1 !! hcdn992: Output string of time command in file /tmp/.hcdn992.1.time.32178 has wrong format: "Command exited with non-zero status 1 !! hcdn992: 6:44.28"! Exiting... !! pdsh@hcdn997: hcdn992: ssh exited with exit code 1 !! hcdn990: Output string of time command in file /tmp/.hcdn990.1.time.28724 has wrong format: "Command exited with non-zero status 1 !! hcdn990: 6:50.88"! Exiting... !! pdsh@hcdn997: hcdn990: ssh exited with exit code 1 Exiting... # Check fstest: # Type Ctrl+C er2341@hcdn997:~> ls -l /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.* -rw-r--r-- 1 er2341 scc 0 10. Feb 17:03 /pfs/work1/perftest/tmp/fstest_20140210.err -rw-r--r-- 1 er2341 scc 1625 10. Feb 17:05 /pfs/work1/perftest/tmp/fstest_20140210.out er2341@hcdn997:~> egrep -i "err|fail" /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.* # Check copy tests: ... cp: Schließen von „bind_cp5/bind-9.2.4rc4/bin/tests/byname_test.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/bin/tests/wire_test.data“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/bin/tests/Kchild.example.+003+04017.private“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/bin/tests/b9t.mk“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/bin/tests/shutdown_test.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/bin/tests/entropy2_test.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/bin/tests/tasks/t_tasks.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp5/bind-9.2.4rc4/bin/tests/tasks/Makefile.in“: Eingabe-/Ausgabefehler Mo 10. Feb 17:13:58 CET 2014 7 Mo 10. Feb 17:13:58 CET 2014 ^C # Cleanup # ======= # parallel_dd: er2341@hcdn997:~> rm /pfs/data2/perftest/tmp/dd_test* # parallel_bonnie: er2341@hcdn997:~> rm -rf /pfs/data2/perftest/tmp/hcd* # fstest: er2341@hcdn997:~> rm -rf /pfs/data2/perftest/fstest/tmp/fstest* # copy file tree: er2341@hcdn997:/pfs/data2/perftest/failover> rm -rf bind_cp1[0-9]* bind_cp[2-9]* # Preparation steps # ================= # see file failover_pfsc_20130711.txt # Follow instructions of support: er2341@hcdn997:~> sudo pdsh -w hcdn[990-997] 'sysctl lnet.debug="+vfstrace rpctrace"' er2341@hcdn997:~> sudo pdsh -w hcdn[990-997] lctl clear [root@pfscn1 ~]# pdsh -a 'sysctl lnet.debug="+vfstrace rpctrace"' [root@pfscn1 ~]# pdsh -a lctl clear # Start tests to run in parallel on hcd # ===================================== # We also test failover with oss2 of the pfscdat2 file system. [root@pfscn1 ~]# crm_mon -1 | grep ost | grep pfscdat2 ost_pfscdat2_0 (ocf::ddn:lustre_server): Started pfscn4 # We used screen to create 4 sessions. # On first session run parallel_dd: er2341@hcdn997:~> /pfs/data1/perftest/bin/parallel_dd -b /pfs/data1/perftest/bin -d /pfs/data2/perftest/tmp -s 10000 -n hcdn[993-995] -p 1:1:12 -v # On second session run parallel_bonnie: er2341@hcdn997:~> /pfs/data1/perftest/bin/parallel_bonnie -e /pfs/data1/perftest/bin -d /pfs/data2/perftest/tmp -n 16 -l hcdn[990-992] -p 12 # On third session run fstest: er2341@hcdn996:~> /pfs/data2/perftest/fstest/fstest /pfs/data2/perftest/fstest/tmp > /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.out 2> /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.err # On fourth session copy file tree: er2341@hcdn997:/pfs/data2/perftest/failover> for iter in $(seq 2 30); do echo $iter; date; mkdir bind_cp$iter; cp -p -r bind_cp`expr $iter - 1`/* bind_cp$iter/; date; done # Reboot OSS # ========== # wait 3 minutes [root@mds1 ~]# pdsh -w pfscn4 shutdown -r now # wait a minute [root@pfscn1 ~]# crm_mon -1 | grep ost | grep pfscdat2 ost_pfscdat2_0 (ocf::ddn:lustre_server): Started pfscn3 # wait few minutes [root@pfscn1 ~]# pdsh -a uptime pfscn3: 17:43:06 up 71 days, 2:30, 0 users, load average: 1.21, 1.41, 1.02 pfscn1: 17:43:06 up 71 days, 2:29, 1 user, load average: 0.00, 0.00, 0.00 pfscn2: 17:43:06 up 71 days, 2:29, 0 users, load average: 0.15, 0.09, 0.05 pfscn4: 17:43:06 up 3 min, 0 users, load average: 1.72, 1.65, 0.77 [root@pfscn1 ~]# crm_mon -1 | grep ost | grep pfscdat2 ost_pfscdat2_0 (ocf::ddn:lustre_server): Started pfscn4 [root@pfscn1 ~]# pdsh -a lustre_recovery_status.sh | grep -v COMPLETE # Check and stop tests # ==================== # Check parallel_dd: ... Wrote 10000 MB on 1 nodes (1 tasks) in 24.78 sec: throughput is 403.5 MB/s. Read 10000 MB on 1 nodes (1 tasks) in 2.74 sec: throughput is 3649.8 MB/s. Deleted 1 files on 1 nodes (1 tasks) in 0.37 sec: 2.7 deletes/s. Wrote 10000 MB on 2 nodes (1 tasks) in 22.92 sec: throughput is 436.3 MB/s. Read 10000 MB on 2 nodes (1 tasks) in 9.12 sec: throughput is 1096.7 MB/s. Deleted 2 files on 2 nodes (1 tasks) in 0.41 sec: 4.9 deletes/s. Wrote 9999 MB on 3 nodes (1 tasks) in 27.80 sec: throughput is 359.7 MB/s. Read 9999 MB on 3 nodes (1 tasks) in 6.28 sec: throughput is 1591.5 MB/s. Deleted 3 files on 3 nodes (1 tasks) in 0.44 sec: 6.8 deletes/s. !! Error in last command (pdsh -f 3 -w hcdn[993] LANG=C /pfs/data1/perftest/bin/parallel_dd_doit w /pfs/data2/perftest/tmp 2 5000 2>/tmp/.parallel_dd.err.4220): !! hcdn993: Last dd command (dd if=/dev/zero of=/pfs/data2/perftest/tmp/dd_test_2_hcdn993 bs=1M count=5000 2> /tmp/.hcdn993.2.ddout.11783) did not have expected output on STDERR: !! hcdn993: Line: "dd: writing `/pfs/data2/perftest/tmp/dd_test_1_hcdn993': Input/output error" !! hcdn993: Exiting... !! pdsh@hcdn997: hcdn993: ssh exited with exit code 1 Exiting... # Check parallel_bonnie: | 1 | 2 | 2 | 3 | 3 | 3 !! Error in last command (pdsh -f 3 -w hcdn[990] /pfs/data1/perftest/bin/parallel_bonnie_doit /pfs/data1/perftest/bin /pfs/data2/perftest/tmp 2 8 2>/tmp/.parallel_bonnie.err.4227): !! hcdn990: Output string of time command in file /tmp/.hcdn990.1.time.27503 has wrong format: "Command exited with non-zero status 1 !! hcdn990: 7:07.50"! Exiting... !! pdsh@hcdn997: hcdn990: ssh exited with exit code 1 Exiting... # Check fstest: Bus-Zugriffsfehler er2341@hcdn997:~> ls -l /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.* -rw-r--r-- 1 er2341 scc 32 18. Feb 17:43 /pfs/work1/perftest/tmp/fstest_20140218.err -rw-r--r-- 1 er2341 scc 900 18. Feb 17:36 /pfs/work1/perftest/tmp/fstest_20140218.out er2341@hcdn996:~> cat /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.err statvfs(): : Input/output error er2341@hcdn997:~> egrep -i "err|fail" /pfs/work1/perftest/tmp/fstest_`date +%Y%m%d`.out # Check copy tests: ... cp: Schließen von „bind_cp2/bind-9.2.4rc4/lib/bind/irs/lcl_pw.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp2/bind-9.2.4rc4/lib/bind/irs/getnetent_r.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp2/bind-9.2.4rc4/lib/bind/irs/lcl_ng.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp2/bind-9.2.4rc4/lib/bind/irs/dns_sv.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp2/bind-9.2.4rc4/lib/bind/irs/lcl_ho.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp2/bind-9.2.4rc4/lib/bind/irs/lcl_nw.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp2/bind-9.2.4rc4/lib/bind/irs/irpmarshall.c“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp2/bind-9.2.4rc4/lib/bind/irs/irs_p.h“: Eingabe-/Ausgabefehler cp: Schließen von „bind_cp2/bind-9.2.4rc4/lib/bind/irs/lcl_p.h“: Eingabe-/Ausgabefehler Di 18. Feb 17:43:47 CET 2014 4 Di 18. Feb 17:43:47 CET 2014 Di 18. Feb 17:44:25 CET 2014 5 Di 18. Feb 17:44:25 CET 2014 Di 18. Feb 17:45:03 CET 2014 6 Di 18. Feb 17:45:03 CET 2014 Di 18. Feb 17:45:38 CET 2014 7 Di 18. Feb 17:45:38 CET 2014 ^C # Collect data for problem report # =============================== # Collect lctl dk on servers: [root@pfscn1 ~]# pdsh -a 'cd /tmp; lctl dk `hostname`.llog' pfscn4: Debug log: 56471 lines, 56471 kept, 0 dropped, 0 bad. pfscn1: Debug log: 97176 lines, 97176 kept, 0 dropped, 0 bad. pfscn3: Debug log: 91449 lines, 91449 kept, 0 dropped, 0 bad. pfscn2: Debug log: 631769 lines, 631769 kept, 0 dropped, 0 bad. [root@pfscn1 ~]# mkdir logs/lctl_dk [root@pfscn1 ~]# pdsh -a 'scp /tmp/`hostname`.llog pfscn1:/root/logs/lctl_dk/' [root@pfscn1 ~]# ls -lh logs/lctl_dk/ [root@pfscn1 ~]# cd logs; tar cvzf server_lctl_dk_`date +%Y%m%d`.tgz lctl_dk/ [root@pfscn1 logs]# rm -fr lctl_dk/; cd - rz54@rzm-laifer2:~$ cd ~/UCFS/Problem/Problem_data/; scp root@pfscn1:logs/server_lctl_dk_`date +%Y%m%d`.tgz . # Collect lctl dk on clients: er2341@hcdn997:~> sudo pdsh -w hcdn[990-997] 'cd /tmp; lctl dk `hostname`.llog' er2341@hcdn997:~> mkdir /tmp/problem_lctl_dk er2341@hcdn997:~> sudo pdsh -w hcdn[990-997] 'scp /tmp/`hostname`.llog hcdn997:/tmp/problem_lctl_dk/' er2341@hcdn997:~> sudo chown er2341 /tmp/problem_lctl_dk/* er2341@hcdn997:~> cd /tmp; tar cvzf client_lctl_dk_`date +%Y%m%d`.tgz problem_lctl_dk/ er2341@hcdn997:/tmp> rm -rf problem_lctl_dk/; cd - rz54@rzm-laifer2:~$ cd ~/UCFS/Problem/Problem_data/; scp er2341@hcd:/tmp/client_lctl_dk_`date +%Y%m%d`.tgz . er2341@hcdn997:~> rm /tmp/client_lctl_dk_`date +%Y%m%d`.tgz # Collect es_showall on servers: [root@pfscn1 ~]# es_showall --include-logs [root@pfscn1 ~]# mv es_lustre_showall_`date +%Y-%m-%d`_*.tar.bz2 logs/ rz54@rzm-laifer2:~$ cd ~/UCFS/Problem/Problem_data/; scp root@pfscn1:logs/es_lustre_showall_`date +%Y-%m-%d`_\*.tar.bz2 . # Collect messages on clients: er2341@hcdn997:~> mkdir /tmp/logs/ er2341@hcdn997:~> sudo pdsh -w hcdn[990-997] 'scp /var/log/messages hcdn997:/tmp/logs/messages_$(hostname | sed -e "s/\..*//")' er2341@hcdn997:~> sudo chown er2341 /tmp/logs/* er2341@hcdn997:~> cd /tmp/logs/; tar czvf client_messages_`date +%Y%m%d`.tgz messages_* rz54@rzm-laifer2:~$ cd ~/UCFS/Problem/Problem_data/; scp er2341@hcd:/tmp/logs/client_messages_`date +%Y%m%d`.tgz . er2341@hcdn997:/tmp/logs> cd -; rm -rf /tmp/logs/ # Send data to ftp site: rz54@rzm-laifer2:~/UCFS/Problem/Problem_data$ ls | grep $(date +%Y-%m-%d) es_lustre_showall_2014-02-19_161417.tar.bz2 rz54@rzm-laifer2:~/UCFS/Problem/Problem_data$ ls | grep $(date +%Y%m%d) client_lctl_dk_20140219.tgz client_messages_20140219.tgz rz54@rzm-laifer2:~/UCFS/Problem/Problem_data$ ls | grep 20140218 server_lctl_dk_20140218.tgz # ftp upload ftp ftp.ddntsr.com User: anonymous Pass: roland.laifer@kit.edu cd /upload bin put # repeat for all 4 files bye # Cleanup # ======= # parallel_dd: er2341@hcdn997:~> rm /pfs/data2/perftest/tmp/dd_test* # parallel_bonnie: er2341@hcdn997:~> rm -rf /pfs/data2/perftest/tmp/hcd* # fstest: er2341@hcdn997:~> rm -rf /pfs/data2/perftest/fstest/tmp/fstest* # copy file tree: er2341@hcdn997:/pfs/data2/perftest/failover> rm -rf bind_cp1[0-9]* bind_cp[2-9]* # Further discussion see SR 30502.