00000400:00000080:37.0F:1538518798.533823:0:143165:0:(module.c:120:libcfs_ioctl()) libcfs ioctl cmd 3221775678 00000400:00000200:37.0:1538518798.533842:0:143165:0:(lib-move.c:4692:LNetGet()) LNetGet -> 12345-10.9.11.3@tcp 00000400:00000200:37.0:1538518798.533849:0:143165:0:(lib-move.c:2428:lnet_handle_send_case_locked()) Source ANY to MR: 10.9.11.3@tcp local destination 00000400:00000200:37.0:1538518798.533854:0:143165:0:(lib-move.c:1502:lnet_get_best_ni()) compare ni 10.9.10.2@tcp [c:256, d:10, s:14] with best_ni not seleced [c:-2147483648, d:-1, s:0] 00000400:00000200:37.0:1538518798.533860:0:143165:0:(lib-move.c:1545:lnet_get_best_ni()) selected best_ni 10.9.10.2@tcp 00000400:00000200:37.0:1538518798.533863:0:143165:0:(lib-move.c:1744:lnet_select_peer_ni()) 10.9.10.2@tcp ni_is_pref = 0 00000400:00000200:37.0:1538518798.533866:0:143165:0:(lib-move.c:1744:lnet_select_peer_ni()) 10.9.10.2@tcp ni_is_pref = 0 00000400:00000200:37.0:1538518798.533869:0:143165:0:(lib-move.c:1750:lnet_select_peer_ni()) 10.9.10.3@tcp c:[8, 8], s:[8, 8] 00000400:00000200:37.0:1538518798.533873:0:143165:0:(lib-move.c:1800:lnet_select_peer_ni()) sd_best_lpni = 10.9.11.3@tcp 00000400:00000200:37.0:1538518798.533882:0:143165:0:(lib-move.c:1706:lnet_handle_send()) TRACE: 10.9.10.2@tcp(10.9.10.2@tcp:) -> 10.9.11.3@tcp(10.9.11.3@tcp:10.9.11.3@tcp) : GET 00000800:00000200:37.0:1538518798.533887:0:143165:0:(socklnd_cb.c:996:ksocknal_send()) sending 0 bytes in 0 frags to 12345-10.9.11.3@tcp 00000800:00000200:37.0:1538518798.533893:0:143165:0:(socklnd.c:195:ksocknal_find_peer_locked()) got peer_ni [ffff883db9bfba00] -> 12345-10.9.11.3@tcp (2) 00000800:00000200:37.1F:1538518798.533897:0:143165:0:(socklnd.c:195:ksocknal_find_peer_locked()) got peer_ni [ffff883db9bfba00] -> 12345-10.9.11.3@tcp (2) 00000800:00020000:51.0F:1538518798.534248:0:142552:0:(socklnd_cb.c:1784:ksocknal_recv_hello()) Error -104 reading HELLO from 10.9.11.3 00000400:02020000:51.0:1538518798.534269:0:142552:0:(acceptor.c:126:lnet_connect_console_error()) 11b-b: Connection to 10.9.11.3@tcp at host 10.9.11.3 on port 988 was reset: is it running a compatible version of Lustre and is 10.9.11.3@tcp one of its NIDs? 00000400:00000200:51.0:1538518798.534278:0:142552:0:(router.c:1749:lnet_notify()) 10.9.10.2@tcp notifying 10.9.11.3@tcp: down 00000400:00000200:51.0:1538518798.534280:0:142552:0:(router.c:120:lnet_notify_locked()) Old news 00000800:00000100:51.0:1538518798.534290:0:142552:0:(socklnd_cb.c:435:ksocknal_txlist_done()) Deleting packet type 2 len 0 10.9.10.2@tcp->10.9.11.3@tcp 00000800:00020000:51.0:1538518798.534293:0:142552:0:(socklnd_cb.c:414:ksocknal_tx_done()) tx failure rc = -104, hstatus = 5 00000400:00000200:51.0:1538518798.534295:0:142552:0:(lib-msg.c:784:lnet_is_health_check()) health check = 1, status = -104, hstatus = 5 00000400:00000200:51.0:1538518798.534300:0:142552:0:(lib-msg.c:644:lnet_health_check()) health check: 10.9.10.2@tcp->10.9.11.3@tcp: GET: LOCAL_ERROR 00000400:00000200:37.0:1538518798.534340:0:143165:0:(api-ni.c:3963:lnet_ping()) poll 1(5 -104) 00000400:00000200:37.0:1538518798.534346:0:143165:0:(lib-md.c:69:lnet_md_unlink()) Unlinking md ffff884503a79ed0 00000400:00000200:37.0:1538518798.534350:0:143165:0:(api-ni.c:3963:lnet_ping()) poll 1(6 0) unlinked 00000400:00000080:37.0:1538518800.194504:0:143166:0:(module.c:120:libcfs_ioctl()) libcfs ioctl cmd 3221775678 00000400:00000200:37.0:1538518800.194526:0:143166:0:(lib-move.c:4692:LNetGet()) LNetGet -> 12345-10.9.11.3@tcp 00000400:00000200:37.0:1538518800.194532:0:143166:0:(lib-move.c:2428:lnet_handle_send_case_locked()) Source ANY to MR: 10.9.11.3@tcp local destination 00000400:00000200:37.0:1538518800.194537:0:143166:0:(lib-move.c:1502:lnet_get_best_ni()) compare ni 10.9.10.2@tcp [c:256, d:10, s:15] with best_ni not seleced [c:-2147483648, d:-1, s:0] 00000400:00000200:37.0:1538518800.194542:0:143166:0:(lib-move.c:1545:lnet_get_best_ni()) selected best_ni 10.9.10.2@tcp 00000400:00000200:37.0:1538518800.194545:0:143166:0:(lib-move.c:1744:lnet_select_peer_ni()) 10.9.10.2@tcp ni_is_pref = 0 00000400:00000200:37.0:1538518800.194548:0:143166:0:(lib-move.c:1744:lnet_select_peer_ni()) 10.9.10.2@tcp ni_is_pref = 0 00000400:00000200:37.0:1538518800.194551:0:143166:0:(lib-move.c:1750:lnet_select_peer_ni()) 10.9.10.3@tcp c:[8, 8], s:[8, 9] 00000400:00000200:37.0:1538518800.194554:0:143166:0:(lib-move.c:1800:lnet_select_peer_ni()) sd_best_lpni = 10.9.10.3@tcp 00000400:00000200:37.0:1538518800.194563:0:143166:0:(lib-move.c:1706:lnet_handle_send()) TRACE: 10.9.10.2@tcp(10.9.10.2@tcp:) -> 10.9.10.3@tcp(10.9.11.3@tcp:10.9.10.3@tcp) : GET 00000800:00000200:37.0:1538518800.194568:0:143166:0:(socklnd_cb.c:996:ksocknal_send()) sending 0 bytes in 0 frags to 12345-10.9.10.3@tcp 00000800:00000200:37.0:1538518800.194573:0:143166:0:(socklnd.c:195:ksocknal_find_peer_locked()) got peer_ni [ffff8844fbe76400] -> 12345-10.9.10.3@tcp (5) 00000800:00000200:37.0:1538518800.194579:0:143166:0:(socklnd_cb.c:758:ksocknal_queue_tx_locked()) Sending to 12345-10.9.10.3@tcp ip 10.9.10.3:1023 00000800:00000200:37.0:1538518800.194582:0:143166:0:(socklnd_cb.c:777:ksocknal_queue_tx_locked()) Packet ffff8844fbe63d00 type 2, nob 96 niov 1 nkiov 0 00000800:00000200:45.0F:1538518800.194643:0:142559:0:(socklnd_cb.c:550:ksocknal_process_transmit()) send(0) 0 00000400:00000200:45.0:1538518800.194655:0:142559:0:(lib-msg.c:784:lnet_is_health_check()) health check = 1, status = 0, hstatus = 0 00000400:00000200:37.0:1538518800.194660:0:143166:0:(api-ni.c:3963:lnet_ping()) poll 1(5 0) 00000400:00000200:45.0:1538518800.194661:0:142559:0:(lib-msg.c:644:lnet_health_check()) health check: 10.9.10.2@tcp->10.9.10.3@tcp: GET: OK 00000400:00000200:45.0:1538518800.194882:0:142559:0:(lib-move.c:4034:lnet_parse()) TRACE: 10.9.10.2@tcp(10.9.10.2@tcp) <- 10.9.10.3@tcp : REPLY - for me 00000400:00000200:45.0:1538518800.194891:0:142559:0:(lib-move.c:3801:lnet_parse_reply()) 10.9.10.2@tcp: Reply from 12345-10.9.10.3@tcp of length 48/48 into md 0xa9 00000400:00000200:45.0:1538518800.194900:0:142559:0:(lib-md.c:69:lnet_md_unlink()) Unlinking md ffff884503a79ed0 00000400:00000200:45.0:1538518800.194902:0:142559:0:(lib-msg.c:784:lnet_is_health_check()) health check = 0, status = 0, hstatus = 0 00000400:00000200:37.0:1538518800.194902:0:143166:0:(api-ni.c:3963:lnet_ping()) poll 1(3 0) unlinked 00000800:00000200:5.0F:1538518801.362107:0:142554:0:(socklnd.c:195:ksocknal_find_peer_locked()) got peer_ni [ffff88453a285800] -> 12345-10.9.10.2@tcp (6) 00000800:00000200:5.0:1538518801.362117:0:142554:0:(socklnd_cb.c:758:ksocknal_queue_tx_locked()) Sending to 12345-10.9.10.2@tcp ip 10.9.10.2:988 00000800:00000200:5.0:1538518801.362120:0:142554:0:(socklnd_cb.c:777:ksocknal_queue_tx_locked()) Packet ffff884d1a27c600 type 192, nob 24 niov 1 nkiov 0 00000800:00000200:47.0F:1538518801.362200:0:142558:0:(socklnd_cb.c:550:ksocknal_process_transmit()) send(0) 0 00000400:00000080:55.0F:1538518802.274750:0:143168:0:(module.c:120:libcfs_ioctl()) libcfs ioctl cmd 3221775678 00000400:00000200:55.0:1538518802.274778:0:143168:0:(lib-move.c:4692:LNetGet()) LNetGet -> 12345-10.9.11.3@tcp 00000400:00000200:55.0:1538518802.274784:0:143168:0:(lib-move.c:2428:lnet_handle_send_case_locked()) Source ANY to MR: 10.9.11.3@tcp local destination 00000400:00000200:55.0:1538518802.274789:0:143168:0:(lib-move.c:1502:lnet_get_best_ni()) compare ni 10.9.10.2@tcp [c:256, d:21, s:16] with best_ni not seleced [c:-2147483648, d:-1, s:0] 00000400:00000200:55.0:1538518802.274795:0:143168:0:(lib-move.c:1545:lnet_get_best_ni()) selected best_ni 10.9.10.2@tcp 00000400:00000200:55.0:1538518802.274798:0:143168:0:(lib-move.c:1744:lnet_select_peer_ni()) 10.9.10.2@tcp ni_is_pref = 0 00000400:00000200:55.0:1538518802.274801:0:143168:0:(lib-move.c:1744:lnet_select_peer_ni()) 10.9.10.2@tcp ni_is_pref = 0 00000400:00000200:55.0:1538518802.274803:0:143168:0:(lib-move.c:1750:lnet_select_peer_ni()) 10.9.10.3@tcp c:[8, 8], s:[9, 9] 00000400:00000200:55.0:1538518802.274807:0:143168:0:(lib-move.c:1800:lnet_select_peer_ni()) sd_best_lpni = 10.9.11.3@tcp 00000400:00000200:55.0:1538518802.274817:0:143168:0:(lib-move.c:1706:lnet_handle_send()) TRACE: 10.9.10.2@tcp(10.9.10.2@tcp:) -> 10.9.11.3@tcp(10.9.11.3@tcp:10.9.11.3@tcp) : GET 00000800:00000200:55.0:1538518802.274822:0:143168:0:(socklnd_cb.c:996:ksocknal_send()) sending 0 bytes in 0 frags to 12345-10.9.11.3@tcp 00000800:00000200:55.0:1538518802.274827:0:143168:0:(socklnd.c:195:ksocknal_find_peer_locked()) got peer_ni [ffff883db9bfba00] -> 12345-10.9.11.3@tcp (2) 00000800:00000200:55.0:1538518802.274831:0:143168:0:(socklnd_cb.c:857:ksocknal_find_connectable_route_locked()) Too soon to retry route 10.9.11.3 (cnted 0, interval 8, 4 secs later) 00000800:00000200:55.1F:1538518802.274836:0:143168:0:(socklnd.c:195:ksocknal_find_peer_locked()) got peer_ni [ffff883db9bfba00] -> 12345-10.9.11.3@tcp (2) 00000800:00000200:55.1:1538518802.274839:0:143168:0:(socklnd_cb.c:857:ksocknal_find_connectable_route_locked()) Too soon to retry route 10.9.11.3 (cnted 0, interval 8, 4 secs later) 00000800:00000100:55.0:1538518802.274844:0:143168:0:(socklnd_cb.c:973:ksocknal_launch_packet()) No usable routes to 12345-10.9.11.3@tcp 00000400:00020000:55.0:1538518802.274847:0:143168:0:(lib-msg.c:779:lnet_is_health_check()) Msg is in inconsistent state, don't perform health checking (-5, 0) 00000400:00000200:55.0:1538518802.293202:0:143168:0:(lib-msg.c:784:lnet_is_health_check()) health check = 0, status = -5, hstatus = 0 00000400:00000200:55.0:1538518802.293209:0:143168:0:(api-ni.c:3963:lnet_ping()) poll 1(5 -5) 00000400:00000200:55.0:1538518802.293212:0:143168:0:(lib-md.c:69:lnet_md_unlink()) Unlinking md ffff884d8b692990 00000400:00000200:55.0:1538518802.293215:0:143168:0:(api-ni.c:3963:lnet_ping()) poll 1(6 0) unlinked 00000400:00000080:55.0:1538518805.314757:0:143169:0:(module.c:120:libcfs_ioctl()) libcfs ioctl cmd 3221775678 00000400:00000200:55.0:1538518805.314783:0:143169:0:(lib-move.c:4692:LNetGet()) LNetGet -> 12345-10.9.11.3@tcp 00000400:00000200:55.0:1538518805.314789:0:143169:0:(lib-move.c:2428:lnet_handle_send_case_locked()) Source ANY to MR: 10.9.11.3@tcp local destination 00000400:00000200:55.0:1538518805.314794:0:143169:0:(lib-move.c:1502:lnet_get_best_ni()) compare ni 10.9.10.2@tcp [c:256, d:21, s:17] with best_ni not seleced [c:-2147483648, d:-1, s:0] 00000400:00000200:55.0:1538518805.314799:0:143169:0:(lib-move.c:1545:lnet_get_best_ni()) selected best_ni 10.9.10.2@tcp 00000400:00000200:55.0:1538518805.314802:0:143169:0:(lib-move.c:1744:lnet_select_peer_ni()) 10.9.10.2@tcp ni_is_pref = 0 00000400:00000200:55.0:1538518805.314805:0:143169:0:(lib-move.c:1744:lnet_select_peer_ni()) 10.9.10.2@tcp ni_is_pref = 0 00000400:00000200:55.0:1538518805.314807:0:143169:0:(lib-move.c:1750:lnet_select_peer_ni()) 10.9.10.3@tcp c:[8, 8], s:[9, 10] 00000400:00000200:55.0:1538518805.314812:0:143169:0:(lib-move.c:1800:lnet_select_peer_ni()) sd_best_lpni = 10.9.10.3@tcp 00000400:00000200:55.0:1538518805.314821:0:143169:0:(lib-move.c:1706:lnet_handle_send()) TRACE: 10.9.10.2@tcp(10.9.10.2@tcp:) -> 10.9.10.3@tcp(10.9.11.3@tcp:10.9.10.3@tcp) : GET 00000800:00000200:55.0:1538518805.314827:0:143169:0:(socklnd_cb.c:996:ksocknal_send()) sending 0 bytes in 0 frags to 12345-10.9.10.3@tcp 00000800:00000200:55.0:1538518805.314832:0:143169:0:(socklnd.c:195:ksocknal_find_peer_locked()) got peer_ni [ffff8844fbe76400] -> 12345-10.9.10.3@tcp (5) 00000800:00000200:55.0:1538518805.314838:0:143169:0:(socklnd_cb.c:758:ksocknal_queue_tx_locked()) Sending to 12345-10.9.10.3@tcp ip 10.9.10.3:1023 00000800:00000200:55.0:1538518805.314841:0:143169:0:(socklnd_cb.c:777:ksocknal_queue_tx_locked()) Packet ffff884cf0e8f300 type 2, nob 96 niov 1 nkiov 0 00000800:00000200:45.0:1538518805.314910:0:142559:0:(socklnd_cb.c:550:ksocknal_process_transmit()) send(0) 0 00000400:00000200:45.0:1538518805.314919:0:142559:0:(lib-msg.c:784:lnet_is_health_check()) health check = 1, status = 0, hstatus = 0 00000400:00000200:45.0:1538518805.314926:0:142559:0:(lib-msg.c:644:lnet_health_check()) health check: 10.9.10.2@tcp->10.9.10.3@tcp: GET: OK 00000400:00000200:55.0:1538518805.314927:0:143169:0:(api-ni.c:3963:lnet_ping()) poll 1(5 0) 00000400:00000200:45.0:1538518805.315203:0:142559:0:(lib-move.c:4034:lnet_parse()) TRACE: 10.9.10.2@tcp(10.9.10.2@tcp) <- 10.9.10.3@tcp : REPLY - for me 00000400:00000200:45.0:1538518805.315214:0:142559:0:(lib-move.c:3801:lnet_parse_reply()) 10.9.10.2@tcp: Reply from 12345-10.9.10.3@tcp of length 48/48 into md 0x385d 00000400:00000200:45.0:1538518805.315225:0:142559:0:(lib-md.c:69:lnet_md_unlink()) Unlinking md ffff884d8b692990 00000400:00000200:45.0:1538518805.315228:0:142559:0:(lib-msg.c:784:lnet_is_health_check()) health check = 0, status = 0, hstatus = 0 00000400:00000200:55.0:1538518805.315232:0:143169:0:(api-ni.c:3963:lnet_ping()) poll 1(3 0) unlinked Debug log: 95 lines, 95 kept, 0 dropped, 0 bad.