Uploaded image for project: 'Lustre'
  1. Lustre
  2. LU-11591

MDS n003 crash, LustreError: 20558:0:(osp_sync.c:1258:osp_sync_thread()) ASSERTION( thread->t_flags != SVC_RUNNING ) failed: 4632 changes, 16403 in progress, 0 in flight

    XMLWordPrintable

Details

    • 3
    • 9223372036854775807

    Description

      2018-10-28 14:17:05 [357676.303755] LustreError: 20558:0:(llog_cat.c:269:llog_cat_id2handle()) snx11205-OST0003-osc-MDT0001: error opening log id [0x36ba7:0x1:0x0]:0: rc = -2
      2018-10-28 14:17:05 [357676.320844] LustreError: 20558:0:(osp_sync.c:1258:osp_sync_thread()) ASSERTION( thread->t_flags != SVC_RUNNING ) failed: 4632 changes, 16403 in progress, 0 in flight
      2018-10-28 14:17:05 [357676.339185] LustreError: 20558:0:(osp_sync.c:1258:osp_sync_thread()) LBUG
      2018-10-28 14:17:05 [357676.347801] Pid: 20558, comm: osp-syn-3-1
      2018-10-28 14:17:05 [357676.353657]
      2018-10-28 14:17:05 [357676.353657] Call Trace:
      2018-10-28 14:17:05 [357676.361029]  [<ffffffffc09aa7ae>] libcfs_call_trace+0x4e/0x60 [libcfs]
      2018-10-28 14:17:05 [357676.369345]  [<ffffffffc09aa83c>] lbug_with_loc+0x4c/0xb0 [libcfs]
      2018-10-28 14:17:05 [357676.377231]  [<ffffffffc157705b>] osp_sync_thread+0xa1b/0xa70 [osp]
      2018-10-28 14:17:05 [357676.385441]  [<ffffffff816b3e2c>] ? __schedule+0x47c/0xa30
      2018-10-28 14:17:05 [357676.392679]  [<ffffffffc1576640>] ? osp_sync_thread+0x0/0xa70 [osp]
      2018-10-28 14:17:05 [357676.400867]  [<ffffffff810b4031>] kthread+0xd1/0xe0
      2018-10-28 14:17:05 [357676.407508]  [<ffffffff810b3f60>] ? kthread+0x0/0xe0
      2018-10-28 14:17:05 [357676.414352]  [<ffffffff816c155d>] ret_from_fork+0x5d/0xb0
      2018-10-28 14:17:05 [357676.421426]  [<ffffffff810b3f60>] ? kthread+0x0/0xe0
      2018-10-28 14:17:05 [357676.428211]
      2018-10-28 14:17:05 [357676.431329] Kernel panic - not syncing: LBUG
      

      catalog llog_handle

      crash> llog_handle 0xffff880fcb784200
      struct llog_handle {
        lgh_lock = {
          {
            count = {
        lgh_id = {
          lgl_oi = {
            {
              oi = {
                oi_id = 8,
                oi_seq = 1
              },
              oi_fid = {
                f_seq = 8,
                f_oid = 1,
                f_ver = 0
              }
            }
          },
          lgl_ogen = 0
        },
        lgh_hdr = 0xffff880f60a70000,
        lgh_hdr_size = 8192,
        lgh_obj = 0xffff880fcb784d00,
        lgh_last_idx = 54914,
        lgh_cur_offset = 3530688,
        lgh_ctxt = 0xffff880f6bd4e840,
        u = {
          phd = {
            phd_entry = {
              next = 0xffff880f0dd95a90,
              prev = 0xffff88051ca0b290
            },
      crash> llog_log_hdr 0xffff880f60a70000
      struct llog_log_hdr {
        llh_hdr = {
          lrh_len = 8192,
          lrh_index = 0,
          lrh_type = 275010873,
          lrh_id = 0
        },
        llh_timestamp = 1519928037,
        llh_count = 2,
        llh_bitmap_offset = 88,
        llh_size = 64,
        llh_flags = 18,
        llh_cat_idx = 54912,
        llh_tgtuuid = {
          uuid = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
        },
        llh_reserved = {0},
        llh_bitmap = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...},
        llh_tail = {
          lrt_len = 8192,
          lrt_index = 54914
        }
      }
      

      The catalog processing and writing operates the same record. So the race is very close to LU-11392/LU-7001. I've identify one more place for race.

      1 Thread with write                            2 Thread with read
      llog_osd_write_rec()                           llog_process_thread()
      lgh_last_idx++                             
      lock lgh_hdr_mutex
      ext2_set_bit() 
      dt_write_rec (write header)                ext2_test_bit()
                                                                 check lgh_last_idx was changed
                                                                 dt_read_rec() reread the record, and here we got the old value of record
      unlock lgh_hdr_mutex
      dt_write_rec (write the record)
      

      When llog_process_thread test the bit from bitmap and reread the record, we cannot be sure that the record was written by another thread. The lgh_hdr_mutex guarantee only the consistent of llog header, not the record itself.

      Attachments

        Activity

          People

            aboyko Alexander Boyko
            aboyko Alexander Boyko
            Votes:
            0 Vote for this issue
            Watchers:
            6 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: