Loading...

Type: Bug
Resolution: Fixed
Priority: Minor
Fix Version/s: Lustre 2.11.0, Lustre 2.10.2
Affects Version/s: None
Labels:
None

Severity:
3
Rank (Obsolete):
9223372036854775807

The panic stack is :

[1001604.771625] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088
[1001605.163344] IP: [<ffffffffa1314227>] ll_statahead_thread+0x107/0x2860 [lustre]
[1001605.525225] PGD 0 
[1001605.631331] Oops: 0002 [#1] SMP 
[1001605.797101] Modules linked in: fuse osc(OE) mgc(OE) lustre(OE) lmv(OE) fld(OE) mdc(OE) fid(OE) lov(OE) ko2iblnd(OE) ptlrpc(OE) obdclass(OE) lnet(OE) sha512_ssse3 sha512_generic cts rpcsec_gss_krb5 nfsv4 crypto_null libcfs(OE) dns_resolver rdma_ucm(OE) ib_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_uverbs(OE) ib_umad(OE) mlx5_ib(OE) mlx5_core(OE) mlx4_en(OE) ipt_REJECT nf_reject_ipv4 nf_log_ipv4 nf_log_common xt_LOG nf_conntrack_ipv4 nf_defrag_ipv4 xt_multiport xt_recent xt_conntrack nf_conntrack iptable_filter dm_mirror mlx4_ib(OE) dm_region_hash dm_log dm_mod ib_sa(OE) ib_mad(OE) ib_core(OE) ib_addr(OE) ib_netlink(OE) intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel mgag200 lrw gf128mul ttm glue_helper ablk_helper
[1001609.295329]  cryptd drm_kms_helper mlx4_core(OE) syscopyarea sysfillrect sysimgblt fb_sys_fops drm iTCO_wdt mlx_compat(OE) iTCO_vendor_support ses hpwdt enclosure sb_edac pcspkr lpc_ich ipmi_devintf edac_core i2c_i801 shpchp acpi_power_meter wmi fjes ipmi_si ipmi_msghandler pcc_cpufreq nfsd binfmt_misc auth_rpcgss ip_tables bridge stp llc xfs isci libsas qla3xxx e1000e igb i2c_algo_bit i2c_core megaraid_sas aacraid aic79xx ata_piix mpt2sas raid_class mptspi scsi_transport_spi mptsas mptscsih mptbase arcmsr ahci libahci sata_nv sata_svw bnx2x libcrc32c bnx2 ext4 mbcache jbd2 sata_sil libata tg3 e1000 nfsv3 nfs_acl nfs lockd grace sunrpc fscache tun sd_mod crc_t10dif crct10dif_generic sg crct10dif_pclmul crct10dif_common ixgbe dca mdio hpilo hpsa ptp scsi_transport_sas pps_core [last unloaded: ipmi_msghandler]
[1001612.730562] 
[1001612.739169] CPU: 1 PID: 22492 Comm: ll_sa_22485 Tainted: G           OE  ------------   3.10.0-514.21.2.el7.x86_64 #1
[1001613.265741] Hardware name: HP ProLiant BL460c Gen9, BIOS I36 09/12/2016
[1001613.596748] task: ffff882bf63c5e20 ti: ffff882db9748000 task.ti: ffff882db9748000
[1001613.970335] RIP: 0010:[<ffffffffa1314227>]  [<ffffffffa1314227>] ll_statahead_thread+0x107/0x2860 [lustre]
[1001614.450601] RSP: 0018:ffff882db974bd20  EFLAGS: 00010246
[1001614.718782] RAX: 00000000000057dc RBX: 0000000000000000 RCX: 0000000000000000
[1001615.074787] RDX: 0000000000000046 RSI: 0000000000000000 RDI: ffff882e68eeeca8
[1001615.431144] RBP: ffff882db974bec0 R08: ffff882db9748000 R09: 0000000000000000
[1001615.790666] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8822d1bc4f00
[1001616.150001] R13: ffffffffa1314120 R14: 0000000000000000 R15: 0000000000000000
[1001616.511851] FS:  0000000000000000(0000) GS:ffff881fffa40000(0000) knlGS:0000000000000000
[1001616.916485] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1001617.206037] CR2: 0000000000000088 CR3: 00000000019be000 CR4: 00000000003407e0
[1001617.564110] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[1001617.921246] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[1001618.281884] Stack:
[1001618.391733]  0000000000000001 0000000000000014 ffff881b99375e20 ffff882db974bd78
[1001618.762956]  ffffffff810ce4c4 00059af4b94ae14e ffff882db974bd78 ffffffff810caebe
[1001619.134595]  ffff883fb0bfc000 ffff882e68eeeca8 0000000000000001 ffff882db974bdc8
[1001619.502128] Call Trace:
[1001619.632070]  [<ffffffff810ce4c4>] ? update_curr+0x104/0x190
[1001619.913794]  [<ffffffff810caebe>] ? account_entity_dequeue+0xae/0xd0
[1001620.233652]  [<ffffffff810ce9ac>] ? dequeue_entity+0x11c/0x5d0
[1001620.531744]  [<ffffffff810c1f55>] ? check_preempt_curr+0x75/0xa0
[1001620.835630]  [<ffffffff810cf27e>] ? dequeue_task_fair+0x41e/0x660
[1001621.143820]  [<ffffffff810c8395>] ? sched_clock_cpu+0x85/0xc0
[1001621.435359]  [<ffffffff81029569>] ? __switch_to+0xd9/0x4c0
[1001621.713460]  [<ffffffff8168c130>] ? __schedule+0x3b0/0x990
[1001621.992373]  [<ffffffffa1314120>] ? ll_agl_thread+0x4d0/0x4d0 [lustre]
[1001622.322281]  [<ffffffff810b0a4f>] kthread+0xcf/0xe0
[1001622.572155]  [<ffffffff810b0980>] ? kthread_create_on_node+0x140/0x140
[1001622.902516]  [<ffffffff81697698>] ret_from_fork+0x58/0x90
[1001623.176213]  [<ffffffff810b0980>] ? kthread_create_on_node+0x140/0x140
[1001623.505432] Code: 48 85 db 74 04 f0 ff 43 08 48 8b bd a8 fe ff ff e8 bf a6 37 e0 65 48 8b 04 25 c0 cd 00 00 f6 05 3d 82 97 ff 40 8b 80 a4 04 00 00 <89> 83 88 00 00 00 74 6e f6 05 2a 82 97 ff 80 74 65 48 8b 85 f8 
[1001624.452047] RIP  [<ffffffffa1314227>] ll_statahead_thread+0x107/0x2860 [lustre]
[1001624.822254]  RSP <ffff882db974bd20>
[1001625.002284] CR2: 0000000000000088

My crash dump analysis indicates that the crash occurs because statahead newly started thread finds lli_sai == NULL when it is not expected/checked in associated code :

// FIXME: replace the name of this with LL_I to conform to kernel stuff
// static inline struct ll_inode_info *LL_I(struct inode *inode)
static inline struct ll_inode_info *ll_i2info(struct inode *inode)
{
        return container_of(inode, struct ll_inode_info, lli_vfs_inode);
}

/*
 * take refcount of sai if sai for @dir exists, which means statahead is on for
 * this directory.
 */
static inline struct ll_statahead_info *ll_sai_get(struct inode *dir)
{
        struct ll_inode_info *lli = ll_i2info(dir);
        struct ll_statahead_info *sai = NULL;

        spin_lock(&lli->lli_sa_lock);
        sai = lli->lli_sai;
        if (sai != NULL)
                atomic_inc(&sai->sai_refcount);
        spin_unlock(&lli->lli_sa_lock);
        
        return sai;        <<<<< sai can be returned NULL !!!!
}            


/* statahead thread main function */
static int ll_statahead_thread(void *arg)
{
        struct dentry *parent = (struct dentry *)arg;
        struct inode *dir = parent->d_inode;
        struct ll_inode_info *lli = ll_i2info(dir);
        struct ll_sb_info *sbi = ll_i2sbi(dir);
        struct ll_statahead_info *sai;
        struct ptlrpc_thread *sa_thread;
        struct ptlrpc_thread *agl_thread;
        int first = 0;
        struct md_op_data *op_data;
        struct ll_dir_chain chain;
        struct l_wait_info lwi = { 0 };
        struct page *page = NULL;
        __u64 pos = 0;
        int rc = 0;
        ENTRY;

        sai = ll_sai_get(dir); <<<<<<<<<<<<<<< sai can be returned NULL !!!!!
        sa_thread = &sai->sai_thread;
        agl_thread = &sai->sai_agl_thread;
        sa_thread->t_pid = current_pid();   <<<<<<<<<<<<<<<<<<<  BUG()
…………………..

Further crash-dump analysis and concerned source code reading has pointed me to a racy scenario where multiple+concurrent attempts to start statahead thread for the same dir has occurred which can lead to this BUG(), due to the fact start_statahead_thread() routine unconditionally clearing lli_sai upon error (i.e., even if already/previously set by an other starter for ll_statahead_thread!) :

static int start_statahead_thread(struct inode *dir, struct dentry *dentry)
{
        struct ll_inode_info *lli = ll_i2info(dir);
        struct ll_statahead_info *sai = NULL;
        struct dentry *parent = dentry->d_parent;
        struct ptlrpc_thread *thread;
        struct l_wait_info lwi = { 0 };
        struct task_struct *task;
        int rc;
        ENTRY;

        /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
        rc = is_first_dirent(dir, dentry);
        if (rc == LS_NOT_FIRST_DE)
                /* It is not "ls -{a}l" operation, no need statahead for it. */
                GOTO(out, rc = -EFAULT); <<<<<<<<<<<<<<<<<<<<<<<<<

        sai = ll_sai_alloc(parent);
        if (sai == NULL)
                GOTO(out, rc = -ENOMEM); <<<<<<<<<<<<<<<<<<<<<<<<

        sai->sai_ls_all = (rc == LS_FIRST_DOT_DE);

        /* if current lli_opendir_key was deauthorized, or dir re-opened by
         * another process, don't start statahead, otherwise the newly spawned
         * statahead thread won't be notified to quit. */
        spin_lock(&lli->lli_sa_lock);
        if (unlikely(lli->lli_sai != NULL || <<<<<<<<<<<<<<<<<<<<<<<
                     lli->lli_opendir_key == NULL ||
                     lli->lli_opendir_pid != current->pid)) {
                spin_unlock(&lli->lli_sa_lock);
                GOTO(out, rc = -EPERM);
        }
        lli->lli_sai = sai;
        spin_unlock(&lli->lli_sa_lock);
..........................
out:
        /* once we start statahead thread failed, disable statahead so that
         * subsequent stat won't waste time to try it. */
        spin_lock(&lli->lli_sa_lock);
        lli->lli_sa_enabled = 0;
        lli->lli_sai = NULL; <<<<<<<<<<<<<<<<<<<<<<<<<
        spin_unlock(&lli->lli_sa_lock);

        if (sai != NULL)
                ll_sai_free(sai);

        RETURN(rc);
}

So my first guess is that this problem can be fixed simply by checking if (lli->lli_sai == said) before to clear/NULL lli->lli_sai in start_statahead_thread(), I will push a patch soon to implement this.

Details

Description

Attachments

Activity

People

Dates