/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoedev.c * AoE device utility functions; maintains device list. */ #include <linux/hdreg.h> #include <linux/blkdev.h> #include <linux/netdevice.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/bitmap.h> #include <linux/kdev_t.h> #include <linux/moduleparam.h> #include <linux/string.h> #include "aoe.h" static void dummy_timer(ulong); static void freetgt(struct aoedev *d, struct aoetgt *t); static void skbpoolfree(struct aoedev *d); static int aoe_dyndevs = 1; module_param(aoe_dyndevs, int, 0644); MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); static struct aoedev *devlist; static DEFINE_SPINLOCK(devlist_lock); /* Because some systems will have one, many, or no * - partitions, * - slots per shelf, * - or shelves, * we need some flexibility in the way the minor numbers * are allocated. So they are dynamic. */ #define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) static DEFINE_SPINLOCK(used_minors_lock); static DECLARE_BITMAP(used_minors, N_DEVS); static int minor_get_dyn(ulong *sysminor) { ulong flags; ulong n; int error = 0; spin_lock_irqsave(&used_minors_lock, flags); n = find_first_zero_bit(used_minors, N_DEVS); if (n < N_DEVS) set_bit(n, used_minors); else error = -1; spin_unlock_irqrestore(&used_minors_lock, flags); *sysminor = n * AOE_PARTITIONS; return error; } static int minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) { ulong flags; ulong n; int error = 0; enum { /* for backwards compatibility when !aoe_dyndevs, * a static number of supported slots per shelf */ NPERSHELF = 16, }; if (aoemin >= NPERSHELF) { pr_err("aoe: %s %d slots per shelf\n", "static minor device numbers support only", NPERSHELF); error = -1; goto out; } n = aoemaj * NPERSHELF + aoemin; if (n >= N_DEVS) { pr_err("aoe: %s with e%ld.%d\n", "cannot use static minor device numbers", aoemaj, aoemin); error = -1; goto out; } spin_lock_irqsave(&used_minors_lock, flags); if (test_bit(n, used_minors)) { pr_err("aoe: %s %lu\n", "existing device already has static minor number", n); error = -1; } else set_bit(n, used_minors); spin_unlock_irqrestore(&used_minors_lock, flags); *sysminor = n * AOE_PARTITIONS; out: return error; } static int minor_get(ulong *sysminor, ulong aoemaj, int aoemin) { if (aoe_dyndevs) return minor_get_dyn(sysminor); else return minor_get_static(sysminor, aoemaj, aoemin); } static void minor_free(ulong minor) { ulong flags; minor /= AOE_PARTITIONS; BUG_ON(minor >= N_DEVS); spin_lock_irqsave(&used_minors_lock, flags); BUG_ON(!test_bit(minor, used_minors)); clear_bit(minor, used_minors); spin_unlock_irqrestore(&used_minors_lock, flags); } /* * Users who grab a pointer to the device with aoedev_by_aoeaddr * automatically get a reference count and must be responsible * for performing a aoedev_put. With the addition of async * kthread processing I'm no longer confident that we can * guarantee consistency in the face of device flushes. * * For the time being, we only bother to add extra references for * frames sitting on the iocq. When the kthreads finish processing * these frames, they will aoedev_put the device. */ void aoedev_put(struct aoedev *d) { ulong flags; spin_lock_irqsave(&devlist_lock, flags); d->ref--; spin_unlock_irqrestore(&devlist_lock, flags); } static void dummy_timer(ulong vp) { struct aoedev *d; d = (struct aoedev *)vp; if (d->flags & DEVFL_TKILL) return; d->timer.expires = jiffies + HZ; add_timer(&d->timer); } static void aoe_failip(struct aoedev *d) { struct request *rq; struct bio *bio; unsigned long n; aoe_failbuf(d, d->ip.buf); rq = d->ip.rq; if (rq == NULL) return; while ((bio = d->ip.nxbio)) { clear_bit(BIO_UPTODATE, &bio->bi_flags); d->ip.nxbio = bio->bi_next; n = (unsigned long) rq->special; rq->special = (void *) --n; } if ((unsigned long) rq->special == 0) aoe_end_request(d, rq, 0); } static void downdev_frame(struct list_head *pos) { struct frame *f; f = list_entry(pos, struct frame, head); list_del(pos); if (f->buf) { f->buf->nframesout--; aoe_failbuf(f->t->d, f->buf); } aoe_freetframe(f); } void aoedev_downdev(struct aoedev *d) { struct aoetgt *t, **tt, **te; struct list_head *head, *pos, *nx; struct request *rq; int i; d->flags &= ~DEVFL_UP; /* clean out active and to-be-retransmitted buffers */ for (i = 0; i < NFACTIVE; i++) { head = &d->factive[i]; list_for_each_safe(pos, nx, head) downdev_frame(pos); } head = &d->rexmitq; list_for_each_safe(pos, nx, head) downdev_frame(pos); /* reset window dressings */ tt = d->targets; te = tt + d->ntargets; for (; tt < te && (t = *tt); tt++) { aoecmd_wreset(t); t->nout = 0; } /* clean out the in-process request (if any) */ aoe_failip(d); /* fast fail all pending I/O */ if (d->blkq) { while ((rq = blk_peek_request(d->blkq))) { blk_start_request(rq); aoe_end_request(d, rq, 1); } } if (d->gd) set_capacity(d->gd, 0); } /* return whether the user asked for this particular * device to be flushed */ static int user_req(char *s, size_t slen, struct aoedev *d) { const char *p; size_t lim; if (!d->gd) return 0; p = kbasename(d->gd->disk_name); lim = sizeof(d->gd->disk_name); lim -= p - d->gd->disk_name; if (slen < lim) lim = slen; return !strncmp(s, p, lim); } static void freedev(struct aoedev *d) { struct aoetgt **t, **e; int freeing = 0; unsigned long flags; spin_lock_irqsave(&d->lock, flags); if (d->flags & DEVFL_TKILL && !(d->flags & DEVFL_FREEING)) { d->flags |= DEVFL_FREEING; freeing = 1; } spin_unlock_irqrestore(&d->lock, flags); if (!freeing) return; del_timer_sync(&d->timer); if (d->gd) { aoedisk_rm_debugfs(d); aoedisk_rm_sysfs(d); del_gendisk(d->gd); put_disk(d->gd); blk_cleanup_queue(d->blkq); } t = d->targets; e = t + d->ntargets; for (; t < e && *t; t++) freetgt(d, *t); if (d->bufpool) mempool_destroy(d->bufpool); skbpoolfree(d); minor_free(d->sysminor); spin_lock_irqsave(&d->lock, flags); d->flags |= DEVFL_FREED; spin_unlock_irqrestore(&d->lock, flags); } enum flush_parms { NOT_EXITING = 0, EXITING = 1, }; static int flush(const char __user *str, size_t cnt, int exiting) { ulong flags; struct aoedev *d, **dd; char buf[16]; int all = 0; int specified = 0; /* flush a specific device */ unsigned int skipflags; skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; if (!exiting && cnt >= 3) { if (cnt > sizeof buf) cnt = sizeof buf; if (copy_from_user(buf, str, cnt)) return -EFAULT; all = !strncmp(buf, "all", 3); if (!all) specified = 1; } flush_scheduled_work(); /* pass one: without sleeping, do aoedev_downdev */ spin_lock_irqsave(&devlist_lock, flags); for (d = devlist; d; d = d->next) { spin_lock(&d->lock); if (exiting) { /* unconditionally take each device down */ } else if (specified) { if (!user_req(buf, cnt, d)) goto cont; } else if ((!all && (d->flags & DEVFL_UP)) || d->flags & skipflags || d->nopen || d->ref) goto cont; aoedev_downdev(d); d->flags |= DEVFL_TKILL; cont: spin_unlock(&d->lock); } spin_unlock_irqrestore(&devlist_lock, flags); /* pass two: call freedev, which might sleep, * for aoedevs marked with DEVFL_TKILL */ restart: spin_lock_irqsave(&devlist_lock, flags); for (d = devlist; d; d = d->next) { spin_lock(&d->lock); if (d->flags & DEVFL_TKILL && !(d->flags & DEVFL_FREEING)) { spin_unlock(&d->lock); spin_unlock_irqrestore(&devlist_lock, flags); freedev(d); goto restart; } spin_unlock(&d->lock); } /* pass three: remove aoedevs marked with DEVFL_FREED */ for (dd = &devlist, d = *dd; d; d = *dd) { struct aoedev *doomed = NULL; spin_lock(&d->lock); if (d->flags & DEVFL_FREED) { *dd = d->next; doomed = d; } else { dd = &d->next; } spin_unlock(&d->lock); if (doomed) kfree(doomed->targets); kfree(doomed); } spin_unlock_irqrestore(&devlist_lock, flags); return 0; } int aoedev_flush(const char __user *str, size_t cnt) { return flush(str, cnt, NOT_EXITING); } /* This has been confirmed to occur once with Tms=3*1000 due to the * driver changing link and not processing its transmit ring. The * problem is hard enough to solve by returning an error that I'm * still punting on "solving" this. */ static void skbfree(struct sk_buff *skb) { enum { Sms = 250, Tms = 30 * 1000}; int i = Tms / Sms; if (skb == NULL) return; while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) msleep(Sms); if (i < 0) { printk(KERN_ERR "aoe: %s holds ref: %s\n", skb->dev ? skb->dev->name : "netif", "cannot free skb -- memory leaked."); return; } skb->truesize -= skb->data_len; skb_shinfo(skb)->nr_frags = skb->data_len = 0; skb_trim(skb, 0); dev_kfree_skb(skb); } static void skbpoolfree(struct aoedev *d) { struct sk_buff *skb, *tmp; skb_queue_walk_safe(&d->skbpool, skb, tmp) skbfree(skb); __skb_queue_head_init(&d->skbpool); } /* find it or allocate it */ struct aoedev * aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) { struct aoedev *d; int i; ulong flags; ulong sysminor = 0; spin_lock_irqsave(&devlist_lock, flags); for (d=devlist; d; d=d->next) if (d->aoemajor == maj && d->aoeminor == min) { spin_lock(&d->lock); if (d->flags & DEVFL_TKILL) { spin_unlock(&d->lock); d = NULL; goto out; } d->ref++; spin_unlock(&d->lock); break; } if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) goto out; d = kcalloc(1, sizeof *d, GFP_ATOMIC); if (!d) goto out; d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); if (!d->targets) { kfree(d); d = NULL; goto out; } d->ntargets = NTARGETS; INIT_WORK(&d->work, aoecmd_sleepwork); spin_lock_init(&d->lock); skb_queue_head_init(&d->skbpool); init_timer(&d->timer); d->timer.data = (ulong) d; d->timer.function = dummy_timer; d->timer.expires = jiffies + HZ; add_timer(&d->timer); d->bufpool = NULL; /* defer to aoeblk_gdalloc */ d->tgt = d->targets; d->ref = 1; for (i = 0; i < NFACTIVE; i++) INIT_LIST_HEAD(&d->factive[i]); INIT_LIST_HEAD(&d->rexmitq); d->sysminor = sysminor; d->aoemajor = maj; d->aoeminor = min; d->rttavg = RTTAVG_INIT; d->rttdev = RTTDEV_INIT; d->next = devlist; devlist = d; out: spin_unlock_irqrestore(&devlist_lock, flags); return d; } static void freetgt(struct aoedev *d, struct aoetgt *t) { struct frame *f; struct list_head *pos, *nx, *head; struct aoeif *ifp; for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { if (!ifp->nd) break; dev_put(ifp->nd); } head = &t->ffree; list_for_each_safe(pos, nx, head) { list_del(pos); f = list_entry(pos, struct frame, head); skbfree(f->skb); kfree(f); } kfree(t); } void aoedev_exit(void) { flush_scheduled_work(); flush(NULL, 0, EXITING); } int __init aoedev_init(void) { return 0; }