diff options
author | Lukas Ertl <le@FreeBSD.org> | 2004-09-18 13:44:43 +0000 |
---|---|---|
committer | Lukas Ertl <le@FreeBSD.org> | 2004-09-18 13:44:43 +0000 |
commit | 67e3ab6ee55dd554915413ed430780e4169df3fa (patch) | |
tree | b26a5245ec9f7555a82415414ac98257151bf50c /sys | |
parent | 54516c29e84f00963a79ba79d96c99c9ec2247e9 (diff) | |
download | src-67e3ab6ee55dd554915413ed430780e4169df3fa.tar.gz src-67e3ab6ee55dd554915413ed430780e4169df3fa.zip |
Re-vamp how I/O is handled in volumes and plexes.
Analogous to the drive level, give each volume and plex a worker thread
that picks up and processes incoming and completed BIOs.
This should fix the data corruption issues that have come up a few
weeks ago and improve performance, especially of RAID5 plexes.
The volume level needs a little work, though.
Notes
Notes:
svn path=/head/; revision=135426
Diffstat (limited to 'sys')
-rw-r--r-- | sys/geom/vinum/geom_vinum.h | 1 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_init.c | 2 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_plex.c | 475 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_raid5.c | 633 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_raid5.h | 63 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_rm.c | 1 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_subr.c | 17 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_var.h | 15 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_volume.c | 228 |
9 files changed, 772 insertions, 663 deletions
diff --git a/sys/geom/vinum/geom_vinum.h b/sys/geom/vinum/geom_vinum.h index a507d73e498c..ddbf5cf38d4c 100644 --- a/sys/geom/vinum/geom_vinum.h +++ b/sys/geom/vinum/geom_vinum.h @@ -70,6 +70,7 @@ int gv_is_striped(struct gv_plex *); int gv_is_open(struct g_geom *); void gv_kill_drive_thread(struct gv_drive *); void gv_kill_plex_thread(struct gv_plex *); +void gv_kill_vol_thread(struct gv_volume *); int gv_object_type(struct gv_softc *, char *); void gv_parse_config(struct gv_softc *, u_char *, int); const char *gv_roughlength(off_t, int); diff --git a/sys/geom/vinum/geom_vinum_init.c b/sys/geom/vinum/geom_vinum_init.c index 4ad7a031078c..382ea1572b08 100644 --- a/sys/geom/vinum/geom_vinum_init.c +++ b/sys/geom/vinum/geom_vinum_init.c @@ -293,7 +293,7 @@ gv_sync_td(void *arg) * This hack declare this bio as part of an initialization * process, so that the lower levels allow it to get through. */ - bp->bio_caller1 = p; + bp->bio_cflags |= GV_BIO_SYNCREQ; /* Schedule it down ... */ g_io_request(bp, to); diff --git a/sys/geom/vinum/geom_vinum_plex.c b/sys/geom/vinum/geom_vinum_plex.c index 8cfa6be488cb..494ec2c0ba10 100644 --- a/sys/geom/vinum/geom_vinum_plex.c +++ b/sys/geom/vinum/geom_vinum_plex.c @@ -43,6 +43,10 @@ __FBSDID("$FreeBSD$"); #include <geom/vinum/geom_vinum_raid5.h> #include <geom/vinum/geom_vinum.h> +static void gv_plex_completed_request(struct gv_plex *, struct bio *); +static void gv_plex_normal_request(struct gv_plex *, struct bio *); +static void gv_plex_worker(void *); + /* XXX: is this the place to catch dying subdisks? */ static void gv_plex_orphan(struct g_consumer *cp) @@ -76,48 +80,39 @@ gv_plex_orphan(struct g_consumer *cp) g_wither_geom(gp, error); } -static void +void gv_plex_done(struct bio *bp) { - struct g_geom *gp; - struct gv_sd *s; - - gp = bp->bio_to->geom; - - s = bp->bio_caller1; - KASSERT(s != NULL, ("gv_plex_done: NULL s")); - - if (bp->bio_error == 0) - s->initialized += bp->bio_length; - - if (s->initialized >= s->size) { - gv_set_sd_state(s, GV_SD_UP, 0); - s->initialized = 0; - } - - g_std_done(bp); + struct gv_plex *p; + struct gv_bioq *bq; + + p = bp->bio_from->geom->softc; + bp->bio_cflags |= GV_BIO_DONE; + bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO); + bq->bp = bp; + mtx_lock(&p->bqueue_mtx); + TAILQ_INSERT_TAIL(&p->bqueue, bq, queue); + wakeup(p); + mtx_unlock(&p->bqueue_mtx); } /* Find the correct subdisk to send the bio to and build a bio to send. */ static int -gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp, - caddr_t addr, long bcount, off_t boff) +gv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct g_geom *gp; - struct gv_plex *p; struct gv_sd *s; - struct bio *cbp; + struct bio *cbp, *pbp; int i, sdno; - off_t len_left, real_len, real_off, stripeend, stripeno, stripestart; - - s = NULL; - - gp = bp->bio_to->geom; - p = gp->softc; + off_t len_left, real_len, real_off; + off_t stripeend, stripeno, stripestart; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); + s = NULL; + gp = bp->bio_to->geom; + /* * We only handle concatenated and striped plexes here. RAID5 plexes * are handled in build_raid5_request(). @@ -190,10 +185,10 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp, break; case GV_SD_STALE: - if (bp->bio_caller1 != p) + if (!(bp->bio_cflags & GV_BIO_SYNCREQ)) return (ENXIO); - printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name); + printf("GEOM_VINUM: sd %s is initializing\n", s->name); gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); break; @@ -214,103 +209,365 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp, cbp->bio_offset = real_off; cbp->bio_length = real_len; cbp->bio_data = addr; - if (bp->bio_caller1 == p) { - cbp->bio_caller1 = s; + cbp->bio_done = g_std_done; + cbp->bio_caller2 = s->consumer; + if ((bp->bio_cflags & GV_BIO_SYNCREQ)) { + cbp->bio_cflags |= GV_BIO_SYNCREQ; cbp->bio_done = gv_plex_done; - } else - cbp->bio_done = g_std_done; - *bp2 = cbp; - *cp = s->consumer; + } + + if (bp->bio_driver1 == NULL) { + bp->bio_driver1 = cbp; + } else { + pbp = bp->bio_driver1; + while (pbp->bio_caller1 != NULL) + pbp = pbp->bio_caller1; + pbp->bio_caller1 = cbp; + } + return (0); } static void gv_plex_start(struct bio *bp) { - struct g_geom *gp; - struct g_consumer *cp; struct gv_plex *p; - struct gv_raid5_packet *wp; - struct bio *bp2; - caddr_t addr; - off_t boff; - long bcount, rcount; - int err; + struct gv_bioq *bq; - gp = bp->bio_to->geom; - p = gp->softc; + switch(bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + break; + case BIO_GETATTR: + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } /* * We cannot handle this request if too many of our subdisks are * inaccessible. */ - if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) { - g_io_deliver(bp, ENXIO); /* XXX: correct way? */ + p = bp->bio_to->geom->softc; + if ((p->state < GV_PLEX_DEGRADED) && + !(bp->bio_cflags & GV_BIO_SYNCREQ)) { + g_io_deliver(bp, ENXIO); return; } - switch(bp->bio_cmd) { - case BIO_READ: - case BIO_WRITE: - case BIO_DELETE: + bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO); + bq->bp = bp; + mtx_lock(&p->bqueue_mtx); + TAILQ_INSERT_TAIL(&p->bqueue, bq, queue); + wakeup(p); + mtx_unlock(&p->bqueue_mtx); +} + +static void +gv_plex_worker(void *arg) +{ + struct bio *bp; + struct gv_plex *p; + struct gv_sd *s; + struct gv_bioq *bq; + + p = arg; + KASSERT(p != NULL, ("NULL p")); + + mtx_lock(&p->bqueue_mtx); + for (;;) { + /* We were signaled to exit. */ + if (p->flags & GV_PLEX_THREAD_DIE) + break; + + /* Take the first BIO from our queue. */ + bq = TAILQ_FIRST(&p->bqueue); + if (bq == NULL) { + msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10); + continue; + } + TAILQ_REMOVE(&p->bqueue, bq, queue); + mtx_unlock(&p->bqueue_mtx); + + bp = bq->bp; + + /* A completed request. */ + if (bp->bio_cflags & GV_BIO_DONE) { + g_free(bq); + if (bp->bio_cflags & GV_BIO_SYNCREQ) { + s = bp->bio_to->private; + if (bp->bio_error == 0) + s->initialized += bp->bio_length; + if (s->initialized >= s->size) { + g_topology_lock(); + gv_set_sd_state(s, GV_SD_UP, + GV_SETSTATE_CONFIG); + g_topology_unlock(); + s->initialized = 0; + } + g_std_done(bp); + } else + gv_plex_completed_request(p, bp); /* - * We split up the request in smaller packets and hand them - * down to our subdisks. + * A sub-request that was hold back because it interfered with + * another sub-request. */ - wp = NULL; - addr = bp->bio_data; - boff = bp->bio_offset; - for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) { - /* - * RAID5 requests usually need to be split up in - * several subrequests. - */ - if (p->org == GV_PLEX_RAID5) { - wp = gv_new_raid5_packet(); - wp->bio = bp; - err = gv_build_raid5_req(wp, bp, addr, bcount, - boff); - } else - err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount, - boff); + } else if (bp->bio_cflags & GV_BIO_ONHOLD) { + /* Is it still locked out? */ + if (gv_stripe_active(p, bp)) { + mtx_lock(&p->bqueue_mtx); + TAILQ_INSERT_TAIL(&p->bqueue, bq, queue); + mtx_unlock(&p->bqueue_mtx); + } else { + g_free(bq); + bp->bio_cflags &= ~GV_BIO_ONHOLD; + g_io_request(bp, bp->bio_caller2); + } - if (err) { - if (p->org == GV_PLEX_RAID5) - gv_free_raid5_packet(wp); - bp->bio_completed += bcount; - if (bp->bio_error == 0) - bp->bio_error = err; - if (bp->bio_completed == bp->bio_length) - g_io_deliver(bp, bp->bio_error); - return; + /* A normal request to this plex. */ + } else { + g_free(bq); + gv_plex_normal_request(p, bp); + } + + mtx_lock(&p->bqueue_mtx); + } + mtx_unlock(&p->bqueue_mtx); + p->flags |= GV_PLEX_THREAD_DEAD; + wakeup(p); + + kthread_exit(ENXIO); +} + +void +gv_plex_completed_request(struct gv_plex *p, struct bio *bp) +{ + struct bio *cbp, *pbp; + struct gv_bioq *bq, *bq2; + struct gv_raid5_packet *wp; + int i; + + wp = bp->bio_driver1; + + switch (bp->bio_parent->bio_cmd) { + case BIO_READ: + if (wp == NULL) + break; + + TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { + if (bq->bp == bp) { + TAILQ_REMOVE(&wp->bits, bq, queue); + g_free(bq); + for (i = 0; i < wp->length; i++) + wp->data[i] ^= bp->bio_data[i]; + break; + } + } + if (TAILQ_EMPTY(&wp->bits)) { + bp->bio_parent->bio_completed += wp->length; + if (wp->lockbase != -1) + TAILQ_REMOVE(&p->packets, wp, list); + g_free(wp); + } + + break; + + case BIO_WRITE: + if (wp == NULL) + break; + + /* Check if we need to handle parity data. */ + TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { + if (bq->bp == bp) { + TAILQ_REMOVE(&wp->bits, bq, queue); + g_free(bq); + cbp = wp->parity; + if (cbp != NULL) { + for (i = 0; i < wp->length; i++) + cbp->bio_data[i] ^= + bp->bio_data[i]; + } + break; } - - if (p->org != GV_PLEX_RAID5) { - rcount = bp2->bio_length; - g_io_request(bp2, cp); - - /* - * RAID5 subrequests are queued on a worklist - * and picked up from the worker thread. This - * ensures correct order. - */ + } + + /* Handle parity data. */ + if (TAILQ_EMPTY(&wp->bits)) { + if (wp->waiting != NULL) { + pbp = wp->waiting; + wp->waiting = NULL; + cbp = wp->parity; + for (i = 0; i < wp->length; i++) + cbp->bio_data[i] ^= pbp->bio_data[i]; + g_io_request(pbp, pbp->bio_caller2); + } else if (wp->parity != NULL) { + cbp = wp->parity; + wp->parity = NULL; + g_io_request(cbp, cbp->bio_caller2); } else { - mtx_lock(&p->worklist_mtx); - TAILQ_INSERT_TAIL(&p->worklist, wp, - list); - mtx_unlock(&p->worklist_mtx); - wakeup(&p); - rcount = wp->length; + bp->bio_parent->bio_completed += wp->length; + TAILQ_REMOVE(&p->packets, wp, list); + g_free(wp); } + } + + break; + } - boff += rcount; - addr += rcount; + pbp = bp->bio_parent; + if (pbp->bio_error == 0) + pbp->bio_error = bp->bio_error; + + /* When the original request is finished, we deliver it. */ + pbp->bio_inbed++; + if (pbp->bio_inbed == pbp->bio_children) + g_io_deliver(pbp, pbp->bio_error); + + /* Clean up what we allocated. */ + if (bp->bio_cflags & GV_BIO_MALLOC) + g_free(bp->bio_data); + g_destroy_bio(bp); +} + +void +gv_plex_normal_request(struct gv_plex *p, struct bio *bp) +{ + struct bio *cbp, *pbp; + struct gv_bioq *bq, *bq2; + struct gv_raid5_packet *wp, *wp2; + caddr_t addr; + off_t bcount, boff; + int err; + + bcount = bp->bio_length; + addr = bp->bio_data; + boff = bp->bio_offset; + + /* Walk over the whole length of the request, we might split it up. */ + while (bcount > 0) { + wp = NULL; + + /* + * RAID5 plexes need special treatment, as a single write + * request involves several read/write sub-requests. + */ + if (p->org == GV_PLEX_RAID5) { + wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); + wp->bio = bp; + TAILQ_INIT(&wp->bits); + + err = gv_build_raid5_req(p, wp, bp, addr, boff, bcount); + + /* + * Building the sub-request failed, we probably need to + * clean up a lot. + */ + if (err) { + printf("GEOM_VINUM: plex request failed for "); + g_print_bio(bp); + printf("\n"); + TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { + TAILQ_REMOVE(&wp->bits, bq, queue); + g_free(bq); + } + if (wp->waiting != NULL) { + if (wp->waiting->bio_cflags & + GV_BIO_MALLOC) + g_free(wp->waiting->bio_data); + g_destroy_bio(wp->waiting); + } + if (wp->parity != NULL) { + if (wp->parity->bio_cflags & + GV_BIO_MALLOC) + g_free(wp->parity->bio_data); + g_destroy_bio(wp->parity); + } + g_free(wp); + + TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { + if (wp->bio == bp) { + TAILQ_REMOVE(&p->packets, wp, + list); + TAILQ_FOREACH_SAFE(bq, + &wp->bits, queue, bq2) { + TAILQ_REMOVE(&wp->bits, + bq, queue); + g_free(bq); + } + g_free(wp); + } + } + + cbp = bp->bio_driver1; + while (cbp != NULL) { + pbp = cbp->bio_caller1; + if (cbp->bio_cflags & GV_BIO_MALLOC) + g_free(cbp->bio_data); + g_destroy_bio(cbp); + cbp = pbp; + } + + g_io_deliver(bp, err); + return; + } + + if (TAILQ_EMPTY(&wp->bits)) + g_free(wp); + else if (wp->lockbase != -1) + TAILQ_INSERT_TAIL(&p->packets, wp, list); + + /* + * Requests to concatenated and striped plexes go straight + * through. + */ + } else { + err = gv_plexbuffer(p, bp, addr, boff, bcount); + + /* Building the sub-request failed. */ + if (err) { + printf("GEOM_VINUM: plex request failed for "); + g_print_bio(bp); + printf("\n"); + cbp = bp->bio_driver1; + while (cbp != NULL) { + pbp = cbp->bio_caller1; + g_destroy_bio(cbp); + cbp = pbp; + } + g_io_deliver(bp, err); + return; + } } - return; + + /* Abuse bio_caller1 as linked list. */ + pbp = bp->bio_driver1; + while (pbp->bio_caller1 != NULL) + pbp = pbp->bio_caller1; + bcount -= pbp->bio_length; + addr += pbp->bio_length; + boff += pbp->bio_length; + } - default: - g_io_deliver(bp, EOPNOTSUPP); - return; + /* Fire off all sub-requests. */ + pbp = bp->bio_driver1; + while (pbp != NULL) { + /* + * RAID5 sub-requests need to come in correct order, otherwise + * we trip over the parity, as it might be overwritten by + * another sub-request. + */ + if (pbp->bio_driver1 != NULL && + gv_stripe_active(p, pbp)) { + pbp->bio_cflags |= GV_BIO_ONHOLD; + bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); + bq->bp = pbp; + mtx_lock(&p->bqueue_mtx); + TAILQ_INSERT_TAIL(&p->bqueue, bq, queue); + mtx_unlock(&p->bqueue_mtx); + } else + g_io_request(pbp, pbp->bio_caller2); + pbp = pbp->bio_caller1; } } @@ -425,16 +682,12 @@ gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) gp->softc = p; p->geom = gp; - /* RAID5 plexes need a 'worker' thread, where IO is handled. */ - if (p->org == GV_PLEX_RAID5) { - TAILQ_INIT(&p->worklist); - mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL, - MTX_DEF); - p->flags &= ~GV_PLEX_THREAD_DIE; - kthread_create(gv_raid5_worker, gp, NULL, 0, 0, - "gv_raid5"); - p->flags |= GV_PLEX_THREAD_ACTIVE; - } + TAILQ_INIT(&p->packets); + TAILQ_INIT(&p->bqueue); + mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF); + kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s", + p->name); + p->flags |= GV_PLEX_THREAD_ACTIVE; /* Attach a consumer to this provider. */ cp = g_new_consumer(gp); diff --git a/sys/geom/vinum/geom_vinum_raid5.c b/sys/geom/vinum/geom_vinum_raid5.c index 8dfe8ab570d2..62fb24685516 100644 --- a/sys/geom/vinum/geom_vinum_raid5.c +++ b/sys/geom/vinum/geom_vinum_raid5.c @@ -44,243 +44,62 @@ __FBSDID("$FreeBSD$"); #include <geom/vinum/geom_vinum_raid5.h> #include <geom/vinum/geom_vinum.h> -int gv_raid5_parity(struct gv_raid5_packet *); -int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); - -struct gv_raid5_bit * -gv_new_raid5_bit(void) -{ - struct gv_raid5_bit *r; - r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); - KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); - return (r); -} - -struct gv_raid5_packet * -gv_new_raid5_packet(void) -{ - struct gv_raid5_packet *wp; - - wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); - KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); - wp->state = SETUP; - wp->type = JUNK; - TAILQ_INIT(&wp->bits); - - return (wp); -} - -void -gv_free_raid5_packet(struct gv_raid5_packet *wp) -{ - struct gv_raid5_bit *r, *r2; - - /* Remove all the bits from this work packet. */ - TAILQ_FOREACH_SAFE(r, &wp->bits, list, r2) { - TAILQ_REMOVE(&wp->bits, r, list); - if (r->malloc) - g_free(r->buf); - if (r->bio != NULL) - g_destroy_bio(r->bio); - g_free(r); - } - - if (wp->bufmalloc == 1) - g_free(wp->buf); - g_free(wp); -} - /* * Check if the stripe that the work packet wants is already being used by * some other work packet. */ int -gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) -{ - struct gv_raid5_packet *wpa; - - TAILQ_FOREACH(wpa, &sc->worklist, list) { - if (wpa->lockbase == wp->lockbase) { - if (wpa == wp) - return (0); - return (1); - } - } - return (0); -} - -/* - * The "worker" thread that runs through the worklist and fires off the - * "subrequests" needed to fulfill a RAID5 read or write request. - */ -void -gv_raid5_worker(void *arg) +gv_stripe_active(struct gv_plex *p, struct bio *bp) { - struct bio *bp; - struct g_geom *gp; - struct gv_plex *p; - struct gv_raid5_packet *wp, *wpt; - struct gv_raid5_bit *rbp, *rbpt; - int error, restart; - - gp = arg; - p = gp->softc; - - mtx_lock(&p->worklist_mtx); - for (;;) { - restart = 0; - TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { - /* This request packet is already being processed. */ - if (wp->state == IO) - continue; - /* This request packet is ready for processing. */ - if (wp->state == VALID) { - /* Couldn't get the lock, try again. */ - if ((wp->lockbase != -1) && - gv_stripe_active(wp, p)) - continue; - - wp->state = IO; - mtx_unlock(&p->worklist_mtx); - TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) - g_io_request(rbp->bio, rbp->consumer); - mtx_lock(&p->worklist_mtx); - continue; - } - if (wp->state == FINISH) { - bp = wp->bio; - bp->bio_completed += wp->length; - /* - * Deliver the original request if we have - * finished. - */ - if (bp->bio_completed == bp->bio_length) { - mtx_unlock(&p->worklist_mtx); - g_io_deliver(bp, 0); - mtx_lock(&p->worklist_mtx); - } - TAILQ_REMOVE(&p->worklist, wp, list); - gv_free_raid5_packet(wp); - restart++; - /*break;*/ - } + struct gv_raid5_packet *wp, *owp; + int overlap; + + wp = bp->bio_driver1; + if (wp->lockbase == -1) + return (0); + + overlap = 0; + TAILQ_FOREACH(owp, &p->packets, list) { + if (owp == wp) + break; + if ((wp->lockbase >= owp->lockbase) && + (wp->lockbase <= owp->lockbase + owp->length)) { + overlap++; + break; } - if (!restart) { - /* Self-destruct. */ - if (p->flags & GV_PLEX_THREAD_DIE) - break; - error = msleep(p, &p->worklist_mtx, PRIBIO, "-", - hz/100); + if ((wp->lockbase <= owp->lockbase) && + (wp->lockbase + wp->length >= owp->lockbase)) { + overlap++; + break; } } - mtx_unlock(&p->worklist_mtx); - - g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); - /* Signal our plex that we are dead. */ - p->flags |= GV_PLEX_THREAD_DEAD; - wakeup(p); - kthread_exit(0); -} - -/* Final bio transaction to write out the parity data. */ -int -gv_raid5_parity(struct gv_raid5_packet *wp) -{ - struct bio *bp; - - bp = g_new_bio(); - if (bp == NULL) - return (ENOMEM); - - wp->type = ISPARITY; - bp->bio_cmd = BIO_WRITE; - bp->bio_data = wp->buf; - bp->bio_offset = wp->offset; - bp->bio_length = wp->length; - bp->bio_done = gv_raid5_done; - bp->bio_caller1 = wp; - bp->bio_caller2 = NULL; - g_io_request(bp, wp->parity); - - return (0); -} - -/* We end up here after each subrequest. */ -void -gv_raid5_done(struct bio *bp) -{ - struct bio *obp; - struct g_geom *gp; - struct gv_plex *p; - struct gv_raid5_packet *wp; - struct gv_raid5_bit *rbp; - off_t i; - int error; - - wp = bp->bio_caller1; - rbp = bp->bio_caller2; - obp = wp->bio; - gp = bp->bio_from->geom; - p = gp->softc; - - /* One less active subrequest. */ - wp->active--; - - switch (obp->bio_cmd) { - case BIO_READ: - /* Degraded reads need to handle parity data. */ - if (wp->type == DEGRADED) { - for (i = 0; i < wp->length; i++) - wp->buf[i] ^= bp->bio_data[i]; - - /* When we're finished copy back the data we want. */ - if (wp->active == 0) - bcopy(wp->buf, wp->data, wp->length); - } - - break; - - case BIO_WRITE: - /* Handle the parity data, if needed. */ - if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { - for (i = 0; i < wp->length; i++) - wp->buf[i] ^= bp->bio_data[i]; - - /* Write out the parity data we calculated. */ - if (wp->active == 0) { - wp->active++; - error = gv_raid5_parity(wp); - } - } - break; - } - - /* This request group is done. */ - if (wp->active == 0) - wp->state = FINISH; + return (overlap); } /* Build a request group to perform (part of) a RAID5 request. */ int -gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, - long bcount, off_t boff) +gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp, + struct bio *bp, caddr_t addr, off_t boff, off_t bcount) { struct g_geom *gp; - struct gv_plex *p; - struct gv_raid5_bit *rbp; struct gv_sd *broken, *original, *parity, *s; - int i, psdno, sdno; - off_t len_left, real_off, stripeend, stripeoff, stripestart; + struct gv_bioq *bq; + struct bio *cbp, *pbp; + int i, psdno, sdno, type; + off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart; gp = bp->bio_to->geom; - p = gp->softc; if (p == NULL || LIST_EMPTY(&p->subdisks)) return (ENXIO); /* We are optimistic and assume that this request will be OK. */ - wp->type = NORMAL; +#define REQ_TYPE_NORMAL 0 +#define REQ_TYPE_DEGRADED 1 +#define REQ_TYPE_NOPARITY 2 + + type = REQ_TYPE_NORMAL; original = parity = broken = NULL; /* The number of the subdisk containing the parity stripe. */ @@ -330,29 +149,20 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, /* Our data stripe is missing. */ if (original->state != GV_SD_UP) - wp->type = DEGRADED; + type = REQ_TYPE_DEGRADED; /* Our parity stripe is missing. */ if (parity->state != GV_SD_UP) { /* We cannot take another failure if we're already degraded. */ - if (wp->type != NORMAL) + if (type != REQ_TYPE_NORMAL) return (ENXIO); else - wp->type = NOPARITY; + type = REQ_TYPE_NOPARITY; } - /* - * A combined write is necessary when the original data subdisk and the - * parity subdisk are both up, but one of the other subdisks isn't. - */ - if ((broken != NULL) && (broken != parity) && (broken != original)) - wp->type = COMBINED; - - wp->offset = real_off; - wp->length = (bcount <= len_left) ? bcount : len_left; + real_len = (bcount <= len_left) ? bcount : len_left; + wp->length = real_len; wp->data = addr; - wp->original = original->consumer; - wp->parity = parity->consumer; - wp->lockbase = stripestart; + wp->lockbase = real_off; KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); @@ -363,58 +173,45 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, * the broken one plus the parity stripe and then recalculate * the desired data. */ - if (wp->type == DEGRADED) { - wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); - if (wp->buf == NULL) - return (ENOMEM); - wp->bufmalloc = 1; + if (type == REQ_TYPE_DEGRADED) { + bzero(wp->data, wp->length); LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken subdisk. */ if (s == broken) continue; - rbp = gv_new_raid5_bit(); - rbp->consumer = s->consumer; - rbp->bio = g_new_bio(); - if (rbp->bio == NULL) - return (ENOMEM); - rbp->buf = g_malloc(wp->length, - M_NOWAIT | M_ZERO); - if (rbp->buf == NULL) + cbp = g_clone_bio(bp); + if (cbp == NULL) return (ENOMEM); - rbp->malloc = 1; - rbp->bio->bio_cmd = BIO_READ; - rbp->bio->bio_offset = wp->offset; - rbp->bio->bio_length = wp->length; - rbp->bio->bio_data = rbp->buf; - rbp->bio->bio_done = gv_raid5_done; - rbp->bio->bio_caller1 = wp; - rbp->bio->bio_caller2 = rbp; - TAILQ_INSERT_HEAD(&wp->bits, rbp, list); - wp->active++; - wp->rqcount++; + cbp->bio_data = g_malloc(real_len, M_WAITOK); + cbp->bio_cflags |= GV_BIO_MALLOC; + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_done = gv_plex_done; + cbp->bio_caller2 = s->consumer; + cbp->bio_driver1 = wp; + + GV_ENQUEUE(bp, cbp, pbp); + + bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); + bq->bp = cbp; + TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } /* A normal read can be fulfilled with the original subdisk. */ } else { - rbp = gv_new_raid5_bit(); - rbp->consumer = wp->original; - rbp->bio = g_new_bio(); - if (rbp->bio == NULL) + cbp = g_clone_bio(bp); + if (cbp == NULL) return (ENOMEM); - rbp->bio->bio_cmd = BIO_READ; - rbp->bio->bio_offset = wp->offset; - rbp->bio->bio_length = wp->length; - rbp->buf = addr; - rbp->bio->bio_data = rbp->buf; - rbp->bio->bio_done = gv_raid5_done; - rbp->bio->bio_caller1 = wp; - rbp->bio->bio_caller2 = rbp; - TAILQ_INSERT_HEAD(&wp->bits, rbp, list); - wp->active++; - wp->rqcount++; + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_data = addr; + cbp->bio_done = g_std_done; + cbp->bio_caller2 = original->consumer; + + GV_ENQUEUE(bp, cbp, pbp); } - if (wp->type != COMBINED) - wp->lockbase = -1; + wp->lockbase = -1; + break; case BIO_WRITE: @@ -424,164 +221,65 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, * recalculate the parity from the original data, and then * write the parity stripe back out. */ - if (wp->type == DEGRADED) { - wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); - if (wp->buf == NULL) - return (ENOMEM); - wp->bufmalloc = 1; - - /* Copy the original data. */ - bcopy(wp->data, wp->buf, wp->length); - + if (type == REQ_TYPE_DEGRADED) { + /* Read all subdisks. */ LIST_FOREACH(s, &p->subdisks, in_plex) { /* Skip the broken and the parity subdisk. */ - if ((s == broken) || - (s->consumer == wp->parity)) + if ((s == broken) || (s == parity)) continue; - rbp = gv_new_raid5_bit(); - rbp->consumer = s->consumer; - rbp->bio = g_new_bio(); - if (rbp->bio == NULL) - return (ENOMEM); - rbp->buf = g_malloc(wp->length, - M_NOWAIT | M_ZERO); - if (rbp->buf == NULL) + cbp = g_clone_bio(bp); + if (cbp == NULL) return (ENOMEM); - rbp->malloc = 1; - rbp->bio->bio_cmd = BIO_READ; - rbp->bio->bio_data = rbp->buf; - rbp->bio->bio_offset = wp->offset; - rbp->bio->bio_length = wp->length; - rbp->bio->bio_done = gv_raid5_done; - rbp->bio->bio_caller1 = wp; - rbp->bio->bio_caller2 = rbp; - TAILQ_INSERT_HEAD(&wp->bits, rbp, list); - wp->active++; - wp->rqcount++; + cbp->bio_cmd = BIO_READ; + cbp->bio_data = g_malloc(real_len, M_WAITOK); + cbp->bio_cflags |= GV_BIO_MALLOC; + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_done = gv_plex_done; + cbp->bio_caller2 = s->consumer; + cbp->bio_driver1 = wp; + + GV_ENQUEUE(bp, cbp, pbp); + + bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); + bq->bp = cbp; + TAILQ_INSERT_TAIL(&wp->bits, bq, queue); } - /* - * When we don't have the parity stripe we just write out the - * data. - */ - } else if (wp->type == NOPARITY) { - rbp = gv_new_raid5_bit(); - rbp->consumer = wp->original; - rbp->bio = g_new_bio(); - if (rbp->bio == NULL) + /* Write the parity data. */ + cbp = g_clone_bio(bp); + if (cbp == NULL) return (ENOMEM); - rbp->bio->bio_cmd = BIO_WRITE; - rbp->bio->bio_offset = wp->offset; - rbp->bio->bio_length = wp->length; - rbp->bio->bio_data = addr; - rbp->bio->bio_done = gv_raid5_done; - rbp->bio->bio_caller1 = wp; - rbp->bio->bio_caller2 = rbp; - TAILQ_INSERT_HEAD(&wp->bits, rbp, list); - wp->active++; - wp->rqcount++; + cbp->bio_data = g_malloc(real_len, M_WAITOK); + cbp->bio_cflags |= GV_BIO_MALLOC; + bcopy(addr, cbp->bio_data, real_len); + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_done = gv_plex_done; + cbp->bio_caller2 = parity->consumer; + cbp->bio_driver1 = wp; + wp->parity = cbp; /* - * A combined write means that our data subdisk and the parity - * subdisks are both up, but another subdisk isn't. We need to - * read all valid stripes including the parity to recalculate - * the data of the stripe that is missing. Then we write our - * original data, and together with the other data stripes - * recalculate the parity again. + * When the parity stripe is missing we just write out the data. */ - } else if (wp->type == COMBINED) { - wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); - if (wp->buf == NULL) + } else if (type == REQ_TYPE_NOPARITY) { + cbp = g_clone_bio(bp); + if (cbp == NULL) return (ENOMEM); - wp->bufmalloc = 1; + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_data = addr; + cbp->bio_done = gv_plex_done; + cbp->bio_caller2 = original->consumer; + cbp->bio_driver1 = wp; - /* Get the data from all subdisks. */ - LIST_FOREACH(s, &p->subdisks, in_plex) { - /* Skip the broken subdisk. */ - if (s == broken) - continue; + GV_ENQUEUE(bp, cbp, pbp); - rbp = gv_new_raid5_bit(); - rbp->consumer = s->consumer; - rbp->bio = g_new_bio(); - if (rbp->bio == NULL) - return (ENOMEM); - rbp->bio->bio_cmd = BIO_READ; - rbp->buf = g_malloc(wp->length, - M_NOWAIT | M_ZERO); - if (rbp->buf == NULL) - return (ENOMEM); - rbp->malloc = 1; - rbp->bio->bio_data = rbp->buf; - rbp->bio->bio_offset = wp->offset; - rbp->bio->bio_length = wp->length; - rbp->bio->bio_done = gv_raid5_done; - rbp->bio->bio_caller1 = wp; - rbp->bio->bio_caller2 = rbp; - TAILQ_INSERT_HEAD(&wp->bits, rbp, list); - wp->active++; - wp->rqcount++; - } - - /* Write the original data. */ - rbp = gv_new_raid5_bit(); - rbp->consumer = wp->original; - rbp->buf = addr; - rbp->bio = g_new_bio(); - if (rbp->bio == NULL) - return (ENOMEM); - rbp->bio->bio_cmd = BIO_WRITE; - rbp->bio->bio_data = rbp->buf; - rbp->bio->bio_offset = wp->offset; - rbp->bio->bio_length = wp->length; - rbp->bio->bio_done = gv_raid5_done; - rbp->bio->bio_caller1 = wp; - rbp->bio->bio_caller2 = rbp; - /* - * Insert at the tail, because we want to read the old - * data first. - */ - TAILQ_INSERT_TAIL(&wp->bits, rbp, list); - wp->active++; - wp->rqcount++; - - /* Get the rest of the data again. */ - LIST_FOREACH(s, &p->subdisks, in_plex) { - /* - * Skip the broken subdisk, the parity, and the - * one we just wrote. - */ - if ((s == broken) || - (s->consumer == wp->parity) || - (s->consumer == wp->original)) - continue; - rbp = gv_new_raid5_bit(); - rbp->consumer = s->consumer; - rbp->bio = g_new_bio(); - if (rbp->bio == NULL) - return (ENOMEM); - rbp->bio->bio_cmd = BIO_READ; - rbp->buf = g_malloc(wp->length, - M_NOWAIT | M_ZERO); - if (rbp->buf == NULL) - return (ENOMEM); - rbp->malloc = 1; - rbp->bio->bio_data = rbp->buf; - rbp->bio->bio_offset = wp->offset; - rbp->bio->bio_length = wp->length; - rbp->bio->bio_done = gv_raid5_done; - rbp->bio->bio_caller1 = wp; - rbp->bio->bio_caller2 = rbp; - /* - * Again, insert at the tail to keep correct - * order. - */ - TAILQ_INSERT_TAIL(&wp->bits, rbp, list); - wp->active++; - wp->rqcount++; - } - + bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); + bq->bp = cbp; + TAILQ_INSERT_TAIL(&wp->bits, bq, queue); /* * A normal write request goes to the original subdisk, then we @@ -589,52 +287,83 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, * out the parity again. */ } else { - wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); - if (wp->buf == NULL) + /* Read old parity. */ + cbp = g_clone_bio(bp); + if (cbp == NULL) return (ENOMEM); - wp->bufmalloc = 1; - LIST_FOREACH(s, &p->subdisks, in_plex) { - /* Skip the parity stripe. */ - if (s->consumer == wp->parity) - continue; + cbp->bio_cmd = BIO_READ; + cbp->bio_data = g_malloc(real_len, M_WAITOK); + cbp->bio_cflags |= GV_BIO_MALLOC; + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_done = gv_plex_done; + cbp->bio_caller2 = parity->consumer; + cbp->bio_driver1 = wp; + + GV_ENQUEUE(bp, cbp, pbp); + + bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); + bq->bp = cbp; + TAILQ_INSERT_TAIL(&wp->bits, bq, queue); + + /* Read old data. */ + cbp = g_clone_bio(bp); + if (cbp == NULL) + return (ENOMEM); + cbp->bio_cmd = BIO_READ; + cbp->bio_data = g_malloc(real_len, M_WAITOK); + cbp->bio_cflags |= GV_BIO_MALLOC; + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_done = gv_plex_done; + cbp->bio_caller2 = original->consumer; + cbp->bio_driver1 = wp; + + GV_ENQUEUE(bp, cbp, pbp); + + bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); + bq->bp = cbp; + TAILQ_INSERT_TAIL(&wp->bits, bq, queue); + + /* Write new data. */ + cbp = g_clone_bio(bp); + if (cbp == NULL) + return (ENOMEM); + cbp->bio_data = addr; + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_done = gv_plex_done; + cbp->bio_caller2 = original->consumer; - rbp = gv_new_raid5_bit(); - rbp->consumer = s->consumer; - rbp->bio = g_new_bio(); - if (rbp->bio == NULL) - return (ENOMEM); - /* - * The data for the original stripe is written, - * the others need to be read in for the parity - * calculation. - */ - if (s->consumer == wp->original) { - rbp->bio->bio_cmd = BIO_WRITE; - rbp->buf = addr; - } else { - rbp->bio->bio_cmd = BIO_READ; - rbp->buf = g_malloc(wp->length, - M_NOWAIT | M_ZERO); - if (rbp->buf == NULL) - return (ENOMEM); - rbp->malloc = 1; - } - rbp->bio->bio_data = rbp->buf; - rbp->bio->bio_offset = wp->offset; - rbp->bio->bio_length = wp->length; - rbp->bio->bio_done = gv_raid5_done; - rbp->bio->bio_caller1 = wp; - rbp->bio->bio_caller2 = rbp; - TAILQ_INSERT_HEAD(&wp->bits, rbp, list); - wp->active++; - wp->rqcount++; - } + cbp->bio_driver1 = wp; + + /* + * We must not write the new data until the old data + * was read, so hold this BIO back until we're ready + * for it. + */ + wp->waiting = cbp; + + /* The final bio for the parity. */ + cbp = g_clone_bio(bp); + if (cbp == NULL) + return (ENOMEM); + cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); + cbp->bio_cflags |= GV_BIO_MALLOC; + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_done = gv_plex_done; + cbp->bio_caller2 = parity->consumer; + cbp->bio_driver1 = wp; + + /* Remember that this is the BIO for the parity data. */ + wp->parity = cbp; } break; + default: return (EINVAL); } - wp->state = VALID; return (0); } diff --git a/sys/geom/vinum/geom_vinum_raid5.h b/sys/geom/vinum/geom_vinum_raid5.h index 454311fa4a53..8074f4273c10 100644 --- a/sys/geom/vinum/geom_vinum_raid5.h +++ b/sys/geom/vinum/geom_vinum_raid5.h @@ -32,22 +32,23 @@ /* * A single RAID5 request usually needs more than one I/O transaction, * depending on the state of the associated subdisks and the direction of the - * transaction (read or write). Every subrequest of a RAID5 request, - * represented by a gv_raid_packet, is defined by a gv_raid5_bit. + * transaction (read or write). */ -/* A subrequest of a RAID5 read/write operation. */ -struct gv_raid5_bit { - struct bio *bio; /* BIO of this subrequest. */ - caddr_t buf; /* Data buffer of this subrequest. */ - int malloc; /* Flag if data buffer was malloced. */ - struct g_consumer *consumer; /* Consumer to send the BIO to. */ - TAILQ_ENTRY(gv_raid5_bit) list; /* Entry in the list of this request. */ -}; +#define GV_ENQUEUE(bp, cbp, pbp) \ + do { \ + if (bp->bio_driver1 == NULL) { \ + bp->bio_driver1 = cbp; \ + } else { \ + pbp = bp->bio_driver1; \ + while (pbp->bio_caller1 != NULL) \ + pbp = pbp->bio_caller1; \ + pbp->bio_caller1 = cbp; \ + } \ + } while (0); -/* Container for one or more gv_raid5_bits; represents a RAID5 I/O request. */ struct gv_raid5_packet { - caddr_t buf; /* Data buffer of this RAID5 request. */ + caddr_t data; /* Data buffer of this sub-request- */ off_t length; /* Size of data buffer. */ off_t lockbase; /* Deny access to our plex offset. */ off_t offset; /* The drive offset of the subdisk. */ @@ -56,39 +57,17 @@ struct gv_raid5_packet { int rqcount; /* Count of subrequests. */ struct bio *bio; /* Pointer to the original bio. */ - caddr_t data; /* Pointer to the original data. */ - - struct g_consumer *original; /* Consumer to the data stripe. */ - struct g_consumer *parity; /* Consumer to the parity stripe. */ - - /* State of this RAID5 packet. */ - enum { - SETUP, /* Newly created. */ - VALID, /* Ready for processing. */ - IO, /* Currently doing I/O. */ - FINISH /* Packet has finished. */ - } state; - - /* Type of this RAID5 transaction. */ - enum { - JUNK, /* Newly created, not valid. */ - NORMAL, /* Normal read or write. */ - ISPARITY, /* Containing only parity data. */ - NOPARITY, /* Parity stripe not available. */ - DEGRADED, /* Data stripe not available. */ - COMBINED /* Data and parity stripes ok, others not. */ - } type; + struct bio *parity; /* The bio containing the parity data. */ + struct bio *waiting; /* A bio that need to wait for other bios. */ - TAILQ_HEAD(,gv_raid5_bit) bits; /* List of subrequests. */ - TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */ + TAILQ_HEAD(,gv_bioq) bits; /* List of subrequests. */ + TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */ }; -int gv_build_raid5_req(struct gv_raid5_packet *, struct bio *, caddr_t, - long, off_t); -void gv_free_raid5_packet(struct gv_raid5_packet *); -void gv_raid5_done(struct bio *); +int gv_stripe_active(struct gv_plex *, struct bio *); +int gv_build_raid5_req(struct gv_plex *, struct gv_raid5_packet *, + struct bio *, caddr_t, off_t, off_t); void gv_raid5_worker(void *); -struct gv_raid5_packet *gv_new_raid5_packet(void); -struct gv_raid5_bit *gv_new_raid5_bit(void); +void gv_plex_done(struct bio *); #endif /* !_GEOM_VINUM_RAID5_H_ */ diff --git a/sys/geom/vinum/geom_vinum_rm.c b/sys/geom/vinum/geom_vinum_rm.c index cb2af799afea..d328c502c579 100644 --- a/sys/geom/vinum/geom_vinum_rm.c +++ b/sys/geom/vinum/geom_vinum_rm.c @@ -166,6 +166,7 @@ gv_rm_vol(struct gv_softc *sc, struct gctl_req *req, struct gv_volume *v, int fl /* Clean up and let our geom fade away. */ LIST_REMOVE(v, volume); + gv_kill_vol_thread(v); g_free(v); if (gp != NULL) { gp->softc = NULL; diff --git a/sys/geom/vinum/geom_vinum_subr.c b/sys/geom/vinum/geom_vinum_subr.c index dedb6c396034..8ebe1355f6ed 100644 --- a/sys/geom/vinum/geom_vinum_subr.c +++ b/sys/geom/vinum/geom_vinum_subr.c @@ -832,12 +832,25 @@ gv_kill_drive_thread(struct gv_drive *d) void gv_kill_plex_thread(struct gv_plex *p) { - if ((p->org == GV_PLEX_RAID5) && (p->flags & GV_PLEX_THREAD_ACTIVE)) { + if (p->flags & GV_PLEX_THREAD_ACTIVE) { p->flags |= GV_PLEX_THREAD_DIE; wakeup(p); while (!(p->flags & GV_PLEX_THREAD_DEAD)) tsleep(p, PRIBIO, "gv_die", hz); p->flags &= ~GV_PLEX_THREAD_ACTIVE; - mtx_destroy(&p->worklist_mtx); + mtx_destroy(&p->bqueue_mtx); + } +} + +void +gv_kill_vol_thread(struct gv_volume *v) +{ + if (v->flags & GV_VOL_THREAD_ACTIVE) { + v->flags |= GV_VOL_THREAD_DIE; + wakeup(v); + while (!(v->flags & GV_VOL_THREAD_DEAD)) + tsleep(v, PRIBIO, "gv_die", hz); + v->flags &= ~GV_VOL_THREAD_ACTIVE; + mtx_destroy(&v->bqueue_mtx); } } diff --git a/sys/geom/vinum/geom_vinum_var.h b/sys/geom/vinum/geom_vinum_var.h index 38b540f81e86..99c1c377cea6 100644 --- a/sys/geom/vinum/geom_vinum_var.h +++ b/sys/geom/vinum/geom_vinum_var.h @@ -111,6 +111,8 @@ #define GV_BIO_DONE 0x01 #define GV_BIO_MALLOC 0x02 #define GV_BIO_ONHOLD 0x04 +#define GV_BIO_SYNCREQ 0x08 +#define GV_BIO_SUCCEED 0x10 /* * hostname is 256 bytes long, but we don't need to shlep multiple copies in @@ -269,8 +271,9 @@ struct gv_plex { off_t synced; /* Count of synced bytes. */ - struct mtx worklist_mtx; /* Mutex for RAID5 worklist. */ - TAILQ_HEAD(,gv_raid5_packet) worklist; /* List of RAID5 work packets. */ + struct mtx bqueue_mtx; /* Lock for the BIO queue. */ + TAILQ_HEAD(,gv_bioq) bqueue; /* BIO queue. */ + TAILQ_HEAD(,gv_raid5_packet) packets; /* RAID5 sub-requests. */ LIST_HEAD(,gv_sd) subdisks; /* List of attached subdisks. */ LIST_ENTRY(gv_plex) in_volume; /* Plex list of associated volume. */ @@ -292,6 +295,14 @@ struct gv_volume { #define GV_VOL_DOWN 0 #define GV_VOL_UP 1 + int flags; +#define GV_VOL_THREAD_ACTIVE 0x01 /* Volume has an active thread. */ +#define GV_VOL_THREAD_DIE 0x02 /* Signal the thread to die. */ +#define GV_VOL_THREAD_DEAD 0x04 /* The thread has died. */ + + struct mtx bqueue_mtx; /* Lock for the BIO queue. */ + TAILQ_HEAD(,gv_bioq) bqueue; /* BIO queue. */ + LIST_HEAD(,gv_plex) plexes; /* List of attached plexes. */ LIST_ENTRY(gv_volume) volume; /* Entry in vinum config. */ diff --git a/sys/geom/vinum/geom_vinum_volume.c b/sys/geom/vinum/geom_vinum_volume.c index a2f262dd88ac..4ace9d2cbd40 100644 --- a/sys/geom/vinum/geom_vinum_volume.c +++ b/sys/geom/vinum/geom_vinum_volume.c @@ -31,6 +31,7 @@ __FBSDID("$FreeBSD$"); #include <sys/bio.h> #include <sys/conf.h> #include <sys/kernel.h> +#include <sys/kthread.h> #include <sys/libkern.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -42,6 +43,9 @@ __FBSDID("$FreeBSD$"); #include <geom/vinum/geom_vinum_var.h> #include <geom/vinum/geom_vinum.h> +static void gv_vol_completed_request(struct gv_volume *, struct bio *); +static void gv_vol_normal_request(struct gv_volume *, struct bio *); + static void gv_volume_orphan(struct g_consumer *cp) { @@ -62,8 +66,10 @@ gv_volume_orphan(struct g_consumer *cp) if (!LIST_EMPTY(&gp->consumer)) return; v = gp->softc; - if (v != NULL) + if (v != NULL) { + gv_kill_vol_thread(v); v->geom = NULL; + } gp->softc = NULL; g_wither_geom(gp, error); } @@ -72,78 +78,185 @@ gv_volume_orphan(struct g_consumer *cp) static void gv_volume_done(struct bio *bp) { - struct g_consumer *cp; - - /* The next plex in this volume. */ - cp = LIST_NEXT(bp->bio_from, consumer); - - switch (bp->bio_cmd) { - case BIO_READ: - /* - * If no error occured on this request, or if we have no plex - * left, finish here... - */ - if ((bp->bio_error == 0) || (cp == NULL)) { - g_std_done(bp); - return; - } + struct gv_volume *v; + struct gv_bioq *bq; + + v = bp->bio_from->geom->softc; + bp->bio_cflags |= GV_BIO_DONE; + bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO); + bq->bp = bp; + mtx_lock(&v->bqueue_mtx); + TAILQ_INSERT_TAIL(&v->bqueue, bq, queue); + wakeup(v); + mtx_unlock(&v->bqueue_mtx); +} - /* ... or try to read from the next plex. */ - g_io_request(bp, cp); - return; +static void +gv_volume_start(struct bio *bp) +{ + struct gv_volume *v; + struct gv_bioq *bq; + switch(bp->bio_cmd) { + case BIO_READ: case BIO_WRITE: case BIO_DELETE: - /* No more plexes left. */ - if (cp == NULL) { - /* - * Clear any errors if one of the previous writes - * succeeded. - */ - if (bp->bio_caller1 == (int *)1) - bp->bio_error = 0; - g_std_done(bp); - return; - } - - /* If this write request had no errors, remember that fact... */ - if (bp->bio_error == 0) - bp->bio_caller1 = (int *)1; + break; + case BIO_GETATTR: + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } - /* ... and write to the next plex. */ - g_io_request(bp, cp); + v = bp->bio_to->geom->softc; + if (v->state != GV_VOL_UP) { + g_io_deliver(bp, ENXIO); return; } + + bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO); + bq->bp = bp; + mtx_lock(&v->bqueue_mtx); + TAILQ_INSERT_TAIL(&v->bqueue, bq, queue); + wakeup(v); + mtx_unlock(&v->bqueue_mtx); } static void -gv_volume_start(struct bio *bp) +gv_vol_worker(void *arg) { - struct g_geom *gp; - struct bio *bp2; + struct bio *bp; struct gv_volume *v; + struct gv_bioq *bq; + + v = arg; + KASSERT(v != NULL, ("NULL v")); + mtx_lock(&v->bqueue_mtx); + for (;;) { + /* We were signaled to exit. */ + if (v->flags & GV_VOL_THREAD_DIE) + break; + + /* Take the first BIO from our queue. */ + bq = TAILQ_FIRST(&v->bqueue); + if (bq == NULL) { + msleep(v, &v->bqueue_mtx, PRIBIO, "-", hz/10); + continue; + } + TAILQ_REMOVE(&v->bqueue, bq, queue); + mtx_unlock(&v->bqueue_mtx); - gp = bp->bio_to->geom; - v = gp->softc; - if (v->state != GV_VOL_UP) { - g_io_deliver(bp, ENXIO); - return; + bp = bq->bp; + g_free(bq); + + if (bp->bio_cflags & GV_BIO_DONE) + gv_vol_completed_request(v, bp); + else + gv_vol_normal_request(v, bp); + + mtx_lock(&v->bqueue_mtx); } - switch(bp->bio_cmd) { + mtx_unlock(&v->bqueue_mtx); + v->flags |= GV_VOL_THREAD_DEAD; + wakeup(v); + + kthread_exit(ENXIO); +} + +static void +gv_vol_completed_request(struct gv_volume *v, struct bio *bp) +{ + struct bio *pbp; + struct gv_bioq *bq; + + pbp = bp->bio_parent; + + if (pbp->bio_error == 0) + pbp->bio_error = bp->bio_error; + + switch (pbp->bio_cmd) { case BIO_READ: + if (bp->bio_error) { + g_destroy_bio(bp); + pbp->bio_children--; + bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); + bq->bp = pbp; + mtx_lock(&v->bqueue_mtx); + TAILQ_INSERT_TAIL(&v->bqueue, bq, queue); + mtx_unlock(&v->bqueue_mtx); + return; + } + break; case BIO_WRITE: case BIO_DELETE: - bp2 = g_clone_bio(bp); - if (bp2 == NULL) { + break; + } + + /* When the original request is finished, we deliver it. */ + pbp->bio_inbed++; + if (pbp->bio_inbed == pbp->bio_children) { + pbp->bio_completed = bp->bio_length; + g_io_deliver(pbp, pbp->bio_error); + } + + g_destroy_bio(bp); +} + +static void +gv_vol_normal_request(struct gv_volume *v, struct bio *bp) +{ + struct g_geom *gp; + struct gv_plex *p; + struct bio *cbp, *pbp; + + gp = v->geom; + + switch (bp->bio_cmd) { + case BIO_READ: + cbp = g_clone_bio(bp); + if (cbp == NULL) { g_io_deliver(bp, ENOMEM); return; } - bp2->bio_done = gv_volume_done; - g_io_request(bp2, LIST_FIRST(&gp->consumer)); - return; - default: - g_io_deliver(bp, EOPNOTSUPP); - return; + cbp->bio_done = gv_volume_done; + LIST_FOREACH(p, &v->plexes, in_volume) { + if (p->state >= GV_PLEX_DEGRADED) + break; + } + g_io_request(cbp, p->consumer); + + break; + + case BIO_WRITE: + case BIO_DELETE: + LIST_FOREACH(p, &v->plexes, in_volume) { + if (p->state < GV_PLEX_DEGRADED) + continue; + + cbp = g_clone_bio(bp); + if (cbp == NULL) /* XXX */ + g_io_deliver(bp, ENOMEM); + cbp->bio_done = gv_volume_done; + cbp->bio_caller2 = p->consumer; + + if (bp->bio_driver1 == NULL) { + bp->bio_driver1 = cbp; + } else { + pbp = bp->bio_driver1; + while (pbp->bio_caller1 != NULL) + pbp = pbp->bio_caller1; + pbp->bio_caller1 = cbp; + } + } + + /* Fire off all sub-requests. */ + pbp = bp->bio_driver1; + while (pbp != NULL) { + g_io_request(pbp, pbp->bio_caller2); + pbp = pbp->bio_caller1; + } + + break; } } @@ -211,6 +324,11 @@ gv_volume_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) gp->access = gv_volume_access; gp->softc = v; first++; + TAILQ_INIT(&v->bqueue); + mtx_init(&v->bqueue_mtx, "gv_plex", NULL, MTX_DEF); + kthread_create(gv_vol_worker, v, NULL, 0, 0, "gv_v %s", + v->name); + v->flags |= GV_VOL_THREAD_ACTIVE; } else gp = v->geom; @@ -261,9 +379,13 @@ static int gv_volume_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) { + struct gv_volume *v; + g_trace(G_T_TOPOLOGY, "gv_volume_destroy_geom: %s", gp->name); g_topology_assert(); + v = gp->softc; + gv_kill_vol_thread(v); g_wither_geom(gp, ENXIO); return (0); } |