src - FreeBSD source tree

diff options


context:
space:
mode:

author	Lukas Ertl <le@FreeBSD.org>	2004-09-18 13:44:43 +0000
committer	Lukas Ertl <le@FreeBSD.org>	2004-09-18 13:44:43 +0000
commit	67e3ab6ee55dd554915413ed430780e4169df3fa (patch)
tree	b26a5245ec9f7555a82415414ac98257151bf50c /sys
parent	54516c29e84f00963a79ba79d96c99c9ec2247e9 (diff)
download	src-67e3ab6ee55dd554915413ed430780e4169df3fa.tar.gz src-67e3ab6ee55dd554915413ed430780e4169df3fa.zip

Re-vamp how I/O is handled in volumes and plexes.

Analogous to the drive level, give each volume and plex a worker thread that picks up and processes incoming and completed BIOs. This should fix the data corruption issues that have come up a few weeks ago and improve performance, especially of RAID5 plexes. The volume level needs a little work, though.

Notes

Notes: svn path=/head/; revision=135426

Diffstat (limited to 'sys')

-rw-r--r--

sys/geom/vinum/geom_vinum.h

-rw-r--r--

sys/geom/vinum/geom_vinum_init.c

-rw-r--r--

sys/geom/vinum/geom_vinum_plex.c

475

-rw-r--r--

sys/geom/vinum/geom_vinum_raid5.c

633

-rw-r--r--

sys/geom/vinum/geom_vinum_raid5.h

-rw-r--r--

sys/geom/vinum/geom_vinum_rm.c

-rw-r--r--

sys/geom/vinum/geom_vinum_subr.c

-rw-r--r--

sys/geom/vinum/geom_vinum_var.h

-rw-r--r--

sys/geom/vinum/geom_vinum_volume.c

228

9 files changed, 772 insertions, 663 deletions

diff --git a/sys/geom/vinum/geom_vinum.h b/sys/geom/vinum/geom_vinum.h
index a507d73e498c..ddbf5cf38d4c 100644
--- a/sys/geom/vinum/geom_vinum.h
+++ b/sys/geom/vinum/geom_vinum.h

@@ -70,6 +70,7 @@ int gv_is_striped(struct gv_plex *);

int gv_is_open(struct g_geom *);

void gv_kill_drive_thread(struct gv_drive *);

void gv_kill_plex_thread(struct gv_plex *);

+void gv_kill_vol_thread(struct gv_volume *);

int gv_object_type(struct gv_softc *, char *);

void gv_parse_config(struct gv_softc *, u_char *, int);

const char *gv_roughlength(off_t, int);

diff --git a/sys/geom/vinum/geom_vinum_init.c b/sys/geom/vinum/geom_vinum_init.c
index 4ad7a031078c..382ea1572b08 100644
--- a/sys/geom/vinum/geom_vinum_init.c
+++ b/sys/geom/vinum/geom_vinum_init.c

@@ -293,7 +293,7 @@ gv_sync_td(void *arg)

* This hack declare this bio as part of an initialization

* process, so that the lower levels allow it to get through.

- bp->bio_caller1 = p;

+ bp->bio_cflags |= GV_BIO_SYNCREQ;

/* Schedule it down ... */

g_io_request(bp, to);

diff --git a/sys/geom/vinum/geom_vinum_plex.c b/sys/geom/vinum/geom_vinum_plex.c
index 8cfa6be488cb..494ec2c0ba10 100644
--- a/sys/geom/vinum/geom_vinum_plex.c
+++ b/sys/geom/vinum/geom_vinum_plex.c

@@ -43,6 +43,10 @@ __FBSDID("$FreeBSD$");

#include <geom/vinum/geom_vinum_raid5.h>

#include <geom/vinum/geom_vinum.h>

+static void gv_plex_completed_request(struct gv_plex *, struct bio *);

+static void gv_plex_normal_request(struct gv_plex *, struct bio *);

+static void gv_plex_worker(void *);

/* XXX: is this the place to catch dying subdisks? */

static void

gv_plex_orphan(struct g_consumer *cp)

@@ -76,48 +80,39 @@ gv_plex_orphan(struct g_consumer *cp)

g_wither_geom(gp, error);

}

-static void

+void

gv_plex_done(struct bio *bp)

{

- struct g_geom *gp;

- struct gv_sd *s;

- gp = bp->bio_to->geom;

- s = bp->bio_caller1;

- KASSERT(s != NULL, ("gv_plex_done: NULL s"));

- if (bp->bio_error == 0)

- s->initialized += bp->bio_length;

- if (s->initialized >= s->size) {

- gv_set_sd_state(s, GV_SD_UP, 0);

- s->initialized = 0;

- }

- g_std_done(bp);

+ struct gv_plex *p;

+ struct gv_bioq *bq;

+ p = bp->bio_from->geom->softc;

+ bp->bio_cflags |= GV_BIO_DONE;

+ bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);

+ bq->bp = bp;

+ mtx_lock(&p->bqueue_mtx);

+ TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);

+ wakeup(p);

+ mtx_unlock(&p->bqueue_mtx);

}

/* Find the correct subdisk to send the bio to and build a bio to send. */

static int

-gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,

- caddr_t addr, long bcount, off_t boff)

+gv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount)

{

struct g_geom *gp;

- struct gv_plex *p;

struct gv_sd *s;

- struct bio *cbp;

+ struct bio *cbp, *pbp;

int i, sdno;

- off_t len_left, real_len, real_off, stripeend, stripeno, stripestart;

- s = NULL;

- gp = bp->bio_to->geom;

- p = gp->softc;

+ off_t len_left, real_len, real_off;

+ off_t stripeend, stripeno, stripestart;

if (p == NULL || LIST_EMPTY(&p->subdisks))

return (ENXIO);

+ s = NULL;

+ gp = bp->bio_to->geom;

* We only handle concatenated and striped plexes here. RAID5 plexes

* are handled in build_raid5_request().

@@ -190,10 +185,10 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,

break;

case GV_SD_STALE:

- if (bp->bio_caller1 != p)

+ if (!(bp->bio_cflags & GV_BIO_SYNCREQ))

return (ENXIO);

- printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name);

+ printf("GEOM_VINUM: sd %s is initializing\n", s->name);

gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);

break;

@@ -214,103 +209,365 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,

cbp->bio_offset = real_off;

cbp->bio_length = real_len;

cbp->bio_data = addr;

- if (bp->bio_caller1 == p) {

- cbp->bio_caller1 = s;

+ cbp->bio_done = g_std_done;

+ cbp->bio_caller2 = s->consumer;

+ if ((bp->bio_cflags & GV_BIO_SYNCREQ)) {

+ cbp->bio_cflags |= GV_BIO_SYNCREQ;

cbp->bio_done = gv_plex_done;

- } else

- cbp->bio_done = g_std_done;

- *bp2 = cbp;

- *cp = s->consumer;

+ }

+ if (bp->bio_driver1 == NULL) {

+ bp->bio_driver1 = cbp;

+ } else {

+ pbp = bp->bio_driver1;

+ while (pbp->bio_caller1 != NULL)

+ pbp = pbp->bio_caller1;

+ pbp->bio_caller1 = cbp;

+ }

return (0);

}

static void

gv_plex_start(struct bio *bp)

{

- struct g_geom *gp;

- struct g_consumer *cp;

struct gv_plex *p;

- struct gv_raid5_packet *wp;

- struct bio *bp2;

- caddr_t addr;

- off_t boff;

- long bcount, rcount;

- int err;

+ struct gv_bioq *bq;

- gp = bp->bio_to->geom;

- p = gp->softc;

+ switch(bp->bio_cmd) {

+ case BIO_READ:

+ case BIO_WRITE:

+ case BIO_DELETE:

+ break;

+ case BIO_GETATTR:

+ default:

+ g_io_deliver(bp, EOPNOTSUPP);

+ return;

+ }

* We cannot handle this request if too many of our subdisks are

* inaccessible.

- if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) {

- g_io_deliver(bp, ENXIO); /* XXX: correct way? */

+ p = bp->bio_to->geom->softc;

+ if ((p->state < GV_PLEX_DEGRADED) &&

+ !(bp->bio_cflags & GV_BIO_SYNCREQ)) {

+ g_io_deliver(bp, ENXIO);

return;

}

- switch(bp->bio_cmd) {

- case BIO_READ:

- case BIO_WRITE:

- case BIO_DELETE:

+ bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);

+ bq->bp = bp;

+ mtx_lock(&p->bqueue_mtx);

+ TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);

+ wakeup(p);

+ mtx_unlock(&p->bqueue_mtx);

+static void

+gv_plex_worker(void *arg)

+ struct bio *bp;

+ struct gv_plex *p;

+ struct gv_sd *s;

+ struct gv_bioq *bq;

+ p = arg;

+ KASSERT(p != NULL, ("NULL p"));

+ mtx_lock(&p->bqueue_mtx);

+ for (;;) {

+ /* We were signaled to exit. */

+ if (p->flags & GV_PLEX_THREAD_DIE)

+ break;

+ /* Take the first BIO from our queue. */

+ bq = TAILQ_FIRST(&p->bqueue);

+ if (bq == NULL) {

+ msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10);

+ continue;

+ }

+ TAILQ_REMOVE(&p->bqueue, bq, queue);

+ mtx_unlock(&p->bqueue_mtx);

+ bp = bq->bp;

+ /* A completed request. */

+ if (bp->bio_cflags & GV_BIO_DONE) {

+ g_free(bq);

+ if (bp->bio_cflags & GV_BIO_SYNCREQ) {

+ s = bp->bio_to->private;

+ if (bp->bio_error == 0)

+ s->initialized += bp->bio_length;

+ if (s->initialized >= s->size) {

+ g_topology_lock();

+ gv_set_sd_state(s, GV_SD_UP,

+ GV_SETSTATE_CONFIG);

+ g_topology_unlock();

+ s->initialized = 0;

+ }

+ g_std_done(bp);

+ } else

+ gv_plex_completed_request(p, bp);

- * We split up the request in smaller packets and hand them

- * down to our subdisks.

+ * A sub-request that was hold back because it interfered with

+ * another sub-request.

- wp = NULL;

- addr = bp->bio_data;

- boff = bp->bio_offset;

- for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {

- /*

- * RAID5 requests usually need to be split up in

- * several subrequests.

- */

- if (p->org == GV_PLEX_RAID5) {

- wp = gv_new_raid5_packet();

- wp->bio = bp;

- err = gv_build_raid5_req(wp, bp, addr, bcount,

- boff);

- } else

- err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount,

- boff);

+ } else if (bp->bio_cflags & GV_BIO_ONHOLD) {

+ /* Is it still locked out? */

+ if (gv_stripe_active(p, bp)) {

+ mtx_lock(&p->bqueue_mtx);

+ TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);

+ mtx_unlock(&p->bqueue_mtx);

+ } else {

+ g_free(bq);

+ bp->bio_cflags &= ~GV_BIO_ONHOLD;

+ g_io_request(bp, bp->bio_caller2);

+ }

- if (err) {

- if (p->org == GV_PLEX_RAID5)

- gv_free_raid5_packet(wp);

- bp->bio_completed += bcount;

- if (bp->bio_error == 0)

- bp->bio_error = err;

- if (bp->bio_completed == bp->bio_length)

- g_io_deliver(bp, bp->bio_error);

- return;

+ /* A normal request to this plex. */

+ } else {

+ g_free(bq);

+ gv_plex_normal_request(p, bp);

+ }

+ mtx_lock(&p->bqueue_mtx);

+ }

+ mtx_unlock(&p->bqueue_mtx);

+ p->flags |= GV_PLEX_THREAD_DEAD;

+ wakeup(p);

+ kthread_exit(ENXIO);

+void

+gv_plex_completed_request(struct gv_plex *p, struct bio *bp)

+ struct bio *cbp, *pbp;

+ struct gv_bioq *bq, *bq2;

+ struct gv_raid5_packet *wp;

+ int i;

+ wp = bp->bio_driver1;

+ switch (bp->bio_parent->bio_cmd) {

+ case BIO_READ:

+ if (wp == NULL)

+ break;

+ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {

+ if (bq->bp == bp) {

+ TAILQ_REMOVE(&wp->bits, bq, queue);

+ g_free(bq);

+ for (i = 0; i < wp->length; i++)

+ wp->data[i] ^= bp->bio_data[i];

+ break;

+ }

+ if (TAILQ_EMPTY(&wp->bits)) {

+ bp->bio_parent->bio_completed += wp->length;

+ if (wp->lockbase != -1)

+ TAILQ_REMOVE(&p->packets, wp, list);

+ g_free(wp);

+ }

+ break;

+ case BIO_WRITE:

+ if (wp == NULL)

+ break;

+ /* Check if we need to handle parity data. */

+ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {

+ if (bq->bp == bp) {

+ TAILQ_REMOVE(&wp->bits, bq, queue);

+ g_free(bq);

+ cbp = wp->parity;

+ if (cbp != NULL) {

+ for (i = 0; i < wp->length; i++)

+ cbp->bio_data[i] ^=

+ bp->bio_data[i];

+ }

+ break;

}

- if (p->org != GV_PLEX_RAID5) {

- rcount = bp2->bio_length;

- g_io_request(bp2, cp);

- /*

- * RAID5 subrequests are queued on a worklist

- * and picked up from the worker thread. This

- * ensures correct order.

- */

+ }

+ /* Handle parity data. */

+ if (TAILQ_EMPTY(&wp->bits)) {

+ if (wp->waiting != NULL) {

+ pbp = wp->waiting;

+ wp->waiting = NULL;

+ cbp = wp->parity;

+ for (i = 0; i < wp->length; i++)

+ cbp->bio_data[i] ^= pbp->bio_data[i];

+ g_io_request(pbp, pbp->bio_caller2);

+ } else if (wp->parity != NULL) {

+ cbp = wp->parity;

+ wp->parity = NULL;

+ g_io_request(cbp, cbp->bio_caller2);

} else {

- mtx_lock(&p->worklist_mtx);

- TAILQ_INSERT_TAIL(&p->worklist, wp,

- list);

- mtx_unlock(&p->worklist_mtx);

- wakeup(&p);

- rcount = wp->length;

+ bp->bio_parent->bio_completed += wp->length;

+ TAILQ_REMOVE(&p->packets, wp, list);

+ g_free(wp);

}

+ }

+ break;

+ }

- boff += rcount;

- addr += rcount;

+ pbp = bp->bio_parent;

+ if (pbp->bio_error == 0)

+ pbp->bio_error = bp->bio_error;

+ /* When the original request is finished, we deliver it. */

+ pbp->bio_inbed++;

+ if (pbp->bio_inbed == pbp->bio_children)

+ g_io_deliver(pbp, pbp->bio_error);

+ /* Clean up what we allocated. */

+ if (bp->bio_cflags & GV_BIO_MALLOC)

+ g_free(bp->bio_data);

+ g_destroy_bio(bp);

+void

+gv_plex_normal_request(struct gv_plex *p, struct bio *bp)

+ struct bio *cbp, *pbp;

+ struct gv_bioq *bq, *bq2;

+ struct gv_raid5_packet *wp, *wp2;

+ caddr_t addr;

+ off_t bcount, boff;

+ int err;

+ bcount = bp->bio_length;

+ addr = bp->bio_data;

+ boff = bp->bio_offset;

+ /* Walk over the whole length of the request, we might split it up. */

+ while (bcount > 0) {

+ wp = NULL;

+ /*

+ * RAID5 plexes need special treatment, as a single write

+ * request involves several read/write sub-requests.

+ */

+ if (p->org == GV_PLEX_RAID5) {

+ wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);

+ wp->bio = bp;

+ TAILQ_INIT(&wp->bits);

+ err = gv_build_raid5_req(p, wp, bp, addr, boff, bcount);

+ /*

+ * Building the sub-request failed, we probably need to

+ * clean up a lot.

+ */

+ if (err) {

+ printf("GEOM_VINUM: plex request failed for ");

+ g_print_bio(bp);

+ printf("\n");

+ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {

+ TAILQ_REMOVE(&wp->bits, bq, queue);

+ g_free(bq);

+ }

+ if (wp->waiting != NULL) {

+ if (wp->waiting->bio_cflags &

+ GV_BIO_MALLOC)

+ g_free(wp->waiting->bio_data);

+ g_destroy_bio(wp->waiting);

+ }

+ if (wp->parity != NULL) {

+ if (wp->parity->bio_cflags &

+ GV_BIO_MALLOC)

+ g_free(wp->parity->bio_data);

+ g_destroy_bio(wp->parity);

+ }

+ g_free(wp);

+ TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {

+ if (wp->bio == bp) {

+ TAILQ_REMOVE(&p->packets, wp,

+ list);

+ TAILQ_FOREACH_SAFE(bq,

+ &wp->bits, queue, bq2) {

+ TAILQ_REMOVE(&wp->bits,

+ bq, queue);

+ g_free(bq);

+ }

+ g_free(wp);

+ }

+ cbp = bp->bio_driver1;

+ while (cbp != NULL) {

+ pbp = cbp->bio_caller1;

+ if (cbp->bio_cflags & GV_BIO_MALLOC)

+ g_free(cbp->bio_data);

+ g_destroy_bio(cbp);

+ cbp = pbp;

+ }

+ g_io_deliver(bp, err);

+ return;

+ }

+ if (TAILQ_EMPTY(&wp->bits))

+ g_free(wp);

+ else if (wp->lockbase != -1)

+ TAILQ_INSERT_TAIL(&p->packets, wp, list);

+ /*

+ * Requests to concatenated and striped plexes go straight

+ * through.

+ */

+ } else {

+ err = gv_plexbuffer(p, bp, addr, boff, bcount);

+ /* Building the sub-request failed. */

+ if (err) {

+ printf("GEOM_VINUM: plex request failed for ");

+ g_print_bio(bp);

+ printf("\n");

+ cbp = bp->bio_driver1;

+ while (cbp != NULL) {

+ pbp = cbp->bio_caller1;

+ g_destroy_bio(cbp);

+ cbp = pbp;

+ }

+ g_io_deliver(bp, err);

+ return;

+ }

}

- return;

+ /* Abuse bio_caller1 as linked list. */

+ pbp = bp->bio_driver1;

+ while (pbp->bio_caller1 != NULL)

+ pbp = pbp->bio_caller1;

+ bcount -= pbp->bio_length;

+ addr += pbp->bio_length;

+ boff += pbp->bio_length;

+ }

- default:

- g_io_deliver(bp, EOPNOTSUPP);

- return;

+ /* Fire off all sub-requests. */

+ pbp = bp->bio_driver1;

+ while (pbp != NULL) {

+ /*

+ * RAID5 sub-requests need to come in correct order, otherwise

+ * we trip over the parity, as it might be overwritten by

+ * another sub-request.

+ */

+ if (pbp->bio_driver1 != NULL &&

+ gv_stripe_active(p, pbp)) {

+ pbp->bio_cflags |= GV_BIO_ONHOLD;

+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);

+ bq->bp = pbp;

+ mtx_lock(&p->bqueue_mtx);

+ TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);

+ mtx_unlock(&p->bqueue_mtx);

+ } else

+ g_io_request(pbp, pbp->bio_caller2);

+ pbp = pbp->bio_caller1;

}

@@ -425,16 +682,12 @@ gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)

gp->softc = p;

p->geom = gp;

- /* RAID5 plexes need a 'worker' thread, where IO is handled. */

- if (p->org == GV_PLEX_RAID5) {

- TAILQ_INIT(&p->worklist);

- mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL,

- MTX_DEF);

- p->flags &= ~GV_PLEX_THREAD_DIE;

- kthread_create(gv_raid5_worker, gp, NULL, 0, 0,

- "gv_raid5");

- p->flags |= GV_PLEX_THREAD_ACTIVE;

- }

+ TAILQ_INIT(&p->packets);

+ TAILQ_INIT(&p->bqueue);

+ mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);

+ kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",

+ p->name);

+ p->flags |= GV_PLEX_THREAD_ACTIVE;

/* Attach a consumer to this provider. */

cp = g_new_consumer(gp);

diff --git a/sys/geom/vinum/geom_vinum_raid5.c b/sys/geom/vinum/geom_vinum_raid5.c
index 8dfe8ab570d2..62fb24685516 100644
--- a/sys/geom/vinum/geom_vinum_raid5.c
+++ b/sys/geom/vinum/geom_vinum_raid5.c

@@ -44,243 +44,62 @@ __FBSDID("$FreeBSD$");

#include <geom/vinum/geom_vinum_raid5.h>

#include <geom/vinum/geom_vinum.h>

-int gv_raid5_parity(struct gv_raid5_packet *);

-int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);

-struct gv_raid5_bit *

-gv_new_raid5_bit(void)

- struct gv_raid5_bit *r;

- r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);

- KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));

- return (r);

-struct gv_raid5_packet *

-gv_new_raid5_packet(void)

- struct gv_raid5_packet *wp;

- wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);

- KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));

- wp->state = SETUP;

- wp->type = JUNK;

- TAILQ_INIT(&wp->bits);

- return (wp);

-void

-gv_free_raid5_packet(struct gv_raid5_packet *wp)

- struct gv_raid5_bit *r, *r2;

- /* Remove all the bits from this work packet. */

- TAILQ_FOREACH_SAFE(r, &wp->bits, list, r2) {

- TAILQ_REMOVE(&wp->bits, r, list);

- if (r->malloc)

- g_free(r->buf);

- if (r->bio != NULL)

- g_destroy_bio(r->bio);

- g_free(r);

- }

- if (wp->bufmalloc == 1)

- g_free(wp->buf);

- g_free(wp);

* Check if the stripe that the work packet wants is already being used by

* some other work packet.

int

-gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)

- struct gv_raid5_packet *wpa;

- TAILQ_FOREACH(wpa, &sc->worklist, list) {

- if (wpa->lockbase == wp->lockbase) {

- if (wpa == wp)

- return (0);

- return (1);

- }

- return (0);

-/*

- * The "worker" thread that runs through the worklist and fires off the

- * "subrequests" needed to fulfill a RAID5 read or write request.

- */

-void

-gv_raid5_worker(void *arg)

+gv_stripe_active(struct gv_plex *p, struct bio *bp)

{

- struct bio *bp;

- struct g_geom *gp;

- struct gv_plex *p;

- struct gv_raid5_packet *wp, *wpt;

- struct gv_raid5_bit *rbp, *rbpt;

- int error, restart;

- gp = arg;

- p = gp->softc;

- mtx_lock(&p->worklist_mtx);

- for (;;) {

- restart = 0;

- TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {

- /* This request packet is already being processed. */

- if (wp->state == IO)

- continue;

- /* This request packet is ready for processing. */

- if (wp->state == VALID) {

- /* Couldn't get the lock, try again. */

- if ((wp->lockbase != -1) &&

- gv_stripe_active(wp, p))

- continue;

- wp->state = IO;

- mtx_unlock(&p->worklist_mtx);

- TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)

- g_io_request(rbp->bio, rbp->consumer);

- mtx_lock(&p->worklist_mtx);

- continue;

- }

- if (wp->state == FINISH) {

- bp = wp->bio;

- bp->bio_completed += wp->length;

- /*

- * Deliver the original request if we have

- * finished.

- */

- if (bp->bio_completed == bp->bio_length) {

- mtx_unlock(&p->worklist_mtx);

- g_io_deliver(bp, 0);

- mtx_lock(&p->worklist_mtx);

- }

- TAILQ_REMOVE(&p->worklist, wp, list);

- gv_free_raid5_packet(wp);

- restart++;

- /*break;*/

- }

+ struct gv_raid5_packet *wp, *owp;

+ int overlap;

+ wp = bp->bio_driver1;

+ if (wp->lockbase == -1)

+ return (0);

+ overlap = 0;

+ TAILQ_FOREACH(owp, &p->packets, list) {

+ if (owp == wp)

+ break;

+ if ((wp->lockbase >= owp->lockbase) &&

+ (wp->lockbase <= owp->lockbase + owp->length)) {

+ overlap++;

+ break;

}

- if (!restart) {

- /* Self-destruct. */

- if (p->flags & GV_PLEX_THREAD_DIE)

- break;

- error = msleep(p, &p->worklist_mtx, PRIBIO, "-",

- hz/100);

+ if ((wp->lockbase <= owp->lockbase) &&

+ (wp->lockbase + wp->length >= owp->lockbase)) {

+ overlap++;

+ break;

}

- mtx_unlock(&p->worklist_mtx);

- g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");

- /* Signal our plex that we are dead. */

- p->flags |= GV_PLEX_THREAD_DEAD;

- wakeup(p);

- kthread_exit(0);

-/* Final bio transaction to write out the parity data. */

-int

-gv_raid5_parity(struct gv_raid5_packet *wp)

- struct bio *bp;

- bp = g_new_bio();

- if (bp == NULL)

- return (ENOMEM);

- wp->type = ISPARITY;

- bp->bio_cmd = BIO_WRITE;

- bp->bio_data = wp->buf;

- bp->bio_offset = wp->offset;

- bp->bio_length = wp->length;

- bp->bio_done = gv_raid5_done;

- bp->bio_caller1 = wp;

- bp->bio_caller2 = NULL;

- g_io_request(bp, wp->parity);

- return (0);

-/* We end up here after each subrequest. */

-void

-gv_raid5_done(struct bio *bp)

- struct bio *obp;

- struct g_geom *gp;

- struct gv_plex *p;

- struct gv_raid5_packet *wp;

- struct gv_raid5_bit *rbp;

- off_t i;

- int error;

- wp = bp->bio_caller1;

- rbp = bp->bio_caller2;

- obp = wp->bio;

- gp = bp->bio_from->geom;

- p = gp->softc;

- /* One less active subrequest. */

- wp->active--;

- switch (obp->bio_cmd) {

- case BIO_READ:

- /* Degraded reads need to handle parity data. */

- if (wp->type == DEGRADED) {

- for (i = 0; i < wp->length; i++)

- wp->buf[i] ^= bp->bio_data[i];

- /* When we're finished copy back the data we want. */

- if (wp->active == 0)

- bcopy(wp->buf, wp->data, wp->length);

- }

- break;

- case BIO_WRITE:

- /* Handle the parity data, if needed. */

- if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {

- for (i = 0; i < wp->length; i++)

- wp->buf[i] ^= bp->bio_data[i];

- /* Write out the parity data we calculated. */

- if (wp->active == 0) {

- wp->active++;

- error = gv_raid5_parity(wp);

- }

- break;

- }

- /* This request group is done. */

- if (wp->active == 0)

- wp->state = FINISH;

+ return (overlap);

}

/* Build a request group to perform (part of) a RAID5 request. */

int

-gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,

- long bcount, off_t boff)

+gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,

+ struct bio *bp, caddr_t addr, off_t boff, off_t bcount)

{

struct g_geom *gp;

- struct gv_plex *p;

- struct gv_raid5_bit *rbp;

struct gv_sd *broken, *original, *parity, *s;

- int i, psdno, sdno;

- off_t len_left, real_off, stripeend, stripeoff, stripestart;

+ struct gv_bioq *bq;

+ struct bio *cbp, *pbp;

+ int i, psdno, sdno, type;

+ off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;

gp = bp->bio_to->geom;

- p = gp->softc;

if (p == NULL || LIST_EMPTY(&p->subdisks))

return (ENXIO);

/* We are optimistic and assume that this request will be OK. */

- wp->type = NORMAL;

+#define REQ_TYPE_NORMAL 0

+#define REQ_TYPE_DEGRADED 1

+#define REQ_TYPE_NOPARITY 2

+ type = REQ_TYPE_NORMAL;

original = parity = broken = NULL;

/* The number of the subdisk containing the parity stripe. */

@@ -330,29 +149,20 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,

/* Our data stripe is missing. */

if (original->state != GV_SD_UP)

- wp->type = DEGRADED;

+ type = REQ_TYPE_DEGRADED;

/* Our parity stripe is missing. */

if (parity->state != GV_SD_UP) {

/* We cannot take another failure if we're already degraded. */

- if (wp->type != NORMAL)

+ if (type != REQ_TYPE_NORMAL)

return (ENXIO);

else

- wp->type = NOPARITY;

+ type = REQ_TYPE_NOPARITY;

}

- /*

- * A combined write is necessary when the original data subdisk and the

- * parity subdisk are both up, but one of the other subdisks isn't.

- */

- if ((broken != NULL) && (broken != parity) && (broken != original))

- wp->type = COMBINED;

- wp->offset = real_off;

- wp->length = (bcount <= len_left) ? bcount : len_left;

+ real_len = (bcount <= len_left) ? bcount : len_left;

+ wp->length = real_len;

wp->data = addr;

- wp->original = original->consumer;

- wp->parity = parity->consumer;

- wp->lockbase = stripestart;

+ wp->lockbase = real_off;

KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));

@@ -363,58 +173,45 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,

* the broken one plus the parity stripe and then recalculate

* the desired data.

- if (wp->type == DEGRADED) {

- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);

- if (wp->buf == NULL)

- return (ENOMEM);

- wp->bufmalloc = 1;

+ if (type == REQ_TYPE_DEGRADED) {

+ bzero(wp->data, wp->length);

LIST_FOREACH(s, &p->subdisks, in_plex) {

/* Skip the broken subdisk. */

if (s == broken)

continue;

- rbp = gv_new_raid5_bit();

- rbp->consumer = s->consumer;

- rbp->bio = g_new_bio();

- if (rbp->bio == NULL)

- return (ENOMEM);

- rbp->buf = g_malloc(wp->length,

- M_NOWAIT | M_ZERO);

- if (rbp->buf == NULL)

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL)

return (ENOMEM);

- rbp->malloc = 1;

- rbp->bio->bio_cmd = BIO_READ;

- rbp->bio->bio_offset = wp->offset;

- rbp->bio->bio_length = wp->length;

- rbp->bio->bio_data = rbp->buf;

- rbp->bio->bio_done = gv_raid5_done;

- rbp->bio->bio_caller1 = wp;

- rbp->bio->bio_caller2 = rbp;

- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);

- wp->active++;

- wp->rqcount++;

+ cbp->bio_data = g_malloc(real_len, M_WAITOK);

+ cbp->bio_cflags |= GV_BIO_MALLOC;

+ cbp->bio_offset = real_off;

+ cbp->bio_length = real_len;

+ cbp->bio_done = gv_plex_done;

+ cbp->bio_caller2 = s->consumer;

+ cbp->bio_driver1 = wp;

+ GV_ENQUEUE(bp, cbp, pbp);

+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);

+ bq->bp = cbp;

+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);

}

/* A normal read can be fulfilled with the original subdisk. */

} else {

- rbp = gv_new_raid5_bit();

- rbp->consumer = wp->original;

- rbp->bio = g_new_bio();

- if (rbp->bio == NULL)

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL)

return (ENOMEM);

- rbp->bio->bio_cmd = BIO_READ;

- rbp->bio->bio_offset = wp->offset;

- rbp->bio->bio_length = wp->length;

- rbp->buf = addr;

- rbp->bio->bio_data = rbp->buf;

- rbp->bio->bio_done = gv_raid5_done;

- rbp->bio->bio_caller1 = wp;

- rbp->bio->bio_caller2 = rbp;

- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);

- wp->active++;

- wp->rqcount++;

+ cbp->bio_offset = real_off;

+ cbp->bio_length = real_len;

+ cbp->bio_data = addr;

+ cbp->bio_done = g_std_done;

+ cbp->bio_caller2 = original->consumer;

+ GV_ENQUEUE(bp, cbp, pbp);

}

- if (wp->type != COMBINED)

- wp->lockbase = -1;

+ wp->lockbase = -1;

break;

case BIO_WRITE:

@@ -424,164 +221,65 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,

* recalculate the parity from the original data, and then

* write the parity stripe back out.

- if (wp->type == DEGRADED) {

- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);

- if (wp->buf == NULL)

- return (ENOMEM);

- wp->bufmalloc = 1;

- /* Copy the original data. */

- bcopy(wp->data, wp->buf, wp->length);

+ if (type == REQ_TYPE_DEGRADED) {

+ /* Read all subdisks. */

LIST_FOREACH(s, &p->subdisks, in_plex) {

/* Skip the broken and the parity subdisk. */

- if ((s == broken) ||

- (s->consumer == wp->parity))

+ if ((s == broken) || (s == parity))

continue;

- rbp = gv_new_raid5_bit();

- rbp->consumer = s->consumer;

- rbp->bio = g_new_bio();

- if (rbp->bio == NULL)

- return (ENOMEM);

- rbp->buf = g_malloc(wp->length,

- M_NOWAIT | M_ZERO);

- if (rbp->buf == NULL)

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL)

return (ENOMEM);

- rbp->malloc = 1;

- rbp->bio->bio_cmd = BIO_READ;

- rbp->bio->bio_data = rbp->buf;

- rbp->bio->bio_offset = wp->offset;

- rbp->bio->bio_length = wp->length;

- rbp->bio->bio_done = gv_raid5_done;

- rbp->bio->bio_caller1 = wp;

- rbp->bio->bio_caller2 = rbp;

- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);

- wp->active++;

- wp->rqcount++;

+ cbp->bio_cmd = BIO_READ;

+ cbp->bio_data = g_malloc(real_len, M_WAITOK);

+ cbp->bio_cflags |= GV_BIO_MALLOC;

+ cbp->bio_offset = real_off;

+ cbp->bio_length = real_len;

+ cbp->bio_done = gv_plex_done;

+ cbp->bio_caller2 = s->consumer;

+ cbp->bio_driver1 = wp;

+ GV_ENQUEUE(bp, cbp, pbp);

+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);

+ bq->bp = cbp;

+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);

}

- /*

- * When we don't have the parity stripe we just write out the

- * data.

- */

- } else if (wp->type == NOPARITY) {

- rbp = gv_new_raid5_bit();

- rbp->consumer = wp->original;

- rbp->bio = g_new_bio();

- if (rbp->bio == NULL)

+ /* Write the parity data. */

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL)

return (ENOMEM);

- rbp->bio->bio_cmd = BIO_WRITE;

- rbp->bio->bio_offset = wp->offset;

- rbp->bio->bio_length = wp->length;

- rbp->bio->bio_data = addr;

- rbp->bio->bio_done = gv_raid5_done;

- rbp->bio->bio_caller1 = wp;

- rbp->bio->bio_caller2 = rbp;

- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);

- wp->active++;

- wp->rqcount++;

+ cbp->bio_data = g_malloc(real_len, M_WAITOK);

+ cbp->bio_cflags |= GV_BIO_MALLOC;

+ bcopy(addr, cbp->bio_data, real_len);

+ cbp->bio_offset = real_off;

+ cbp->bio_length = real_len;

+ cbp->bio_done = gv_plex_done;

+ cbp->bio_caller2 = parity->consumer;

+ cbp->bio_driver1 = wp;

+ wp->parity = cbp;

- * A combined write means that our data subdisk and the parity

- * subdisks are both up, but another subdisk isn't. We need to

- * read all valid stripes including the parity to recalculate

- * the data of the stripe that is missing. Then we write our

- * original data, and together with the other data stripes

- * recalculate the parity again.

+ * When the parity stripe is missing we just write out the data.

- } else if (wp->type == COMBINED) {

- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);

- if (wp->buf == NULL)

+ } else if (type == REQ_TYPE_NOPARITY) {

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL)

return (ENOMEM);

- wp->bufmalloc = 1;

+ cbp->bio_offset = real_off;

+ cbp->bio_length = real_len;

+ cbp->bio_data = addr;

+ cbp->bio_done = gv_plex_done;

+ cbp->bio_caller2 = original->consumer;

+ cbp->bio_driver1 = wp;

- /* Get the data from all subdisks. */

- LIST_FOREACH(s, &p->subdisks, in_plex) {

- /* Skip the broken subdisk. */

- if (s == broken)

- continue;

+ GV_ENQUEUE(bp, cbp, pbp);

- rbp = gv_new_raid5_bit();

- rbp->consumer = s->consumer;

- rbp->bio = g_new_bio();

- if (rbp->bio == NULL)

- return (ENOMEM);

- rbp->bio->bio_cmd = BIO_READ;

- rbp->buf = g_malloc(wp->length,

- M_NOWAIT | M_ZERO);

- if (rbp->buf == NULL)

- return (ENOMEM);

- rbp->malloc = 1;

- rbp->bio->bio_data = rbp->buf;

- rbp->bio->bio_offset = wp->offset;

- rbp->bio->bio_length = wp->length;

- rbp->bio->bio_done = gv_raid5_done;

- rbp->bio->bio_caller1 = wp;

- rbp->bio->bio_caller2 = rbp;

- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);

- wp->active++;

- wp->rqcount++;

- }

- /* Write the original data. */

- rbp = gv_new_raid5_bit();

- rbp->consumer = wp->original;

- rbp->buf = addr;

- rbp->bio = g_new_bio();

- if (rbp->bio == NULL)

- return (ENOMEM);

- rbp->bio->bio_cmd = BIO_WRITE;

- rbp->bio->bio_data = rbp->buf;

- rbp->bio->bio_offset = wp->offset;

- rbp->bio->bio_length = wp->length;

- rbp->bio->bio_done = gv_raid5_done;

- rbp->bio->bio_caller1 = wp;

- rbp->bio->bio_caller2 = rbp;

- /*

- * Insert at the tail, because we want to read the old

- * data first.

- */

- TAILQ_INSERT_TAIL(&wp->bits, rbp, list);

- wp->active++;

- wp->rqcount++;

- /* Get the rest of the data again. */

- LIST_FOREACH(s, &p->subdisks, in_plex) {

- /*

- * Skip the broken subdisk, the parity, and the

- * one we just wrote.

- */

- if ((s == broken) ||

- (s->consumer == wp->parity) ||

- (s->consumer == wp->original))

- continue;

- rbp = gv_new_raid5_bit();

- rbp->consumer = s->consumer;

- rbp->bio = g_new_bio();

- if (rbp->bio == NULL)

- return (ENOMEM);

- rbp->bio->bio_cmd = BIO_READ;

- rbp->buf = g_malloc(wp->length,

- M_NOWAIT | M_ZERO);

- if (rbp->buf == NULL)

- return (ENOMEM);

- rbp->malloc = 1;

- rbp->bio->bio_data = rbp->buf;

- rbp->bio->bio_offset = wp->offset;

- rbp->bio->bio_length = wp->length;

- rbp->bio->bio_done = gv_raid5_done;

- rbp->bio->bio_caller1 = wp;

- rbp->bio->bio_caller2 = rbp;

- /*

- * Again, insert at the tail to keep correct

- * order.

- */

- TAILQ_INSERT_TAIL(&wp->bits, rbp, list);

- wp->active++;

- wp->rqcount++;

- }

+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);

+ bq->bp = cbp;

+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);

* A normal write request goes to the original subdisk, then we

@@ -589,52 +287,83 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,

* out the parity again.

} else {

- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);

- if (wp->buf == NULL)

+ /* Read old parity. */

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL)

return (ENOMEM);

- wp->bufmalloc = 1;

- LIST_FOREACH(s, &p->subdisks, in_plex) {

- /* Skip the parity stripe. */

- if (s->consumer == wp->parity)

- continue;

+ cbp->bio_cmd = BIO_READ;

+ cbp->bio_data = g_malloc(real_len, M_WAITOK);

+ cbp->bio_cflags |= GV_BIO_MALLOC;

+ cbp->bio_offset = real_off;

+ cbp->bio_length = real_len;

+ cbp->bio_done = gv_plex_done;

+ cbp->bio_caller2 = parity->consumer;

+ cbp->bio_driver1 = wp;

+ GV_ENQUEUE(bp, cbp, pbp);

+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);

+ bq->bp = cbp;

+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);

+ /* Read old data. */

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL)

+ return (ENOMEM);

+ cbp->bio_cmd = BIO_READ;

+ cbp->bio_data = g_malloc(real_len, M_WAITOK);

+ cbp->bio_cflags |= GV_BIO_MALLOC;

+ cbp->bio_offset = real_off;

+ cbp->bio_length = real_len;

+ cbp->bio_done = gv_plex_done;

+ cbp->bio_caller2 = original->consumer;

+ cbp->bio_driver1 = wp;

+ GV_ENQUEUE(bp, cbp, pbp);

+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);

+ bq->bp = cbp;

+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);

+ /* Write new data. */

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL)

+ return (ENOMEM);

+ cbp->bio_data = addr;

+ cbp->bio_offset = real_off;

+ cbp->bio_length = real_len;

+ cbp->bio_done = gv_plex_done;

+ cbp->bio_caller2 = original->consumer;

- rbp = gv_new_raid5_bit();

- rbp->consumer = s->consumer;

- rbp->bio = g_new_bio();

- if (rbp->bio == NULL)

- return (ENOMEM);

- /*

- * The data for the original stripe is written,

- * the others need to be read in for the parity

- * calculation.

- */

- if (s->consumer == wp->original) {

- rbp->bio->bio_cmd = BIO_WRITE;

- rbp->buf = addr;

- } else {

- rbp->bio->bio_cmd = BIO_READ;

- rbp->buf = g_malloc(wp->length,

- M_NOWAIT | M_ZERO);

- if (rbp->buf == NULL)

- return (ENOMEM);

- rbp->malloc = 1;

- }

- rbp->bio->bio_data = rbp->buf;

- rbp->bio->bio_offset = wp->offset;

- rbp->bio->bio_length = wp->length;

- rbp->bio->bio_done = gv_raid5_done;

- rbp->bio->bio_caller1 = wp;

- rbp->bio->bio_caller2 = rbp;

- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);

- wp->active++;

- wp->rqcount++;

- }

+ cbp->bio_driver1 = wp;

+ /*

+ * We must not write the new data until the old data

+ * was read, so hold this BIO back until we're ready

+ * for it.

+ */

+ wp->waiting = cbp;

+ /* The final bio for the parity. */

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL)

+ return (ENOMEM);

+ cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);

+ cbp->bio_cflags |= GV_BIO_MALLOC;

+ cbp->bio_offset = real_off;

+ cbp->bio_length = real_len;

+ cbp->bio_done = gv_plex_done;

+ cbp->bio_caller2 = parity->consumer;

+ cbp->bio_driver1 = wp;

+ /* Remember that this is the BIO for the parity data. */

+ wp->parity = cbp;

}

break;

default:

return (EINVAL);

}

- wp->state = VALID;

return (0);

}

diff --git a/sys/geom/vinum/geom_vinum_raid5.h b/sys/geom/vinum/geom_vinum_raid5.h
index 454311fa4a53..8074f4273c10 100644
--- a/sys/geom/vinum/geom_vinum_raid5.h
+++ b/sys/geom/vinum/geom_vinum_raid5.h

@@ -32,22 +32,23 @@

* A single RAID5 request usually needs more than one I/O transaction,

* depending on the state of the associated subdisks and the direction of the

- * transaction (read or write). Every subrequest of a RAID5 request,

- * represented by a gv_raid_packet, is defined by a gv_raid5_bit.

+ * transaction (read or write).

-/* A subrequest of a RAID5 read/write operation. */

-struct gv_raid5_bit {

- struct bio *bio; /* BIO of this subrequest. */

- caddr_t buf; /* Data buffer of this subrequest. */

- int malloc; /* Flag if data buffer was malloced. */

- struct g_consumer *consumer; /* Consumer to send the BIO to. */

- TAILQ_ENTRY(gv_raid5_bit) list; /* Entry in the list of this request. */

-};

+#define GV_ENQUEUE(bp, cbp, pbp) \

+ do { \

+ if (bp->bio_driver1 == NULL) { \

+ bp->bio_driver1 = cbp; \

+ } else { \

+ pbp = bp->bio_driver1; \

+ while (pbp->bio_caller1 != NULL) \

+ pbp = pbp->bio_caller1; \

+ pbp->bio_caller1 = cbp; \

+ } \

+ } while (0);

-/* Container for one or more gv_raid5_bits; represents a RAID5 I/O request. */

struct gv_raid5_packet {

- caddr_t buf; /* Data buffer of this RAID5 request. */

+ caddr_t data; /* Data buffer of this sub-request- */

off_t length; /* Size of data buffer. */

off_t lockbase; /* Deny access to our plex offset. */

off_t offset; /* The drive offset of the subdisk. */

@@ -56,39 +57,17 @@ struct gv_raid5_packet {

int rqcount; /* Count of subrequests. */

struct bio *bio; /* Pointer to the original bio. */

- caddr_t data; /* Pointer to the original data. */

- struct g_consumer *original; /* Consumer to the data stripe. */

- struct g_consumer *parity; /* Consumer to the parity stripe. */

- /* State of this RAID5 packet. */

- enum {

- SETUP, /* Newly created. */

- VALID, /* Ready for processing. */

- IO, /* Currently doing I/O. */

- FINISH /* Packet has finished. */

- } state;

- /* Type of this RAID5 transaction. */

- enum {

- JUNK, /* Newly created, not valid. */

- NORMAL, /* Normal read or write. */

- ISPARITY, /* Containing only parity data. */

- NOPARITY, /* Parity stripe not available. */

- DEGRADED, /* Data stripe not available. */

- COMBINED /* Data and parity stripes ok, others not. */

- } type;

+ struct bio *parity; /* The bio containing the parity data. */

+ struct bio *waiting; /* A bio that need to wait for other bios. */

- TAILQ_HEAD(,gv_raid5_bit) bits; /* List of subrequests. */

- TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */

+ TAILQ_HEAD(,gv_bioq) bits; /* List of subrequests. */

+ TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */

};

-int gv_build_raid5_req(struct gv_raid5_packet *, struct bio *, caddr_t,

- long, off_t);

-void gv_free_raid5_packet(struct gv_raid5_packet *);

-void gv_raid5_done(struct bio *);

+int gv_stripe_active(struct gv_plex *, struct bio *);

+int gv_build_raid5_req(struct gv_plex *, struct gv_raid5_packet *,

+ struct bio *, caddr_t, off_t, off_t);

void gv_raid5_worker(void *);

-struct gv_raid5_packet *gv_new_raid5_packet(void);

-struct gv_raid5_bit *gv_new_raid5_bit(void);

+void gv_plex_done(struct bio *);

#endif /* !_GEOM_VINUM_RAID5_H_ */

diff --git a/sys/geom/vinum/geom_vinum_rm.c b/sys/geom/vinum/geom_vinum_rm.c
index cb2af799afea..d328c502c579 100644
--- a/sys/geom/vinum/geom_vinum_rm.c
+++ b/sys/geom/vinum/geom_vinum_rm.c

@@ -166,6 +166,7 @@ gv_rm_vol(struct gv_softc *sc, struct gctl_req *req, struct gv_volume *v, int fl

/* Clean up and let our geom fade away. */

LIST_REMOVE(v, volume);

+ gv_kill_vol_thread(v);

g_free(v);

if (gp != NULL) {

gp->softc = NULL;

diff --git a/sys/geom/vinum/geom_vinum_subr.c b/sys/geom/vinum/geom_vinum_subr.c
index dedb6c396034..8ebe1355f6ed 100644
--- a/sys/geom/vinum/geom_vinum_subr.c
+++ b/sys/geom/vinum/geom_vinum_subr.c

@@ -832,12 +832,25 @@ gv_kill_drive_thread(struct gv_drive *d)

void

gv_kill_plex_thread(struct gv_plex *p)

{

- if ((p->org == GV_PLEX_RAID5) && (p->flags & GV_PLEX_THREAD_ACTIVE)) {

+ if (p->flags & GV_PLEX_THREAD_ACTIVE) {

p->flags |= GV_PLEX_THREAD_DIE;

wakeup(p);

while (!(p->flags & GV_PLEX_THREAD_DEAD))

tsleep(p, PRIBIO, "gv_die", hz);

p->flags &= ~GV_PLEX_THREAD_ACTIVE;

- mtx_destroy(&p->worklist_mtx);

+ mtx_destroy(&p->bqueue_mtx);

+ }

+void

+gv_kill_vol_thread(struct gv_volume *v)

+ if (v->flags & GV_VOL_THREAD_ACTIVE) {

+ v->flags |= GV_VOL_THREAD_DIE;

+ wakeup(v);

+ while (!(v->flags & GV_VOL_THREAD_DEAD))

+ tsleep(v, PRIBIO, "gv_die", hz);

+ v->flags &= ~GV_VOL_THREAD_ACTIVE;

+ mtx_destroy(&v->bqueue_mtx);

}

diff --git a/sys/geom/vinum/geom_vinum_var.h b/sys/geom/vinum/geom_vinum_var.h
index 38b540f81e86..99c1c377cea6 100644
--- a/sys/geom/vinum/geom_vinum_var.h
+++ b/sys/geom/vinum/geom_vinum_var.h

@@ -111,6 +111,8 @@

#define GV_BIO_DONE 0x01

#define GV_BIO_MALLOC 0x02

#define GV_BIO_ONHOLD 0x04

+#define GV_BIO_SYNCREQ 0x08

+#define GV_BIO_SUCCEED 0x10

* hostname is 256 bytes long, but we don't need to shlep multiple copies in

@@ -269,8 +271,9 @@ struct gv_plex {

off_t synced; /* Count of synced bytes. */

- struct mtx worklist_mtx; /* Mutex for RAID5 worklist. */

- TAILQ_HEAD(,gv_raid5_packet) worklist; /* List of RAID5 work packets. */

+ struct mtx bqueue_mtx; /* Lock for the BIO queue. */

+ TAILQ_HEAD(,gv_bioq) bqueue; /* BIO queue. */

+ TAILQ_HEAD(,gv_raid5_packet) packets; /* RAID5 sub-requests. */

LIST_HEAD(,gv_sd) subdisks; /* List of attached subdisks. */

LIST_ENTRY(gv_plex) in_volume; /* Plex list of associated volume. */

@@ -292,6 +295,14 @@ struct gv_volume {

#define GV_VOL_DOWN 0

#define GV_VOL_UP 1

+ int flags;

+#define GV_VOL_THREAD_ACTIVE 0x01 /* Volume has an active thread. */

+#define GV_VOL_THREAD_DIE 0x02 /* Signal the thread to die. */

+#define GV_VOL_THREAD_DEAD 0x04 /* The thread has died. */

+ struct mtx bqueue_mtx; /* Lock for the BIO queue. */

+ TAILQ_HEAD(,gv_bioq) bqueue; /* BIO queue. */

LIST_HEAD(,gv_plex) plexes; /* List of attached plexes. */

LIST_ENTRY(gv_volume) volume; /* Entry in vinum config. */

diff --git a/sys/geom/vinum/geom_vinum_volume.c b/sys/geom/vinum/geom_vinum_volume.c
index a2f262dd88ac..4ace9d2cbd40 100644
--- a/sys/geom/vinum/geom_vinum_volume.c
+++ b/sys/geom/vinum/geom_vinum_volume.c

@@ -31,6 +31,7 @@ __FBSDID("$FreeBSD$");

#include <sys/bio.h>

#include <sys/conf.h>

#include <sys/kernel.h>

+#include <sys/kthread.h>

#include <sys/libkern.h>

#include <sys/lock.h>

#include <sys/malloc.h>

@@ -42,6 +43,9 @@ __FBSDID("$FreeBSD$");

#include <geom/vinum/geom_vinum_var.h>

#include <geom/vinum/geom_vinum.h>

+static void gv_vol_completed_request(struct gv_volume *, struct bio *);

+static void gv_vol_normal_request(struct gv_volume *, struct bio *);

static void

gv_volume_orphan(struct g_consumer *cp)

{

@@ -62,8 +66,10 @@ gv_volume_orphan(struct g_consumer *cp)

if (!LIST_EMPTY(&gp->consumer))

return;

v = gp->softc;

- if (v != NULL)

+ if (v != NULL) {

+ gv_kill_vol_thread(v);

v->geom = NULL;

+ }

gp->softc = NULL;

g_wither_geom(gp, error);

}

@@ -72,78 +78,185 @@ gv_volume_orphan(struct g_consumer *cp)

static void

gv_volume_done(struct bio *bp)

{

- struct g_consumer *cp;

- /* The next plex in this volume. */

- cp = LIST_NEXT(bp->bio_from, consumer);

- switch (bp->bio_cmd) {

- case BIO_READ:

- /*

- * If no error occured on this request, or if we have no plex

- * left, finish here...

- */

- if ((bp->bio_error == 0) || (cp == NULL)) {

- g_std_done(bp);

- return;

- }

+ struct gv_volume *v;

+ struct gv_bioq *bq;

+ v = bp->bio_from->geom->softc;

+ bp->bio_cflags |= GV_BIO_DONE;

+ bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);

+ bq->bp = bp;

+ mtx_lock(&v->bqueue_mtx);

+ TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);

+ wakeup(v);

+ mtx_unlock(&v->bqueue_mtx);

- /* ... or try to read from the next plex. */

- g_io_request(bp, cp);

- return;

+static void

+gv_volume_start(struct bio *bp)

+ struct gv_volume *v;

+ struct gv_bioq *bq;

+ switch(bp->bio_cmd) {

+ case BIO_READ:

case BIO_WRITE:

case BIO_DELETE:

- /* No more plexes left. */

- if (cp == NULL) {

- /*

- * Clear any errors if one of the previous writes

- * succeeded.

- */

- if (bp->bio_caller1 == (int *)1)

- bp->bio_error = 0;

- g_std_done(bp);

- return;

- }

- /* If this write request had no errors, remember that fact... */

- if (bp->bio_error == 0)

- bp->bio_caller1 = (int *)1;

+ break;

+ case BIO_GETATTR:

+ default:

+ g_io_deliver(bp, EOPNOTSUPP);

+ return;

+ }

- /* ... and write to the next plex. */

- g_io_request(bp, cp);

+ v = bp->bio_to->geom->softc;

+ if (v->state != GV_VOL_UP) {

+ g_io_deliver(bp, ENXIO);

return;

}

+ bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);

+ bq->bp = bp;

+ mtx_lock(&v->bqueue_mtx);

+ TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);

+ wakeup(v);

+ mtx_unlock(&v->bqueue_mtx);

}

static void

-gv_volume_start(struct bio *bp)

+gv_vol_worker(void *arg)

{

- struct g_geom *gp;

- struct bio *bp2;

+ struct bio *bp;

struct gv_volume *v;

+ struct gv_bioq *bq;

+ v = arg;

+ KASSERT(v != NULL, ("NULL v"));

+ mtx_lock(&v->bqueue_mtx);

+ for (;;) {

+ /* We were signaled to exit. */

+ if (v->flags & GV_VOL_THREAD_DIE)

+ break;

+ /* Take the first BIO from our queue. */

+ bq = TAILQ_FIRST(&v->bqueue);

+ if (bq == NULL) {

+ msleep(v, &v->bqueue_mtx, PRIBIO, "-", hz/10);

+ continue;

+ }

+ TAILQ_REMOVE(&v->bqueue, bq, queue);

+ mtx_unlock(&v->bqueue_mtx);

- gp = bp->bio_to->geom;

- v = gp->softc;

- if (v->state != GV_VOL_UP) {

- g_io_deliver(bp, ENXIO);

- return;

+ bp = bq->bp;

+ g_free(bq);

+ if (bp->bio_cflags & GV_BIO_DONE)

+ gv_vol_completed_request(v, bp);

+ else

+ gv_vol_normal_request(v, bp);

+ mtx_lock(&v->bqueue_mtx);

}

- switch(bp->bio_cmd) {

+ mtx_unlock(&v->bqueue_mtx);

+ v->flags |= GV_VOL_THREAD_DEAD;

+ wakeup(v);

+ kthread_exit(ENXIO);

+static void

+gv_vol_completed_request(struct gv_volume *v, struct bio *bp)

+ struct bio *pbp;

+ struct gv_bioq *bq;

+ pbp = bp->bio_parent;

+ if (pbp->bio_error == 0)

+ pbp->bio_error = bp->bio_error;

+ switch (pbp->bio_cmd) {

case BIO_READ:

+ if (bp->bio_error) {

+ g_destroy_bio(bp);

+ pbp->bio_children--;

+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);

+ bq->bp = pbp;

+ mtx_lock(&v->bqueue_mtx);

+ TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);

+ mtx_unlock(&v->bqueue_mtx);

+ return;

+ }

+ break;

case BIO_WRITE:

case BIO_DELETE:

- bp2 = g_clone_bio(bp);

- if (bp2 == NULL) {

+ break;

+ }

+ /* When the original request is finished, we deliver it. */

+ pbp->bio_inbed++;

+ if (pbp->bio_inbed == pbp->bio_children) {

+ pbp->bio_completed = bp->bio_length;

+ g_io_deliver(pbp, pbp->bio_error);

+ }

+ g_destroy_bio(bp);

+static void

+gv_vol_normal_request(struct gv_volume *v, struct bio *bp)

+ struct g_geom *gp;

+ struct gv_plex *p;

+ struct bio *cbp, *pbp;

+ gp = v->geom;

+ switch (bp->bio_cmd) {

+ case BIO_READ:

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL) {

g_io_deliver(bp, ENOMEM);

return;

}

- bp2->bio_done = gv_volume_done;

- g_io_request(bp2, LIST_FIRST(&gp->consumer));

- return;

- default:

- g_io_deliver(bp, EOPNOTSUPP);

- return;

+ cbp->bio_done = gv_volume_done;

+ LIST_FOREACH(p, &v->plexes, in_volume) {

+ if (p->state >= GV_PLEX_DEGRADED)

+ break;

+ }

+ g_io_request(cbp, p->consumer);

+ break;

+ case BIO_WRITE:

+ case BIO_DELETE:

+ LIST_FOREACH(p, &v->plexes, in_volume) {

+ if (p->state < GV_PLEX_DEGRADED)

+ continue;

+ cbp = g_clone_bio(bp);

+ if (cbp == NULL) /* XXX */

+ g_io_deliver(bp, ENOMEM);

+ cbp->bio_done = gv_volume_done;

+ cbp->bio_caller2 = p->consumer;

+ if (bp->bio_driver1 == NULL) {

+ bp->bio_driver1 = cbp;

+ } else {

+ pbp = bp->bio_driver1;

+ while (pbp->bio_caller1 != NULL)

+ pbp = pbp->bio_caller1;

+ pbp->bio_caller1 = cbp;

+ }

+ /* Fire off all sub-requests. */

+ pbp = bp->bio_driver1;

+ while (pbp != NULL) {

+ g_io_request(pbp, pbp->bio_caller2);

+ pbp = pbp->bio_caller1;

+ }

+ break;

}

@@ -211,6 +324,11 @@ gv_volume_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)

gp->access = gv_volume_access;

gp->softc = v;

first++;

+ TAILQ_INIT(&v->bqueue);

+ mtx_init(&v->bqueue_mtx, "gv_plex", NULL, MTX_DEF);

+ kthread_create(gv_vol_worker, v, NULL, 0, 0, "gv_v %s",

+ v->name);

+ v->flags |= GV_VOL_THREAD_ACTIVE;

} else

gp = v->geom;

@@ -261,9 +379,13 @@ static int

gv_volume_destroy_geom(struct gctl_req *req, struct g_class *mp,

struct g_geom *gp)

{

+ struct gv_volume *v;

g_trace(G_T_TOPOLOGY, "gv_volume_destroy_geom: %s", gp->name);

g_topology_assert();

+ v = gp->softc;

+ gv_kill_vol_thread(v);

g_wither_geom(gp, ENXIO);

return (0);

}