src - FreeBSD source tree

diff options


context:
space:
mode:

author	Greg Lehey <grog@FreeBSD.org>	1999-08-07 08:13:23 +0000
committer	Greg Lehey <grog@FreeBSD.org>	1999-08-07 08:13:23 +0000
commit	780f9fa3e3609eff3bb23c3be65a4112d1a4b52f (patch)
tree	7d69199aceb2cc2e6f4afa136f928a643b968420
parent	b853969e092a040df3a4836c82d7f168d45d7b3c (diff)
download	src-780f9fa3e3609eff3bb23c3be65a4112d1a4b52f.tar.gz src-780f9fa3e3609eff3bb23c3be65a4112d1a4b52f.zip

Import RAID-5 code.

Add Cybernet copyright. OK'd-by: Chuck Jacobus <chuck@cybernet.com> logrq: save device major and minor numbers to compensate for lost dev_t. launch_requests: Don't issue requests which are marked XFR_BAD_SUBDISK. This may make things easier in bre(). bre: Rearrange. - Change some comments - Recognize holes in plex structure. Formerly this could lead to incorrect write to the plex. Return REQUEST_DEGRADED on a read request, but carry on to the bitter end on a write request, and mark the requests for the inaccessible subdisks with XFR_BAD_SUBDISK. - return REQUEST_EOF if the requested transfer goes beyond the end of the plex. This is not an error, since other plexes may go further into the volume address space. build_read_request: Handle REQUEST_DEGRADED returned from bre(). sdio: Lock buffer before issuing the requests.

Notes

Notes: svn path=/head/; revision=49486

Diffstat

-rw-r--r--

sys/dev/vinum/vinumrequest.c

241

1 files changed, 155 insertions, 86 deletions

diff --git a/sys/dev/vinum/vinumrequest.c b/sys/dev/vinum/vinumrequest.c
index ffbc76bd4a80..646fd1d2bfb8 100644
--- a/sys/dev/vinum/vinumrequest.c
+++ b/sys/dev/vinum/vinumrequest.c

@@ -1,6 +1,10 @@

/*-

+ *

+ * Written by Greg Lehey

* This software is distributed under the so-called ``Berkeley

* License'':

@@ -33,7 +37,7 @@

* otherwise) arising in any way out of the use of this software, even if

* advised of the possibility of such damage.

- * $Id: vinumrequest.c,v 1.23 1999/03/20 21:58:38 grog Exp grog $

+ * $Id: vinumrequest.c,v 1.24 1999/07/05 01:53:14 grog Exp grog $

#include <dev/vinum/vinumhdr.h>

@@ -79,6 +83,8 @@ logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)

case loginfo_user_bp:

case loginfo_user_bpl:

bcopy(info.bp, &rqip->info.b, sizeof(struct buf));

+ rqip->devmajor = major(info.bp->b_dev);

+ rqip->devminor = minor(info.bp->b_dev);

break;

case loginfo_iodone:

@@ -86,6 +92,8 @@ logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)

case loginfo_raid5_data:

case loginfo_raid5_parity:

bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement));

+ rqip->devmajor = major(info.rqe->b.b_dev);

+ rqip->devminor = minor(info.rqe->b.b_dev);

break;

case loginfo_unused:

@@ -368,7 +376,7 @@ launch_requests(struct request *rq, int reviveok)

rqe = &rqg->rqe[rqno];

if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */

rqg->active--; /* one less active request */

- else {

+ else if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk isn't bad, we can do it */

if ((rqe->b.b_flags & B_READ) == 0)

rqe->b.b_vp->v_numoutput++; /* one more output going */

rqe->b.b_flags |= B_ORDERED; /* XXX chase SCSI driver */

@@ -394,7 +402,6 @@ launch_requests(struct request *rq, int reviveok)

/* fire off the request */

(*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b);

}

- /* XXX Do we need caching? Think about this more */

}

splx(s);

@@ -405,9 +412,9 @@ launch_requests(struct request *rq, int reviveok)

* define the low-level requests needed to perform a

* high-level I/O operation for a specific plex 'plexno'.

- * Return 0 if all subdisks involved in the request are up, 1 if some

- * subdisks are not up, and -1 if the request is at least partially

- * outside the bounds of the subdisks.

+ * Return REQUEST_OK if all subdisks involved in the request are up,

+ * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the

+ * request is at least partially outside the bounds of the subdisks.

* Modify the pointer *diskstart to point to the end address. On

* read, return on the first bad subdisk, so that the caller

@@ -438,6 +445,7 @@ bre(struct request *rq,

daddr_t blockoffset; /* offset in stripe on subdisk */

struct rqelement *rqe; /* point to this request information */

daddr_t diskstart = *diskaddr; /* remember where this transfer starts */

+ enum requeststatus s; /* temp return value */

bp = rq->bp; /* buffer pointer */

status = REQUEST_OK; /* return value: OK until proven otherwise */

@@ -445,17 +453,12 @@ bre(struct request *rq,

switch (plex->organization) {

case plex_concat:

+ sd = NULL; /* (keep compiler quiet) */

for (sdno = 0; sdno < plex->subdisks; sdno++) {

sd = &SD[plex->sdnos[sdno]];

- if ((*diskaddr < (sd->plexoffset + sd->sectors)) /* The request starts before the end of this */

- &&(diskend > sd->plexoffset)) { /* subdisk and ends after the start of this sd */

- if (sd->state != sd_up) {

- enum requeststatus s;

- s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */

- if (s)

- return s; /* XXX get this right */

- }

+ if (*diskaddr < sd->plexoffset) /* we must have a hole, */

+ status = REQUEST_DEGRADED; /* note the fact */

+ if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */

rqg = allocrqg(rq, 1); /* space for the request */

if (rqg == NULL) { /* malloc failed */

bp->b_flags |= B_ERROR;

@@ -468,7 +471,7 @@ bre(struct request *rq,

rqe = &rqg->rqe[0]; /* point to the element */

rqe->rqg = rqg; /* group */

rqe->sdno = sd->sdno; /* put in the subdisk number */

- plexoffset = max(sd->plexoffset, *diskaddr); /* start offset in plex */

+ plexoffset = *diskaddr; /* start offset in plex */

rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */

rqe->useroffset = plexoffset - diskstart; /* start offset in user buffer */

rqe->dataoffset = 0;

@@ -479,55 +482,74 @@ bre(struct request *rq,

rqe->buflen = rqe->datalen; /* buffer length is data buffer length */

rqe->flags = 0;

rqe->driveno = sd->driveno;

+ if (sd->state != sd_up) { /* *now* we find the sd is down */

+ s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */

+ if (s == REQUEST_DOWN) { /* down? */

+ if (rq->bp->b_flags & B_READ) /* read request, */

+ return REQUEST_DEGRADED; /* give up here */

+ /*

+ * If we're writing, don't give up

+ * because of a bad subdisk. Go

+ * through to the bitter end, but note

+ * which ones we can't access.

+ */

+ rqe->flags = XFR_BAD_SUBDISK;

+ status = REQUEST_DEGRADED; /* can't do it all */

+ }

*diskaddr += rqe->datalen; /* bump the address */

- if (build_rq_buffer(rqe, plex)) { /* build the buffer */

- deallocrqg(rqg);

- bp->b_flags |= B_ERROR;

- bp->b_error = ENOMEM;

- biodone(bp);

- return REQUEST_ENOMEM; /* can't do it */

+ if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk OK, */

+ /*

+ * We could build the buffer anyway, even if the

+ * subdisk is down, but it's a waste of time and

+ * space.

+ */

+ if (build_rq_buffer(rqe, plex)) { /* build the buffer */

+ deallocrqg(rqg);

+ bp->b_flags |= B_ERROR;

+ bp->b_error = ENOMEM;

+ biodone(bp);

+ return REQUEST_ENOMEM; /* can't do it */

+ }

}

- if (*diskaddr > diskend) /* we're finished, */

+ if (*diskaddr == diskend) /* we're finished, */

break; /* get out of here */

}

+ /*

+ * We've got to the end of the plex. Have we got to the end of

+ * the transfer? It would seem that having an offset beyond the

+ * end of the subdisk is an error, but in fact it can happen if

+ * the volume has another plex of different size. There's a valid

+ * question as to why you would want to do this, but currently

+ * it's allowed.

+ *

+ * In a previous version, I returned REQUEST_DOWN here. I think

+ * REQUEST_EOF is more appropriate now.

+ */

+ if (diskend > sd->sectors + sd->plexoffset) /* pointing beyond EOF? */

+ status = REQUEST_EOF;

break;

case plex_striped:

{

while (*diskaddr < diskend) { /* until we get it all sorted out */

- /*

- * The offset of the start address from

- * the start of the stripe

- */

+ if (*diskaddr >= plex->length) /* beyond the end of the plex */

+ return REQUEST_EOF; /* can't continue */

+ /* The offset of the start address from the start of the stripe. */

stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks);

- /*

- * The plex-relative address of the

- * start of the stripe

- */

+ /* The plex-relative address of the start of the stripe. */

stripebase = *diskaddr - stripeoffset;

- /*

- * The number of the subdisk in which

- * the start is located

- */

+ /* The number of the subdisk in which the start is located. */

sdno = stripeoffset / plex->stripesize;

- /*

- * The offset from the beginning of the stripe

- * on this subdisk

- */

+ /* The offset from the beginning of the stripe on this subdisk. */

blockoffset = stripeoffset % plex->stripesize;

sd = &SD[plex->sdnos[sdno]]; /* the subdisk in question */

- if (sd->state != sd_up) {

- enum requeststatus s;

- s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */

- if (s) /* give up? */

- return s; /* yup */

- }

rqg = allocrqg(rq, 1); /* space for the request */

if (rqg == NULL) { /* malloc failed */

bp->b_flags |= B_ERROR;

@@ -551,8 +573,32 @@ bre(struct request *rq,

rqe->sdno = sd->sdno; /* put in the subdisk number */

rqe->driveno = sd->driveno;

- if (rqe->sdoffset >= sd->sectors) { /* starts beyond the end of the subdisk? */

- deallocrqg(rqg);

+ if (sd->state != sd_up) { /* *now* we find the sd is down */

+ s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */

+ if (s == REQUEST_DOWN) { /* down? */

+ if (rq->bp->b_flags & B_READ) /* read request, */

+ return REQUEST_DEGRADED; /* give up here */

+ /*

+ * If we're writing, don't give up

+ * because of a bad subdisk. Go through

+ * to the bitter end, but note which

+ * ones we can't access.

+ */

+ rqe->flags = XFR_BAD_SUBDISK; /* yup */

+ status = REQUEST_DEGRADED; /* can't do it all */

+ }

+ /*

+ * It would seem that having an offset

+ * beyond the end of the subdisk is an

+ * error, but in fact it can happen if the

+ * volume has another plex of different

+ * size. There's a valid question as to why

+ * you would want to do this, but currently

+ * it's allowed.

+ */

+ if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */

+ rqe->datalen = sd->sectors - rqe->sdoffset; /* truncate */

#if VINUMDEBUG

if (debug & DEBUG_EOFINFO) { /* tell on the request */

log(LOG_DEBUG,

@@ -568,19 +614,19 @@ bre(struct request *rq,

blockoffset);

}

#endif

- return REQUEST_EOF;

- } else if (rqe->sdoffset + rqe->datalen > sd->sectors) /* ends beyond the end of the subdisk? */

- rqe->datalen = sd->sectors - rqe->sdoffset; /* yes, truncate */

- if (build_rq_buffer(rqe, plex)) { /* build the buffer */

- deallocrqg(rqg);

- bp->b_flags |= B_ERROR;

- bp->b_error = ENOMEM;

- biodone(bp);

- return REQUEST_ENOMEM; /* can't do it */

+ }

+ if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk OK, */

+ if (build_rq_buffer(rqe, plex)) { /* build the buffer */

+ deallocrqg(rqg);

+ bp->b_flags |= B_ERROR;

+ bp->b_error = ENOMEM;

+ biodone(bp);

+ return REQUEST_ENOMEM; /* can't do it */

+ }

}

*diskaddr += rqe->datalen; /* look at the remainder */

- if (*diskaddr < diskend) { /* didn't finish the request on this stripe */

+ if ((*diskaddr < diskend) /* didn't finish the request on this stripe */

+ &&(*diskaddr < plex->length)) { /* and there's more to come */

plex->multiblock++; /* count another one */

if (sdno == plex->subdisks - 1) /* last subdisk, */

plex->multistripe++; /* another stripe as well */

@@ -589,6 +635,13 @@ bre(struct request *rq,

}

break;

+ /*

+ * RAID5 is complicated enough to have

+ * its own function

+ */

+ case plex_raid5:

+ status = bre5(rq, plexno, diskaddr, diskend);

+ break;

default:

log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization);

@@ -617,6 +670,7 @@ build_read_request(struct request *rq, /* request */

off_t oldstart; /* note where we started */

int recovered = 0; /* set if we recover a read */

enum requeststatus status = REQUEST_OK;

+ int plexmask; /* bit mask of plexes, for recovery */

bp = rq->bp; /* buffer pointer */

diskaddr = bp->b_blkno; /* start offset of transfer */

@@ -632,41 +686,42 @@ build_read_request(struct request *rq, /* request */

continue;

case REQUEST_RECOVERED:

+ /*

+ * XXX FIXME if we have more than one plex, and we can

+ * satisfy the request from another, don't use the

+ * recovered request, since it's more expensive.

+ */

recovered = 1;

break;

- case REQUEST_EOF:

case REQUEST_ENOMEM:

return status;

- * if we get here, we have either had a failure or

- * a RAID 5 recovery. We don't want to use the

- * recovery, because it's expensive, so first we

- * check if we have alternatives

+ * If we get here, our request is not complete. Try

+ * to fill in the missing parts from another plex.

+ * This can happen multiple times in this function,

+ * and we reinitialize the plex mask each time, since

+ * we could have a hole in our plexes.

+ case REQUEST_EOF:

case REQUEST_DOWN: /* can't access the plex */

- if (vol != NULL) { /* and this is volume I/O */

- /*

- * Try to satisfy the request

- * from another plex

- */

- for (plexno = 0; plexno < vol->plexes; plexno++) {

- diskaddr = startaddr; /* start at the beginning again */

- oldstart = startaddr; /* and note where that was */

- if (plexno != plexindex) { /* don't try this plex again */

- bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */

- if (diskaddr > oldstart) { /* we satisfied another part */

- recovered = 1; /* we recovered from the problem */

- status = REQUEST_OK; /* don't complain about it */

- break;

- }

+ case REQUEST_DEGRADED: /* can't access the plex */

+ plexmask = ((1 << vol->plexes) - 1) /* all plexes in the volume */

+ &~(1 << plexindex); /* except for the one we were looking at */

+ for (plexno = 0; plexno < vol->plexes; plexno++) {

+ if (plexmask == 0) /* no plexes left to try */

+ return REQUEST_DOWN; /* failed */

+ diskaddr = startaddr; /* start at the beginning again */

+ oldstart = startaddr; /* and note where that was */

+ if (plexmask & (1 << plexno)) { /* we haven't tried this plex yet */

+ bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */

+ if (diskaddr > oldstart) { /* we satisfied another part */

+ recovered = 1; /* we recovered from the problem */

+ status = REQUEST_OK; /* don't complain about it */

+ break;

}

- if (plexno == (vol->plexes - 1)) /* couldn't satisfy the request */

- return REQUEST_DOWN; /* failed */

}

- } else

- return REQUEST_DOWN; /* bad luck */

+ }

}

if (recovered)

vol->recovered_reads += recovered; /* adjust our recovery count */

@@ -757,6 +812,18 @@ build_rq_buffer(struct rqelement *rqe, struct plex *plex)

* finished the transfer

bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE;

+ /*

+ * On a recovery read, we perform an XOR of

+ * all blocks to the user buffer. To make

+ * this work, we first clean out the buffer

+ */

+ if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK))

+ == (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) { /* bad subdisk of a recovery read */

+ int length = rqe->grouplen << DEV_BSHIFT; /* and count involved */

+ char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */

+ bzero(data, length); /* clean it out */

+ }

return 0;

}

@@ -838,6 +905,8 @@ sdio(struct buf *bp)

sbp->b.b_data = bp->b_data; /* data buffer */

sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;

sbp->b.b_iodone = sdio_done; /* come here on completion */

+ BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */

+ BUF_LOCK(&sbp->b, LK_EXCLUSIVE); /* and lock it */

sbp->b.b_vp = DRIVE[sd->driveno].vp; /* vnode */

sbp->bp = bp; /* note the address of the original header */