diff options
author | Lukas Ertl <le@FreeBSD.org> | 2004-06-12 21:16:10 +0000 |
---|---|---|
committer | Lukas Ertl <le@FreeBSD.org> | 2004-06-12 21:16:10 +0000 |
commit | 73679edcc78b9bbd516cb52b348121b0dc35f675 (patch) | |
tree | b37e9b83eff28125aba7f626ab2e3bea5b487658 /sys/geom/vinum | |
parent | 359fdba7a70222b3422610f4f73bca1be55e0524 (diff) | |
download | src-73679edcc78b9bbd516cb52b348121b0dc35f675.tar.gz src-73679edcc78b9bbd516cb52b348121b0dc35f675.zip |
Add a first version of a GEOMified vinum.
Notes
Notes:
svn path=/head/; revision=130389
Diffstat (limited to 'sys/geom/vinum')
-rw-r--r-- | sys/geom/vinum/geom_vinum.c | 569 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum.h | 79 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_drive.c | 476 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_init.c | 405 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_list.c | 466 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_plex.c | 456 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_raid5.c | 616 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_raid5.h | 93 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_rm.c | 346 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_share.c | 651 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_share.h | 62 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_state.c | 289 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_subr.c | 804 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_var.h | 279 | ||||
-rw-r--r-- | sys/geom/vinum/geom_vinum_volume.c | 260 |
15 files changed, 5851 insertions, 0 deletions
diff --git a/sys/geom/vinum/geom_vinum.c b/sys/geom/vinum/geom_vinum.c new file mode 100644 index 000000000000..44a8061c4dfd --- /dev/null +++ b/sys/geom/vinum/geom_vinum.c @@ -0,0 +1,569 @@ +/* + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum.h> +#include <geom/vinum/geom_vinum_share.h> + +#if 0 +SYSCTL_DECL(_kern_geom); +SYSCTL_NODE(_kern_geom, OID_AUTO, vinum, CTLFLAG_RW, 0, "GEOM_VINUM stuff"); +SYSCTL_UINT(_kern_geom_vinum, OID_AUTO, debug, CTLFLAG_RW, &gv_debug, 0, + "Debug level"); +#endif + +int gv_create(struct g_geom *, struct gctl_req *); +void config_new_drive(struct gv_drive *); + +static void +gv_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + struct gv_softc *sc; + int error; + + g_topology_assert(); + + KASSERT(cp != NULL, ("gv_orphan: null cp")); + gp = cp->geom; + KASSERT(gp != NULL, ("gv_orphan: null gp")); + sc = gp->softc; + + g_trace(G_T_TOPOLOGY, "gv_orphan(%s)", gp->name); + + if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) + g_access(cp, -cp->acr, -cp->acw, -cp->ace); + error = cp->provider->error; + if (error == 0) + error = ENXIO; + g_detach(cp); + g_destroy_consumer(cp); + if (!LIST_EMPTY(&gp->consumer)) + return; + g_free(sc); + g_wither_geom(gp, error); +} + +static void +gv_start(struct bio *bp) +{ + struct bio *bp2; + struct g_geom *gp; + + gp = bp->bio_to->geom; + switch(bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + bp2 = g_clone_bio(bp); + bp2->bio_done = g_std_done; + g_io_request(bp2, LIST_FIRST(&gp->consumer)); + return; + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } +} + +static int +gv_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + + gp = pp->geom; + error = ENXIO; + cp = LIST_FIRST(&gp->consumer); + error = g_access(cp, dr, dw, de); + return (error); +} + +static struct g_geom * +gv_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct gv_softc *sc; + struct gv_hdr *vhdr; + int error, first; + char *buf; + + vhdr = NULL; + buf = NULL; + first = 0; + + g_trace(G_T_TOPOLOGY, "gv_taste(%s, %s)", mp->name, pp->name); + g_topology_assert(); + + if (pp->sectorsize == 0) + return (NULL); + + /* Check if we already have a VINUM geom, or create a new one. */ + if (LIST_EMPTY(&mp->geom)) { + gp = g_new_geomf(mp, "VINUM"); + gp->spoiled = gv_orphan; + gp->orphan = gv_orphan; + gp->access = gv_access; + gp->start = gv_start; + gp->softc = g_malloc(sizeof(struct gv_softc), + M_WAITOK | M_ZERO); + sc = gp->softc; + sc->geom = gp; + LIST_INIT(&sc->drives); + LIST_INIT(&sc->subdisks); + LIST_INIT(&sc->plexes); + LIST_INIT(&sc->volumes); + first++; + } else { + gp = LIST_FIRST(&mp->geom); + sc = gp->softc; + } + + + /* We need a temporary consumer to read the config from. */ + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error) { + g_destroy_consumer(cp); + if (first) { + g_free(sc); + g_destroy_geom(gp); + } + return (NULL); + } + error = g_access(cp, 1, 0, 0); + if (error) { + g_detach(cp); + g_destroy_consumer(cp); + if (first) { + g_free(gp->softc); + g_destroy_geom(gp); + } + return (NULL); + } + + g_topology_unlock(); + + /* Check if the provided slice is a valid vinum drive. */ + vhdr = g_read_data(cp, GV_HDR_OFFSET, GV_HDR_LEN, &error); + if (vhdr == NULL || error != 0) { + g_topology_lock(); + g_access(cp, -1, 0, 0); + g_detach(cp); + g_destroy_consumer(cp); + if (first) { + g_free(sc); + g_destroy_geom(gp); + } + return (NULL); + } + + /* This provider has no vinum magic on board. */ + if (vhdr->magic != GV_MAGIC) { + /* Release the temporary consumer, we don't need it anymore. */ + g_topology_lock(); + g_access(cp, -1, 0, 0); + g_detach(cp); + g_destroy_consumer(cp); + + g_free(vhdr); + + /* + * If there is no other VINUM geom yet just take this one; the + * configuration is still empty, but it can be filled by other + * valid vinum drives later. + */ + if (first) + return (gp); + else + return (NULL); + + /* + * We have found a valid vinum drive, now read the on-disk + * configuration. + */ + } else { + g_free(vhdr); + + buf = g_read_data(cp, GV_CFG_OFFSET, GV_CFG_LEN, + &error); + if (buf == NULL || error != 0) { + g_topology_lock(); + g_access(cp, -1, 0, 0); + g_detach(cp); + g_destroy_consumer(cp); + if (first) { + g_free(sc); + g_destroy_geom(gp); + } + return (NULL); + } + + /* Release the temporary consumer, we don't need it anymore. */ + g_topology_lock(); + g_access(cp, -1, 0, 0); + g_detach(cp); + g_destroy_consumer(cp); + + /* We are the first VINUM geom. */ + if (first) { + gv_parse_config(sc, buf, 0); + g_free(buf); + return (gp); + + /* Just merge the configs. */ + } else { + gv_parse_config(sc, buf, 1); + g_free(buf); + return (NULL); + } + } +} + +/* XXX this really belongs somewhere else */ +void +config_new_drive(struct gv_drive *d) +{ + struct gv_hdr *vhdr; + struct gv_freelist *fl; + + KASSERT(d != NULL, ("config_new_drive: NULL d")); + + vhdr = g_malloc(sizeof(*vhdr), M_WAITOK | M_ZERO); + vhdr->magic = GV_MAGIC; + vhdr->config_length = GV_CFG_LEN; + + bcopy(hostname, vhdr->label.sysname, GV_HOSTNAME_LEN); + strncpy(vhdr->label.name, d->name, GV_MAXDRIVENAME); + microtime(&vhdr->label.date_of_birth); + + d->hdr = vhdr; + + LIST_INIT(&d->subdisks); + LIST_INIT(&d->freelist); + + fl = g_malloc(sizeof(struct gv_freelist), M_WAITOK | M_ZERO); + fl->offset = GV_DATA_START; + fl->size = d->avail; + LIST_INSERT_HEAD(&d->freelist, fl, freelist); + d->freelist_entries = 1; + +} + +/* Handle userland requests for creating new objects. */ +int +gv_create(struct g_geom *gp, struct gctl_req *req) +{ + struct gv_softc *sc; + struct gv_drive *d, *d2; + struct gv_plex *p, *p2; + struct gv_sd *s, *s2; + struct gv_volume *v, *v2; + struct g_consumer *cp; + struct g_provider *pp; + int error, i, *drives, *plexes, *subdisks, *volumes; + char buf[20], errstr[ERRBUFSIZ]; + + g_topology_assert(); + + sc = gp->softc; + + /* Find out how many of each object have been passed in. */ + volumes = gctl_get_paraml(req, "volumes", sizeof(*volumes)); + plexes = gctl_get_paraml(req, "plexes", sizeof(*plexes)); + subdisks = gctl_get_paraml(req, "subdisks", sizeof(*subdisks)); + drives = gctl_get_paraml(req, "drives", sizeof(*drives)); + + /* First, handle drive definitions ... */ + for (i = 0; i < *drives; i++) { + snprintf(buf, sizeof(buf), "drive%d", i); + d2 = gctl_get_paraml(req, buf, sizeof(*d2)); + d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); + bcopy(d2, d, sizeof(*d)); + + /* XXX */ + pp = g_provider_by_name(d->device); + d->size = pp->mediasize - GV_DATA_START; + d->avail = d->size; + + config_new_drive(d); + + LIST_INSERT_HEAD(&sc->drives, d, drive); + } + + /* ... then volume definitions ... */ + for (i = 0; i < *volumes; i++) { + error = 0; + snprintf(buf, sizeof(buf), "volume%d", i); + v2 = gctl_get_paraml(req, buf, sizeof(*v2)); + + v = gv_find_vol(sc, v2->name); + if (v != NULL) { + gctl_error(req, "volume '%s' is already known", + v->name); + return (-1); + } + + v = g_malloc(sizeof(*v), M_WAITOK | M_ZERO); + bcopy(v2, v, sizeof(*v)); + + v->vinumconf = sc; + LIST_INIT(&v->plexes); + LIST_INSERT_HEAD(&sc->volumes, v, volume); + } + + /* ... then plex definitions ... */ + for (i = 0; i < *plexes; i++) { + error = 0; + snprintf(buf, sizeof(buf), "plex%d", i); + p2 = gctl_get_paraml(req, buf, sizeof(*p2)); + + p = gv_find_plex(sc, p2->name); + if (p != NULL) { + gctl_error(req, "plex '%s' is already known", p->name); + return (-1); + } + + p = g_malloc(sizeof(*p), M_WAITOK | M_ZERO); + bcopy(p2, p, sizeof(*p)); + + /* Find the volume this plex should be attached to. */ + v = gv_find_vol(sc, p->volume); + if (v != NULL) { + if (v->plexcount) + p->flags |= GV_PLEX_ADDED; + p->vol_sc = v; + v->plexcount++; + LIST_INSERT_HEAD(&v->plexes, p, in_volume); + } + + p->vinumconf = sc; + LIST_INIT(&p->subdisks); + LIST_INSERT_HEAD(&sc->plexes, p, plex); + } + + /* ... and finally, subdisk definitions. */ + for (i = 0; i < *subdisks; i++) { + error = 0; + snprintf(buf, sizeof(buf), "sd%d", i); + s2 = gctl_get_paraml(req, buf, sizeof(*s2)); + + s = gv_find_sd(sc, s2->name); + if (s != NULL) { + gctl_error(req, "subdisk '%s' is already known", + s->name); + return (-1); + } + + s = g_malloc(sizeof(*s), M_WAITOK | M_ZERO); + bcopy(s2, s, sizeof(*s)); + + /* Find the drive where this subdisk should be put on. */ + d = gv_find_drive(sc, s->drive); + + /* drive not found - XXX */ + if (d == NULL) { + printf("FOO: drive '%s' not found\n", s->drive); + g_free(s); + continue; + } + + /* Find the plex where this subdisk belongs to. */ + p = gv_find_plex(sc, s->plex); + + /* plex not found - XXX */ + if (p == NULL) { + printf("FOO: plex '%s' not found\n", s->plex); + g_free(s); + continue; + } + + /* + * First we give the subdisk to the drive, to handle autosized + * values ... + */ + error = gv_sd_to_drive(sc, d, s, errstr, sizeof(errstr)); + if (error) { + gctl_error(req, errstr); + g_free(s); + continue; + } + + /* + * Then, we give the subdisk to the plex; we check if the + * given values are correct and maybe adjust them. + */ + error = gv_sd_to_plex(p, s, 1); + if (error) { + printf("FOO: couldn't give sd '%s' to plex '%s'\n", + s->name, p->name); + } + s->flags |= GV_SD_NEWBORN; + + s->vinumconf = sc; + LIST_INSERT_HEAD(&sc->subdisks, s, sd); + } + + LIST_FOREACH(s, &sc->subdisks, sd) + gv_update_sd_state(s); + LIST_FOREACH(p, &sc->plexes, plex) + gv_update_plex_config(p); + LIST_FOREACH(v, &sc->volumes, volume) + gv_update_vol_state(v); + + /* + * Write out the configuration to each drive. If the drive doesn't + * have a valid geom_slice geom yet, attach it temporarily to our VINUM + * geom. + */ + LIST_FOREACH(d, &sc->drives, drive) { + if (d->geom == NULL) { + /* XXX */ + pp = g_provider_by_name(d->device); + cp = g_new_consumer(gp); + g_attach(cp, pp); + gv_save_config(cp, d, sc); + g_detach(cp); + g_destroy_consumer(cp); + } else + gv_save_config(NULL, d, sc); + } + + return (0); +} + +static void +gv_config(struct gctl_req *req, struct g_class *mp, char const *verb) +{ + struct g_geom *gp; + struct gv_softc *sc; + struct sbuf *sb; + char *comment; + + g_topology_assert(); + + gp = LIST_FIRST(&mp->geom); + sc = gp->softc; + + if (!strcmp(verb, "list")) { + gv_list(gp, req); + + /* Save our configuration back to disk. */ + } else if (!strcmp(verb, "saveconfig")) { + + gv_save_config_all(sc); + + /* Return configuration in string form. */ + } else if (!strcmp(verb, "getconfig")) { + comment = gctl_get_param(req, "comment", NULL); + + sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); + gv_format_config(sc, sb, 0, comment); + sbuf_finish(sb); + gctl_set_param(req, "config", sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + + } else if (!strcmp(verb, "create")) { + gv_create(gp, req); + + } else if (!strcmp(verb, "remove")) { + gv_remove(gp, req); + + } else if (!strcmp(verb, "start")) { + gv_start_obj(gp, req); + + } else + gctl_error(req, "Unknown verb parameter"); +} + +static int +gv_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp) +{ + struct g_geom *gp2; + struct gv_softc *sc; + struct gv_drive *d, *d2; + struct gv_freelist *fl, *fl2; + + g_trace(G_T_TOPOLOGY, "gv_destroy_geom: %s", gp->name); + g_topology_assert(); + + KASSERT(gp != NULL, ("gv_destroy_geom: null gp")); + KASSERT(gp->softc != NULL, ("gv_destroy_geom: null sc")); + + sc = gp->softc; + + /* + * Check if any of our drives is still open; if so, refuse destruction. + */ + LIST_FOREACH(d, &sc->drives, drive) { + gp2 = d->geom; + if (gv_is_open(gp2)) + return (EBUSY); + } + + LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) { + g_free(d->hdr); + d->hdr = NULL; + LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) { + d->freelist_entries--; + LIST_REMOVE(fl, freelist); + g_free(fl); + fl = NULL; + } + LIST_REMOVE(d, drive); + } + + g_free(sc); + sc = NULL; + g_wither_geom(gp, ENXIO); + return (0); +} + +#define VINUM_CLASS_NAME "VINUM" + +static struct g_class g_vinum_class = { + .name = VINUM_CLASS_NAME, + .taste = gv_taste, + .destroy_geom = gv_destroy_geom, + .ctlreq = gv_config, +}; + +DECLARE_GEOM_CLASS(g_vinum_class, g_vinum); diff --git a/sys/geom/vinum/geom_vinum.h b/sys/geom/vinum/geom_vinum.h new file mode 100644 index 000000000000..567c8b6015c4 --- /dev/null +++ b/sys/geom/vinum/geom_vinum.h @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _GEOM_VINUM_H_ +#define _GEOM_VINUM_H_ + +#define ERRBUFSIZ 1024 + +/* geom_vinum_drive.c */ +void gv_save_config_all(struct gv_softc *); +void gv_save_config(struct g_consumer *, struct gv_drive *, + struct gv_softc *); + +/* geom_vinum_init.c */ +void gv_start_obj(struct g_geom *, struct gctl_req *); + +/* geom_vinum_list.c */ +void gv_ld(struct g_geom *, struct gctl_req *, struct sbuf *); +void gv_lp(struct g_geom *, struct gctl_req *, struct sbuf *); +void gv_ls(struct g_geom *, struct gctl_req *, struct sbuf *); +void gv_lv(struct g_geom *, struct gctl_req *, struct sbuf *); +void gv_list(struct g_geom *, struct gctl_req *); + +/* geom_vinum_rm.c */ +void gv_remove(struct g_geom *, struct gctl_req *); + +/* geom_vinum_state.c */ +int gv_sdstatemap(struct gv_plex *); +int gv_set_drive_state(struct gv_drive *, int, int); +int gv_set_sd_state(struct gv_sd *, int, int); +void gv_update_sd_state(struct gv_sd *); +void gv_update_plex_state(struct gv_plex *); +void gv_update_vol_state(struct gv_volume *); + +/* geom_vinum_subr.c */ +void gv_adjust_freespace(struct gv_sd *, off_t); +struct g_geom *find_vinum_geom(void); +struct gv_drive *gv_find_drive(struct gv_softc *, char *); +struct gv_plex *gv_find_plex(struct gv_softc *, char *); +struct gv_sd *gv_find_sd(struct gv_softc *, char *); +struct gv_volume *gv_find_vol(struct gv_softc *, char *); +void gv_format_config(struct gv_softc *, struct sbuf *, int, char *); +int gv_is_striped(struct gv_plex *); +int gv_is_open(struct g_geom *); +void gv_kill_thread(struct gv_plex *); +int gv_object_type(struct gv_softc *, char *); +void gv_parse_config(struct gv_softc *, u_char *, int); +const char *gv_roughlength(off_t, int); +int gv_sd_to_drive(struct gv_softc *, struct gv_drive *, struct gv_sd *, + char *, int); +int gv_sd_to_plex(struct gv_plex *, struct gv_sd *, int); +void gv_update_plex_config(struct gv_plex *); + +#endif /* !_GEOM_VINUM_H_ */ diff --git a/sys/geom/vinum/geom_vinum_drive.c b/sys/geom/vinum/geom_vinum_drive.c new file mode 100644 index 000000000000..161b6aca806b --- /dev/null +++ b/sys/geom/vinum/geom_vinum_drive.c @@ -0,0 +1,476 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/errno.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/libkern.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/sbuf.h> +#include <sys/systm.h> +#include <sys/time.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum.h> +#include <geom/vinum/geom_vinum_share.h> + +void gv_drive_modify(struct gv_drive *); + +void +gv_save_config_all(struct gv_softc *sc) +{ + struct gv_drive *d; + + g_topology_assert(); + + LIST_FOREACH(d, &sc->drives, drive) { + if (d->geom == NULL) + continue; + gv_save_config(NULL, d, sc); + } +} + +/* Save the vinum configuration back to disk. */ +void +gv_save_config(struct g_consumer *cp, struct gv_drive *d, struct gv_softc *sc) +{ + struct g_geom *gp; + struct g_consumer *cp2; + struct gv_hdr *vhdr, *hdr; + struct sbuf *sb; + int error; + + g_topology_assert(); + + KASSERT(d != NULL, ("gv_save_config: null d")); + KASSERT(sc != NULL, ("gv_save_config: null sc")); + + if (cp == NULL) { + gp = d->geom; + KASSERT(gp != NULL, ("gv_save_config: null gp")); + cp2 = LIST_FIRST(&gp->consumer); + KASSERT(cp2 != NULL, ("gv_save_config: null cp2")); + } else + cp2 = cp; + + vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO); + vhdr->magic = GV_MAGIC; + vhdr->config_length = GV_CFG_LEN; + + hdr = d->hdr; + if (hdr == NULL) { + printf("NULL hdr!!!\n"); + g_free(vhdr); + return; + } + microtime(&hdr->label.last_update); + bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label)); + + sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); + gv_format_config(sc, sb, 1, NULL); + sbuf_finish(sb); + + error = g_access(cp2, 0, 1, 0); + if (error) { + printf("g_access failed: %d\n", error); + sbuf_delete(sb); + return; + } + g_topology_unlock(); + + do { + error = g_write_data(cp2, GV_HDR_OFFSET, vhdr, GV_HDR_LEN); + if (error) { + printf("writing vhdr failed: %d", error); + break; + } + + error = g_write_data(cp2, GV_CFG_OFFSET, sbuf_data(sb), + GV_CFG_LEN); + if (error) { + printf("writing first config copy failed: %d", error); + break; + } + + error = g_write_data(cp2, GV_CFG_OFFSET + GV_CFG_LEN, + sbuf_data(sb), GV_CFG_LEN); + if (error) + printf("writing second config copy failed: %d", error); + } while (0); + + g_topology_lock(); + g_access(cp2, 0, -1, 0); + sbuf_delete(sb); + g_free(vhdr); + + if (d->geom != NULL) + gv_drive_modify(d); +} + +/* This resembles g_slice_access(). */ +static int +gv_drive_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_provider *pp2; + struct gv_drive *d; + struct gv_sd *s, *s2; + int error; + + gp = pp->geom; + cp = LIST_FIRST(&gp->consumer); + KASSERT(cp != NULL, ("gv_drive_access: NULL cp")); + + d = gp->softc; + + s = pp->private; + KASSERT(s != NULL, ("gv_drive_access: NULL s")); + + LIST_FOREACH(s2, &d->subdisks, from_drive) { + if (s == s2) + continue; + if (s->drive_offset + s->size <= s2->drive_offset) + continue; + if (s2->drive_offset + s2->size <= s->drive_offset) + continue; + + /* Overlap. */ + pp2 = s2->provider; + KASSERT(s2 != NULL, ("gv_drive_access: NULL s2")); + if ((pp->acw + dw) > 0 && pp2->ace > 0) { + printf("FOOO: permission denied - e\n"); + return (EPERM); + } + if ((pp->ace + de) > 0 && pp2->acw > 0) { + printf("FOOO: permission denied - w\n"); + return (EPERM); + } + } + + /* On first open, grab an extra "exclusive" bit */ + if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) + de++; + /* ... and let go of it on last close */ + if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) + de--; + error = g_access(cp, dr, dw, de); + if (error) { + printf("FOOO: g_access failed: %d\n", error); + } + return (error); +} + +static void +gv_drive_start(struct bio *bp) +{ + struct bio *bp2; + struct g_geom *gp; + struct g_consumer *cp; + struct g_provider *pp; + struct gv_drive *d; + struct gv_sd *s; + + pp = bp->bio_to; + gp = pp->geom; + cp = LIST_FIRST(&gp->consumer); + d = gp->softc; + s = pp->private; + + if ((s->state == GV_SD_DOWN) || (s->state == GV_SD_STALE)) { + g_io_deliver(bp, ENXIO); + return; + } + + switch(bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + if (bp->bio_offset > s->size) { + g_io_deliver(bp, EINVAL); /* XXX: EWHAT ? */ + return; + } + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + if (bp2->bio_offset + bp2->bio_length > s->size) + bp2->bio_length = s->size - bp2->bio_offset; + bp2->bio_done = g_std_done; + bp2->bio_offset += s->drive_offset; + g_io_request(bp2, cp); + return; + + case BIO_GETATTR: + if (!strcmp("GEOM::kerneldump", bp->bio_attribute)) { + struct g_kerneldump *gkd; + + gkd = (struct g_kerneldump *)bp->bio_data; + gkd->offset += s->drive_offset; + if (gkd->length > s->size) + gkd->length = s->size; + /* now, pass it on downwards... */ + } + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + bp2->bio_done = g_std_done; + g_io_request(bp2, cp); + return; + + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } +} + +static void +gv_drive_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + int error; + + g_topology_assert(); + gp = cp->geom; + g_trace(G_T_TOPOLOGY, "gv_drive_orphan(%s)", gp->name); + if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) + g_access(cp, -cp->acr, -cp->acw, -cp->ace); + error = cp->provider->error; + if (error == 0) + error = ENXIO; + g_detach(cp); + g_destroy_consumer(cp); + if (!LIST_EMPTY(&gp->consumer)) + return; + g_free(gp->softc); + g_wither_geom(gp, error); +} + +static struct g_geom * +gv_drive_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_geom *gp, *gp2; + struct g_consumer *cp; + struct gv_drive *d; + struct gv_sd *s; + struct gv_softc *sc; + struct gv_freelist *fl; + struct gv_hdr *vhdr; + int error; + char errstr[ERRBUFSIZ]; + + vhdr = NULL; + d = NULL; + + g_trace(G_T_TOPOLOGY, "gv_drive_taste(%s, %s)", mp->name, pp->name); + g_topology_assert(); + + if (pp->sectorsize == 0) + return(NULL); + + /* Find the VINUM class and its associated geom. */ + gp2 = find_vinum_geom(); + if (gp2 == NULL) + return (NULL); + sc = gp2->softc; + + gp = g_new_geomf(mp, "%s.vinumdrive", pp->name); + gp->start = gv_drive_start; + gp->spoiled = gv_drive_orphan; + gp->orphan = gv_drive_orphan; + gp->access = gv_drive_access; + gp->start = gv_drive_start; + + cp = g_new_consumer(gp); + g_attach(cp, pp); + error = g_access(cp, 1, 0, 0); + if (error) { + g_detach(cp); + g_destroy_consumer(cp); + g_destroy_geom(gp); + return (NULL); + } + + g_topology_unlock(); + + /* Now check if the provided slice is a valid vinum drive. */ + do { + vhdr = g_read_data(cp, GV_HDR_OFFSET, GV_HDR_LEN, &error); + if (vhdr == NULL || error != 0) + break; + if (vhdr->magic != GV_MAGIC) { + g_free(vhdr); + break; + } + + /* + * We have found a valid vinum drive. Let's see if it is + * already known in the configuration. + */ + g_topology_lock(); + g_access(cp, -1, 0, 0); + + d = gv_find_drive(sc, vhdr->label.name); + + /* We already know about this drive. */ + if (d != NULL) { + bcopy(vhdr, d->hdr, sizeof(*vhdr)); + + /* This is a new drive. */ + } else { + d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO); + + /* Initialize all needed variables. */ + d->size = pp->mediasize - GV_DATA_START; + d->avail = d->size; + d->hdr = vhdr; + strncpy(d->name, vhdr->label.name, GV_MAXDRIVENAME); + LIST_INIT(&d->subdisks); + LIST_INIT(&d->freelist); + + /* We also need a freelist entry. */ + fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); + fl->offset = GV_DATA_START; + fl->size = d->avail; + LIST_INSERT_HEAD(&d->freelist, fl, freelist); + d->freelist_entries = 1; + + /* Save it into the main configuration. */ + LIST_INSERT_HEAD(&sc->drives, d, drive); + } + + gp->softc = d; + d->geom = gp; + strncpy(d->device, pp->name, GV_MAXDRIVENAME); + + /* + * Find out which subdisks belong to this drive and crosslink + * them. + */ + LIST_FOREACH(s, &sc->subdisks, sd) { + if (!strncmp(s->drive, d->name, GV_MAXDRIVENAME)) + /* XXX: errors ignored */ + gv_sd_to_drive(sc, d, s, errstr, + sizeof(errstr)); + } + + /* This drive is now up for sure. */ + gv_set_drive_state(d, GV_DRIVE_UP, 0); + + /* + * If there are subdisks on this drive, we need to create + * providers for them. + */ + if (d->sdcount) + gv_drive_modify(d); + + return (gp); + + } while (0); + + g_topology_lock(); + g_access(cp, -1, 0, 0); + + g_detach(cp); + g_destroy_consumer(cp); + g_free(gp->softc); + g_destroy_geom(gp); + return (NULL); +} + +/* + * Modify the providers for the given drive 'd'. It is assumed that the + * subdisk list of 'd' is already correctly set up. + */ +void +gv_drive_modify(struct gv_drive *d) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_provider *pp, *pp2; + struct gv_sd *s; + int nsd; + + KASSERT(d != NULL, ("gv_drive_modify: null d")); + gp = d->geom; + KASSERT(gp != NULL, ("gv_drive_modify: null gp")); + cp = LIST_FIRST(&gp->consumer); + KASSERT(cp != NULL, ("gv_drive_modify: null cp")); + pp = cp->provider; + KASSERT(pp != NULL, ("gv_drive_modify: null pp")); + + g_topology_assert(); + + nsd = 0; + LIST_FOREACH(s, &d->subdisks, from_drive) { + /* This subdisk already has a provider. */ + if (s->provider != NULL) + continue; + pp2 = g_new_providerf(gp, "gvinum/sd/%s", s->name); + pp2->mediasize = s->size; + pp2->sectorsize = pp->sectorsize; + g_error_provider(pp2, 0); + s->provider = pp2; + pp2->private = s; + } +} + +static int +gv_drive_destroy_geom(struct gctl_req *req, struct g_class *mp, + struct g_geom *gp) +{ + /*struct gv_drive *d;*/ + + g_trace(G_T_TOPOLOGY, "gv_drive_destroy_geom: %s", gp->name); + g_topology_assert(); + + /* g_free(sc); */ + g_wither_geom(gp, ENXIO); + return (0); +} + +#define VINUMDRIVE_CLASS_NAME "VINUMDRIVE" + +static struct g_class g_vinum_drive_class = { + .name = VINUMDRIVE_CLASS_NAME, + .taste = gv_drive_taste, + .destroy_geom = gv_drive_destroy_geom +}; + +DECLARE_GEOM_CLASS(g_vinum_drive_class, g_vinum_drive); diff --git a/sys/geom/vinum/geom_vinum_init.c b/sys/geom/vinum/geom_vinum_init.c new file mode 100644 index 000000000000..1eaa63dc78b5 --- /dev/null +++ b/sys/geom/vinum/geom_vinum_init.c @@ -0,0 +1,405 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/libkern.h> +#include <sys/malloc.h> +#include <sys/queue.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum.h> +#include <geom/vinum/geom_vinum_share.h> + +int gv_init_plex(struct gv_plex *); +int gv_init_sd(struct gv_sd *); +void gv_init_td(void *); +void gv_start_plex(struct gv_plex *); +void gv_start_vol(struct gv_volume *); +void gv_sync(struct gv_volume *); +void gv_sync_td(void *); + +struct gv_sync_args { + struct gv_volume *v; + struct gv_plex *from; + struct gv_plex *to; + off_t syncsize; +}; + +void +gv_start_obj(struct g_geom *gp, struct gctl_req *req) +{ + struct gv_softc *sc; + struct gv_volume *v; + struct gv_plex *p; + int *argc, *initsize; + char *argv, buf[20]; + int i, type; + + argc = gctl_get_paraml(req, "argc", sizeof(*argc)); + initsize = gctl_get_paraml(req, "initsize", sizeof(*initsize)); + + if (argc == NULL || *argc == 0) { + gctl_error(req, "no arguments given"); + return; + } + + sc = gp->softc; + + for (i = 0; i < *argc; i++) { + snprintf(buf, sizeof(buf), "argv%d", i); + argv = gctl_get_param(req, buf, NULL); + if (argv == NULL) + continue; + type = gv_object_type(sc, argv); + switch (type) { + case GV_TYPE_VOL: + v = gv_find_vol(sc, argv); + gv_start_vol(v); + break; + + case GV_TYPE_PLEX: + p = gv_find_plex(sc, argv); + gv_start_plex(p); + break; + + case GV_TYPE_SD: + case GV_TYPE_DRIVE: + /* XXX not yet */ + gctl_error(req, "cannot start '%s'", argv); + return; + default: + gctl_error(req, "unknown object '%s'", argv); + return; + } + } +} + +void +gv_start_plex(struct gv_plex *p) +{ + struct gv_volume *v; + + KASSERT(p != NULL, ("gv_start_plex: NULL p")); + + if (p->state == GV_PLEX_UP) + return; + + v = p->vol_sc; + if ((v != NULL) && (v->plexcount > 1)) + gv_sync(v); + else if (p->org == GV_PLEX_RAID5) + gv_init_plex(p); + + return; +} + +void +gv_start_vol(struct gv_volume *v) +{ + struct gv_plex *p; + + KASSERT(v != NULL, ("gv_start_vol: NULL v")); + + if (v->plexcount == 0) + return; + + else if (v->plexcount == 1) { + p = LIST_FIRST(&v->plexes); + KASSERT(p != NULL, ("gv_start_vol: NULL p on %s", v->name)); + if (p->org == GV_PLEX_RAID5) { + switch (p->state) { + case GV_PLEX_DOWN: + gv_init_plex(p); + break; + case GV_PLEX_DEGRADED: /* XXX not yet */ + default: + return; + } + } + } else + gv_sync(v); +} + +void +gv_sync(struct gv_volume *v) +{ + struct gv_softc *sc; + struct gv_plex *p, *up; + struct gv_sync_args *sync; + + KASSERT(v != NULL, ("gv_sync: NULL v")); + sc = v->vinumconf; + KASSERT(sc != NULL, ("gv_sync: NULL sc on %s", v->name)); + + /* Find the plex that's up. */ + up = NULL; + LIST_FOREACH(up, &v->plexes, in_volume) { + if (up->state == GV_PLEX_UP) + break; + } + + /* Didn't find a good plex. */ + if (up == NULL) + return; + + LIST_FOREACH(p, &v->plexes, in_volume) { + if ((p == up) || (p->state == GV_PLEX_UP)) + continue; + sync = g_malloc(sizeof(*sync), M_WAITOK | M_ZERO); + sync->v = v; + sync->from = up; + sync->to = p; + sync->syncsize = GV_DFLT_SYNCSIZE; + kthread_create(gv_sync_td, sync, NULL, 0, 0, "sync_p '%s'", + p->name); + } +} + +int +gv_init_plex(struct gv_plex *p) +{ + struct gv_sd *s; + int err; + + KASSERT(p != NULL, ("gv_init_plex: NULL p")); + + LIST_FOREACH(s, &p->subdisks, in_plex) { + err = gv_init_sd(s); + if (err) + return (err); + } + + return (0); +} + +int +gv_init_sd(struct gv_sd *s) +{ + KASSERT(s != NULL, ("gv_init_sd: NULL s")); + + if (gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE)) + return (-1); + + s->init_size = GV_DFLT_SYNCSIZE; + s->flags &= ~GV_SD_INITCANCEL; + + /* Spawn the thread that does the work for us. */ + kthread_create(gv_init_td, s, NULL, 0, 0, "init_sd %s", s->name); + + return (0); +} + +void +gv_sync_td(void *arg) +{ + struct bio *bp; + struct gv_plex *p; + struct g_consumer *from, *to; + struct gv_sync_args *sync; + u_char *buf; + off_t i; + int error; + + sync = arg; + + from = sync->from->consumer; + to = sync->to->consumer; + + p = sync->to; + p->synced = 0; + p->flags |= GV_PLEX_SYNCING; + + error = 0; + + g_topology_lock(); + error = g_access(from, 1, 0, 0); + if (error) { + g_topology_unlock(); + printf("gvinum: sync from '%s' failed to access consumer: %d\n", + sync->from->name, error); + kthread_exit(error); + } + error = g_access(to, 0, 1, 0); + if (error) { + g_access(from, -1, 0, 0); + g_topology_unlock(); + printf("gvinum: sync to '%s' failed to access consumer: %d\n", + p->name, error); + kthread_exit(error); + } + g_topology_unlock(); + + for (i = 0; i < p->size; i+= sync->syncsize) { + /* Read some bits from the good plex. */ + buf = g_read_data(from, i, sync->syncsize, &error); + if (buf == NULL) { + printf("gvinum: sync read from '%s' failed at offset " + "%jd, errno: %d\n", sync->from->name, i, error); + break; + } + + /* + * Create a bio and schedule it down on the 'bad' plex. We + * cannot simply use g_write_data() because we have to let the + * lower parts know that we are an initialization process and + * not a 'normal' request. + */ + bp = g_new_bio(); + if (bp == NULL) { + printf("gvinum: sync write to '%s' failed at offset " + "%jd, out of memory\n", p->name, i); + g_free(buf); + break; + } + bp->bio_cmd = BIO_WRITE; + bp->bio_offset = i; + bp->bio_length = sync->syncsize; + bp->bio_data = buf; + bp->bio_done = NULL; + + /* + * This hack declare this bio as part of an initialization + * process, so that the lower levels allow it to get through. + */ + bp->bio_caller1 = p; + + /* Schedule it down ... */ + g_io_request(bp, to); + + /* ... and wait for the result. */ + error = biowait(bp, "gwrite"); + g_destroy_bio(bp); + g_free(buf); + if (error) { + printf("gvinum: sync write to '%s' failed at offset " + "%jd, errno: %d\n", p->name, i, error); + break; + } + + /* Note that we have synced a little bit more. */ + p->synced += sync->syncsize; + } + + g_topology_lock(); + g_access(from, -1, 0, 0); + g_access(to, 0, -1, 0); + g_topology_unlock(); + + /* Successful initialization. */ + if (!error) { + p->flags &= ~GV_PLEX_SYNCING; + printf("gvinum: plex '%s': sync finished\n", p->name); + } + + g_free(sync); + kthread_exit(error); +} + +void +gv_init_td(void *arg) +{ + struct gv_sd *s; + struct gv_drive *d; + struct g_geom *gp; + struct g_consumer *cp; + int error; + off_t i, init_size, start, offset, length; + u_char *buf; + + s = arg; + KASSERT(s != NULL, ("gv_init_td: NULL s")); + d = s->drive_sc; + KASSERT(d != NULL, ("gv_init_td: NULL d")); + gp = d->geom; + KASSERT(gp != NULL, ("gv_init_td: NULL gp")); + + cp = LIST_FIRST(&gp->consumer); + KASSERT(cp != NULL, ("gv_init_td: NULL cp")); + + s->init_error = 0; + init_size = s->init_size; + start = s->drive_offset + s->initialized; + offset = s->drive_offset; + length = s->size; + + buf = g_malloc(s->init_size, M_WAITOK | M_ZERO); + + g_topology_lock(); + error = g_access(cp, 0, 1, 0); + if (error) { + s->init_error = error; + g_topology_unlock(); + printf("geom_vinum: init '%s' failed to access consumer: %d\n", + s->name, error); + kthread_exit(error); + } + g_topology_unlock(); + + for (i = start; i < offset + length; i += init_size) { + if (s->flags & GV_SD_INITCANCEL) { + printf("geom_vinum: subdisk '%s' init: cancelled at" + " offset %jd (drive offset %jd)\n", s->name, + (intmax_t)s->initialized, (intmax_t)i); + error = EAGAIN; + break; + } + error = g_write_data(cp, i, buf, init_size); + if (error) { + printf("geom_vinum: subdisk '%s' init: write failed" + " at offset %jd (drive offset %jd)\n", s->name, + (intmax_t)s->initialized, (intmax_t)i); + break; + } + s->initialized += init_size; + } + + g_free(buf); + + g_topology_lock(); + g_access(cp, 0, -1, 0); + g_topology_unlock(); + if (error) { + s->init_error = error; + g_topology_lock(); + gv_set_sd_state(s, GV_SD_STALE, + GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG); + g_topology_unlock(); + } else { + g_topology_lock(); + gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG); + g_topology_unlock(); + s->initialized = 0; + printf("geom_vinum: init '%s' finished\n", s->name); + } + kthread_exit(error); +} diff --git a/sys/geom/vinum/geom_vinum_list.c b/sys/geom/vinum/geom_vinum_list.c new file mode 100644 index 000000000000..f70cffb5f842 --- /dev/null +++ b/sys/geom/vinum/geom_vinum_list.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/libkern.h> +#include <sys/malloc.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum.h> +#include <geom/vinum/geom_vinum_share.h> + +void gv_lvi(struct gv_volume *, struct sbuf *, int); +void gv_lpi(struct gv_plex *, struct sbuf *, int); +void gv_lsi(struct gv_sd *, struct sbuf *, int); +void gv_ldi(struct gv_drive *, struct sbuf *, int); + +void +gv_list(struct g_geom *gp, struct gctl_req *req) +{ + struct gv_softc *sc; + struct gv_drive *d; + struct gv_plex *p; + struct gv_sd *s; + struct gv_volume *v; + struct sbuf *sb; + int *argc, i, *flags, type; + char *arg, buf[20], *cmd; + + argc = gctl_get_paraml(req, "argc", sizeof(*argc)); + + if (argc == NULL) { + gctl_error(req, "no arguments given"); + return; + } + + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + + sc = gp->softc; + + sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN); + + /* Figure out which command was given. */ + cmd = gctl_get_param(req, "cmd", NULL); + + /* List specific objects or everything. */ + if (!strcmp(cmd, "list") || !strcmp(cmd, "l")) { + if (*argc) { + for (i = 0; i < *argc; i++) { + snprintf(buf, sizeof(buf), "argv%d", i); + arg = gctl_get_param(req, buf, NULL); + if (arg == NULL) + continue; + type = gv_object_type(sc, arg); + switch (type) { + case GV_TYPE_VOL: + v = gv_find_vol(sc, arg); + gv_lvi(v, sb, *flags); + break; + case GV_TYPE_PLEX: + p = gv_find_plex(sc, arg); + gv_lpi(p, sb, *flags); + break; + case GV_TYPE_SD: + s = gv_find_sd(sc, arg); + gv_lsi(s, sb, *flags); + break; + case GV_TYPE_DRIVE: + d = gv_find_drive(sc, arg); + gv_ldi(d, sb, *flags); + break; + default: + gctl_error(req, "unknown object '%s'", + arg); + break; + } + } + } else { + gv_ld(gp, req, sb); + sbuf_printf(sb, "\n"); + gv_lv(gp, req, sb); + sbuf_printf(sb, "\n"); + gv_lp(gp, req, sb); + sbuf_printf(sb, "\n"); + gv_ls(gp, req, sb); + } + + /* List drives. */ + } else if (!strcmp(cmd, "ld")) { + if (*argc) { + for (i = 0; i < *argc; i++) { + snprintf(buf, sizeof(buf), "argv%d", i); + arg = gctl_get_param(req, buf, NULL); + if (arg == NULL) + continue; + type = gv_object_type(sc, arg); + if (type != GV_TYPE_DRIVE) { + gctl_error(req, "'%s' is not a drive", + arg); + continue; + } else { + d = gv_find_drive(sc, arg); + gv_ldi(d, sb, *flags); + } + } + } else + gv_ld(gp, req, sb); + + /* List volumes. */ + } else if (!strcmp(cmd, "lv")) { + if (*argc) { + for (i = 0; i < *argc; i++) { + snprintf(buf, sizeof(buf), "argv%d", i); + arg = gctl_get_param(req, buf, NULL); + if (arg == NULL) + continue; + type = gv_object_type(sc, arg); + if (type != GV_TYPE_VOL) { + gctl_error(req, "'%s' is not a volume", + arg); + continue; + } else { + v = gv_find_vol(sc, arg); + gv_lvi(v, sb, *flags); + } + } + } else + gv_lv(gp, req, sb); + + /* List plexes. */ + } else if (!strcmp(cmd, "lp")) { + if (*argc) { + for (i = 0; i < *argc; i++) { + snprintf(buf, sizeof(buf), "argv%d", i); + arg = gctl_get_param(req, buf, NULL); + if (arg == NULL) + continue; + type = gv_object_type(sc, arg); + if (type != GV_TYPE_PLEX) { + gctl_error(req, "'%s' is not a plex", + arg); + continue; + } else { + p = gv_find_plex(sc, arg); + gv_lpi(p, sb, *flags); + } + } + } else + gv_lp(gp, req, sb); + + /* List subdisks. */ + } else if (!strcmp(cmd, "ls")) { + if (*argc) { + for (i = 0; i < *argc; i++) { + snprintf(buf, sizeof(buf), "argv%d", i); + arg = gctl_get_param(req, buf, NULL); + if (arg == NULL) + continue; + type = gv_object_type(sc, arg); + if (type != GV_TYPE_SD) { + gctl_error(req, "'%s' is not a subdisk", + arg); + continue; + } else { + s = gv_find_sd(sc, arg); + gv_lsi(s, sb, *flags); + } + } + } else + gv_ls(gp, req, sb); + + } else + gctl_error(req, "unknown command '%s'", cmd); + + sbuf_finish(sb); + gctl_set_param(req, "config", sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); +} + +/* List one or more volumes. */ +void +gv_lv(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) +{ + struct gv_softc *sc; + struct gv_volume *v; + int i, *flags; + + sc = gp->softc; + i = 0; + + LIST_FOREACH(v, &sc->volumes, volume) + i++; + + sbuf_printf(sb, "%d volume%s:\n", i, i == 1 ? "" : "s"); + + if (i) { + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + LIST_FOREACH(v, &sc->volumes, volume) + gv_lvi(v, sb, *flags); + } +} + +/* List a single volume. */ +void +gv_lvi(struct gv_volume *v, struct sbuf *sb, int flags) +{ + struct gv_plex *p; + int i; + + if (flags & GV_FLAG_V) { + sbuf_printf(sb, "Volume %s:\tSize: %jd bytes (%jd MB)\n", + v->name, (intmax_t)v->size, (intmax_t)v->size / MEGABYTE); + sbuf_printf(sb, "\t\tState: %s\n", gv_volstate(v->state)); + } else { + sbuf_printf(sb, "V %-21s State: %s\tPlexes: %7d\tSize: %s\n", + v->name, gv_volstate(v->state), v->plexcount, + gv_roughlength(v->size, 0)); + } + + if (flags & GV_FLAG_VV) { + i = 0; + LIST_FOREACH(p, &v->plexes, in_volume) { + sbuf_printf(sb, "\t\tPlex %2d:\t%s\t(%s), %s\n", i, + p->name, gv_plexstate(p->state), + gv_roughlength(p->size, 0)); + i++; + } + } + + if (flags & GV_FLAG_R) { + LIST_FOREACH(p, &v->plexes, in_volume) + gv_lpi(p, sb, flags); + } +} + +/* List one or more plexes. */ +void +gv_lp(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) +{ + struct gv_softc *sc; + struct gv_plex *p; + int i, *flags; + + sc = gp->softc; + i = 0; + + LIST_FOREACH(p, &sc->plexes, plex) + i++; + + sbuf_printf(sb, "%d plex%s:\n", i, i == 1 ? "" : "es"); + + if (i) { + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + LIST_FOREACH(p, &sc->plexes, plex) + gv_lpi(p, sb, *flags); + } +} + +/* List a single plex. */ +void +gv_lpi(struct gv_plex *p, struct sbuf *sb, int flags) +{ + struct gv_sd *s; + int i; + + if (flags & GV_FLAG_V) { + sbuf_printf(sb, "Plex %s:\tSize:\t%9jd bytes (%jd MB)\n", + p->name, (intmax_t)p->size, (intmax_t)p->size / MEGABYTE); + sbuf_printf(sb, "\t\tSubdisks: %8d\n", p->sdcount); + sbuf_printf(sb, "\t\tState: %s\n\t\tOrganization: %s", + gv_plexstate(p->state), gv_plexorg(p->org)); + if (gv_is_striped(p)) { + sbuf_printf(sb, "\tStripe size: %s\n", + gv_roughlength(p->stripesize, 1)); + } + if (p->vol_sc != NULL) { + sbuf_printf(sb, "\t\tPart of volume %s\n", p->volume); + } + } else { + sbuf_printf(sb, "P %-18s %2s State: %s\tSubdisks: %5d" + "\tSize: %s\n", p->name, gv_plexorg_short(p->org), + gv_plexstate(p->state), p->sdcount, + gv_roughlength(p->size, 0)); + } + + if (flags & GV_FLAG_VV) { + i = 0; + LIST_FOREACH(s, &p->subdisks, in_plex) { + sbuf_printf(sb, "\t\tSubdisk %d:\t%s\n", i, s->name); + sbuf_printf(sb, "\t\t state: %s\tsize %11jd " + "(%jd MB)\n", gv_sdstate(s->state), + (intmax_t)s->size, (intmax_t)s->size / MEGABYTE); + if (p->org == GV_PLEX_CONCAT) { + sbuf_printf(sb, "\t\t\toffset %9jd (0x%jx)\n", + (intmax_t)s->plex_offset, + (intmax_t)s->plex_offset); + } + i++; + } + } + + if (flags & GV_FLAG_R) { + LIST_FOREACH(s, &p->subdisks, in_plex) + gv_lsi(s, sb, flags); + } +} + +/* List one or more subdisks. */ +void +gv_ls(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) +{ + struct gv_softc *sc; + struct gv_sd *s; + int i, *flags; + + sc = gp->softc; + i = 0; + + LIST_FOREACH(s, &sc->subdisks, sd) + i++; + + sbuf_printf(sb, "%d subdisk%s:\n", i, i == 1 ? "" : "s"); + + if (i) { + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + LIST_FOREACH(s, &sc->subdisks, sd) + gv_lsi(s, sb, *flags); + } +} + +/* List a single subdisk. */ +void +gv_lsi(struct gv_sd *s, struct sbuf *sb, int flags) +{ + if (flags & GV_FLAG_V) { + sbuf_printf(sb, "Subdisk %s:\n", s->name); + sbuf_printf(sb, "\t\tSize: %16jd bytes (%jd MB)\n", + (intmax_t)s->size, (intmax_t)s->size / MEGABYTE); + sbuf_printf(sb, "\t\tState: %s\n", gv_sdstate(s->state)); + + if (s->state == GV_SD_INITIALIZING) { + sbuf_printf(sb, "\t\tInitialized: %16jd bytes " + "(%d%%)\n", (intmax_t)s->initialized, + (int)((s->initialized * 100) / s->size)); + } + + if (s->plex_sc != NULL) { + sbuf_printf(sb, "\t\tPlex %s at offset %jd (%s)\n", + s->plex, (intmax_t)s->plex_offset, + gv_roughlength(s->plex_offset, 1)); + } + + if (s->state == GV_SD_REVIVING) { + /* XXX */ + } + + sbuf_printf(sb, "\t\tDrive %s (%s) at offset %jd (%s)\n", + s->drive, + s->drive_sc == NULL ? "*missing*" : s->drive_sc->name, + (intmax_t)s->drive_offset, + gv_roughlength(s->drive_offset, 1)); + } else { + /* XXX reviving and initializing... */ + sbuf_printf(sb, "S %-21s State: ", s->name); + if (s->state == GV_SD_INITIALIZING) { + sbuf_printf(sb, "I %d%%\t", + (int)((s->initialized * 100) / s->size)); + } else { + sbuf_printf(sb, "%s\t", gv_sdstate(s->state)); + } + sbuf_printf(sb, "D: %-12s Size: %s\n", s->drive, + gv_roughlength(s->size, 0)); + } +} + +/* List one or more drives. */ +void +gv_ld(struct g_geom *gp, struct gctl_req *req, struct sbuf *sb) +{ + struct gv_softc *sc; + struct gv_drive *d; + int i, *flags; + + sc = gp->softc; + i = 0; + + LIST_FOREACH(d, &sc->drives, drive) + i++; + + sbuf_printf(sb, "%d drive%s:\n", i, i == 1 ? "" : "s"); + + if (i) { + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + LIST_FOREACH(d, &sc->drives, drive) + gv_ldi(d, sb, *flags); + } +} + +/* List a single drive. */ +void +gv_ldi(struct gv_drive *d, struct sbuf *sb, int flags) +{ + struct gv_freelist *fl; + struct gv_sd *s; + + /* Verbose listing. */ + if (flags & GV_FLAG_V) { + sbuf_printf(sb, "Drive %s:\tDevice %s\n", d->name, d->device); + sbuf_printf(sb, "\t\tSize: %16jd bytes (%jd MB)\n", + (intmax_t)d->size, (intmax_t)d->size / MEGABYTE); + sbuf_printf(sb, "\t\tUsed: %16jd bytes (%jd MB)\n", + (intmax_t)d->size - d->avail, + (intmax_t)(d->size - d->avail) / MEGABYTE); + sbuf_printf(sb, "\t\tAvailable: %11jd bytes (%jd MB)\n", + (intmax_t)d->avail, (intmax_t)d->avail / MEGABYTE); + sbuf_printf(sb, "\t\tState: %s\n", gv_drivestate(d->state)); + + /* Be very verbose. */ + if (flags & GV_FLAG_VV) { + sbuf_printf(sb, "\t\tFree list contains %d entries:\n", + d->freelist_entries); + sbuf_printf(sb, "\t\t Offset\t Size\n"); + LIST_FOREACH(fl, &d->freelist, freelist) + sbuf_printf(sb, "\t\t%9jd\t%9jd\n", + (intmax_t)fl->offset, (intmax_t)fl->size); + } + } else { + sbuf_printf(sb, "D %-21s State: %s\t/dev/%s\tA: %jd/%jd MB " + "(%d%%)\n", d->name, gv_drivestate(d->state), d->device, + (intmax_t)d->avail / MEGABYTE, (intmax_t)d->size / MEGABYTE, + (int)((d->avail * 100) / d->size)); + } + + /* Recursive listing. */ + if (flags & GV_FLAG_R) { + LIST_FOREACH(s, &d->subdisks, from_drive) + gv_lsi(s, sb, flags); + } +} diff --git a/sys/geom/vinum/geom_vinum_plex.c b/sys/geom/vinum/geom_vinum_plex.c new file mode 100644 index 000000000000..a7acf7270d75 --- /dev/null +++ b/sys/geom/vinum/geom_vinum_plex.c @@ -0,0 +1,456 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/libkern.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum_raid5.h> +#include <geom/vinum/geom_vinum.h> + +/* XXX: is this the place to catch dying subdisks? */ +static void +gv_plex_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + struct gv_plex *p; + int error; + + g_topology_assert(); + gp = cp->geom; + g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name); + + if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) + g_access(cp, -cp->acr, -cp->acw, -cp->ace); + error = cp->provider->error; + if (error == 0) + error = ENXIO; + g_detach(cp); + g_destroy_consumer(cp); + if (!LIST_EMPTY(&gp->consumer)) + return; + + p = gp->softc; + gv_kill_thread(p); + g_free(p); + g_wither_geom(gp, error); +} + +static void +gv_plex_done(struct bio *bp) +{ + struct g_geom *gp; + struct gv_sd *s; + + gp = bp->bio_to->geom; + + s = bp->bio_caller1; + KASSERT(s != NULL, ("gv_plex_done: NULL s")); + + if (bp->bio_error == 0) + s->initialized += bp->bio_length; + + if (s->initialized >= s->size) { + gv_set_sd_state(s, GV_SD_UP, 0); + s->initialized = 0; + } + + g_std_done(bp); +} + +/* Find the correct subdisk to send the bio to and build a bio to send. */ +static int +gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp, + caddr_t addr, long bcount, off_t boff) +{ + struct g_geom *gp; + struct gv_plex *p; + struct gv_sd *s; + struct bio *cbp; + int i, sdno; + off_t len_left, real_len, real_off, stripeend, stripeno, stripestart; + + s = NULL; + + gp = bp->bio_to->geom; + p = gp->softc; + + if (p == NULL || LIST_EMPTY(&p->subdisks)) + return (ENXIO); + + /* + * We only handle concatenated and striped plexes here. RAID5 plexes + * are handled in build_raid5_request(). + */ + switch (p->org) { + case GV_PLEX_CONCAT: + /* + * Find the subdisk where this request starts. The subdisks in + * this list must be ordered by plex_offset. + */ + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (s->plex_offset <= boff && + s->plex_offset + s->size > boff) + break; + } + /* Subdisk not found. */ + if (s == NULL) + return (ENXIO); + + /* Calculate corresponding offsets on disk. */ + real_off = boff - s->plex_offset; + len_left = s->size - real_off; + real_len = (bcount > len_left) ? len_left : bcount; + break; + + case GV_PLEX_STRIPED: + /* The number of the stripe where the request starts. */ + stripeno = boff / p->stripesize; + + /* The number of the subdisk where the stripe resides. */ + sdno = stripeno % p->sdcount; + + /* Find the right subdisk. */ + i = 0; + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (i == sdno) + break; + i++; + } + + /* Subdisk not found. */ + if (s == NULL) + return (ENXIO); + + /* The offset of the stripe from the start of the subdisk. */ + stripestart = (stripeno / p->sdcount) * + p->stripesize; + + /* The offset at the end of the stripe. */ + stripeend = stripestart + p->stripesize; + + /* The offset of the request on this subdisk. */ + real_off = boff - (stripeno * p->stripesize) + + stripestart; + + /* The length left in this stripe. */ + len_left = stripeend - real_off; + + real_len = (bcount <= len_left) ? bcount : len_left; + break; + + default: + return (EINVAL); + } + + /* Now check if we can handle the request on this subdisk. */ + switch (s->state) { + case GV_SD_UP: + /* If the subdisk is up, just continue. */ + break; + + case GV_SD_STALE: + if (bp->bio_caller1 != p) + return (ENXIO); + + printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name); + gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE); + break; + + case GV_SD_INITIALIZING: + if (bp->bio_cmd == BIO_READ) + return (ENXIO); + break; + + default: + /* All other subdisk states mean it's not accessible. */ + return (ENXIO); + } + + /* Clone the bio and adjust the offsets and sizes. */ + cbp = g_clone_bio(bp); + if (cbp == NULL) + return (ENOMEM); + cbp->bio_offset = real_off; + cbp->bio_length = real_len; + cbp->bio_data = addr; + if (bp->bio_caller1 == p) { + cbp->bio_caller1 = s; + cbp->bio_done = gv_plex_done; + } else + cbp->bio_done = g_std_done; + *bp2 = cbp; + *cp = s->consumer; + return (0); +} + +static void +gv_plex_start(struct bio *bp) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct gv_plex *p; + struct gv_raid5_packet *wp; + struct bio *bp2; + caddr_t addr; + off_t boff; + long bcount, rcount; + int err; + + gp = bp->bio_to->geom; + p = gp->softc; + + /* + * We cannot handle this request if too many of our subdisks are + * inaccessible. + */ + if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) { + g_io_deliver(bp, ENXIO); /* XXX: correct way? */ + return; + } + + switch(bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + /* + * We split up the request in smaller packets and hand them + * down to our subdisks. + */ + wp = NULL; + addr = bp->bio_data; + boff = bp->bio_offset; + for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) { + /* + * RAID5 requests usually need to be split up in + * several subrequests. + */ + if (p->org == GV_PLEX_RAID5) { + wp = gv_new_raid5_packet(); + wp->bio = bp; + err = gv_build_raid5_req(wp, bp, addr, bcount, + boff); + } else + err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount, + boff); + + if (err) { + bp->bio_completed += bcount; + if (bp->bio_error == 0) + bp->bio_error = err; + if (bp->bio_completed == bp->bio_length) + g_io_deliver(bp, bp->bio_error); + return; + } + + if (p->org != GV_PLEX_RAID5) { + rcount = bp2->bio_length; + g_io_request(bp2, cp); + + /* + * RAID5 subrequests are queued on a worklist + * and picked up from the worker thread. This + * ensures correct order. + */ + } else { + mtx_lock(&p->worklist_mtx); + TAILQ_INSERT_TAIL(&p->worklist, wp, + list); + mtx_unlock(&p->worklist_mtx); + wakeup(&p); + rcount = wp->length; + } + + boff += rcount; + addr += rcount; + } + return; + + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } +} + +static int +gv_plex_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp, *cp2; + int error; + + gp = pp->geom; + + error = ENXIO; + LIST_FOREACH(cp, &gp->consumer, consumer) { + error = g_access(cp, dr, dw, de); + if (error) { + LIST_FOREACH(cp2, &gp->consumer, consumer) { + if (cp == cp2) + break; + g_access(cp2, -dr, -dw, -de); + } + return (error); + } + } + return (error); +} + +static struct g_geom * +gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_geom *gp; + struct g_consumer *cp; + struct g_provider *pp2; + struct gv_plex *p; + struct gv_sd *s; + struct gv_softc *sc; + + g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name); + g_topology_assert(); + + /* We only want to attach to subdisks. */ + if (strcmp(pp->geom->class->name, "VINUMDRIVE")) + return (NULL); + + /* Find the VINUM class and its associated geom. */ + gp = find_vinum_geom(); + if (gp == NULL) + return (NULL); + sc = gp->softc; + KASSERT(sc != NULL, ("gv_plex_taste: NULL sc")); + + /* Find out which subdisk the offered provider corresponds to. */ + s = pp->private; + KASSERT(s != NULL, ("gv_plex_taste: NULL s")); + + /* Now find the correct plex where this subdisk belongs to. */ + p = gv_find_plex(sc, s->plex); + KASSERT(p != NULL, ("gv_plex_taste: NULL p")); + + /* + * Add this subdisk to this plex. Since we trust the on-disk + * configuration, we don't check the given value (should we?). + * XXX: shouldn't be done here + */ + gv_sd_to_plex(p, s, 0); + + /* Now check if there's already a geom for this plex. */ + gp = p->geom; + + /* Yes, there is already a geom, so we just add the consumer. */ + if (gp != NULL) { + /* Need to attach a new consumer to this subdisk. */ + cp = g_new_consumer(gp); + g_attach(cp, pp); + s->consumer = cp; + + /* Adjust the size of the providers this plex has. */ + LIST_FOREACH(pp2, &gp->provider, provider) + pp2->mediasize = p->size; + + return (NULL); + + /* We need to create a new geom. */ + } else { + gp = g_new_geomf(mp, "%s", p->name); + gp->start = gv_plex_start; + gp->orphan = gv_plex_orphan; + gp->access = gv_plex_access; + gp->softc = p; + p->geom = gp; + + /* RAID5 plexes need a 'worker' thread, where IO is handled. */ + if (p->org == GV_PLEX_RAID5) { + TAILQ_INIT(&p->worklist); + mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL, + MTX_DEF); + p->flags &= ~GV_PLEX_THREAD_DIE; + kthread_create(gv_raid5_worker, gp, NULL, 0, 0, + "gv_raid5"); + p->flags |= GV_PLEX_THREAD_ACTIVE; + } + + /* Attach a consumer to this provider. */ + cp = g_new_consumer(gp); + g_attach(cp, pp); + s->consumer = cp; + + /* Create a provider for the outside world. */ + pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name); + pp2->mediasize = p->size; + pp2->sectorsize = pp->sectorsize; + p->provider = pp2; + g_error_provider(pp2, 0); + return (gp); + } +} + +static int +gv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp, + struct g_geom *gp) +{ + struct gv_plex *p; + + g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name); + g_topology_assert(); + + p = gp->softc; + + KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name)); + + /* + * If this is a RAID5 plex, check if its worker thread is still active + * and signal it to self destruct. + */ + gv_kill_thread(p); + mtx_destroy(&p->worklist_mtx); + /* g_free(sc); */ + g_wither_geom(gp, ENXIO); + return (0); +} + +#define VINUMPLEX_CLASS_NAME "VINUMPLEX" + +static struct g_class g_vinum_plex_class = { + .name = VINUMPLEX_CLASS_NAME, + .taste = gv_plex_taste, + .destroy_geom = gv_plex_destroy_geom, +}; + +DECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex); diff --git a/sys/geom/vinum/geom_vinum_raid5.c b/sys/geom/vinum/geom_vinum_raid5.c new file mode 100644 index 000000000000..0c604fec53a2 --- /dev/null +++ b/sys/geom/vinum/geom_vinum_raid5.c @@ -0,0 +1,616 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/conf.h> +#include <sys/errno.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/libkern.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum_raid5.h> +#include <geom/vinum/geom_vinum.h> + +int gv_raid5_parity(struct gv_raid5_packet *); +int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); + +struct gv_raid5_bit * +gv_new_raid5_bit(void) +{ + struct gv_raid5_bit *r; + r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); + KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); + return (r); +} + +struct gv_raid5_packet * +gv_new_raid5_packet(void) +{ + struct gv_raid5_packet *wp; + + wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); + KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); + wp->state = SETUP; + wp->type = JUNK; + TAILQ_INIT(&wp->bits); + + return (wp); +} + +/* + * Check if the stripe that the work packet wants is already being used by + * some other work packet. + */ +int +gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) +{ + struct gv_raid5_packet *wpa; + + TAILQ_FOREACH(wpa, &sc->worklist, list) { + if (wpa->lockbase == wp->lockbase) { + if (wpa->bio == wp->bio) + return (0); + return (1); + } + } + return (0); +} + +/* + * The "worker" thread that runs through the worklist and fires off the + * "subrequests" needed to fulfill a RAID5 read or write request. + */ +void +gv_raid5_worker(void *arg) +{ + struct bio *bp; + struct g_geom *gp; + struct gv_plex *p; + struct gv_raid5_packet *wp, *wpt; + struct gv_raid5_bit *rbp, *rbpt; + int error, restart; + + gp = arg; + p = gp->softc; + + mtx_lock(&p->worklist_mtx); + for (;;) { + restart = 0; + g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan"); + TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { + /* This request packet is already being processed. */ + if (wp->state == IO) + continue; + /* This request packet is ready for processing. */ + if (wp->state == VALID) { + /* Couldn't get the lock, try again. */ + if ((wp->lockbase != -1) && + gv_stripe_active(wp, p)) + continue; + + wp->state = IO; + mtx_unlock(&p->worklist_mtx); + TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) + g_io_request(rbp->bio, rbp->consumer); + mtx_lock(&p->worklist_mtx); + continue; + } + if (wp->state == FINISH) { + bp = wp->bio; + bp->bio_completed += wp->length; + /* + * Deliver the original request if we have + * finished. + */ + if (bp->bio_completed == bp->bio_length) { + mtx_unlock(&p->worklist_mtx); + g_io_deliver(bp, 0); + mtx_lock(&p->worklist_mtx); + } + TAILQ_REMOVE(&p->worklist, wp, list); + if (wp->bufmalloc == 1) + g_free(wp->buf); + g_free(wp); + restart++; + /*break;*/ + } + } + if (!restart) { + /* Self-destruct. */ + if (p->flags & GV_PLEX_THREAD_DIE) + break; + g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep"); + error = msleep(p, &p->worklist_mtx, PRIBIO, "-", + hz/100); + } + } + mtx_unlock(&p->worklist_mtx); + + g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); + + /* Signal our plex that we are dead. */ + p->flags |= GV_PLEX_THREAD_DEAD; + wakeup(p); + kthread_exit(0); +} + +/* Final bio transaction to write out the parity data. */ +int +gv_raid5_parity(struct gv_raid5_packet *wp) +{ + struct bio *bp; + + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + + wp->type = ISPARITY; + bp->bio_cmd = BIO_WRITE; + bp->bio_data = wp->buf; + bp->bio_offset = wp->offset; + bp->bio_length = wp->length; + bp->bio_done = gv_raid5_done; + bp->bio_caller1 = wp; + bp->bio_caller2 = NULL; + g_io_request(bp, wp->parity); + + return (0); +} + +/* We end up here after each subrequest. */ +void +gv_raid5_done(struct bio *bp) +{ + struct bio *obp; + struct g_geom *gp; + struct gv_plex *p; + struct gv_raid5_packet *wp; + struct gv_raid5_bit *rbp; + off_t i; + int error; + + wp = bp->bio_caller1; + rbp = bp->bio_caller2; + obp = wp->bio; + gp = bp->bio_from->geom; + p = gp->softc; + + /* One less active subrequest. */ + wp->active--; + + switch (obp->bio_cmd) { + case BIO_READ: + /* Degraded reads need to handle parity data. */ + if (wp->type == DEGRADED) { + for (i = 0; i < wp->length; i++) + wp->buf[i] ^= bp->bio_data[i]; + + /* When we're finished copy back the data we want. */ + if (wp->active == 0) + bcopy(wp->buf, wp->data, wp->length); + } + + break; + + case BIO_WRITE: + /* Handle the parity data, if needed. */ + if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { + for (i = 0; i < wp->length; i++) + wp->buf[i] ^= bp->bio_data[i]; + + /* Write out the parity data we calculated. */ + if (wp->active == 0) { + wp->active++; + error = gv_raid5_parity(wp); + } + } + break; + } + + g_destroy_bio(bp); + + if (rbp != NULL) { + if (rbp->malloc == 1) + g_free(rbp->buf); + TAILQ_REMOVE(&wp->bits, rbp, list); + g_free(rbp); + } + + /* This request group is done. */ + if (wp->active == 0) + wp->state = FINISH; +} + +/* Build a request group to perform (part of) a RAID5 request. */ +int +gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, + long bcount, off_t boff) +{ + struct g_geom *gp; + struct gv_plex *p; + struct gv_raid5_bit *rbp; + struct gv_sd *broken, *original, *parity, *s; + int i, psdno, sdno; + off_t len_left, real_off, stripeend, stripeoff, stripestart; + + gp = bp->bio_to->geom; + p = gp->softc; + + if (p == NULL || LIST_EMPTY(&p->subdisks)) + return (ENXIO); + + /* We are optimistic and assume that this request will be OK. */ + wp->type = NORMAL; + original = parity = broken = NULL; + + /* The number of the subdisk containing the parity stripe. */ + psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % + p->sdcount; + KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); + + /* Offset of the start address from the start of the stripe. */ + stripeoff = boff % (p->stripesize * (p->sdcount - 1)); + KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); + + /* The number of the subdisk where the stripe resides. */ + sdno = stripeoff / p->stripesize; + KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); + + /* At or past parity subdisk. */ + if (sdno >= psdno) + sdno++; + + /* The offset of the stripe on this subdisk. */ + stripestart = (boff - stripeoff) / (p->sdcount - 1); + KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); + + if (stripeoff >= p->stripesize) + stripeoff -= p->stripesize; + + /* The offset of the request on this subdisk. */ + real_off = stripestart + stripeoff; + + stripeend = stripestart + p->stripesize; + len_left = stripeend - real_off; + KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); + + /* Find the right subdisks. */ + i = 0; + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (i == sdno) + original = s; + if (i == psdno) + parity = s; + if (s->state != GV_SD_UP) + broken = s; + i++; + } + + if ((original == NULL) || (parity == NULL)) + return (ENXIO); + + /* Our data stripe is missing. */ + if (original->state != GV_SD_UP) + wp->type = DEGRADED; + /* Our parity stripe is missing. */ + if (parity->state != GV_SD_UP) { + /* We cannot take another failure if we're already degraded. */ + if (wp->type != NORMAL) + return (ENXIO); + else + wp->type = NOPARITY; + } + + /* + * A combined write is necessary when the original data subdisk and the + * parity subdisk are both up, but one of the other subdisks isn't. + */ + if ((broken != NULL) && (broken != parity) && (broken != original)) + wp->type = COMBINED; + + wp->offset = real_off; + wp->length = (bcount <= len_left) ? bcount : len_left; + wp->data = addr; + wp->original = original->consumer; + wp->parity = parity->consumer; + wp->lockbase = stripestart; + + KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); + + switch (bp->bio_cmd) { + case BIO_READ: + /* + * For a degraded read we need to read in all stripes except + * the broken one plus the parity stripe and then recalculate + * the desired data. + */ + if (wp->type == DEGRADED) { + wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); + wp->bufmalloc = 1; + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Skip the broken subdisk. */ + if (s == broken) + continue; + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + rbp->bio->bio_cmd = BIO_READ; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + + /* A normal read can be fulfilled with the original subdisk. */ + } else { + rbp = gv_new_raid5_bit(); + rbp->consumer = wp->original; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_READ; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->buf = addr; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + if (wp->type != COMBINED) + wp->lockbase = -1; + break; + + case BIO_WRITE: + /* + * A degraded write means we cannot write to the original data + * subdisk. Thus we need to read in all valid stripes, + * recalculate the parity from the original data, and then + * write the parity stripe back out. + */ + if (wp->type == DEGRADED) { + wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); + wp->bufmalloc = 1; + + /* Copy the original data. */ + bcopy(wp->data, wp->buf, wp->length); + + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Skip the broken and the parity subdisk. */ + if ((s == broken) || + (s->consumer == wp->parity)) + continue; + + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + rbp->bio->bio_cmd = BIO_READ; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + + /* + * When we don't have the parity stripe we just write out the + * data. + */ + } else if (wp->type == NOPARITY) { + rbp = gv_new_raid5_bit(); + rbp->consumer = wp->original; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_WRITE; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_data = addr; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + + /* + * A combined write means that our data subdisk and the parity + * subdisks are both up, but another subdisk isn't. We need to + * read all valid stripes including the parity to recalculate + * the data of the stripe that is missing. Then we write our + * original data, and together with the other data stripes + * recalculate the parity again. + */ + } else if (wp->type == COMBINED) { + wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); + wp->bufmalloc = 1; + + /* Get the data from all subdisks. */ + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Skip the broken subdisk. */ + if (s == broken) + continue; + + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_READ; + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + + /* Write the original data. */ + rbp = gv_new_raid5_bit(); + rbp->consumer = wp->original; + rbp->buf = addr; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_WRITE; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + /* + * Insert at the tail, because we want to read the old + * data first. + */ + TAILQ_INSERT_TAIL(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + + /* Get the rest of the data again. */ + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* + * Skip the broken subdisk, the parity, and the + * one we just wrote. + */ + if ((s == broken) || + (s->consumer == wp->parity) || + (s->consumer == wp->original)) + continue; + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_READ; + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + /* + * Again, insert at the tail to keep correct + * order. + */ + TAILQ_INSERT_TAIL(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + + + /* + * A normal write request goes to the original subdisk, then we + * read in all other stripes, recalculate the parity and write + * out the parity again. + */ + } else { + wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); + wp->bufmalloc = 1; + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Skip the parity stripe. */ + if (s->consumer == wp->parity) + continue; + + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + /* + * The data for the original stripe is written, + * the others need to be read in for the parity + * calculation. + */ + if (s->consumer == wp->original) { + rbp->bio->bio_cmd = BIO_WRITE; + rbp->buf = addr; + } else { + rbp->bio->bio_cmd = BIO_READ; + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + } + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + } + break; + default: + return (EINVAL); + } + + wp->state = VALID; + return (0); +} diff --git a/sys/geom/vinum/geom_vinum_raid5.h b/sys/geom/vinum/geom_vinum_raid5.h new file mode 100644 index 000000000000..c43cb101594c --- /dev/null +++ b/sys/geom/vinum/geom_vinum_raid5.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _GEOM_VINUM_RAID5_H_ +#define _GEOM_VINUM_RAID5_H_ + +/* + * A single RAID5 request usually needs more than one I/O transaction, + * depending on the state of the associated subdisks and the direction of the + * transaction (read or write). Every subrequest of a RAID5 request, + * represented by a gv_raid_packet, is defined by a gv_raid5_bit. + */ + +/* A subrequest of a RAID5 read/write operation. */ +struct gv_raid5_bit { + struct bio *bio; /* BIO of this subrequest. */ + caddr_t buf; /* Data buffer of this subrequest. */ + int malloc; /* Flag if data buffer was malloced. */ + struct g_consumer *consumer; /* Consumer to send the BIO to. */ + TAILQ_ENTRY(gv_raid5_bit) list; /* Entry in the list of this request. */ +}; + +/* Container for one or more gv_raid5_bits; represents a RAID5 I/O request. */ +struct gv_raid5_packet { + caddr_t buf; /* Data buffer of this RAID5 request. */ + off_t length; /* Size of data buffer. */ + off_t lockbase; /* Deny access to our plex offset. */ + off_t offset; /* The drive offset of the subdisk. */ + int bufmalloc; /* Flag if data buffer was malloced. */ + int active; /* Count of active subrequests. */ + int rqcount; /* Count of subrequests. */ + + struct bio *bio; /* Pointer to the original bio. */ + caddr_t data; /* Pointer to the original data. */ + + struct g_consumer *original; /* Consumer to the data stripe. */ + struct g_consumer *parity; /* Consumer to the parity stripe. */ + + /* State of this RAID5 packet. */ + enum { + SETUP, /* Newly created. */ + VALID, /* Ready for processing. */ + IO, /* Currently doing I/O. */ + FINISH /* Packet has finished. */ + } state; + + /* Type of this RAID5 transaction. */ + enum { + JUNK, /* Newly created, not valid. */ + NORMAL, /* Normal read or write. */ + ISPARITY, /* Containing only parity data. */ + NOPARITY, /* Parity stripe not available. */ + DEGRADED, /* Data stripe not available. */ + COMBINED /* Data and parity stripes ok, others not. */ + } type; + + TAILQ_HEAD(,gv_raid5_bit) bits; /* List of subrequests. */ + TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */ +}; + +int gv_build_raid5_req(struct gv_raid5_packet *, struct bio *, caddr_t, + long, off_t); +void gv_raid5_done(struct bio *); +void gv_raid5_worker(void *); +struct gv_raid5_packet *gv_new_raid5_packet(void); +struct gv_raid5_bit *gv_new_raid5_bit(void); + +#endif /* !_GEOM_VINUM_RAID5_H_ */ diff --git a/sys/geom/vinum/geom_vinum_rm.c b/sys/geom/vinum/geom_vinum_rm.c new file mode 100644 index 000000000000..181a954f0c21 --- /dev/null +++ b/sys/geom/vinum/geom_vinum_rm.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/libkern.h> +#include <sys/kernel.h> +#include <sys/malloc.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum.h> +#include <geom/vinum/geom_vinum_share.h> + +static void gv_cleanup_pp(void *, int); +static void gv_free_sd(struct gv_sd *); +static int gv_rm_plex(struct gv_softc *, struct gctl_req *, + struct gv_plex *, int); +static int gv_rm_sd(struct gv_softc *, struct gctl_req *, struct gv_sd *, + int); +static int gv_rm_vol(struct gv_softc *, struct gctl_req *, + struct gv_volume *, int); + +/* General 'remove' routine. */ +void +gv_remove(struct g_geom *gp, struct gctl_req *req) +{ + struct gv_softc *sc; + struct gv_volume *v; + struct gv_plex *p; + struct gv_sd *s; + int *argc, *flags; + char *argv, buf[20]; + int i, type, err; + + argc = gctl_get_paraml(req, "argc", sizeof(*argc)); + flags = gctl_get_paraml(req, "flags", sizeof(*flags)); + + if (argc == NULL || *argc == 0) { + gctl_error(req, "no arguments given"); + return; + } + + sc = gp->softc; + + for (i = 0; i < *argc; i++) { + snprintf(buf, sizeof(buf), "argv%d", i); + argv = gctl_get_param(req, buf, NULL); + if (argv == NULL) + continue; + type = gv_object_type(sc, argv); + switch (type) { + case GV_TYPE_VOL: + v = gv_find_vol(sc, argv); + if (v == NULL) { + gctl_error(req, "unknown volume '%s'", argv); + return; + } + err = gv_rm_vol(sc, req, v, *flags); + if (err) + return; + break; + case GV_TYPE_PLEX: + p = gv_find_plex(sc, argv); + if (p == NULL) { + gctl_error(req, "unknown plex '%s'", argv); + return; + } + err = gv_rm_plex(sc, req, p, *flags); + if (err) + return; + break; + case GV_TYPE_SD: + s = gv_find_sd(sc, argv); + if (s == NULL) { + gctl_error(req, "unknown subdisk '%s'", argv); + return; + } + err = gv_rm_sd(sc, req, s, *flags); + if (err) + return; + break; + default: + gctl_error(req, "unknown object '%s'", argv); + return; + } + } + + gv_save_config_all(sc); +} + +/* Remove a volume. */ +static int +gv_rm_vol(struct gv_softc *sc, struct gctl_req *req, struct gv_volume *v, int flags) +{ + struct g_geom *gp; + struct gv_plex *p, *p2; + int err; + + g_topology_assert(); + KASSERT(v != NULL, ("gv_rm_vol: NULL v")); + + /* If this volume has plexes, we want a recursive removal. */ + if (!LIST_EMPTY(&v->plexes) && !(flags & GV_FLAG_R)) { + gctl_error(req, "volume '%s' has attached plexes", v->name); + return (-1); + } + + gp = v->geom; + + /* Check if any of our consumers is open. */ + if (gp != NULL && gv_is_open(gp)) { + gctl_error(req, "volume '%s' is busy", v->name); + return (-1); + } + + /* Remove the plexes our volume has. */ + LIST_FOREACH_SAFE(p, &v->plexes, in_volume, p2) { + v->plexcount--; + LIST_REMOVE(p, in_volume); + p->vol_sc = NULL; + + err = gv_rm_plex(sc, req, p, flags); + if (err) + return (err); + } + + /* Clean up and let our geom fade away. */ + LIST_REMOVE(v, volume); + g_free(v); + if (gp != NULL) { + gp->softc = NULL; + g_wither_geom(gp, ENXIO); + } + + return (0); +} + +/* Remove a plex. */ +static int +gv_rm_plex(struct gv_softc *sc, struct gctl_req *req, struct gv_plex *p, int flags) +{ + struct g_geom *gp; + struct gv_sd *s, *s2; + int err; + + g_topology_assert(); + + KASSERT(p != NULL, ("gv_rm_plex: NULL p")); + + /* If this plex has subdisks, we want a recursive removal. */ + if (!LIST_EMPTY(&p->subdisks) && !(flags & GV_FLAG_R)) { + gctl_error(req, "plex '%s' has attached subdisks", p->name); + return (-1); + } + + if (p->vol_sc != NULL && p->vol_sc->plexcount == 1) { + gctl_error(req, "plex '%s' is still attached to volume '%s'", + p->name, p->volume); + return (-1); + } + + gp = p->geom; + + /* Check if any of our consumers is open. */ + if (gp != NULL && gv_is_open(gp)) { + gctl_error(req, "plex '%s' is busy", p->name); + return (-1); + } + + /* Remove the subdisks our plex has. */ + LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2) { + p->sdcount--; +#if 0 + LIST_REMOVE(s, in_plex); + s->plex_sc = NULL; +#endif + + err = gv_rm_sd(sc, req, s, flags); + if (err) + return (err); + } + + /* Clean up and let our geom fade away. */ + LIST_REMOVE(p, plex); + if (p->vol_sc != NULL) { + p->vol_sc->plexcount--; + LIST_REMOVE(p, in_volume); + p->vol_sc = NULL; + } + + gv_kill_thread(p); + g_free(p); + + if (gp != NULL) { + gp->softc = NULL; + g_wither_geom(gp, ENXIO); + } + + return (0); +} + +/* Remove a subdisk. */ +static int +gv_rm_sd(struct gv_softc *sc, struct gctl_req *req, struct gv_sd *s, int flags) +{ + struct gv_drive *d; + struct g_geom *gp; + struct g_provider *pp; + + KASSERT(s != NULL, ("gv_rm_sd: NULL s")); + d = s->drive_sc; + KASSERT(d != NULL, ("gv_rm_sd: NULL d")); + gp = d->geom; + KASSERT(gp != NULL, ("gv_rm_sd: NULL gp")); + + pp = s->provider; + + /* Clean up. */ + LIST_REMOVE(s, in_plex); + LIST_REMOVE(s, from_drive); + LIST_REMOVE(s, sd); + gv_free_sd(s); + g_free(s); + + /* If the subdisk has a provider we need to clean up this one too. */ + if (pp != NULL) { + g_orphan_provider(pp, ENXIO); + if (LIST_EMPTY(&pp->consumers)) + g_destroy_provider(pp); + else + /* Schedule this left-over provider for destruction. */ + g_post_event(gv_cleanup_pp, pp, M_WAITOK, pp, NULL); + } + + return (0); +} + +/* + * This function is called from the event queue to clean up left-over subdisk + * providers. + */ +static void +gv_cleanup_pp(void *arg, int flag) +{ + struct g_provider *pp; + + g_topology_assert(); + + if (flag == EV_CANCEL) + return; + + pp = arg; + if (pp == NULL) { + printf("gv_cleanup_pp: provider has gone\n"); + return; + } + + if (!LIST_EMPTY(&pp->consumers)) { + printf("gv_cleanup_pp: provider still not empty\n"); + return; + } + + g_destroy_provider(pp); +} + +static void +gv_free_sd(struct gv_sd *s) +{ + struct gv_drive *d; + struct gv_freelist *fl, *fl2; + + KASSERT(s != NULL, ("gv_free_sd: NULL s")); + d = s->drive_sc; + KASSERT(d != NULL, ("gv_free_sd: NULL d")); + + /* + * First, find the free slot that's immediately before or after this + * subdisk. + */ + fl = NULL; + LIST_FOREACH(fl, &d->freelist, freelist) { + if (fl->offset == s->drive_offset + s->size) + break; + if (fl->offset + fl->size == s->drive_offset) + break; + } + + /* If there is no free slot behind this subdisk, so create one. */ + if (fl == NULL) { + + fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); + fl->size = s->size; + fl->offset = s->drive_offset; + + if (d->freelist_entries == 0) { + LIST_INSERT_HEAD(&d->freelist, fl, freelist); + } else { + LIST_FOREACH(fl2, &d->freelist, freelist) { + if (fl->offset < fl2->offset) { + LIST_INSERT_BEFORE(fl2, fl, freelist); + break; + } else if (LIST_NEXT(fl2, freelist) == NULL) { + LIST_INSERT_AFTER(fl2, fl, freelist); + break; + } + } + } + + d->freelist_entries++; + + /* Expand the free slot we just found. */ + } else { + fl->size += s->size; + if (fl->offset > s->drive_offset) + fl->offset = s->drive_offset; + } + + d->avail += s->size; +} diff --git a/sys/geom/vinum/geom_vinum_share.c b/sys/geom/vinum/geom_vinum_share.c new file mode 100644 index 000000000000..2c6530ed177f --- /dev/null +++ b/sys/geom/vinum/geom_vinum_share.c @@ -0,0 +1,651 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + */ + +/* This file is shared between kernel and userland. */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifdef _KERNEL +#include <sys/bio.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <geom/geom.h> +#define iswhite(c) (((c) == ' ') || ((c) == '\t')) +#else +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#define iswhite isspace +#define g_free free +#endif /* _KERNEL */ + +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> + +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum_share.h> + +/* + * Take a blank separated list of tokens and turn it into a list of + * individual nul-delimited strings. Build a list of pointers at + * token, which must have enough space for the tokens. Return the + * number of tokens, or -1 on error (typically a missing string + * delimiter). + */ +int +gv_tokenize(char *cptr, char *token[], int maxtoken) +{ + int tokennr; /* Index of this token. */ + char delim; /* Delimiter for searching for the partner. */ + + for (tokennr = 0; tokennr < maxtoken;) { + + /* Skip leading white space. */ + while (iswhite(*cptr)) + cptr++; + + /* End of line. */ + if ((*cptr == '\0') || (*cptr == '\n') || (*cptr == '#')) + return tokennr; + + delim = *cptr; + token[tokennr] = cptr; /* Point to it. */ + tokennr++; /* One more. */ + + /* Run off the end? */ + if (tokennr == maxtoken) + return tokennr; + + /* Quoted? */ + if ((delim == '\'') || (delim == '"')) { + for (;;) { + cptr++; + + /* Found the partner. */ + if ((*cptr == delim) && (cptr[-1] != '\\')) { + cptr++; + + /* Space after closing quote needed. */ + if (!iswhite(*cptr)) + return -1; + + /* Delimit. */ + *cptr++ = '\0'; + + /* End-of-line? */ + } else if ((*cptr == '\0') || (*cptr == '\n')) + return -1; + } + + /* Not quoted. */ + } else { + while ((*cptr != '\0') && + (!iswhite(*cptr)) && + (*cptr != '\n')) + cptr++; + + /* Not end-of-line; delimit and move to the next. */ + if (*cptr != '\0') + *cptr++ = '\0'; + } + } + + /* Can't get here. */ + return maxtoken; +} + + +/* + * Take a number with an optional scale factor and convert it to a number of + * bytes. + * + * The scale factors are: + * + * s sectors (of 512 bytes) + * b blocks (of 512 bytes). This unit is deprecated, because it's + * confusing, but maintained to avoid confusing Veritas users. + * k kilobytes (1024 bytes) + * m megabytes (of 1024 * 1024 bytes) + * g gigabytes (of 1024 * 1024 * 1024 bytes) + * + * XXX: need a way to signal error + */ +off_t +gv_sizespec(char *spec) +{ + uint64_t size; + char *s; + int sign; + + size = 0; + sign = 1; + if (spec != NULL) { /* we have a parameter */ + s = spec; + if (*s == '-') { /* negative, */ + sign = -1; + s++; /* skip */ + } + + /* It's numeric. */ + if ((*s >= '0') && (*s <= '9')) { + + /* It's numeric. */ + while ((*s >= '0') && (*s <= '9')) + /* Convert it. */ + size = size * 10 + *s++ - '0'; + + switch (*s) { + case '\0': + return size * sign; + + case 'B': + case 'b': + case 'S': + case 's': + return size * sign * 512; + + case 'K': + case 'k': + return size * sign * 1024; + + case 'M': + case 'm': + return size * sign * 1024 * 1024; + + case 'G': + case 'g': + return size * sign * 1024 * 1024 * 1024; + } + } + } + + return (0); +} + +const char * +gv_drivestate(int state) +{ + switch (state) { + case GV_DRIVE_DOWN: + return "down"; + case GV_DRIVE_UP: + return "up"; + default: + return "??"; + } +} + +int +gv_drivestatei(char *buf) +{ + if (!strcmp(buf, "up")) + return (GV_DRIVE_UP); + else + return (GV_DRIVE_DOWN); +} + +/* Translate from a string to a subdisk state. */ +int +gv_sdstatei(char *buf) +{ + if (!strcmp(buf, "up")) + return (GV_SD_UP); + else if (!strcmp(buf, "reviving")) + return (GV_SD_REVIVING); + else if (!strcmp(buf, "stale")) + return (GV_SD_STALE); + else + return (GV_SD_DOWN); +} + +/* Translate from a subdisk state to a string. */ +const char * +gv_sdstate(int state) +{ + switch (state) { + case GV_SD_INITIALIZING: + return "initializing"; + case GV_SD_STALE: + return "stale"; + case GV_SD_DOWN: + return "down"; + case GV_SD_REVIVING: + return "reviving"; + case GV_SD_UP: + return "up"; + default: + return "??"; + } +} + +/* Translate from a string to a plex state. */ +int +gv_plexstatei(char *buf) +{ + if (!strcmp(buf, "up")) + return (GV_PLEX_UP); + else if (!strcmp(buf, "initializing")) + return (GV_PLEX_INITIALIZING); + else if (!strcmp(buf, "degraded")) + return (GV_PLEX_DEGRADED); + else + return (GV_PLEX_DOWN); +} + +/* Translate from a plex state to a string. */ +const char * +gv_plexstate(int state) +{ + switch (state) { + case GV_PLEX_DOWN: + return "down"; + case GV_PLEX_INITIALIZING: + return "initializing"; + case GV_PLEX_DEGRADED: + return "degraded"; + case GV_PLEX_UP: + return "up"; + default: + return "??"; + } +} + +/* Translate from a string to a plex organization. */ +int +gv_plexorgi(char *buf) +{ + if (!strcmp(buf, "concat")) + return (GV_PLEX_CONCAT); + else if (!strcmp(buf, "striped")) + return (GV_PLEX_STRIPED); + else if (!strcmp(buf, "raid5")) + return (GV_PLEX_RAID5); + else + return (GV_PLEX_DISORG); +} + +int +gv_volstatei(char *buf) +{ + if (!strcmp(buf, "up")) + return (GV_VOL_UP); + else + return (GV_VOL_DOWN); +} + +const char * +gv_volstate(int state) +{ + switch (state) { + case GV_VOL_UP: + return "up"; + case GV_VOL_DOWN: + return "down"; + default: + return "??"; + } +} + +/* Translate from a plex organization to a string. */ +const char * +gv_plexorg(int org) +{ + switch (org) { + case GV_PLEX_DISORG: + return "??"; + case GV_PLEX_CONCAT: + return "concat"; + case GV_PLEX_STRIPED: + return "striped"; + case GV_PLEX_RAID5: + return "raid5"; + default: + return "??"; + } +} + +const char * +gv_plexorg_short(int org) +{ + switch (org) { + case GV_PLEX_DISORG: + return "??"; + case GV_PLEX_CONCAT: + return "C"; + case GV_PLEX_STRIPED: + return "S"; + case GV_PLEX_RAID5: + return "R5"; + default: + return "??"; + } +} + +/* Get a new drive object. */ +struct gv_drive * +gv_new_drive(int max, char *token[]) +{ + struct gv_drive *d; + int j, errors; + char *ptr; + + if (token[1] == NULL || *token[1] == '\0') + return (NULL); + +#ifdef _KERNEL + d = g_malloc(sizeof(struct gv_drive), M_WAITOK | M_ZERO); + +#else + d = malloc(sizeof(struct gv_drive)); + if (d == NULL) + return (NULL); + bzero(d, sizeof(struct gv_drive)); +#endif + + errors = 0; + for (j = 1; j < max; j++) { + if (!strcmp(token[j], "state")) { + j++; + if (j >= max) { + errors++; + break; + } + d->state = gv_drivestatei(token[j]); + } else if (!strcmp(token[j], "device")) { + j++; + if (j >= max) { + errors++; + break; + } + ptr = token[j] + strlen(token[j]); + while (ptr != token[j] && *ptr != '/') + ptr--; + ptr++; + strncpy(d->device, ptr, GV_MAXDRIVENAME); + } else { + /* We assume this is the drive name. */ + strncpy(d->name, token[j], GV_MAXDRIVENAME); + } + } + + if (strlen(d->name) == 0 || strlen(d->device) == 0) + errors++; + + if (errors) { + g_free(d); + return (NULL); + } + + return (d); +} + +/* Get a new volume object. */ +struct gv_volume * +gv_new_volume(int max, char *token[]) +{ + struct gv_volume *v; + int j, errors; + + if (token[1] == NULL || *token[1] == '\0') + return (NULL); + +#ifdef _KERNEL + v = g_malloc(sizeof(struct gv_volume), M_WAITOK | M_ZERO); + +#else + v = malloc(sizeof(struct gv_volume)); + if (v == NULL) + return (NULL); + bzero(v, sizeof(struct gv_volume)); +#endif + + errors = 0; + for (j = 1; j < max; j++) { + if (!strcmp(token[j], "state")) { + j++; + if (j >= max) { + errors++; + break; + } + v->state = gv_volstatei(token[j]); + } else { + /* We assume this is the volume name. */ + strncpy(v->name, token[j], GV_MAXVOLNAME); + } + } + + if (strlen(v->name) == 0) + errors++; + + if (errors) { + g_free(v); + return (NULL); + } + + return (v); +} + +/* Get a new plex object. */ +struct gv_plex * +gv_new_plex(int max, char *token[]) +{ + struct gv_plex *p; + int j, errors; + + if (token[1] == NULL || *token[1] == '\0') + return (NULL); + +#ifdef _KERNEL + p = g_malloc(sizeof(struct gv_plex), M_WAITOK | M_ZERO); +#else + p = malloc(sizeof(struct gv_plex)); + if (p == NULL) + return (NULL); + bzero(p, sizeof(struct gv_plex)); +#endif + + errors = 0; + for (j = 1; j < max; j++) { + if (!strcmp(token[j], "name")) { + j++; + if (j >= max) { + errors++; + break; + } + strncpy(p->name, token[j], GV_MAXPLEXNAME); + } else if (!strcmp(token[j], "org")) { + j++; + if (j >= max) { + errors++; + break; + } + p->org = gv_plexorgi(token[j]); + if ((p->org == GV_PLEX_RAID5) || + (p->org == GV_PLEX_STRIPED)) { + j++; + if (j >= max) { + errors++; + break; + } + p->stripesize = gv_sizespec(token[j]); + if (p->stripesize == 0) { + errors++; + break; + } + } + } else if (!strcmp(token[j], "state")) { + j++; + if (j >= max) { + errors++; + break; + } + p->state = gv_plexstatei(token[j]); + } else if (!strcmp(token[j], "vol")) { + j++; + if (j >= max) { + errors++; + break; + } + strncpy(p->volume, token[j], GV_MAXVOLNAME); + } else { + errors++; + break; + } + } + + if (errors) { + g_free(p); + return (NULL); + } + + return (p); +} + +/* Get a new subdisk object. */ +struct gv_sd * +gv_new_sd(int max, char *token[]) +{ + struct gv_sd *s; + int j, errors; + + if (token[1] == NULL || *token[1] == '\0') + return NULL; + +#ifdef _KERNEL + s = g_malloc(sizeof(struct gv_sd), M_WAITOK | M_ZERO); +#else + s = malloc(sizeof(struct gv_sd)); + if (s == NULL) + return NULL; + bzero(s, sizeof(struct gv_sd)); +#endif + + s->plex_offset = -1; + s->size = -1; + s->drive_offset = -1; + errors = 0; + for (j = 1; j < max; j++) { + if (!strcmp(token[j], "name")) { + j++; + if (j >= max) { + errors++; + break; + } + strncpy(s->name, token[j], GV_MAXSDNAME); + } else if (!strcmp(token[j], "drive")) { + j++; + if (j >= max) { + errors++; + break; + } + strncpy(s->drive, token[j], GV_MAXDRIVENAME); + } else if (!strcmp(token[j], "plex")) { + j++; + if (j >= max) { + errors++; + break; + } + strncpy(s->plex, token[j], GV_MAXPLEXNAME); + } else if (!strcmp(token[j], "state")) { + j++; + if (j >= max) { + errors++; + break; + } + s->state = gv_sdstatei(token[j]); + } else if (!strcmp(token[j], "len") || + !strcmp(token[j], "length")) { + j++; + if (j >= max) { + errors++; + break; + } + s->size = gv_sizespec(token[j]); + if (s->size <= 0) { + errors++; + break; + } + } else if (!strcmp(token[j], "driveoffset")) { + j++; + if (j >= max) { + errors++; + break; + } + s->drive_offset = gv_sizespec(token[j]); + if (s->drive_offset != 0 && + s->drive_offset < GV_DATA_START) { + errors++; + break; + } + } else if (!strcmp(token[j], "plexoffset")) { + j++; + if (j >= max) { + errors++; + break; + } + s->plex_offset = gv_sizespec(token[j]); + if (s->plex_offset < 0) { + errors++; + break; + } + } else { + errors++; + break; + } + } + + if (strlen(s->drive) == 0) + errors++; + + if (errors) { + g_free(s); + return (NULL); + } + + return (s); +} diff --git a/sys/geom/vinum/geom_vinum_share.h b/sys/geom/vinum/geom_vinum_share.h new file mode 100644 index 000000000000..177e97156abc --- /dev/null +++ b/sys/geom/vinum/geom_vinum_share.h @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _GEOM_VINUM_SHARE_H_ +#define _GEOM_VINUM_SHARE_H_ + +/* Maximum number of arguments for a single command. */ +#define GV_MAXARGS 64 + +enum { + KILOBYTE = 1024, + MEGABYTE = 1048576, + GIGABYTE = 1073741824 +}; + +off_t gv_sizespec(char *); +int gv_tokenize(char *, char **, int); + +struct gv_drive *gv_new_drive(int, char **); +struct gv_plex *gv_new_plex(int, char **); +struct gv_sd *gv_new_sd(int, char **); +struct gv_volume *gv_new_volume(int, char **); + +int gv_drivestatei(char *); +int gv_plexorgi(char *); +int gv_plexstatei(char *); +int gv_sdstatei(char *); +int gv_volstatei(char *); + +const char *gv_drivestate(int); +const char *gv_plexorg(int); +const char *gv_plexorg_short(int); +const char *gv_plexstate(int); +const char *gv_sdstate(int); +const char *gv_volstate(int); + +#endif /* _GEOM_VINUM_SHARE_H_ */ diff --git a/sys/geom/vinum/geom_vinum_state.c b/sys/geom/vinum/geom_vinum_state.c new file mode 100644 index 000000000000..fe8a88e2eded --- /dev/null +++ b/sys/geom/vinum/geom_vinum_state.c @@ -0,0 +1,289 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/malloc.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum.h> +#include <geom/vinum/geom_vinum_share.h> + +/* Update drive state; return 1 if the state changes, otherwise 0. */ +int +gv_set_drive_state(struct gv_drive *d, int newstate, int flags) +{ + struct gv_sd *s; + int oldstate; + + KASSERT(d != NULL, ("gv_set_drive_state: NULL d")); + + oldstate = d->state; + + if (newstate == oldstate) + return (1); + + /* We allow to take down an open drive only with force. */ + if ((newstate == GV_DRIVE_DOWN) && gv_is_open(d->geom) && + (!(flags & GV_SETSTATE_FORCE))) + return (0); + + d->state = newstate; + + if (d->state != oldstate) { + LIST_FOREACH(s, &d->subdisks, from_drive) + gv_update_sd_state(s); + } + + return (1); +} + +int +gv_set_sd_state(struct gv_sd *s, int newstate, int flags) +{ + struct gv_drive *d; + struct gv_plex *p; + int oldstate, status; + + KASSERT(s != NULL, ("gv_set_sd_state: NULL s")); + + oldstate = s->state; + + /* We are optimistic and assume it will work. */ + status = 0; + + if (newstate == oldstate) + return (0); + + switch (newstate) { + case GV_SD_DOWN: + /* + * If we're attached to a plex, we won't go down without use of + * force. + */ + if ((s->plex_sc != NULL) && !(flags & GV_SETSTATE_FORCE)) + return (-1); + break; + + case GV_SD_UP: + /* We can't bring the subdisk up if our drive is dead. */ + d = s->drive_sc; + if ((d == NULL) || (d->state != GV_DRIVE_UP)) + return (-1); + + /* Check from where we want to be brought up. */ + switch (s->state) { + case GV_SD_REVIVING: + case GV_SD_INITIALIZING: + /* + * The subdisk was initializing. We allow it to be + * brought up. + */ + break; + + case GV_SD_DOWN: + /* + * The subdisk is currently down. We allow it to be + * brought up if it is not attached to a plex. + */ + p = s->plex_sc; + if (p == NULL) + break; + + /* + * If this subdisk is attached to a plex, we allow it + * to be brought up if the plex if it's not a RAID5 + * plex, otherwise it's made 'stale'. + */ + + if (p->org != GV_PLEX_RAID5) + break; + else + s->state = GV_SD_STALE; + + status = -1; + break; + + case GV_SD_STALE: + /* + * A stale subdisk can't be brought up directly, it + * needs to be revived or initialized first. + */ + /* FALLTHROUGH */ + default: + return (-1); + } + break; + + /* Other state transitions are only possible with force. */ + default: + if (!(flags & GV_SETSTATE_FORCE)) + return (-1); + } + + /* We can change the state and do it. */ + if (status == 0) + s->state = newstate; + + /* Update our plex, if we're attached to one. */ + if (s->plex_sc != NULL) + gv_update_plex_state(s->plex_sc); + + /* Save the config back to disk. */ + if (flags & GV_SETSTATE_CONFIG) + gv_save_config_all(s->vinumconf); + + return (status); +} + + +/* Update the state of a subdisk based on its environment. */ +void +gv_update_sd_state(struct gv_sd *s) +{ + struct gv_drive *d; + + KASSERT(s != NULL, ("gv_update_sd_state: NULL s")); + d = s->drive_sc; + KASSERT(d != NULL, ("gv_update_sd_state: NULL d")); + + /* If our drive isn't up we cannot be up either. */ + if (d->state != GV_DRIVE_UP) + s->state = GV_SD_DOWN; + /* If this subdisk was just created, we assume it is good.*/ + else if (s->flags & GV_SD_NEWBORN) { + s->state = GV_SD_UP; + s->flags &= ~GV_SD_NEWBORN; + } else if (s->state != GV_SD_UP) + s->state = GV_SD_STALE; + else + s->state = GV_SD_UP; + + printf("FOO: sd %s is %s\n", s->name, gv_sdstate(s->state)); + /* Update the plex, if we have one. */ + if (s->plex_sc != NULL) + gv_update_plex_state(s->plex_sc); +} + +/* Update the state of a plex based on its environment. */ +void +gv_update_plex_state(struct gv_plex *p) +{ + int sdstates; + + KASSERT(p != NULL, ("gv_update_plex_state: NULL p")); + + /* First, check the state of our subdisks. */ + sdstates = gv_sdstatemap(p); + + /* If all subdisks are up, our plex can be up, too. */ + if (sdstates == GV_SD_UPSTATE) + p->state = GV_PLEX_UP; + + /* One or more of our subdisks are down. */ + else if (sdstates & GV_SD_DOWNSTATE) { + /* A RAID5 plex can handle one dead subdisk. */ + if ((p->org == GV_PLEX_RAID5) && (p->sddown == 1)) + p->state = GV_PLEX_DEGRADED; + else + p->state = GV_PLEX_DOWN; + + /* Some of our subdisks are initializing. */ + } else if (sdstates & GV_SD_INITSTATE) { + if (p->flags & GV_PLEX_SYNCING) + p->state = GV_PLEX_DEGRADED; + else + p->state = GV_PLEX_DOWN; + } else + p->state = GV_PLEX_DOWN; + + printf("FOO: plex %s is %s\n", p->name, gv_plexstate(p->state)); + /* Update our volume, if we have one. */ + if (p->vol_sc != NULL) + gv_update_vol_state(p->vol_sc); +} + +/* Update the volume state based on its plexes. */ +void +gv_update_vol_state(struct gv_volume *v) +{ + struct gv_plex *p; + + KASSERT(v != NULL, ("gv_update_vol_state: NULL v")); + + LIST_FOREACH(p, &v->plexes, in_volume) { + /* One of our plexes is accessible, and so are we. */ + if (p->state > GV_PLEX_DEGRADED) { + v->state = GV_VOL_UP; + return; + } + } + + /* Not one of our plexes is up, so we can't be either. */ + v->state = GV_VOL_DOWN; +} + +/* Return a state map for the subdisks of a plex. */ +int +gv_sdstatemap(struct gv_plex *p) +{ + struct gv_sd *s; + int statemap; + + KASSERT(p != NULL, ("gv_sdstatemap: NULL p")); + + statemap = 0; + p->sddown = 0; /* No subdisks down yet. */ + + LIST_FOREACH(s, &p->subdisks, in_plex) { + switch (s->state) { + case GV_SD_DOWN: + case GV_SD_STALE: + statemap |= GV_SD_DOWNSTATE; + p->sddown++; /* Another unusable subdisk. */ + break; + + case GV_SD_UP: + statemap |= GV_SD_UPSTATE; + break; + + case GV_SD_INITIALIZING: + statemap |= GV_SD_INITSTATE; + break; + + case GV_SD_REVIVING: + statemap |= GV_SD_INITSTATE; + p->sddown++; /* XXX: Another unusable subdisk? */ + break; + } + } + return (statemap); +} diff --git a/sys/geom/vinum/geom_vinum_subr.c b/sys/geom/vinum/geom_vinum_subr.c new file mode 100644 index 000000000000..55cf583ca21e --- /dev/null +++ b/sys/geom/vinum/geom_vinum_subr.c @@ -0,0 +1,804 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <geom/geom.h> +#include <geom/geom_int.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum.h> +#include <geom/vinum/geom_vinum_share.h> + +/* Find the VINUM class and it's associated geom. */ +struct g_geom * +find_vinum_geom(void) +{ + struct g_class *mp; + struct g_geom *gp; + + g_topology_assert(); + + gp = NULL; + + LIST_FOREACH(mp, &g_classes, class) { + if (!strcmp(mp->name, "VINUM")) { + gp = LIST_FIRST(&mp->geom); + break; + } + } + + return (gp); +} + +/* + * Parse the vinum config provided in *buf and store it in *gp's softc. + * If parameter 'merge' is non-zero, then the given config is merged into + * *gp. + */ +void +gv_parse_config(struct gv_softc *sc, u_char *buf, int merge) +{ + char *aptr, *bptr, *cptr; + struct gv_volume *v, *v2; + struct gv_plex *p, *p2; + struct gv_sd *s, *s2; + int tokens; + char *token[GV_MAXARGS]; + + g_topology_assert(); + + KASSERT(sc != NULL, ("gv_parse_config: NULL softc")); + + /* Until the end of the string *buf. */ + for (aptr = buf; *aptr != '\0'; aptr = bptr) { + bptr = aptr; + cptr = aptr; + + /* Seperate input lines. */ + while (*bptr != '\n') + bptr++; + *bptr = '\0'; + bptr++; + + tokens = gv_tokenize(cptr, token, GV_MAXARGS); + + if (tokens > 0) { + if (!strcmp(token[0], "volume")) { + v = gv_new_volume(tokens, token); + if (v == NULL) { + printf("geom_vinum: failed volume\n"); + break; + } + + if (merge) { + v2 = gv_find_vol(sc, v->name); + if (v2 != NULL) { + g_free(v); + continue; + } + } + + v->vinumconf = sc; + LIST_INIT(&v->plexes); + LIST_INSERT_HEAD(&sc->volumes, v, volume); + + } else if (!strcmp(token[0], "plex")) { + p = gv_new_plex(tokens, token); + if (p == NULL) { + printf("geom_vinum: failed plex\n"); + break; + } + + if (merge) { + p2 = gv_find_plex(sc, p->name); + if (p2 != NULL) { + g_free(p); + continue; + } + } + + p->vinumconf = sc; + LIST_INIT(&p->subdisks); + LIST_INSERT_HEAD(&sc->plexes, p, plex); + + } else if (!strcmp(token[0], "sd")) { + s = gv_new_sd(tokens, token); + + if (s == NULL) { + printf("geom_vinum: failed subdisk\n"); + break; + } + + if (merge) { + s2 = gv_find_sd(sc, s->name); + if (s2 != NULL) { + g_free(s); + continue; + } + } + + s->vinumconf = sc; + LIST_INSERT_HEAD(&sc->subdisks, s, sd); + } + } + } +} + +/* + * Format the vinum configuration properly. If ondisk is non-zero then the + * configuration is intended to be written to disk later. + */ +void +gv_format_config(struct gv_softc *sc, struct sbuf *sb, int ondisk, char *prefix) +{ + struct gv_drive *d; + struct gv_sd *s; + struct gv_plex *p; + struct gv_volume *v; + + g_topology_assert(); + + /* + * We don't need the drive configuration if we're not writing the + * config to disk. + */ + if (!ondisk) { + LIST_FOREACH(d, &sc->drives, drive) { + sbuf_printf(sb, "%sdrive %s device %s\n", prefix, + d->name, d->device); + } + } + + LIST_FOREACH(v, &sc->volumes, volume) { + if (!ondisk) + sbuf_printf(sb, "%s", prefix); + sbuf_printf(sb, "volume %s", v->name); + if (ondisk) + sbuf_printf(sb, " state %s", gv_volstate(v->state)); + sbuf_printf(sb, "\n"); + } + + LIST_FOREACH(p, &sc->plexes, plex) { + if (!ondisk) + sbuf_printf(sb, "%s", prefix); + sbuf_printf(sb, "plex name %s org %s ", p->name, + gv_plexorg(p->org)); + if (gv_is_striped(p)) + sbuf_printf(sb, "%ds ", p->stripesize / 512); + if (p->vol_sc != NULL) + sbuf_printf(sb, "vol %s", p->volume); + if (ondisk) + sbuf_printf(sb, " state %s", gv_plexstate(p->state)); + sbuf_printf(sb, "\n"); + } + + LIST_FOREACH(s, &sc->subdisks, sd) { + if (!ondisk) + sbuf_printf(sb, "%s", prefix); + sbuf_printf(sb, "sd name %s drive %s len %jds driveoffset " + "%jds", s->name, s->drive, s->size / 512, + s->drive_offset / 512); + if (s->plex_sc != NULL) { + sbuf_printf(sb, " plex %s plexoffset %jds", s->plex, + s->plex_offset / 512); + } + if (ondisk) + sbuf_printf(sb, " state %s", gv_sdstate(s->state)); + sbuf_printf(sb, "\n"); + } + + return; +} + +/* + * Take a size in bytes and return a pointer to a string which represents the + * size best. If lj is != 0, return left justified, otherwise in a fixed 10 + * character field suitable for columnar printing. + * + * Note this uses a static string: it's only intended to be used immediately + * for printing. + */ +const char * +gv_roughlength(off_t bytes, int lj) +{ + static char desc[16]; + + /* Gigabytes. */ + if (bytes > (off_t)MEGABYTE * 10000) + snprintf(desc, sizeof(desc), lj ? "%jd GB" : "%10jd GB", + bytes / GIGABYTE); + + /* Megabytes. */ + else if (bytes > KILOBYTE * 10000) + snprintf(desc, sizeof(desc), lj ? "%jd MB" : "%10jd MB", + bytes / MEGABYTE); + + /* Kilobytes. */ + else if (bytes > 10000) + snprintf(desc, sizeof(desc), lj ? "%jd kB" : "%10jd kB", + bytes / KILOBYTE); + + /* Bytes. */ + else + snprintf(desc, sizeof(desc), lj ? "%jd B" : "%10jd B", bytes); + + return (desc); +} + +int +gv_sd_to_plex(struct gv_plex *p, struct gv_sd *s, int check) +{ + struct gv_sd *s2; + + g_topology_assert(); + + /* If this subdisk was already given to this plex, do nothing. */ + if (s->plex_sc == p) + return (0); + + /* Find the correct plex offset for this subdisk, if needed. */ + if (s->plex_offset == -1) { + if (p->sdcount) { + LIST_FOREACH(s2, &p->subdisks, in_plex) { + if (gv_is_striped(p)) + s->plex_offset = p->sdcount * + p->stripesize; + else + s->plex_offset = s2->plex_offset + + s2->size; + } + } else + s->plex_offset = 0; + } + + p->sdcount++; + + /* Adjust the size of our plex. */ + switch (p->org) { + case GV_PLEX_CONCAT: + case GV_PLEX_STRIPED: + p->size += s->size; + break; + + case GV_PLEX_RAID5: + p->size = (p->sdcount - 1) * s->size; + break; + + default: + break; + } + + /* There are no subdisks for this plex yet, just insert it. */ + if (LIST_EMPTY(&p->subdisks)) { + LIST_INSERT_HEAD(&p->subdisks, s, in_plex); + + /* Insert in correct order, depending on plex_offset. */ + } else { + LIST_FOREACH(s2, &p->subdisks, in_plex) { + if (s->plex_offset < s2->plex_offset) { + LIST_INSERT_BEFORE(s2, s, in_plex); + break; + } else if (LIST_NEXT(s2, in_plex) == NULL) { + LIST_INSERT_AFTER(s2, s, in_plex); + break; + } + } + } + + s->plex_sc = p; + + return (0); +} + +void +gv_update_plex_config(struct gv_plex *p) +{ + struct gv_sd *s, *s2; + off_t remainder; + int required_sds, state; + + KASSERT(p != NULL, ("gv_update_plex_config: NULL p")); + + /* This is what we want the plex to be. */ + state = GV_PLEX_UP; + + /* The plex was added to an already running volume. */ + if (p->flags & GV_PLEX_ADDED) + state = GV_PLEX_DOWN; + + switch (p->org) { + case GV_PLEX_STRIPED: + required_sds = 2; + break; + case GV_PLEX_RAID5: + required_sds = 3; + break; + case GV_PLEX_CONCAT: + default: + required_sds = 0; + break; + } + + if (required_sds) { + if (p->sdcount < required_sds) { + state = GV_PLEX_DOWN; + } + + /* + * The subdisks in striped plexes must all have the same size. + */ + s = LIST_FIRST(&p->subdisks); + LIST_FOREACH(s2, &p->subdisks, in_plex) { + if (s->size != s2->size) { + printf("geom_vinum: subdisk size mismatch " + "%s (%jd) <> %s (%jd)\n", s->name, s->size, + s2->name, s2->size); + state = GV_PLEX_DOWN; + } + } + + /* Trim subdisk sizes so that they match the stripe size. */ + LIST_FOREACH(s, &p->subdisks, in_plex) { + remainder = s->size % p->stripesize; + if (remainder) { + printf("gvinum: size of sd %s is not a " + "multiple of plex stripesize, taking off " + "%jd bytes\n", s->name, + (intmax_t)remainder); + gv_adjust_freespace(s, remainder); + } + } + } + + /* Adjust the size of our plex. */ + if (p->sdcount > 0) { + p->size = 0; + switch (p->org) { + case GV_PLEX_CONCAT: + LIST_FOREACH(s, &p->subdisks, in_plex) + p->size += s->size; + break; + + case GV_PLEX_STRIPED: + s = LIST_FIRST(&p->subdisks); + p->size = p->sdcount * s->size; + break; + + case GV_PLEX_RAID5: + s = LIST_FIRST(&p->subdisks); + p->size = (p->sdcount - 1) * s->size; + break; + + default: + break; + } + } + + if (p->sdcount == 0) + state = GV_PLEX_DOWN; + else if ((p->flags & GV_PLEX_ADDED) || (p->org == GV_PLEX_RAID5)) { + LIST_FOREACH(s, &p->subdisks, in_plex) + s->state = GV_SD_STALE; + p->flags &= ~GV_PLEX_ADDED; + p->state = GV_PLEX_DOWN; + } +} + +/* + * Give a subdisk to a drive, check and adjust several parameters, adjust + * freelist. + */ +int +gv_sd_to_drive(struct gv_softc *sc, struct gv_drive *d, struct gv_sd *s, + char *errstr, int errlen) +{ + struct gv_sd *s2; + struct gv_freelist *fl, *fl2; + off_t tmp; + int i; + + g_topology_assert(); + + fl2 = NULL; + + KASSERT(sc != NULL, ("gv_sd_to_drive: NULL softc")); + KASSERT(d != NULL, ("gv_sd_to_drive: NULL drive")); + KASSERT(s != NULL, ("gv_sd_to_drive: NULL subdisk")); + KASSERT(errstr != NULL, ("gv_sd_to_drive: NULL errstr")); + KASSERT(errlen >= ERRBUFSIZ, ("gv_sd_to_drive: short errlen", errlen)); + + /* Check if this subdisk was already given to this drive. */ + if (s->drive_sc == d) + return (0); + + /* Preliminary checks. */ + if (s->size > d->avail || d->freelist_entries == 0) { + snprintf(errstr, errlen, "not enough space on '%s' for '%s'", + d->name, s->name); + return (-1); + } + + /* No size given, autosize it. */ + if (s->size == -1) { + /* Find the largest available slot. */ + LIST_FOREACH(fl, &d->freelist, freelist) { + if (fl->size > s->size) { + s->size = fl->size; + s->drive_offset = fl->offset; + fl2 = fl; + } + } + + /* No good slot found? */ + if (s->size == -1) { + snprintf(errstr, errlen, "couldn't autosize '%s' on " + "'%s'", s->name, d->name); + return (-1); + } + + /* + * Check if we have a free slot that's large enough for the given size. + */ + } else { + i = 0; + LIST_FOREACH(fl, &d->freelist, freelist) { + /* Yes, this subdisk fits. */ + if (fl->size >= s->size) { + i++; + /* Override drive_offset, if given. */ + s->drive_offset = fl->offset; + fl2 = fl; + break; + } + } + + /* Couldn't find a good free slot. */ + if (i == 0) { + snprintf(errstr, errlen, "free slots to small for '%s' " + "on '%s'", s->name, d->name); + return (-1); + } + } + + /* No drive offset given, try to calculate it. */ + if (s->drive_offset == -1) { + + /* Add offsets and sizes from other subdisks on this drive. */ + LIST_FOREACH(s2, &d->subdisks, from_drive) { + s->drive_offset = s2->drive_offset + s2->size; + } + + /* + * If there are no other subdisks yet, then set the default + * offset to GV_DATA_START. + */ + if (s->drive_offset == 0) + s->drive_offset = GV_DATA_START; + + /* Check if we have a free slot at the given drive offset. */ + } else { + i = 0; + LIST_FOREACH(fl, &d->freelist, freelist) { + /* Yes, this subdisk fits. */ + if ((fl->offset <= s->drive_offset) && + (fl->offset + fl->size >= + s->drive_offset + s->size)) { + i++; + fl2 = fl; + break; + } + } + + /* Couldn't find a good free slot. */ + if (i == 0) { + snprintf(errstr, errlen, "given drive_offset for '%s' " + "won't fit on '%s'", s->name, d->name); + return (-1); + } + } + + /* + * Now that all parameters are checked and set up, we can give the + * subdisk to the drive and adjust the freelist. + */ + + /* First, adjust the freelist. */ + LIST_FOREACH(fl, &d->freelist, freelist) { + + /* This is the free slot that we have found before. */ + if (fl == fl2) { + + /* + * The subdisk starts at the beginning of the free + * slot. + */ + if (fl->offset == s->drive_offset) { + fl->offset += s->size; + fl->size -= s->size; + + /* + * The subdisk uses the whole slot, so remove + * it. + */ + if (fl->size == 0) { + d->freelist_entries--; + LIST_REMOVE(fl, freelist); + } + /* + * The subdisk does not start at the beginning of the + * free slot. + */ + } else { + tmp = fl->offset + fl->size; + fl->size = s->drive_offset - fl->offset; + + /* + * The subdisk didn't use the complete rest of + * the free slot, so we need to split it. + */ + if (s->drive_offset + s->size != tmp) { + fl2 = g_malloc(sizeof(*fl2), + M_WAITOK | M_ZERO); + fl2->offset = s->drive_offset + s->size; + fl2->size = tmp - fl2->offset; + LIST_INSERT_AFTER(fl, fl2, freelist); + d->freelist_entries++; + } + } + break; + } + } + + /* + * This is the first subdisk on this drive, just insert it into the + * list. + */ + if (LIST_EMPTY(&d->subdisks)) { + LIST_INSERT_HEAD(&d->subdisks, s, from_drive); + + /* There are other subdisks, so insert this one in correct order. */ + } else { + LIST_FOREACH(s2, &d->subdisks, from_drive) { + if (s->drive_offset < s2->drive_offset) { + LIST_INSERT_BEFORE(s2, s, from_drive); + break; + } else if (LIST_NEXT(s2, from_drive) == NULL) { + LIST_INSERT_AFTER(s2, s, from_drive); + break; + } + } + } + + d->sdcount++; + d->avail -= s->size; + + /* Link back from the subdisk to this drive. */ + s->drive_sc = d; + + return (0); +} + +void +gv_adjust_freespace(struct gv_sd *s, off_t remainder) +{ + struct gv_drive *d; + struct gv_freelist *fl, *fl2; + + KASSERT(s != NULL, ("gv_adjust_freespace: NULL s")); + d = s->drive_sc; + KASSERT(d != NULL, ("gv_adjust_freespace: NULL d")); + + /* First, find the free slot that's immediately after this subdisk. */ + fl = NULL; + LIST_FOREACH(fl, &d->freelist, freelist) { + if (fl->offset == s->drive_offset + s->size) + break; + } + + /* If there is no free slot behind this subdisk, so create one. */ + if (fl == NULL) { + + fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO); + fl->size = remainder; + fl->offset = s->drive_offset + s->size - remainder; + + if (d->freelist_entries == 0) { + LIST_INSERT_HEAD(&d->freelist, fl, freelist); + } else { + LIST_FOREACH(fl2, &d->freelist, freelist) { + if (fl->offset < fl2->offset) { + LIST_INSERT_BEFORE(fl2, fl, freelist); + break; + } else if (LIST_NEXT(fl2, freelist) == NULL) { + LIST_INSERT_AFTER(fl2, fl, freelist); + break; + } + } + } + + d->freelist_entries++; + + /* Expand the free slot we just found. */ + } else { + fl->offset -= remainder; + fl->size += remainder; + } + + s->size -= remainder; + d->avail += remainder; +} + +/* Check if the given plex is a striped one. */ +int +gv_is_striped(struct gv_plex *p) +{ + KASSERT(p != NULL, ("gv_is_striped: NULL p")); + switch(p->org) { + case GV_PLEX_STRIPED: + case GV_PLEX_RAID5: + return (1); + default: + return (0); + } +} + +/* Find a volume by name. */ +struct gv_volume * +gv_find_vol(struct gv_softc *sc, char *name) +{ + struct gv_volume *v; + + LIST_FOREACH(v, &sc->volumes, volume) { + if (!strncmp(v->name, name, GV_MAXVOLNAME)) + return (v); + } + + return (NULL); +} + +/* Find a plex by name. */ +struct gv_plex * +gv_find_plex(struct gv_softc *sc, char *name) +{ + struct gv_plex *p; + + LIST_FOREACH(p, &sc->plexes, plex) { + if (!strncmp(p->name, name, GV_MAXPLEXNAME)) + return (p); + } + + return (NULL); +} + +/* Find a subdisk by name. */ +struct gv_sd * +gv_find_sd(struct gv_softc *sc, char *name) +{ + struct gv_sd *s; + + LIST_FOREACH(s, &sc->subdisks, sd) { + if (!strncmp(s->name, name, GV_MAXSDNAME)) + return (s); + } + + return (NULL); +} + +/* Find a drive by name. */ +struct gv_drive * +gv_find_drive(struct gv_softc *sc, char *name) +{ + struct gv_drive *d; + + LIST_FOREACH(d, &sc->drives, drive) { + if (!strncmp(d->name, name, GV_MAXDRIVENAME)) + return (d); + } + + return (NULL); +} + +/* Check if any consumer of the given geom is open. */ +int +gv_is_open(struct g_geom *gp) +{ + struct g_consumer *cp; + + KASSERT(gp != NULL, ("gv_is_open: NULL gp")); + + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (cp->acr || cp->acw || cp->ace) + return (1); + } + + return (0); +} + +/* Return the type of object identified by string 'name'. */ +int +gv_object_type(struct gv_softc *sc, char *name) +{ + struct gv_drive *d; + struct gv_plex *p; + struct gv_sd *s; + struct gv_volume *v; + + LIST_FOREACH(v, &sc->volumes, volume) { + if (!strncmp(v->name, name, GV_MAXVOLNAME)) + return (GV_TYPE_VOL); + } + + LIST_FOREACH(p, &sc->plexes, plex) { + if (!strncmp(p->name, name, GV_MAXPLEXNAME)) + return (GV_TYPE_PLEX); + } + + LIST_FOREACH(s, &sc->subdisks, sd) { + if (!strncmp(s->name, name, GV_MAXSDNAME)) + return (GV_TYPE_SD); + } + + LIST_FOREACH(d, &sc->drives, drive) { + if (!strncmp(d->name, name, GV_MAXDRIVENAME)) + return (GV_TYPE_DRIVE); + } + + return (-1); +} + +void +gv_kill_thread(struct gv_plex *p) +{ + if ((p->org == GV_PLEX_RAID5) && (p->flags & GV_PLEX_THREAD_ACTIVE)) { + p->flags |= GV_PLEX_THREAD_DIE; + wakeup(p); + while (!(p->flags & GV_PLEX_THREAD_DEAD)) + tsleep(p, PRIBIO, "gv_die", hz); + p->flags &= ~GV_PLEX_THREAD_ACTIVE; + } +} diff --git a/sys/geom/vinum/geom_vinum_var.h b/sys/geom/vinum/geom_vinum_var.h new file mode 100644 index 000000000000..4c38923d7794 --- /dev/null +++ b/sys/geom/vinum/geom_vinum_var.h @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2004 Lukas Ertl + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * Parts written by Greg Lehey. + * + * This software is distributed under the so-called ``Berkeley + * License'': * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $FreeBSD$ + */ + +#ifndef _GEOM_VINUM_VAR_H_ +#define _GEOM_VINUM_VAR_H_ + +/* + * Slice header + * + * Vinum drives start with this structure: + * + *\ Sector + * |--------------------------------------| + * | PDP-11 memorial boot block | 0 + * |--------------------------------------| + * | Disk label, maybe | 1 + * |--------------------------------------| + * | Slice definition (vinum_hdr) | 8 + * |--------------------------------------| + * | | + * | Configuration info, first copy | 9 + * | | + * |--------------------------------------| + * | | + * | Configuration info, second copy | 9 + size of config + * | | + * |--------------------------------------| + */ + +/* Sizes and offsets of our information. */ +#define GV_HDR_OFFSET 4096 /* Offset of vinum header. */ +#define GV_HDR_LEN 512 /* Size of vinum header. */ +#define GV_CFG_OFFSET 4608 /* Offset of first config copy. */ +#define GV_CFG_LEN 65536 /* Size of config copy. */ + +/* This is where the actual data starts. */ +#define GV_DATA_START (GV_CFG_LEN * 2 + GV_CFG_OFFSET) +/* #define GV_DATA_START (GV_CFG_LEN * 2 + GV_HDR_LEN) */ + +#define GV_MAXDRIVENAME 32 /* Maximum length of a device name. */ +#define GV_MAXSDNAME 64 /* Maximum length of a subdisk name. */ +#define GV_MAXPLEXNAME 64 /* Maximum length of a plex name. */ +#define GV_MAXVOLNAME 64 /* Maximum length of a volume name. */ + +/* Command line flags. */ +#define GV_FLAG_R 0x01 +#define GV_FLAG_S 0x02 +#define GV_FLAG_V 0x04 +#define GV_FLAG_VV 0x08 +#define GV_FLAG_F 0x10 + +/* Object types. */ +#define GV_TYPE_VOL 1 +#define GV_TYPE_PLEX 2 +#define GV_TYPE_SD 3 +#define GV_TYPE_DRIVE 4 + +/* State changing flags. */ +#define GV_SETSTATE_FORCE 0x1 +#define GV_SETSTATE_CONFIG 0x2 + +/* Subdisk state bitmaps for plexes. */ +#define GV_SD_DOWNSTATE 0x01 /* Subdisk is down. */ +#define GV_SD_STALESTATE 0x02 /* Subdisk is stale. */ +#define GV_SD_INITSTATE 0x04 /* Subdisk is initializing. */ +#define GV_SD_UPSTATE 0x08 /* Subdisk is up. */ + +/* Synchronization/initialization request sizes. */ +#define GV_MIN_SYNCSIZE 512 +#define GV_MAX_SYNCSIZE MAXPHYS +#define GV_DFLT_SYNCSIZE 65536 + +/* + * hostname is 256 bytes long, but we don't need to shlep multiple copies in + * vinum. We use the host name just to identify this system, and 32 bytes + * should be ample for that purpose. + */ + +#define GV_HOSTNAME_LEN 32 +struct gv_label { + char sysname[GV_HOSTNAME_LEN]; /* System name at creation time. */ + char name[GV_MAXDRIVENAME]; /* Our name of the drive. */ + struct timeval date_of_birth; /* The time it was created ... */ + struct timeval last_update; /* ... and the time of last update. */ + off_t drive_size; /* Total size incl. headers. */ +}; + +/* The 'header' of each valid vinum drive. */ +struct gv_hdr { + uint64_t magic; +#define GV_MAGIC 22322600044678729LL +#define GV_NOMAGIC 22322600044678990LL + + int config_length; + struct gv_label label; +}; + +/* A single freelist entry of a drive. */ +struct gv_freelist { + off_t size; /* Size of this free slot. */ + off_t offset; /* Offset on the drive. */ + LIST_ENTRY(gv_freelist) freelist; +}; + +/* This struct contains the main vinum config. */ +struct gv_softc { + /*struct mtx config_mtx; XXX not yet */ + + /* Linked lists of all objects in our setup. */ + LIST_HEAD(,gv_drive) drives; /* All drives. */ + LIST_HEAD(,gv_plex) plexes; /* All plexes. */ + LIST_HEAD(,gv_sd) subdisks; /* All subdisks. */ + LIST_HEAD(,gv_volume) volumes; /* All volumes. */ + + struct g_geom *geom; /* Pointer to our VINUM geom. */ +}; + +/* softc for a drive. */ +struct gv_drive { + char name[GV_MAXDRIVENAME]; /* The name of this drive. */ + char device[GV_MAXDRIVENAME]; /* Associated device. */ + int state; /* The state of this drive. */ +#define GV_DRIVE_DOWN 0 +#define GV_DRIVE_UP 1 + + off_t size; /* Size of this drive. */ + off_t avail; /* Available space. */ + int sdcount; /* Number of subdisks. */ + + struct gv_hdr *hdr; /* The drive header. */ + + int freelist_entries; /* Count of freelist entries. */ + LIST_HEAD(,gv_freelist) freelist; /* List of freelist entries. */ + LIST_HEAD(,gv_sd) subdisks; /* Subdisks on this drive. */ + LIST_ENTRY(gv_drive) drive; /* Entry in the vinum config. */ + + struct g_geom *geom; /* The geom of this drive. */ + struct gv_softc *vinumconf; /* Pointer to the vinum conf. */ +}; + +/* softc for a subdisk. */ +struct gv_sd { + char name[GV_MAXSDNAME]; /* The name of this subdisk. */ + off_t size; /* The size of this subdisk. */ + off_t drive_offset; /* Offset in the underlying drive. */ + off_t plex_offset; /* Offset in the associated plex. */ + int state; /* The state of this subdisk. */ +#define GV_SD_DOWN 0 +#define GV_SD_STALE 1 +#define GV_SD_INITIALIZING 2 +#define GV_SD_REVIVING 3 +#define GV_SD_UP 4 + + off_t initialized; /* Count of initialized bytes. */ + + int init_size; /* Initialization read/write size. */ + int init_error; /* Flag error on initialization. */ + + int flags; +#define GV_SD_NEWBORN 0x01 /* Subdisk was just created. */ +#define GV_SD_INITCANCEL 0x02 /* Cancel initialization process. */ + + char drive[GV_MAXDRIVENAME]; /* Name of underlying drive. */ + char plex[GV_MAXPLEXNAME]; /* Name of associated plex. */ + + struct gv_drive *drive_sc; /* Pointer to underlying drive. */ + struct gv_plex *plex_sc; /* Pointer to associated plex. */ + + struct g_provider *provider; /* The provider this sd represents. */ + struct g_consumer *consumer; /* Consumer attached to our provider. */ + + LIST_ENTRY(gv_sd) from_drive; /* Subdisk list of underlying drive. */ + LIST_ENTRY(gv_sd) in_plex; /* Subdisk list of associated plex. */ + LIST_ENTRY(gv_sd) sd; /* Entry in the vinum config. */ + + struct gv_softc *vinumconf; /* Pointer to the vinum config. */ +}; + +/* softc for a plex. */ +struct gv_plex { + char name[GV_MAXPLEXNAME]; /* The name of the plex. */ + off_t size; /* The size of the plex. */ + int state; /* The plex state. */ +#define GV_PLEX_DOWN 0 +#define GV_PLEX_INITIALIZING 1 +#define GV_PLEX_DEGRADED 2 +#define GV_PLEX_UP 3 + + int org; /* The plex organisation. */ +#define GV_PLEX_DISORG 0 +#define GV_PLEX_CONCAT 1 +#define GV_PLEX_STRIPED 2 +#define GV_PLEX_RAID5 4 + + int stripesize; /* The stripe size of the plex. */ + + char volume[GV_MAXVOLNAME]; /* Name of associated volume. */ + struct gv_volume *vol_sc; /* Pointer to associated volume. */ + + int sdcount; /* Number of subdisks in this plex. */ + int sddown; /* Number of subdisks that are down. */ + int flags; +#define GV_PLEX_ADDED 0x01 /* Added to an existing volume. */ +#define GV_PLEX_SYNCING 0x02 /* Plex is syncing from another plex. */ +#define GV_PLEX_THREAD_ACTIVE 0x04 /* Plex has an active RAID5 thread. */ +#define GV_PLEX_THREAD_DIE 0x08 /* Signal the RAID5 thread to die. */ +#define GV_PLEX_THREAD_DEAD 0x10 /* The RAID5 thread has died. */ +#define GV_PLEX_NEWBORN 0x20 /* The plex was just created. */ + + off_t synced; /* Count of synced bytes. */ + + struct mtx worklist_mtx; /* Mutex for RAID5 worklist. */ + TAILQ_HEAD(,gv_raid5_packet) worklist; /* List of RAID5 work packets. */ + + LIST_HEAD(,gv_sd) subdisks; /* List of attached subdisks. */ + LIST_ENTRY(gv_plex) in_volume; /* Plex list of associated volume. */ + LIST_ENTRY(gv_plex) plex; /* Entry in the vinum config. */ + + struct g_provider *provider; /* The provider this plex represents. */ + struct g_consumer *consumer; /* Consumer attached to our provider. */ + + struct g_geom *geom; /* The geom of this plex. */ + struct gv_softc *vinumconf; /* Pointer to the vinum config. */ +}; + +/* softc for a volume. */ +struct gv_volume { + char name[GV_MAXVOLNAME]; /* The name of the volume. */ + off_t size; /* The size of the volume. */ + int plexcount; /* Number of plexes. */ + int state; /* The state of the volume. */ +#define GV_VOL_DOWN 0 +#define GV_VOL_UP 1 + + LIST_HEAD(,gv_plex) plexes; /* List of attached plexes. */ + LIST_ENTRY(gv_volume) volume; /* Entry in vinum config. */ + + struct g_geom *geom; /* The geom of this volume. */ + struct gv_softc *vinumconf; /* Pointer to the vinum config. */ +}; + +#endif /* !_GEOM_VINUM_VAR_H */ diff --git a/sys/geom/vinum/geom_vinum_volume.c b/sys/geom/vinum/geom_vinum_volume.c new file mode 100644 index 000000000000..c916af462e17 --- /dev/null +++ b/sys/geom/vinum/geom_vinum_volume.c @@ -0,0 +1,260 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum.h> + +static void +gv_volume_orphan(struct g_consumer *cp) +{ + struct g_geom *gp; + int error; + + g_topology_assert(); + gp = cp->geom; + g_trace(G_T_TOPOLOGY, "gv_volume_orphan(%s)", gp->name); + if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0) + g_access(cp, -cp->acr, -cp->acw, -cp->ace); + error = cp->provider->error; + if (error == 0) + error = ENXIO; + g_detach(cp); + g_destroy_consumer(cp); + if (!LIST_EMPTY(&gp->consumer)) + return; + g_free(gp->softc); + g_wither_geom(gp, error); +} + +/* We end up here after the requests to our plexes are done. */ +static void +gv_volume_done(struct bio *bp) +{ + struct g_consumer *cp; + + /* The next plex in this volume. */ + cp = LIST_NEXT(bp->bio_from, consumer); + + switch (bp->bio_cmd) { + case BIO_READ: + /* + * If no error occured on this request, or if we have no plex + * left, finish here... + */ + if ((bp->bio_error == 0) || (cp == NULL)) { + g_std_done(bp); + return; + } + + /* ... or try to read from the next plex. */ + g_io_request(bp, cp); + return; + + case BIO_WRITE: + case BIO_DELETE: + /* No more plexes left. */ + if (cp == NULL) { + /* + * Clear any errors if one of the previous writes + * succeeded. + */ + if (bp->bio_caller1 == (int *)1) + bp->bio_error = 0; + g_std_done(bp); + return; + } + + /* If this write request had no errors, remember that fact... */ + if (bp->bio_error == 0) + bp->bio_caller1 = (int *)1; + + /* ... and write to the next plex. */ + g_io_request(bp, cp); + return; + } +} + +static void +gv_volume_start(struct bio *bp) +{ + struct g_geom *gp; + struct bio *bp2; + struct gv_volume *v; + + gp = bp->bio_to->geom; + v = gp->softc; + if (v->state != GV_VOL_UP) { + g_io_deliver(bp, ENXIO); + return; + } + switch(bp->bio_cmd) { + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + bp2 = g_clone_bio(bp); + if (bp2 == NULL) { + g_io_deliver(bp, ENOMEM); + return; + } + bp2->bio_done = gv_volume_done; + g_io_request(bp2, LIST_FIRST(&gp->consumer)); + return; + default: + g_io_deliver(bp, EOPNOTSUPP); + return; + } +} + +static int +gv_volume_access(struct g_provider *pp, int dr, int dw, int de) +{ + struct g_geom *gp; + struct g_consumer *cp, *cp2; + int error; + + gp = pp->geom; + + error = ENXIO; + LIST_FOREACH(cp, &gp->consumer, consumer) { + error = g_access(cp, dr, dw, de); + if (error) { + LIST_FOREACH(cp2, &gp->consumer, consumer) { + if (cp == cp2) + break; + g_access(cp2, -dr, -dw, -de); + } + return (error); + } + } + return (error); +} + +static struct g_geom * +gv_volume_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) +{ + struct g_geom *gp; + struct g_provider *pp2; + struct g_consumer *cp; + struct gv_softc *sc; + struct gv_volume *v; + struct gv_plex *p; + int first; + + g_trace(G_T_TOPOLOGY, "gv_volume_taste(%s, %s)", mp->name, pp->name); + g_topology_assert(); + + /* First, find the VINUM class and its associated geom. */ + gp = find_vinum_geom(); + if (gp == NULL) + return (NULL); + + sc = gp->softc; + KASSERT(sc != NULL, ("gv_volume_taste: NULL sc")); + + gp = pp->geom; + + /* We only want to attach to plexes. */ + if (strcmp(gp->class->name, "VINUMPLEX")) + return (NULL); + + first = 0; + p = gp->softc; + v = gv_find_vol(sc, p->volume); + if (v == NULL) + return (NULL); + if (v->geom == NULL) { + gp = g_new_geomf(mp, "%s", p->volume); + gp->start = gv_volume_start; + gp->orphan = gv_volume_orphan; + gp->access = gv_volume_access; + gp->softc = v; + first++; + } else + gp = v->geom; + + cp = g_new_consumer(gp); + g_attach(cp, pp); + p->consumer = cp; + + if (p->vol_sc != v) { + p->vol_sc = v; + v->plexcount++; + LIST_INSERT_HEAD(&v->plexes, p, in_volume); + } + + /* We need to setup a new VINUMVOLUME geom. */ + if (first) { + pp2 = g_new_providerf(gp, "gvinum/%s", v->name); + pp2->mediasize = pp->mediasize; + pp2->sectorsize = pp->sectorsize; + g_error_provider(pp2, 0); + v->size = pp2->mediasize; + v->geom = gp; + return (gp); + } + + return (NULL); +} + +static int +gv_volume_destroy_geom(struct gctl_req *req, struct g_class *mp, + struct g_geom *gp) +{ + g_trace(G_T_TOPOLOGY, "gv_volume_destroy_geom: %s", gp->name); + g_topology_assert(); +/* + if (gp->softc != NULL) + g_free(gp->softc); + gp->softc = NULL; +*/ + g_wither_geom(gp, ENXIO); + return (0); +} + +#define VINUMVOLUME_CLASS_NAME "VINUMVOLUME" + +static struct g_class g_vinum_volume_class = { + .name = VINUMVOLUME_CLASS_NAME, + .taste = gv_volume_taste, + .destroy_geom = gv_volume_destroy_geom, +}; + +DECLARE_GEOM_CLASS(g_vinum_volume_class, g_vinum_volume); |