diff -Naur linux-2.6.0/drivers/md/md.c linux-2.6.0-evms-2.2.2/drivers/md/md.c --- linux-2.6.0/drivers/md/md.c 2004-01-05 09:20:10.000000000 -0600 +++ linux-2.6.0-evms-2.2.2/drivers/md/md.c 2004-01-05 09:18:33.000000000 -0600 @@ -471,6 +471,29 @@ void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); }; +static int get_rdev_sbnr(mdp_super_t *sb, mdk_rdev_t * rdev) +{ + mdp_disk_t *desc; + dev_t dev; + int i; + + if (sb->level != LEVEL_MULTIPATH) + return sb->this_disk.number; + + for (i=0; idisks[i]; + dev = MKDEV(desc->major,desc->minor); + if (!dev) + continue; + if (dev != rdev->bdev->bd_dev) + continue; + return i; + } + + MD_BUG(); + return -1; +} + /* * load_super for 0.90.0 */ @@ -528,11 +551,7 @@ rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; - - if (sb->level == MULTIPATH) - rdev->desc_nr = -1; - else - rdev->desc_nr = sb->this_disk.number; + rdev->desc_nr = get_rdev_sbnr(sb,rdev); if (refdev == 0) ret = 1; @@ -608,7 +627,7 @@ if (ev1 < mddev->events) return -EINVAL; } - if (mddev->level != LEVEL_MULTIPATH) { + if (rdev->desc_nr != -1) { rdev->raid_disk = -1; rdev->in_sync = rdev->faulty = 0; desc = sb->disks + rdev->desc_nr; @@ -685,6 +704,8 @@ sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_size; + if (sb->level != LEVEL_MULTIPATH) { + sb->disks[0].state = (1<in_sync && !rdev2->faulty) { + rdev2->desc_nr = next; + rdev2->raid_disk = next; + d = &sb->disks[next]; + d->number = next; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + d->raid_disk = next; + d->state = (1<in_sync && !rdev2->faulty) { + rdev2->desc_nr = next; + d = &sb->disks[next]; + d->number = next; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + d->raid_disk = next; + d->state = 0; + spare++; + working++; + nr_disks++; + next++; + } + } + ITERATE_RDEV(mddev,rdev2,tmp) { + if (rdev2->faulty) { + rdev2->desc_nr = next; + d = &sb->disks[next]; + d->number = next; + d->major = MAJOR(rdev2->bdev->bd_dev); + d->minor = MINOR(rdev2->bdev->bd_dev); + d->raid_disk = next; + d->state = (1<raid_disks = active; + } + sb->nr_disks = nr_disks; sb->active_disks = active; sb->working_disks = working; @@ -1415,11 +1489,6 @@ kick_rdev_from_array(rdev); continue; } - if (mddev->level == LEVEL_MULTIPATH) { - rdev->desc_nr = i++; - rdev->raid_disk = rdev->desc_nr; - rdev->in_sync = 1; - } } @@ -3351,7 +3420,7 @@ struct list_head *rtmp; - dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); + dprintk(KERN_INFO "md_check_recovery: thread got woken up ...\n"); if (mddev->ro) return; diff -Naur linux-2.6.0/drivers/md/multipath.c linux-2.6.0-evms-2.2.2/drivers/md/multipath.c --- linux-2.6.0/drivers/md/multipath.c 2004-01-05 09:20:10.000000000 -0600 +++ linux-2.6.0-evms-2.2.2/drivers/md/multipath.c 2004-01-05 09:18:33.000000000 -0600 @@ -30,8 +30,6 @@ #define MD_DRIVER #define MD_PERSONALITY -#define MAX_WORK_PER_DISK 128 - #define NR_RESERVED_BUFS 32 @@ -57,7 +55,8 @@ static int multipath_map (mddev_t *mddev, mdk_rdev_t **rdevp) { multipath_conf_t *conf = mddev_to_conf(mddev); - int i, disks = conf->raid_disks; + struct multipath_info *p; + int i; /* * Later we do read balancing on the read side @@ -65,11 +64,11 @@ */ spin_lock_irq(&conf->device_lock); - for (i = 0; i < disks; i++) { - mdk_rdev_t *rdev = conf->multipaths[i].rdev; - if (rdev && rdev->in_sync) { - *rdevp = rdev; - atomic_inc(&rdev->nr_pending); + for (i = 0; i <= conf->raid_disks; i++) { + p = &conf->multipaths[i]; + if (p && p->rdev && p->rdev->in_sync) { + *rdevp = p->rdev; + atomic_inc(&p->rdev->nr_pending); spin_unlock_irq(&conf->device_lock); return 0; } @@ -83,7 +82,6 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) { unsigned long flags; - mddev_t *mddev = mp_bh->mddev; spin_lock_irqsave(&retry_list_lock, flags); if (multipath_retry_list == NULL) @@ -92,7 +90,7 @@ multipath_retry_tail = &mp_bh->next_mp; mp_bh->next_mp = NULL; spin_unlock_irqrestore(&retry_list_lock, flags); - md_wakeup_thread(mddev->thread); + md_wakeup_thread(mp_bh->mddev->thread); } @@ -114,8 +112,7 @@ { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); - multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); - mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; + mdk_rdev_t *rdev = mp_bh->rdev; if (bio->bi_size) return 1; @@ -126,11 +123,7 @@ /* * oops, IO error: */ - char b[BDEVNAME_SIZE]; md_error (mp_bh->mddev, rdev); - printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", - bdevname(rdev->bdev,b), - (unsigned long long)bio->bi_sector); multipath_reschedule_retry(mp_bh); } atomic_dec(&rdev->nr_pending); @@ -138,21 +131,43 @@ } /* - * This routine returns the disk from which the requested read should - * be done. + * Multipath read balance ... + * + * Returns: + * + * If no active paths + * + * - Error ( -1 ) + * + * If active paths == 1 + * + * - 1st active path encountered + * + * If active paths > 1 + * + * - 1st idle active path encountered + * - else ... the active path doing the least amount of work. */ static int multipath_read_balance (multipath_conf_t *conf) { - int disk; + int i, disk=-1, nr_pending, least_pending=0; + struct multipath_info *p; - for (disk = 0; disk < conf->raid_disks; disk++) { - mdk_rdev_t *rdev = conf->multipaths[disk].rdev; - if (rdev && rdev->in_sync) - return disk; + for (i = 0; i <= conf->raid_disks; i++) { + p = &conf->multipaths[i]; + if (p && p->rdev && p->rdev->in_sync) { + nr_pending = atomic_read(&p->rdev->nr_pending); + if (nr_pending == 0 || conf->working_disks == 1) + return i; + if (least_pending == 0 || + nr_pending < least_pending) { + disk = i; + least_pending = nr_pending; + } + } } - BUG(); - return 0; + return disk; } static int multipath_make_request (request_queue_t *q, struct bio * bio) @@ -161,6 +176,7 @@ multipath_conf_t *conf = mddev_to_conf(mddev); struct multipath_bh * mp_bh; struct multipath_info *multipath; + int disk_idx; mp_bh = mempool_alloc(conf->pool, GFP_NOIO); @@ -171,11 +187,18 @@ * read balancing logic: */ spin_lock_irq(&conf->device_lock); - mp_bh->path = multipath_read_balance(conf); - multipath = conf->multipaths + mp_bh->path; + disk_idx = multipath_read_balance(conf); + if (disk_idx == -1) { + spin_unlock_irq(&conf->device_lock); + printk (KERN_ERR "md_multipath_make_request: no IO paths.\n"); + multipath_end_bh_io (mp_bh, 0); + return 0; + } + multipath = &conf->multipaths[disk_idx]; atomic_inc(&multipath->rdev->nr_pending); spin_unlock_irq(&conf->device_lock); + mp_bh->rdev = multipath->rdev; mp_bh->bio = *bio; mp_bh->bio.bi_bdev = multipath->rdev->bdev; mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST); @@ -206,39 +229,42 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) { multipath_conf_t *conf = mddev_to_conf(mddev); + struct multipath_info *p; + char b[BDEVNAME_SIZE]; + int i; - if (conf->working_disks <= 1) { - /* - * Uh oh, we can do nothing if this is our last path, but - * first check if this is a queued request for a device - * which has just failed. - */ - printk(KERN_ALERT - "multipath: only one IO path left and IO error.\n"); - /* leave it active... it's all we have */ - } else { - /* - * Mark disk as unusable - */ - if (!rdev->faulty) { - char b[BDEVNAME_SIZE]; - rdev->in_sync = 0; - rdev->faulty = 1; - mddev->sb_dirty = 1; - conf->working_disks--; - printk(KERN_ALERT "multipath: IO failure on %s," - " disabling IO path. \n Operation continuing" - " on %d IO paths.\n", - bdevname (rdev->bdev,b), - conf->working_disks); + spin_lock_irq(&conf->device_lock); + if (!rdev->faulty) { + + printk(KERN_INFO "md_multipath_error: IO failure on %s\n", + bdevname(rdev->bdev,b)); + + /* mark disk bad */ + rdev->in_sync = 0; + rdev->faulty = 1; + mddev->sb_dirty = 1; + conf->working_disks--; + + /* activate a spare */ + for (i=0; i<=conf->raid_disks; i++) { + p = &conf->multipaths[i]; + if (p && p->rdev && + !p->rdev->faulty && !p->rdev->in_sync ) { + p->rdev->in_sync = 1; + p->rdev->raid_disk = i; + conf->working_disks++; + break; + } } } + spin_unlock_irq(&conf->device_lock); } static void print_multipath_conf (multipath_conf_t *conf) { int i; struct multipath_info *tmp; + char b[BDEVNAME_SIZE]; printk("MULTIPATH conf printout:\n"); if (!conf) { @@ -248,10 +274,9 @@ printk(" --- wd:%d rd:%d\n", conf->working_disks, conf->raid_disks); - for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; - tmp = conf->multipaths + i; - if (tmp->rdev) + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = &conf->multipaths[i]; + if (tmp && tmp->rdev) printk(" disk%d, o:%d, dev:%s\n", i,!tmp->rdev->faulty, bdevname(tmp->rdev->bdev,b)); @@ -266,10 +291,12 @@ int path; struct multipath_info *p; + printk(KERN_INFO "md_multipath_add_disk: entry\n"); print_multipath_conf(conf); spin_lock_irq(&conf->device_lock); - for (path=0; pathraid_disks; path++) - if ((p=conf->multipaths+path)->rdev == NULL) { + for (path=0; found==0 && pathmultipaths[path]; + if (p->rdev == NULL) { p->rdev = rdev; blk_queue_stack_limits(mddev->queue, rdev->bdev->bd_disk->queue); @@ -284,30 +311,41 @@ mddev->queue->max_sectors > (PAGE_SIZE>>9)) mddev->queue->max_sectors = (PAGE_SIZE>>9); - conf->working_disks++; rdev->raid_disk = path; - rdev->in_sync = 1; + rdev->in_sync = 0; + rdev->faulty = 0; + if (path > conf->raid_disks) + conf->raid_disks = path; found = 1; } + } spin_unlock_irq(&conf->device_lock); print_multipath_conf(conf); + printk(KERN_INFO "md_multipath_add_disk: done\n"); return found; } static int multipath_remove_disk(mddev_t *mddev, int number) { multipath_conf_t *conf = mddev->private; - int err = 1; - struct multipath_info *p = conf->multipaths + number; + int err = 1, i; + struct multipath_info *p=NULL, *mp=NULL; + printk(KERN_INFO "md_multipath_rm_disk: entry\n"); print_multipath_conf(conf); - spin_lock_irq(&conf->device_lock); - if (p->rdev) { + spin_lock_irq(&conf->device_lock); + for (i=0; imultipaths[i]; + if (mp && mp->rdev && mp->rdev->raid_disk==number) { + p = mp; + break; + } + } + if (p) { if (p->rdev->in_sync || atomic_read(&p->rdev->nr_pending)) { - printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); err = -EBUSY; goto abort; } @@ -320,6 +358,7 @@ spin_unlock_irq(&conf->device_lock); print_multipath_conf(conf); + printk(KERN_INFO "md_multipath_rm_disk: done\n"); return err; } @@ -328,49 +367,50 @@ /* * This is a kernel thread which: * - * 1. Retries failed read operations on working multipaths. + * 1. Retries failed operations on working multipaths. * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. */ - static void multipathd (mddev_t *mddev) { struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; mdk_rdev_t *rdev; + char b[BDEVNAME_SIZE]; md_check_recovery(mddev); for (;;) { - char b[BDEVNAME_SIZE]; spin_lock_irqsave(&retry_list_lock, flags); mp_bh = multipath_retry_list; - if (!mp_bh) + if (!mp_bh) { + spin_unlock_irqrestore(&retry_list_lock, flags); break; + } multipath_retry_list = mp_bh->next_mp; spin_unlock_irqrestore(&retry_list_lock, flags); - mddev = mp_bh->mddev; - bio = &mp_bh->bio; - bio->bi_sector = mp_bh->master_bio->bi_sector; - + mp_bh->bio = *mp_bh->master_bio; + mp_bh->bio.bi_end_io = multipath_end_request; + mp_bh->bio.bi_private = mp_bh; + bio = &mp_bh->bio; + rdev = NULL; - if (multipath_map (mddev, &rdev)<0) { - printk(KERN_ALERT "multipath: %s: unrecoverable IO read" + if (multipath_map (mddev, &rdev)==-1) { + printk(KERN_ERR "md_multipath_D: %s: unrecoverable IO " " error for block %llu\n", bdevname(bio->bi_bdev,b), (unsigned long long)bio->bi_sector); multipath_end_bh_io(mp_bh, 0); } else { - printk(KERN_ERR "multipath: %s: redirecting sector %llu" + printk(KERN_INFO "md_multipath_D: %s: redirecting sector %llu" " to another IO path\n", bdevname(bio->bi_bdev,b), (unsigned long long)bio->bi_sector); + mp_bh->rdev = rdev; bio->bi_bdev = rdev->bdev; generic_make_request(bio); } } - spin_unlock_irqrestore(&retry_list_lock, flags); } static int multipath_run (mddev_t *mddev) @@ -382,8 +422,8 @@ struct list_head *tmp; if (mddev->level != LEVEL_MULTIPATH) { - printk("multipath: md%d: raid level not set to multipath IO (%d)\n", - mdidx(mddev), mddev->level); + printk("md_multipath_run: raid level not set to multipath IO (%d)\n", + mddev->level); goto out; } /* @@ -396,31 +436,36 @@ mddev->private = conf; if (!conf) { printk(KERN_ERR - "multipath: couldn't allocate memory for md%d\n", + "md_multipath_run: couldn't allocate memory for md%d\n", mdidx(mddev)); goto out; } memset(conf, 0, sizeof(*conf)); - conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks, + conf->multipaths = kmalloc(sizeof(struct multipath_info)*MD_SB_DISKS, GFP_KERNEL); if (!conf->multipaths) { printk(KERN_ERR - "multipath: couldn't allocate memory for md%d\n", + "md_multipath_run: couldn't allocate memory for md%d\n", mdidx(mddev)); goto out_free_conf; } - memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks); + memset(conf->multipaths, 0, sizeof(struct multipath_info)*MD_SB_DISKS); conf->working_disks = 0; + conf->raid_disks = 0; ITERATE_RDEV(mddev,rdev,tmp) { disk_idx = rdev->raid_disk; - if (disk_idx < 0 || - disk_idx >= mddev->raid_disks) + if ( rdev->faulty || + !rdev->in_sync || + disk_idx==-1 ) continue; - disk = conf->multipaths + disk_idx; + disk = &conf->multipaths[disk_idx]; disk->rdev = rdev; + conf->working_disks++; + if (disk_idx > conf->raid_disks) + conf->raid_disks = disk_idx; blk_queue_stack_limits(mddev->queue, rdev->bdev->bd_disk->queue); @@ -431,17 +476,14 @@ mddev->queue->max_sectors > (PAGE_SIZE>>9)) mddev->queue->max_sectors = (PAGE_SIZE>>9); - if (!rdev->faulty) - conf->working_disks++; } - conf->raid_disks = mddev->raid_disks; mddev->sb_dirty = 1; conf->mddev = mddev; conf->device_lock = SPIN_LOCK_UNLOCKED; if (!conf->working_disks) { - printk(KERN_ERR "multipath: no operational IO paths for md%d\n", + printk(KERN_ERR "md_multipath_run: no operational IO paths for md%d\n", mdidx(mddev)); goto out_free_conf; } @@ -452,7 +494,7 @@ NULL); if (conf->pool == NULL) { printk(KERN_ERR - "multipath: couldn't allocate memory for md%d\n", + "md_multipath_run: couldn't allocate memory for md%d\n", mdidx(mddev)); goto out_free_conf; } @@ -462,14 +504,16 @@ mddev->thread = md_register_thread(multipathd, mddev, name); if (!mddev->thread) { - printk(KERN_ERR "multipath: couldn't allocate thread" + printk(KERN_ERR "md_multipath_run: couldn't allocate thread" " for md%d\n", mdidx(mddev)); goto out_free_conf; } } + print_multipath_conf(conf); + printk(KERN_INFO - "multipath: array md%d active with %d out of %d IO paths\n", + "md_multipath_run: array md%d active with %d out of %d IO paths\n", mdidx(mddev), conf->working_disks, mddev->raid_disks); /* * Ok, everything is just fine now @@ -493,6 +537,10 @@ { multipath_conf_t *conf = mddev_to_conf(mddev); + printk(KERN_INFO + "md_multipath_stop: stopping array md%d active with %d out of %d IO paths\n", + mdidx(mddev), conf->working_disks, mddev->raid_disks); + md_unregister_thread(mddev->thread); mempool_destroy(conf->pool); kfree(conf->multipaths); diff -Naur linux-2.6.0/include/linux/raid/multipath.h linux-2.6.0-evms-2.2.2/include/linux/raid/multipath.h --- linux-2.6.0/include/linux/raid/multipath.h 2004-01-05 09:20:10.000000000 -0600 +++ linux-2.6.0-evms-2.2.2/include/linux/raid/multipath.h 2004-01-05 09:19:26.000000000 -0600 @@ -35,7 +35,7 @@ mddev_t *mddev; struct bio *master_bio; struct bio bio; - int path; + mdk_rdev_t *rdev; struct multipath_bh *next_mp; /* next for retry */ }; #endif