| /* |
| md.c : Multiple Devices driver for Linux |
| Copyright (C) 1998, 1999, 2000 Ingo Molnar |
| |
| completely rewritten, based on the MD driver code from Marc Zyngier |
| |
| Changes: |
| |
| - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar |
| - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> |
| - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> |
| - kerneld support by Boris Tobotras <boris@xtalk.msk.su> |
| - kmod support by: Cyrus Durgin |
| - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> |
| - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> |
| |
| - lots of fixes and improvements to the RAID1/RAID5 and generic |
| RAID code (such as request based resynchronization): |
| |
| Neil Brown <neilb@cse.unsw.edu.au>. |
| |
| - persistent bitmap code |
| Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. |
| |
| This program is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 2, or (at your option) |
| any later version. |
| |
| You should have received a copy of the GNU General Public License |
| (for example /usr/src/linux/COPYING); if not, write to the Free |
| Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| */ |
| |
| #include <linux/kthread.h> |
| #include <linux/blkdev.h> |
| #include <linux/sysctl.h> |
| #include <linux/seq_file.h> |
| #include <linux/mutex.h> |
| #include <linux/buffer_head.h> /* for invalidate_bdev */ |
| #include <linux/poll.h> |
| #include <linux/ctype.h> |
| #include <linux/string.h> |
| #include <linux/hdreg.h> |
| #include <linux/proc_fs.h> |
| #include <linux/random.h> |
| #include <linux/module.h> |
| #include <linux/reboot.h> |
| #include <linux/file.h> |
| #include <linux/compat.h> |
| #include <linux/delay.h> |
| #include <linux/raid/md_p.h> |
| #include <linux/raid/md_u.h> |
| #include <linux/slab.h> |
| #include "md.h" |
| #include "bitmap.h" |
| |
| #ifndef MODULE |
| static void autostart_arrays(int part); |
| #endif |
| |
| /* pers_list is a list of registered personalities protected |
| * by pers_lock. |
| * pers_lock does extra service to protect accesses to |
| * mddev->thread when the mutex cannot be held. |
| */ |
| static LIST_HEAD(pers_list); |
| static DEFINE_SPINLOCK(pers_lock); |
| |
| static void md_print_devices(void); |
| |
| static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
| static struct workqueue_struct *md_wq; |
| static struct workqueue_struct *md_misc_wq; |
| |
| #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } |
| |
| /* |
| * Default number of read corrections we'll attempt on an rdev |
| * before ejecting it from the array. We divide the read error |
| * count by 2 for every hour elapsed between read errors. |
| */ |
| #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 |
| /* |
| * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' |
| * is 1000 KB/sec, so the extra system load does not show up that much. |
| * Increase it if you want to have more _guaranteed_ speed. Note that |
| * the RAID driver will use the maximum available bandwidth if the IO |
| * subsystem is idle. There is also an 'absolute maximum' reconstruction |
| * speed limit - in case reconstruction slows down your system despite |
| * idle IO detection. |
| * |
| * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. |
| * or /sys/block/mdX/md/sync_speed_{min,max} |
| */ |
| |
| static int sysctl_speed_limit_min = 1000; |
| static int sysctl_speed_limit_max = 200000; |
| static inline int speed_min(struct mddev *mddev) |
| { |
| return mddev->sync_speed_min ? |
| mddev->sync_speed_min : sysctl_speed_limit_min; |
| } |
| |
| static inline int speed_max(struct mddev *mddev) |
| { |
| return mddev->sync_speed_max ? |
| mddev->sync_speed_max : sysctl_speed_limit_max; |
| } |
| |
| static struct ctl_table_header *raid_table_header; |
| |
| static ctl_table raid_table[] = { |
| { |
| .procname = "speed_limit_min", |
| .data = &sysctl_speed_limit_min, |
| .maxlen = sizeof(int), |
| .mode = S_IRUGO|S_IWUSR, |
| .proc_handler = proc_dointvec, |
| }, |
| { |
| .procname = "speed_limit_max", |
| .data = &sysctl_speed_limit_max, |
| .maxlen = sizeof(int), |
| .mode = S_IRUGO|S_IWUSR, |
| .proc_handler = proc_dointvec, |
| }, |
| { } |
| }; |
| |
| static ctl_table raid_dir_table[] = { |
| { |
| .procname = "raid", |
| .maxlen = 0, |
| .mode = S_IRUGO|S_IXUGO, |
| .child = raid_table, |
| }, |
| { } |
| }; |
| |
| static ctl_table raid_root_table[] = { |
| { |
| .procname = "dev", |
| .maxlen = 0, |
| .mode = 0555, |
| .child = raid_dir_table, |
| }, |
| { } |
| }; |
| |
| static const struct block_device_operations md_fops; |
| |
| static int start_readonly; |
| |
| /* bio_clone_mddev |
| * like bio_clone, but with a local bio set |
| */ |
| |
| static void mddev_bio_destructor(struct bio *bio) |
| { |
| struct mddev *mddev, **mddevp; |
| |
| mddevp = (void*)bio; |
| mddev = mddevp[-1]; |
| |
| bio_free(bio, mddev->bio_set); |
| } |
| |
| struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, |
| struct mddev *mddev) |
| { |
| struct bio *b; |
| struct mddev **mddevp; |
| |
| if (!mddev || !mddev->bio_set) |
| return bio_alloc(gfp_mask, nr_iovecs); |
| |
| b = bio_alloc_bioset(gfp_mask, nr_iovecs, |
| mddev->bio_set); |
| if (!b) |
| return NULL; |
| mddevp = (void*)b; |
| mddevp[-1] = mddev; |
| b->bi_destructor = mddev_bio_destructor; |
| return b; |
| } |
| EXPORT_SYMBOL_GPL(bio_alloc_mddev); |
| |
| struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, |
| struct mddev *mddev) |
| { |
| struct bio *b; |
| struct mddev **mddevp; |
| |
| if (!mddev || !mddev->bio_set) |
| return bio_clone(bio, gfp_mask); |
| |
| b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, |
| mddev->bio_set); |
| if (!b) |
| return NULL; |
| mddevp = (void*)b; |
| mddevp[-1] = mddev; |
| b->bi_destructor = mddev_bio_destructor; |
| __bio_clone(b, bio); |
| if (bio_integrity(bio)) { |
| int ret; |
| |
| ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); |
| |
| if (ret < 0) { |
| bio_put(b); |
| return NULL; |
| } |
| } |
| |
| return b; |
| } |
| EXPORT_SYMBOL_GPL(bio_clone_mddev); |
| |
| void md_trim_bio(struct bio *bio, int offset, int size) |
| { |
| /* 'bio' is a cloned bio which we need to trim to match |
| * the given offset and size. |
| * This requires adjusting bi_sector, bi_size, and bi_io_vec |
| */ |
| int i; |
| struct bio_vec *bvec; |
| int sofar = 0; |
| |
| size <<= 9; |
| if (offset == 0 && size == bio->bi_size) |
| return; |
| |
| bio->bi_sector += offset; |
| bio->bi_size = size; |
| offset <<= 9; |
| clear_bit(BIO_SEG_VALID, &bio->bi_flags); |
| |
| while (bio->bi_idx < bio->bi_vcnt && |
| bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { |
| /* remove this whole bio_vec */ |
| offset -= bio->bi_io_vec[bio->bi_idx].bv_len; |
| bio->bi_idx++; |
| } |
| if (bio->bi_idx < bio->bi_vcnt) { |
| bio->bi_io_vec[bio->bi_idx].bv_offset += offset; |
| bio->bi_io_vec[bio->bi_idx].bv_len -= offset; |
| } |
| /* avoid any complications with bi_idx being non-zero*/ |
| if (bio->bi_idx) { |
| memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, |
| (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); |
| bio->bi_vcnt -= bio->bi_idx; |
| bio->bi_idx = 0; |
| } |
| /* Make sure vcnt and last bv are not too big */ |
| bio_for_each_segment(bvec, bio, i) { |
| if (sofar + bvec->bv_len > size) |
| bvec->bv_len = size - sofar; |
| if (bvec->bv_len == 0) { |
| bio->bi_vcnt = i; |
| break; |
| } |
| sofar += bvec->bv_len; |
| } |
| } |
| EXPORT_SYMBOL_GPL(md_trim_bio); |
| |
| /* |
| * We have a system wide 'event count' that is incremented |
| * on any 'interesting' event, and readers of /proc/mdstat |
| * can use 'poll' or 'select' to find out when the event |
| * count increases. |
| * |
| * Events are: |
| * start array, stop array, error, add device, remove device, |
| * start build, activate spare |
| */ |
| static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); |
| static atomic_t md_event_count; |
| void md_new_event(struct mddev *mddev) |
| { |
| atomic_inc(&md_event_count); |
| wake_up(&md_event_waiters); |
| } |
| EXPORT_SYMBOL_GPL(md_new_event); |
| |
| /* Alternate version that can be called from interrupts |
| * when calling sysfs_notify isn't needed. |
| */ |
| static void md_new_event_inintr(struct mddev *mddev) |
| { |
| atomic_inc(&md_event_count); |
| wake_up(&md_event_waiters); |
| } |
| |
| /* |
| * Enables to iterate over all existing md arrays |
| * all_mddevs_lock protects this list. |
| */ |
| static LIST_HEAD(all_mddevs); |
| static DEFINE_SPINLOCK(all_mddevs_lock); |
| |
| |
| /* |
| * iterates through all used mddevs in the system. |
| * We take care to grab the all_mddevs_lock whenever navigating |
| * the list, and to always hold a refcount when unlocked. |
| * Any code which breaks out of this loop while own |
| * a reference to the current mddev and must mddev_put it. |
| */ |
| #define for_each_mddev(_mddev,_tmp) \ |
| \ |
| for (({ spin_lock(&all_mddevs_lock); \ |
| _tmp = all_mddevs.next; \ |
| _mddev = NULL;}); \ |
| ({ if (_tmp != &all_mddevs) \ |
| mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ |
| spin_unlock(&all_mddevs_lock); \ |
| if (_mddev) mddev_put(_mddev); \ |
| _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ |
| _tmp != &all_mddevs;}); \ |
| ({ spin_lock(&all_mddevs_lock); \ |
| _tmp = _tmp->next;}) \ |
| ) |
| |
| |
| /* Rather than calling directly into the personality make_request function, |
| * IO requests come here first so that we can check if the device is |
| * being suspended pending a reconfiguration. |
| * We hold a refcount over the call to ->make_request. By the time that |
| * call has finished, the bio has been linked into some internal structure |
| * and so is visible to ->quiesce(), so we don't need the refcount any more. |
| */ |
| static void md_make_request(struct request_queue *q, struct bio *bio) |
| { |
| const int rw = bio_data_dir(bio); |
| struct mddev *mddev = q->queuedata; |
| int cpu; |
| unsigned int sectors; |
| |
| if (mddev == NULL || mddev->pers == NULL |
| || !mddev->ready) { |
| bio_io_error(bio); |
| return; |
| } |
| smp_rmb(); /* Ensure implications of 'active' are visible */ |
| rcu_read_lock(); |
| if (mddev->suspended) { |
| DEFINE_WAIT(__wait); |
| for (;;) { |
| prepare_to_wait(&mddev->sb_wait, &__wait, |
| TASK_UNINTERRUPTIBLE); |
| if (!mddev->suspended) |
| break; |
| rcu_read_unlock(); |
| schedule(); |
| rcu_read_lock(); |
| } |
| finish_wait(&mddev->sb_wait, &__wait); |
| } |
| atomic_inc(&mddev->active_io); |
| rcu_read_unlock(); |
| |
| /* |
| * save the sectors now since our bio can |
| * go away inside make_request |
| */ |
| sectors = bio_sectors(bio); |
| mddev->pers->make_request(mddev, bio); |
| |
| cpu = part_stat_lock(); |
| part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); |
| part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); |
| part_stat_unlock(); |
| |
| if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) |
| wake_up(&mddev->sb_wait); |
| } |
| |
| /* mddev_suspend makes sure no new requests are submitted |
| * to the device, and that any requests that have been submitted |
| * are completely handled. |
| * Once ->stop is called and completes, the module will be completely |
| * unused. |
| */ |
| void mddev_suspend(struct mddev *mddev) |
| { |
| BUG_ON(mddev->suspended); |
| mddev->suspended = 1; |
| synchronize_rcu(); |
| wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); |
| mddev->pers->quiesce(mddev, 1); |
| |
| del_timer_sync(&mddev->safemode_timer); |
| } |
| EXPORT_SYMBOL_GPL(mddev_suspend); |
| |
| void mddev_resume(struct mddev *mddev) |
| { |
| mddev->suspended = 0; |
| wake_up(&mddev->sb_wait); |
| mddev->pers->quiesce(mddev, 0); |
| |
| md_wakeup_thread(mddev->thread); |
| md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ |
| } |
| EXPORT_SYMBOL_GPL(mddev_resume); |
| |
| int mddev_congested(struct mddev *mddev, int bits) |
| { |
| return mddev->suspended; |
| } |
| EXPORT_SYMBOL(mddev_congested); |
| |
| /* |
| * Generic flush handling for md |
| */ |
| |
| static void md_end_flush(struct bio *bio, int err) |
| { |
| struct md_rdev *rdev = bio->bi_private; |
| struct mddev *mddev = rdev->mddev; |
| |
| rdev_dec_pending(rdev, mddev); |
| |
| if (atomic_dec_and_test(&mddev->flush_pending)) { |
| /* The pre-request flush has finished */ |
| queue_work(md_wq, &mddev->flush_work); |
| } |
| bio_put(bio); |
| } |
| |
| static void md_submit_flush_data(struct work_struct *ws); |
| |
| static void submit_flushes(struct work_struct *ws) |
| { |
| struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
| struct md_rdev *rdev; |
| |
| INIT_WORK(&mddev->flush_work, md_submit_flush_data); |
| atomic_set(&mddev->flush_pending, 1); |
| rcu_read_lock(); |
| list_for_each_entry_rcu(rdev, &mddev->disks, same_set) |
| if (rdev->raid_disk >= 0 && |
| !test_bit(Faulty, &rdev->flags)) { |
| /* Take two references, one is dropped |
| * when request finishes, one after |
| * we reclaim rcu_read_lock |
| */ |
| struct bio *bi; |
| atomic_inc(&rdev->nr_pending); |
| atomic_inc(&rdev->nr_pending); |
| rcu_read_unlock(); |
| bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); |
| bi->bi_end_io = md_end_flush; |
| bi->bi_private = rdev; |
| bi->bi_bdev = rdev->bdev; |
| atomic_inc(&mddev->flush_pending); |
| submit_bio(WRITE_FLUSH, bi); |
| rcu_read_lock(); |
| rdev_dec_pending(rdev, mddev); |
| } |
| rcu_read_unlock(); |
| if (atomic_dec_and_test(&mddev->flush_pending)) |
| queue_work(md_wq, &mddev->flush_work); |
| } |
| |
| static void md_submit_flush_data(struct work_struct *ws) |
| { |
| struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
| struct bio *bio = mddev->flush_bio; |
| |
| if (bio->bi_size == 0) |
| /* an empty barrier - all done */ |
| bio_endio(bio, 0); |
| else { |
| bio->bi_rw &= ~REQ_FLUSH; |
| mddev->pers->make_request(mddev, bio); |
| } |
| |
| mddev->flush_bio = NULL; |
| wake_up(&mddev->sb_wait); |
| } |
| |
| void md_flush_request(struct mddev *mddev, struct bio *bio) |
| { |
| spin_lock_irq(&mddev->write_lock); |
| wait_event_lock_irq(mddev->sb_wait, |
| !mddev->flush_bio, |
| mddev->write_lock, /*nothing*/); |
| mddev->flush_bio = bio; |
| spin_unlock_irq(&mddev->write_lock); |
| |
| INIT_WORK(&mddev->flush_work, submit_flushes); |
| queue_work(md_wq, &mddev->flush_work); |
| } |
| EXPORT_SYMBOL(md_flush_request); |
| |
| /* Support for plugging. |
| * This mirrors the plugging support in request_queue, but does not |
| * require having a whole queue or request structures. |
| * We allocate an md_plug_cb for each md device and each thread it gets |
| * plugged on. This links tot the private plug_handle structure in the |
| * personality data where we keep a count of the number of outstanding |
| * plugs so other code can see if a plug is active. |
| */ |
| struct md_plug_cb { |
| struct blk_plug_cb cb; |
| struct mddev *mddev; |
| }; |
| |
| static void plugger_unplug(struct blk_plug_cb *cb) |
| { |
| struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); |
| if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) |
| md_wakeup_thread(mdcb->mddev->thread); |
| kfree(mdcb); |
| } |
| |
| /* Check that an unplug wakeup will come shortly. |
| * If not, wakeup the md thread immediately |
| */ |
| int mddev_check_plugged(struct mddev *mddev) |
| { |
| struct blk_plug *plug = current->plug; |
| struct md_plug_cb *mdcb; |
| |
| if (!plug) |
| return 0; |
| |
| list_for_each_entry(mdcb, &plug->cb_list, cb.list) { |
| if (mdcb->cb.callback == plugger_unplug && |
| mdcb->mddev == mddev) { |
| /* Already on the list, move to top */ |
| if (mdcb != list_first_entry(&plug->cb_list, |
| struct md_plug_cb, |
| cb.list)) |
| list_move(&mdcb->cb.list, &plug->cb_list); |
| return 1; |
| } |
| } |
| /* Not currently on the callback list */ |
| mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); |
| if (!mdcb) |
| return 0; |
| |
| mdcb->mddev = mddev; |
| mdcb->cb.callback = plugger_unplug; |
| atomic_inc(&mddev->plug_cnt); |
| list_add(&mdcb->cb.list, &plug->cb_list); |
| return 1; |
| } |
| EXPORT_SYMBOL_GPL(mddev_check_plugged); |
| |
| static inline struct mddev *mddev_get(struct mddev *mddev) |
| { |
| atomic_inc(&mddev->active); |
| return mddev; |
| } |
| |
| static void mddev_delayed_delete(struct work_struct *ws); |
| |
| static void mddev_put(struct mddev *mddev) |
| { |
| struct bio_set *bs = NULL; |
| |
| if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) |
| return; |
| if (!mddev->raid_disks && list_empty(&mddev->disks) && |
| mddev->ctime == 0 && !mddev->hold_active) { |
| /* Array is not configured at all, and not held active, |
| * so destroy it */ |
| list_del_init(&mddev->all_mddevs); |
| bs = mddev->bio_set; |
| mddev->bio_set = NULL; |
| if (mddev->gendisk) { |
| /* We did a probe so need to clean up. Call |
| * queue_work inside the spinlock so that |
| * flush_workqueue() after mddev_find will |
| * succeed in waiting for the work to be done. |
| */ |
| INIT_WORK(&mddev->del_work, mddev_delayed_delete); |
| queue_work(md_misc_wq, &mddev->del_work); |
| } else |
| kfree(mddev); |
| } |
| spin_unlock(&all_mddevs_lock); |
| if (bs) |
| bioset_free(bs); |
| } |
| |
| void mddev_init(struct mddev *mddev) |
| { |
| mutex_init(&mddev->open_mutex); |
| mutex_init(&mddev->reconfig_mutex); |
| mutex_init(&mddev->bitmap_info.mutex); |
| INIT_LIST_HEAD(&mddev->disks); |
| INIT_LIST_HEAD(&mddev->all_mddevs); |
| init_timer(&mddev->safemode_timer); |
| atomic_set(&mddev->active, 1); |
| atomic_set(&mddev->openers, 0); |
| atomic_set(&mddev->active_io, 0); |
| atomic_set(&mddev->plug_cnt, 0); |
| spin_lock_init(&mddev->write_lock); |
| atomic_set(&mddev->flush_pending, 0); |
| init_waitqueue_head(&mddev->sb_wait); |
| init_waitqueue_head(&mddev->recovery_wait); |
| mddev->reshape_position = MaxSector; |
| mddev->resync_min = 0; |
| mddev->resync_max = MaxSector; |
| mddev->level = LEVEL_NONE; |
| } |
| EXPORT_SYMBOL_GPL(mddev_init); |
| |
| static struct mddev * mddev_find(dev_t unit) |
| { |
| struct mddev *mddev, *new = NULL; |
| |
| if (unit && MAJOR(unit) != MD_MAJOR) |
| unit &= ~((1<<MdpMinorShift)-1); |
| |
| retry: |
| spin_lock(&all_mddevs_lock); |
| |
| if (unit) { |
| list_for_each_entry(mddev, &all_mddevs, all_mddevs) |
| if (mddev->unit == unit) { |
| mddev_get(mddev); |
| spin_unlock(&all_mddevs_lock); |
| kfree(new); |
| return mddev; |
| } |
| |
| if (new) { |
| list_add(&new->all_mddevs, &all_mddevs); |
| spin_unlock(&all_mddevs_lock); |
| new->hold_active = UNTIL_IOCTL; |
| return new; |
| } |
| } else if (new) { |
| /* find an unused unit number */ |
| static int next_minor = 512; |
| int start = next_minor; |
| int is_free = 0; |
| int dev = 0; |
| while (!is_free) { |
| dev = MKDEV(MD_MAJOR, next_minor); |
| next_minor++; |
| if (next_minor > MINORMASK) |
| next_minor = 0; |
| if (next_minor == start) { |
| /* Oh dear, all in use. */ |
| spin_unlock(&all_mddevs_lock); |
| kfree(new); |
| return NULL; |
| } |
| |
| is_free = 1; |
| list_for_each_entry(mddev, &all_mddevs, all_mddevs) |
| if (mddev->unit == dev) { |
| is_free = 0; |
| break; |
| } |
| } |
| new->unit = dev; |
| new->md_minor = MINOR(dev); |
| new->hold_active = UNTIL_STOP; |
| list_add(&new->all_mddevs, &all_mddevs); |
| spin_unlock(&all_mddevs_lock); |
| return new; |
| } |
| spin_unlock(&all_mddevs_lock); |
| |
| new = kzalloc(sizeof(*new), GFP_KERNEL); |
| if (!new) |
| return NULL; |
| |
| new->unit = unit; |
| if (MAJOR(unit) == MD_MAJOR) |
| new->md_minor = MINOR(unit); |
| else |
| new->md_minor = MINOR(unit) >> MdpMinorShift; |
| |
| mddev_init(new); |
| |
| goto retry; |
| } |
| |
| static inline int mddev_lock(struct mddev * mddev) |
| { |
| return mutex_lock_interruptible(&mddev->reconfig_mutex); |
| } |
| |
| static inline int mddev_is_locked(struct mddev *mddev) |
| { |
| return mutex_is_locked(&mddev->reconfig_mutex); |
| } |
| |
| static inline int mddev_trylock(struct mddev * mddev) |
| { |
| return mutex_trylock(&mddev->reconfig_mutex); |
| } |
| |
| static struct attribute_group md_redundancy_group; |
| |
| static void mddev_unlock(struct mddev * mddev) |
| { |
| if (mddev->to_remove) { |
| /* These cannot be removed under reconfig_mutex as |
| * an access to the files will try to take reconfig_mutex |
| * while holding the file unremovable, which leads to |
| * a deadlock. |
| * So hold set sysfs_active while the remove in happeing, |
| * and anything else which might set ->to_remove or my |
| * otherwise change the sysfs namespace will fail with |
| * -EBUSY if sysfs_active is still set. |
| * We set sysfs_active under reconfig_mutex and elsewhere |
| * test it under the same mutex to ensure its correct value |
| * is seen. |
| */ |
| struct attribute_group *to_remove = mddev->to_remove; |
| mddev->to_remove = NULL; |
| mddev->sysfs_active = 1; |
| mutex_unlock(&mddev->reconfig_mutex); |
| |
| if (mddev->kobj.sd) { |
| if (to_remove != &md_redundancy_group) |
| sysfs_remove_group(&mddev->kobj, to_remove); |
| if (mddev->pers == NULL || |
| mddev->pers->sync_request == NULL) { |
| sysfs_remove_group(&mddev->kobj, &md_redundancy_group); |
| if (mddev->sysfs_action) |
| sysfs_put(mddev->sysfs_action); |
| mddev->sysfs_action = NULL; |
| } |
| } |
| mddev->sysfs_active = 0; |
| } else |
| mutex_unlock(&mddev->reconfig_mutex); |
| |
| /* As we've dropped the mutex we need a spinlock to |
| * make sure the thread doesn't disappear |
| */ |
| spin_lock(&pers_lock); |
| md_wakeup_thread(mddev->thread); |
| spin_unlock(&pers_lock); |
| } |
| |
| static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) |
| { |
| struct md_rdev *rdev; |
| |
| list_for_each_entry(rdev, &mddev->disks, same_set) |
| if (rdev->desc_nr == nr) |
| return rdev; |
| |
| return NULL; |
| } |
| |
| static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) |
| { |
| struct md_rdev *rdev; |
| |
| list_for_each_entry(rdev, &mddev->disks, same_set) |
| if (rdev->bdev->bd_dev == dev) |
| return rdev; |
| |
| return NULL; |
| } |
| |
| static struct md_personality *find_pers(int level, char *clevel) |
| { |
| struct md_personality *pers; |
| list_for_each_entry(pers, &pers_list, list) { |
| if (level != LEVEL_NONE && pers->level == level) |
| return pers; |
| if (strcmp(pers->name, clevel)==0) |
| return pers; |
| } |
| return NULL; |
| } |
| |
| /* return the offset of the super block in 512byte sectors */ |
| static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) |
| { |
| sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; |
| return MD_NEW_SIZE_SECTORS(num_sectors); |
| } |
| |
| static int alloc_disk_sb(struct md_rdev * rdev) |
| { |
| if (rdev->sb_page) |
| MD_BUG(); |
| |
| rdev->sb_page = alloc_page(GFP_KERNEL); |
| if (!rdev->sb_page) { |
| printk(KERN_ALERT "md: out of memory.\n"); |
| return -ENOMEM; |
| } |
| |
| return 0; |
| } |
| |
| static void free_disk_sb(struct md_rdev * rdev) |
| { |
| if (rdev->sb_page) { |
| put_page(rdev->sb_page); |
| rdev->sb_loaded = 0; |
| rdev->sb_page = NULL; |
| rdev->sb_start = 0; |
| rdev->sectors = 0; |
| } |
| if (rdev->bb_page) { |
| put_page(rdev->bb_page); |
| rdev->bb_page = NULL; |
| } |
| } |
| |
| |
| static void super_written(struct bio *bio, int error) |
| { |
| struct md_rdev *rdev = bio->bi_private; |
| struct mddev *mddev = rdev->mddev; |
| |
| if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { |
| printk("md: super_written gets error=%d, uptodate=%d\n", |
| error, test_bit(BIO_UPTODATE, &bio->bi_flags)); |
| WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); |
| md_error(mddev, rdev); |
| } |
| |
| if (atomic_dec_and_test(&mddev->pending_writes)) |
| wake_up(&mddev->sb_wait); |
| bio_put(bio); |
| } |
| |
| void md_super_write(struct mddev *mddev, struct md_rdev *rdev, |
| sector_t sector, int size, struct page *page) |
| { |
| /* write first size bytes of page to sector of rdev |
| * Increment mddev->pending_writes before returning |
| * and decrement it on completion, waking up sb_wait |
| * if zero is reached. |
| * If an error occurred, call md_error |
| */ |
| struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); |
| |
| bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; |
| bio->bi_sector = sector; |
| bio_add_page(bio, page, size, 0); |
| bio->bi_private = rdev; |
| bio->bi_end_io = super_written; |
| |
| atomic_inc(&mddev->pending_writes); |
| submit_bio(WRITE_FLUSH_FUA, bio); |
| } |
| |
| void md_super_wait(struct mddev *mddev) |
| { |
| /* wait for all superblock writes that were scheduled to complete */ |
| DEFINE_WAIT(wq); |
| for(;;) { |
| prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); |
| if (atomic_read(&mddev->pending_writes)==0) |
| break; |
| schedule(); |
| } |
| finish_wait(&mddev->sb_wait, &wq); |
| } |
| |
| static void bi_complete(struct bio *bio, int error) |
| { |
| complete((struct completion*)bio->bi_private); |
| } |
| |
| int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
| struct page *page, int rw, bool metadata_op) |
| { |
| struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); |
| struct completion event; |
| int ret; |
| |
| rw |= REQ_SYNC; |
| |
| bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? |
| rdev->meta_bdev : rdev->bdev; |
| if (metadata_op) |
| bio->bi_sector = sector + rdev->sb_start; |
| else |
| bio->bi_sector = sector + rdev->data_offset; |
| bio_add_page(bio, page, size, 0); |
| init_completion(&event); |
| bio->bi_private = &event; |
| bio->bi_end_io = bi_complete; |
| submit_bio(rw, bio); |
| wait_for_completion(&event); |
| |
| ret = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| bio_put(bio); |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(sync_page_io); |
| |
| static int read_disk_sb(struct md_rdev * rdev, int size) |
| { |
| char b[BDEVNAME_SIZE]; |
| if (!rdev->sb_page) { |
| MD_BUG(); |
| return -EINVAL; |
| } |
| if (rdev->sb_loaded) |
| return 0; |
| |
| |
| if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) |
| goto fail; |
| rdev->sb_loaded = 1; |
| return 0; |
| |
| fail: |
| printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", |
| bdevname(rdev->bdev,b)); |
| return -EINVAL; |
| } |
| |
| static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
| { |
| return sb1->set_uuid0 == sb2->set_uuid0 && |
| sb1->set_uuid1 == sb2->set_uuid1 && |
| sb1->set_uuid2 == sb2->set_uuid2 && |
| sb1->set_uuid3 == sb2->set_uuid3; |
| } |
| |
| static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
| { |
| int ret; |
| mdp_super_t *tmp1, *tmp2; |
| |
| tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); |
| tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); |
| |
| if (!tmp1 || !tmp2) { |
| ret = 0; |
| printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); |
| goto abort; |
| } |
| |
| *tmp1 = *sb1; |
| *tmp2 = *sb2; |
| |
| /* |
| * nr_disks is not constant |
| */ |
| tmp1->nr_disks = 0; |
| tmp2->nr_disks = 0; |
| |
| ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); |
| abort: |
| kfree(tmp1); |
| kfree(tmp2); |
| return ret; |
| } |
| |
| |
| static u32 md_csum_fold(u32 csum) |
| { |
| csum = (csum & 0xffff) + (csum >> 16); |
| return (csum & 0xffff) + (csum >> 16); |
| } |
| |
| static unsigned int calc_sb_csum(mdp_super_t * sb) |
| { |
| u64 newcsum = 0; |
| u32 *sb32 = (u32*)sb; |
| int i; |
| unsigned int disk_csum, csum; |
| |
| disk_csum = sb->sb_csum; |
| sb->sb_csum = 0; |
| |
| for (i = 0; i < MD_SB_BYTES/4 ; i++) |
| newcsum += sb32[i]; |
| csum = (newcsum & 0xffffffff) + (newcsum>>32); |
| |
| |
| #ifdef CONFIG_ALPHA |
| /* This used to use csum_partial, which was wrong for several |
| * reasons including that different results are returned on |
| * different architectures. It isn't critical that we get exactly |
| * the same return value as before (we always csum_fold before |
| * testing, and that removes any differences). However as we |
| * know that csum_partial always returned a 16bit value on |
| * alphas, do a fold to maximise conformity to previous behaviour. |
| */ |
| sb->sb_csum = md_csum_fold(disk_csum); |
| #else |
| sb->sb_csum = disk_csum; |
| #endif |
| return csum; |
| } |
| |
| |
| /* |
| * Handle superblock details. |
| * We want to be able to handle multiple superblock formats |
| * so we have a common interface to them all, and an array of |
| * different handlers. |
| * We rely on user-space to write the initial superblock, and support |
| * reading and updating of superblocks. |
| * Interface methods are: |
| * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) |
| * loads and validates a superblock on dev. |
| * if refdev != NULL, compare superblocks on both devices |
| * Return: |
| * 0 - dev has a superblock that is compatible with refdev |
| * 1 - dev has a superblock that is compatible and newer than refdev |
| * so dev should be used as the refdev in future |
| * -EINVAL superblock incompatible or invalid |
| * -othererror e.g. -EIO |
| * |
| * int validate_super(struct mddev *mddev, struct md_rdev *dev) |
| * Verify that dev is acceptable into mddev. |
| * The first time, mddev->raid_disks will be 0, and data from |
| * dev should be merged in. Subsequent calls check that dev |
| * is new enough. Return 0 or -EINVAL |
| * |
| * void sync_super(struct mddev *mddev, struct md_rdev *dev) |
| * Update the superblock for rdev with data in mddev |
| * This does not write to disc. |
| * |
| */ |
| |
| struct super_type { |
| char *name; |
| struct module *owner; |
| int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, |
| int minor_version); |
| int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); |
| void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); |
| unsigned long long (*rdev_size_change)(struct md_rdev *rdev, |
| sector_t num_sectors); |
| }; |
| |
| /* |
| * Check that the given mddev has no bitmap. |
| * |
| * This function is called from the run method of all personalities that do not |
| * support bitmaps. It prints an error message and returns non-zero if mddev |
| * has a bitmap. Otherwise, it returns 0. |
| * |
| */ |
| int md_check_no_bitmap(struct mddev *mddev) |
| { |
| if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) |
| return 0; |
| printk(KERN_ERR "%s: bitmaps are not supported for %s\n", |
| mdname(mddev), mddev->pers->name); |
| return 1; |
| } |
| EXPORT_SYMBOL(md_check_no_bitmap); |
| |
| /* |
| * load_super for 0.90.0 |
| */ |
| static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
| { |
| char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
| mdp_super_t *sb; |
| int ret; |
| |
| /* |
| * Calculate the position of the superblock (512byte sectors), |
| * it's at the end of the disk. |
| * |
| * It also happens to be a multiple of 4Kb. |
| */ |
| rdev->sb_start = calc_dev_sboffset(rdev); |
| |
| ret = read_disk_sb(rdev, MD_SB_BYTES); |
| if (ret) return ret; |
| |
| ret = -EINVAL; |
| |
| bdevname(rdev->bdev, b); |
| sb = page_address(rdev->sb_page); |
| |
| if (sb->md_magic != MD_SB_MAGIC) { |
| printk(KERN_ERR "md: invalid raid superblock magic on %s\n", |
| b); |
| goto abort; |
| } |
| |
| if (sb->major_version != 0 || |
| sb->minor_version < 90 || |
| sb->minor_version > 91) { |
| printk(KERN_WARNING "Bad version number %d.%d on %s\n", |
| sb->major_version, sb->minor_version, |
| b); |
| goto abort; |
| } |
| |
| if (sb->raid_disks <= 0) |
| goto abort; |
| |
| if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { |
| printk(KERN_WARNING "md: invalid superblock checksum on %s\n", |
| b); |
| goto abort; |
| } |
| |
| rdev->preferred_minor = sb->md_minor; |
| rdev->data_offset = 0; |
| rdev->sb_size = MD_SB_BYTES; |
| rdev->badblocks.shift = -1; |
| |
| if (sb->level == LEVEL_MULTIPATH) |
| rdev->desc_nr = -1; |
| else |
| rdev->desc_nr = sb->this_disk.number; |
| |
| if (!refdev) { |
| ret = 1; |
| } else { |
| __u64 ev1, ev2; |
| mdp_super_t *refsb = page_address(refdev->sb_page); |
| if (!uuid_equal(refsb, sb)) { |
| printk(KERN_WARNING "md: %s has different UUID to %s\n", |
| b, bdevname(refdev->bdev,b2)); |
| goto abort; |
| } |
| if (!sb_equal(refsb, sb)) { |
| printk(KERN_WARNING "md: %s has same UUID" |
| " but different superblock to %s\n", |
| b, bdevname(refdev->bdev, b2)); |
| goto abort; |
| } |
| ev1 = md_event(sb); |
| ev2 = md_event(refsb); |
| if (ev1 > ev2) |
| ret = 1; |
| else |
| ret = 0; |
| } |
| rdev->sectors = rdev->sb_start; |
| /* Limit to 4TB as metadata cannot record more than that */ |
| if (rdev->sectors >= (2ULL << 32)) |
| rdev->sectors = (2ULL << 32) - 2; |
| |
| if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) |
| /* "this cannot possibly happen" ... */ |
| ret = -EINVAL; |
| |
| abort: |
| return ret; |
| } |
| |
| /* |
| * validate_super for 0.90.0 |
| */ |
| static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| mdp_disk_t *desc; |
| mdp_super_t *sb = page_address(rdev->sb_page); |
| __u64 ev1 = md_event(sb); |
| |
| rdev->raid_disk = -1; |
| clear_bit(Faulty, &rdev->flags); |
| clear_bit(In_sync, &rdev->flags); |
| clear_bit(WriteMostly, &rdev->flags); |
| |
| if (mddev->raid_disks == 0) { |
| mddev->major_version = 0; |
| mddev->minor_version = sb->minor_version; |
| mddev->patch_version = sb->patch_version; |
| mddev->external = 0; |
| mddev->chunk_sectors = sb->chunk_size >> 9; |
| mddev->ctime = sb->ctime; |
| mddev->utime = sb->utime; |
| mddev->level = sb->level; |
| mddev->clevel[0] = 0; |
| mddev->layout = sb->layout; |
| mddev->raid_disks = sb->raid_disks; |
| mddev->dev_sectors = ((sector_t)sb->size) * 2; |
| mddev->events = ev1; |
| mddev->bitmap_info.offset = 0; |
| mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
| |
| if (mddev->minor_version >= 91) { |
| mddev->reshape_position = sb->reshape_position; |
| mddev->delta_disks = sb->delta_disks; |
| mddev->new_level = sb->new_level; |
| mddev->new_layout = sb->new_layout; |
| mddev->new_chunk_sectors = sb->new_chunk >> 9; |
| } else { |
| mddev->reshape_position = MaxSector; |
| mddev->delta_disks = 0; |
| mddev->new_level = mddev->level; |
| mddev->new_layout = mddev->layout; |
| mddev->new_chunk_sectors = mddev->chunk_sectors; |
| } |
| |
| if (sb->state & (1<<MD_SB_CLEAN)) |
| mddev->recovery_cp = MaxSector; |
| else { |
| if (sb->events_hi == sb->cp_events_hi && |
| sb->events_lo == sb->cp_events_lo) { |
| mddev->recovery_cp = sb->recovery_cp; |
| } else |
| mddev->recovery_cp = 0; |
| } |
| |
| memcpy(mddev->uuid+0, &sb->set_uuid0, 4); |
| memcpy(mddev->uuid+4, &sb->set_uuid1, 4); |
| memcpy(mddev->uuid+8, &sb->set_uuid2, 4); |
| memcpy(mddev->uuid+12,&sb->set_uuid3, 4); |
| |
| mddev->max_disks = MD_SB_DISKS; |
| |
| if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
| mddev->bitmap_info.file == NULL) |
| mddev->bitmap_info.offset = |
| mddev->bitmap_info.default_offset; |
| |
| } else if (mddev->pers == NULL) { |
| /* Insist on good event counter while assembling, except |
| * for spares (which don't need an event count) */ |
| ++ev1; |
| if (sb->disks[rdev->desc_nr].state & ( |
| (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) |
| if (ev1 < mddev->events) |
| return -EINVAL; |
| } else if (mddev->bitmap) { |
| /* if adding to array with a bitmap, then we can accept an |
| * older device ... but not too old. |
| */ |
| if (ev1 < mddev->bitmap->events_cleared) |
| return 0; |
| } else { |
| if (ev1 < mddev->events) |
| /* just a hot-add of a new device, leave raid_disk at -1 */ |
| return 0; |
| } |
| |
| if (mddev->level != LEVEL_MULTIPATH) { |
| desc = sb->disks + rdev->desc_nr; |
| |
| if (desc->state & (1<<MD_DISK_FAULTY)) |
| set_bit(Faulty, &rdev->flags); |
| else if (desc->state & (1<<MD_DISK_SYNC) /* && |
| desc->raid_disk < mddev->raid_disks */) { |
| set_bit(In_sync, &rdev->flags); |
| rdev->raid_disk = desc->raid_disk; |
| } else if (desc->state & (1<<MD_DISK_ACTIVE)) { |
| /* active but not in sync implies recovery up to |
| * reshape position. We don't know exactly where |
| * that is, so set to zero for now */ |
| if (mddev->minor_version >= 91) { |
| rdev->recovery_offset = 0; |
| rdev->raid_disk = desc->raid_disk; |
| } |
| } |
| if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) |
| set_bit(WriteMostly, &rdev->flags); |
| } else /* MULTIPATH are always insync */ |
| set_bit(In_sync, &rdev->flags); |
| return 0; |
| } |
| |
| /* |
| * sync_super for 0.90.0 |
| */ |
| static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| mdp_super_t *sb; |
| struct md_rdev *rdev2; |
| int next_spare = mddev->raid_disks; |
| |
| |
| /* make rdev->sb match mddev data.. |
| * |
| * 1/ zero out disks |
| * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); |
| * 3/ any empty disks < next_spare become removed |
| * |
| * disks[0] gets initialised to REMOVED because |
| * we cannot be sure from other fields if it has |
| * been initialised or not. |
| */ |
| int i; |
| int active=0, working=0,failed=0,spare=0,nr_disks=0; |
| |
| rdev->sb_size = MD_SB_BYTES; |
| |
| sb = page_address(rdev->sb_page); |
| |
| memset(sb, 0, sizeof(*sb)); |
| |
| sb->md_magic = MD_SB_MAGIC; |
| sb->major_version = mddev->major_version; |
| sb->patch_version = mddev->patch_version; |
| sb->gvalid_words = 0; /* ignored */ |
| memcpy(&sb->set_uuid0, mddev->uuid+0, 4); |
| memcpy(&sb->set_uuid1, mddev->uuid+4, 4); |
| memcpy(&sb->set_uuid2, mddev->uuid+8, 4); |
| memcpy(&sb->set_uuid3, mddev->uuid+12,4); |
| |
| sb->ctime = mddev->ctime; |
| sb->level = mddev->level; |
| sb->size = mddev->dev_sectors / 2; |
| sb->raid_disks = mddev->raid_disks; |
| sb->md_minor = mddev->md_minor; |
| sb->not_persistent = 0; |
| sb->utime = mddev->utime; |
| sb->state = 0; |
| sb->events_hi = (mddev->events>>32); |
| sb->events_lo = (u32)mddev->events; |
| |
| if (mddev->reshape_position == MaxSector) |
| sb->minor_version = 90; |
| else { |
| sb->minor_version = 91; |
| sb->reshape_position = mddev->reshape_position; |
| sb->new_level = mddev->new_level; |
| sb->delta_disks = mddev->delta_disks; |
| sb->new_layout = mddev->new_layout; |
| sb->new_chunk = mddev->new_chunk_sectors << 9; |
| } |
| mddev->minor_version = sb->minor_version; |
| if (mddev->in_sync) |
| { |
| sb->recovery_cp = mddev->recovery_cp; |
| sb->cp_events_hi = (mddev->events>>32); |
| sb->cp_events_lo = (u32)mddev->events; |
| if (mddev->recovery_cp == MaxSector) |
| sb->state = (1<< MD_SB_CLEAN); |
| } else |
| sb->recovery_cp = 0; |
| |
| sb->layout = mddev->layout; |
| sb->chunk_size = mddev->chunk_sectors << 9; |
| |
| if (mddev->bitmap && mddev->bitmap_info.file == NULL) |
| sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
| |
| sb->disks[0].state = (1<<MD_DISK_REMOVED); |
| list_for_each_entry(rdev2, &mddev->disks, same_set) { |
| mdp_disk_t *d; |
| int desc_nr; |
| int is_active = test_bit(In_sync, &rdev2->flags); |
| |
| if (rdev2->raid_disk >= 0 && |
| sb->minor_version >= 91) |
| /* we have nowhere to store the recovery_offset, |
| * but if it is not below the reshape_position, |
| * we can piggy-back on that. |
| */ |
| is_active = 1; |
| if (rdev2->raid_disk < 0 || |
| test_bit(Faulty, &rdev2->flags)) |
| is_active = 0; |
| if (is_active) |
| desc_nr = rdev2->raid_disk; |
| else |
| desc_nr = next_spare++; |
| rdev2->desc_nr = desc_nr; |
| d = &sb->disks[rdev2->desc_nr]; |
| nr_disks++; |
| d->number = rdev2->desc_nr; |
| d->major = MAJOR(rdev2->bdev->bd_dev); |
| d->minor = MINOR(rdev2->bdev->bd_dev); |
| if (is_active) |
| d->raid_disk = rdev2->raid_disk; |
| else |
| d->raid_disk = rdev2->desc_nr; /* compatibility */ |
| if (test_bit(Faulty, &rdev2->flags)) |
| d->state = (1<<MD_DISK_FAULTY); |
| else if (is_active) { |
| d->state = (1<<MD_DISK_ACTIVE); |
| if (test_bit(In_sync, &rdev2->flags)) |
| d->state |= (1<<MD_DISK_SYNC); |
| active++; |
| working++; |
| } else { |
| d->state = 0; |
| spare++; |
| working++; |
| } |
| if (test_bit(WriteMostly, &rdev2->flags)) |
| d->state |= (1<<MD_DISK_WRITEMOSTLY); |
| } |
| /* now set the "removed" and "faulty" bits on any missing devices */ |
| for (i=0 ; i < mddev->raid_disks ; i++) { |
| mdp_disk_t *d = &sb->disks[i]; |
| if (d->state == 0 && d->number == 0) { |
| d->number = i; |
| d->raid_disk = i; |
| d->state = (1<<MD_DISK_REMOVED); |
| d->state |= (1<<MD_DISK_FAULTY); |
| failed++; |
| } |
| } |
| sb->nr_disks = nr_disks; |
| sb->active_disks = active; |
| sb->working_disks = working; |
| sb->failed_disks = failed; |
| sb->spare_disks = spare; |
| |
| sb->this_disk = sb->disks[rdev->desc_nr]; |
| sb->sb_csum = calc_sb_csum(sb); |
| } |
| |
| /* |
| * rdev_size_change for 0.90.0 |
| */ |
| static unsigned long long |
| super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
| { |
| if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
| return 0; /* component must fit device */ |
| if (rdev->mddev->bitmap_info.offset) |
| return 0; /* can't move bitmap */ |
| rdev->sb_start = calc_dev_sboffset(rdev); |
| if (!num_sectors || num_sectors > rdev->sb_start) |
| num_sectors = rdev->sb_start; |
| /* Limit to 4TB as metadata cannot record more than that. |
| * 4TB == 2^32 KB, or 2*2^32 sectors. |
| */ |
| if (num_sectors >= (2ULL << 32)) |
| num_sectors = (2ULL << 32) - 2; |
| md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
| rdev->sb_page); |
| md_super_wait(rdev->mddev); |
| return num_sectors; |
| } |
| |
| |
| /* |
| * version 1 superblock |
| */ |
| |
| static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) |
| { |
| __le32 disk_csum; |
| u32 csum; |
| unsigned long long newcsum; |
| int size = 256 + le32_to_cpu(sb->max_dev)*2; |
| __le32 *isuper = (__le32*)sb; |
| int i; |
| |
| disk_csum = sb->sb_csum; |
| sb->sb_csum = 0; |
| newcsum = 0; |
| for (i=0; size>=4; size -= 4 ) |
| newcsum += le32_to_cpu(*isuper++); |
| |
| if (size == 2) |
| newcsum += le16_to_cpu(*(__le16*) isuper); |
| |
| csum = (newcsum & 0xffffffff) + (newcsum >> 32); |
| sb->sb_csum = disk_csum; |
| return cpu_to_le32(csum); |
| } |
| |
| static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, |
| int acknowledged); |
| static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
| { |
| struct mdp_superblock_1 *sb; |
| int ret; |
| sector_t sb_start; |
| char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
| int bmask; |
| |
| /* |
| * Calculate the position of the superblock in 512byte sectors. |
| * It is always aligned to a 4K boundary and |
| * depeding on minor_version, it can be: |
| * 0: At least 8K, but less than 12K, from end of device |
| * 1: At start of device |
| * 2: 4K from start of device. |
| */ |
| switch(minor_version) { |
| case 0: |
| sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; |
| sb_start -= 8*2; |
| sb_start &= ~(sector_t)(4*2-1); |
| break; |
| case 1: |
| sb_start = 0; |
| break; |
| case 2: |
| sb_start = 8; |
| break; |
| default: |
| return -EINVAL; |
| } |
| rdev->sb_start = sb_start; |
| |
| /* superblock is rarely larger than 1K, but it can be larger, |
| * and it is safe to read 4k, so we do that |
| */ |
| ret = read_disk_sb(rdev, 4096); |
| if (ret) return ret; |
| |
| |
| sb = page_address(rdev->sb_page); |
| |
| if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
| sb->major_version != cpu_to_le32(1) || |
| le32_to_cpu(sb->max_dev) > (4096-256)/2 || |
| le64_to_cpu(sb->super_offset) != rdev->sb_start || |
| (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) |
| return -EINVAL; |
| |
| if (calc_sb_1_csum(sb) != sb->sb_csum) { |
| printk("md: invalid superblock checksum on %s\n", |
| bdevname(rdev->bdev,b)); |
| return -EINVAL; |
| } |
| if (le64_to_cpu(sb->data_size) < 10) { |
| printk("md: data_size too small on %s\n", |
| bdevname(rdev->bdev,b)); |
| return -EINVAL; |
| } |
| |
| rdev->preferred_minor = 0xffff; |
| rdev->data_offset = le64_to_cpu(sb->data_offset); |
| atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); |
| |
| rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; |
| bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; |
| if (rdev->sb_size & bmask) |
| rdev->sb_size = (rdev->sb_size | bmask) + 1; |
| |
| if (minor_version |
| && rdev->data_offset < sb_start + (rdev->sb_size/512)) |
| return -EINVAL; |
| |
| if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) |
| rdev->desc_nr = -1; |
| else |
| rdev->desc_nr = le32_to_cpu(sb->dev_number); |
| |
| if (!rdev->bb_page) { |
| rdev->bb_page = alloc_page(GFP_KERNEL); |
| if (!rdev->bb_page) |
| return -ENOMEM; |
| } |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && |
| rdev->badblocks.count == 0) { |
| /* need to load the bad block list. |
| * Currently we limit it to one page. |
| */ |
| s32 offset; |
| sector_t bb_sector; |
| u64 *bbp; |
| int i; |
| int sectors = le16_to_cpu(sb->bblog_size); |
| if (sectors > (PAGE_SIZE / 512)) |
| return -EINVAL; |
| offset = le32_to_cpu(sb->bblog_offset); |
| if (offset == 0) |
| return -EINVAL; |
| bb_sector = (long long)offset; |
| if (!sync_page_io(rdev, bb_sector, sectors << 9, |
| rdev->bb_page, READ, true)) |
| return -EIO; |
| bbp = (u64 *)page_address(rdev->bb_page); |
| rdev->badblocks.shift = sb->bblog_shift; |
| for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { |
| u64 bb = le64_to_cpu(*bbp); |
| int count = bb & (0x3ff); |
| u64 sector = bb >> 10; |
| sector <<= sb->bblog_shift; |
| count <<= sb->bblog_shift; |
| if (bb + 1 == 0) |
| break; |
| if (md_set_badblocks(&rdev->badblocks, |
| sector, count, 1) == 0) |
| return -EINVAL; |
| } |
| } else if (sb->bblog_offset == 0) |
| rdev->badblocks.shift = -1; |
| |
| if (!refdev) { |
| ret = 1; |
| } else { |
| __u64 ev1, ev2; |
| struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); |
| |
| if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || |
| sb->level != refsb->level || |
| sb->layout != refsb->layout || |
| sb->chunksize != refsb->chunksize) { |
| printk(KERN_WARNING "md: %s has strangely different" |
| " superblock to %s\n", |
| bdevname(rdev->bdev,b), |
| bdevname(refdev->bdev,b2)); |
| return -EINVAL; |
| } |
| ev1 = le64_to_cpu(sb->events); |
| ev2 = le64_to_cpu(refsb->events); |
| |
| if (ev1 > ev2) |
| ret = 1; |
| else |
| ret = 0; |
| } |
| if (minor_version) |
| rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - |
| le64_to_cpu(sb->data_offset); |
| else |
| rdev->sectors = rdev->sb_start; |
| if (rdev->sectors < le64_to_cpu(sb->data_size)) |
| return -EINVAL; |
| rdev->sectors = le64_to_cpu(sb->data_size); |
| if (le64_to_cpu(sb->size) > rdev->sectors) |
| return -EINVAL; |
| return ret; |
| } |
| |
| static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
| __u64 ev1 = le64_to_cpu(sb->events); |
| |
| rdev->raid_disk = -1; |
| clear_bit(Faulty, &rdev->flags); |
| clear_bit(In_sync, &rdev->flags); |
| clear_bit(WriteMostly, &rdev->flags); |
| |
| if (mddev->raid_disks == 0) { |
| mddev->major_version = 1; |
| mddev->patch_version = 0; |
| mddev->external = 0; |
| mddev->chunk_sectors = le32_to_cpu(sb->chunksize); |
| mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); |
| mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); |
| mddev->level = le32_to_cpu(sb->level); |
| mddev->clevel[0] = 0; |
| mddev->layout = le32_to_cpu(sb->layout); |
| mddev->raid_disks = le32_to_cpu(sb->raid_disks); |
| mddev->dev_sectors = le64_to_cpu(sb->size); |
| mddev->events = ev1; |
| mddev->bitmap_info.offset = 0; |
| mddev->bitmap_info.default_offset = 1024 >> 9; |
| |
| mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
| memcpy(mddev->uuid, sb->set_uuid, 16); |
| |
| mddev->max_disks = (4096-256)/2; |
| |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
| mddev->bitmap_info.file == NULL ) |
| mddev->bitmap_info.offset = |
| (__s32)le32_to_cpu(sb->bitmap_offset); |
| |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
| mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
| mddev->delta_disks = le32_to_cpu(sb->delta_disks); |
| mddev->new_level = le32_to_cpu(sb->new_level); |
| mddev->new_layout = le32_to_cpu(sb->new_layout); |
| mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); |
| } else { |
| mddev->reshape_position = MaxSector; |
| mddev->delta_disks = 0; |
| mddev->new_level = mddev->level; |
| mddev->new_layout = mddev->layout; |
| mddev->new_chunk_sectors = mddev->chunk_sectors; |
| } |
| |
| } else if (mddev->pers == NULL) { |
| /* Insist of good event counter while assembling, except for |
| * spares (which don't need an event count) */ |
| ++ev1; |
| if (rdev->desc_nr >= 0 && |
| rdev->desc_nr < le32_to_cpu(sb->max_dev) && |
| le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) |
| if (ev1 < mddev->events) |
| return -EINVAL; |
| } else if (mddev->bitmap) { |
| /* If adding to array with a bitmap, then we can accept an |
| * older device, but not too old. |
| */ |
| if (ev1 < mddev->bitmap->events_cleared) |
| return 0; |
| } else { |
| if (ev1 < mddev->events) |
| /* just a hot-add of a new device, leave raid_disk at -1 */ |
| return 0; |
| } |
| if (mddev->level != LEVEL_MULTIPATH) { |
| int role; |
| if (rdev->desc_nr < 0 || |
| rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { |
| role = 0xffff; |
| rdev->desc_nr = -1; |
| } else |
| role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
| switch(role) { |
| case 0xffff: /* spare */ |
| break; |
| case 0xfffe: /* faulty */ |
| set_bit(Faulty, &rdev->flags); |
| break; |
| default: |
| if ((le32_to_cpu(sb->feature_map) & |
| MD_FEATURE_RECOVERY_OFFSET)) |
| rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); |
| else |
| set_bit(In_sync, &rdev->flags); |
| rdev->raid_disk = role; |
| break; |
| } |
| if (sb->devflags & WriteMostly1) |
| set_bit(WriteMostly, &rdev->flags); |
| } else /* MULTIPATH are always insync */ |
| set_bit(In_sync, &rdev->flags); |
| |
| return 0; |
| } |
| |
| static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| struct mdp_superblock_1 *sb; |
| struct md_rdev *rdev2; |
| int max_dev, i; |
| /* make rdev->sb match mddev and rdev data. */ |
| |
| sb = page_address(rdev->sb_page); |
| |
| sb->feature_map = 0; |
| sb->pad0 = 0; |
| sb->recovery_offset = cpu_to_le64(0); |
| memset(sb->pad1, 0, sizeof(sb->pad1)); |
| memset(sb->pad3, 0, sizeof(sb->pad3)); |
| |
| sb->utime = cpu_to_le64((__u64)mddev->utime); |
| sb->events = cpu_to_le64(mddev->events); |
| if (mddev->in_sync) |
| sb->resync_offset = cpu_to_le64(mddev->recovery_cp); |
| else |
| sb->resync_offset = cpu_to_le64(0); |
| |
| sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); |
| |
| sb->raid_disks = cpu_to_le32(mddev->raid_disks); |
| sb->size = cpu_to_le64(mddev->dev_sectors); |
| sb->chunksize = cpu_to_le32(mddev->chunk_sectors); |
| sb->level = cpu_to_le32(mddev->level); |
| sb->layout = cpu_to_le32(mddev->layout); |
| |
| if (test_bit(WriteMostly, &rdev->flags)) |
| sb->devflags |= WriteMostly1; |
| else |
| sb->devflags &= ~WriteMostly1; |
| |
| if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
| sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
| sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
| } |
| |
| if (rdev->raid_disk >= 0 && |
| !test_bit(In_sync, &rdev->flags)) { |
| sb->feature_map |= |
| cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); |
| sb->recovery_offset = |
| cpu_to_le64(rdev->recovery_offset); |
| } |
| |
| if (mddev->reshape_position != MaxSector) { |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); |
| sb->reshape_position = cpu_to_le64(mddev->reshape_position); |
| sb->new_layout = cpu_to_le32(mddev->new_layout); |
| sb->delta_disks = cpu_to_le32(mddev->delta_disks); |
| sb->new_level = cpu_to_le32(mddev->new_level); |
| sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
| } |
| |
| if (rdev->badblocks.count == 0) |
| /* Nothing to do for bad blocks*/ ; |
| else if (sb->bblog_offset == 0) |
| /* Cannot record bad blocks on this device */ |
| md_error(mddev, rdev); |
| else { |
| struct badblocks *bb = &rdev->badblocks; |
| u64 *bbp = (u64 *)page_address(rdev->bb_page); |
| u64 *p = bb->page; |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); |
| if (bb->changed) { |
| unsigned seq; |
| |
| retry: |
| seq = read_seqbegin(&bb->lock); |
| |
| memset(bbp, 0xff, PAGE_SIZE); |
| |
| for (i = 0 ; i < bb->count ; i++) { |
| u64 internal_bb = *p++; |
| u64 store_bb = ((BB_OFFSET(internal_bb) << 10) |
| | BB_LEN(internal_bb)); |
| *bbp++ = cpu_to_le64(store_bb); |
| } |
| bb->changed = 0; |
| if (read_seqretry(&bb->lock, seq)) |
| goto retry; |
| |
| bb->sector = (rdev->sb_start + |
| (int)le32_to_cpu(sb->bblog_offset)); |
| bb->size = le16_to_cpu(sb->bblog_size); |
| } |
| } |
| |
| max_dev = 0; |
| list_for_each_entry(rdev2, &mddev->disks, same_set) |
| if (rdev2->desc_nr+1 > max_dev) |
| max_dev = rdev2->desc_nr+1; |
| |
| if (max_dev > le32_to_cpu(sb->max_dev)) { |
| int bmask; |
| sb->max_dev = cpu_to_le32(max_dev); |
| rdev->sb_size = max_dev * 2 + 256; |
| bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; |
| if (rdev->sb_size & bmask) |
| rdev->sb_size = (rdev->sb_size | bmask) + 1; |
| } else |
| max_dev = le32_to_cpu(sb->max_dev); |
| |
| for (i=0; i<max_dev;i++) |
| sb->dev_roles[i] = cpu_to_le16(0xfffe); |
| |
| list_for_each_entry(rdev2, &mddev->disks, same_set) { |
| i = rdev2->desc_nr; |
| if (test_bit(Faulty, &rdev2->flags)) |
| sb->dev_roles[i] = cpu_to_le16(0xfffe); |
| else if (test_bit(In_sync, &rdev2->flags)) |
| sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
| else if (rdev2->raid_disk >= 0) |
| sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
| else |
| sb->dev_roles[i] = cpu_to_le16(0xffff); |
| } |
| |
| sb->sb_csum = calc_sb_1_csum(sb); |
| } |
| |
| static unsigned long long |
| super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
| { |
| struct mdp_superblock_1 *sb; |
| sector_t max_sectors; |
| if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
| return 0; /* component must fit device */ |
| if (rdev->sb_start < rdev->data_offset) { |
| /* minor versions 1 and 2; superblock before data */ |
| max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; |
| max_sectors -= rdev->data_offset; |
| if (!num_sectors || num_sectors > max_sectors) |
| num_sectors = max_sectors; |
| } else if (rdev->mddev->bitmap_info.offset) { |
| /* minor version 0 with bitmap we can't move */ |
| return 0; |
| } else { |
| /* minor version 0; superblock after data */ |
| sector_t sb_start; |
| sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; |
| sb_start &= ~(sector_t)(4*2 - 1); |
| max_sectors = rdev->sectors + sb_start - rdev->sb_start; |
| if (!num_sectors || num_sectors > max_sectors) |
| num_sectors = max_sectors; |
| rdev->sb_start = sb_start; |
| } |
| sb = page_address(rdev->sb_page); |
| sb->data_size = cpu_to_le64(num_sectors); |
| sb->super_offset = rdev->sb_start; |
| sb->sb_csum = calc_sb_1_csum(sb); |
| md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
| rdev->sb_page); |
| md_super_wait(rdev->mddev); |
| return num_sectors; |
| } |
| |
| static struct super_type super_types[] = { |
| [0] = { |
| .name = "0.90.0", |
| .owner = THIS_MODULE, |
| .load_super = super_90_load, |
| .validate_super = super_90_validate, |
| .sync_super = super_90_sync, |
| .rdev_size_change = super_90_rdev_size_change, |
| }, |
| [1] = { |
| .name = "md-1", |
| .owner = THIS_MODULE, |
| .load_super = super_1_load, |
| .validate_super = super_1_validate, |
| .sync_super = super_1_sync, |
| .rdev_size_change = super_1_rdev_size_change, |
| }, |
| }; |
| |
| static void sync_super(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| if (mddev->sync_super) { |
| mddev->sync_super(mddev, rdev); |
| return; |
| } |
| |
| BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); |
| |
| super_types[mddev->major_version].sync_super(mddev, rdev); |
| } |
| |
| static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) |
| { |
| struct md_rdev *rdev, *rdev2; |
| |
| rcu_read_lock(); |
| rdev_for_each_rcu(rdev, mddev1) |
| rdev_for_each_rcu(rdev2, mddev2) |
| if (rdev->bdev->bd_contains == |
| rdev2->bdev->bd_contains) { |
| rcu_read_unlock(); |
| return 1; |
| } |
| rcu_read_unlock(); |
| return 0; |
| } |
| |
| static LIST_HEAD(pending_raid_disks); |
| |
| /* |
| * Try to register data integrity profile for an mddev |
| * |
| * This is called when an array is started and after a disk has been kicked |
| * from the array. It only succeeds if all working and active component devices |
| * are integrity capable with matching profiles. |
| */ |
| int md_integrity_register(struct mddev *mddev) |
| { |
| struct md_rdev *rdev, *reference = NULL; |
| |
| if (list_empty(&mddev->disks)) |
| return 0; /* nothing to do */ |
| if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) |
| return 0; /* shouldn't register, or already is */ |
| list_for_each_entry(rdev, &mddev->disks, same_set) { |
| /* skip spares and non-functional disks */ |
| if (test_bit(Faulty, &rdev->flags)) |
| continue; |
| if (rdev->raid_disk < 0) |
| continue; |
| if (!reference) { |
| /* Use the first rdev as the reference */ |
| reference = rdev; |
| continue; |
| } |
| /* does this rdev's profile match the reference profile? */ |
| if (blk_integrity_compare(reference->bdev->bd_disk, |
| rdev->bdev->bd_disk) < 0) |
| return -EINVAL; |
| } |
| if (!reference || !bdev_get_integrity(reference->bdev)) |
| return 0; |
| /* |
| * All component devices are integrity capable and have matching |
| * profiles, register the common profile for the md device. |
| */ |
| if (blk_integrity_register(mddev->gendisk, |
| bdev_get_integrity(reference->bdev)) != 0) { |
| printk(KERN_ERR "md: failed to register integrity for %s\n", |
| mdname(mddev)); |
| return -EINVAL; |
| } |
| printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); |
| if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { |
| printk(KERN_ERR "md: failed to create integrity pool for %s\n", |
| mdname(mddev)); |
| return -EINVAL; |
| } |
| return 0; |
| } |
| EXPORT_SYMBOL(md_integrity_register); |
| |
| /* Disable data integrity if non-capable/non-matching disk is being added */ |
| void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) |
| { |
| struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); |
| struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); |
| |
| if (!bi_mddev) /* nothing to do */ |
| return; |
| if (rdev->raid_disk < 0) /* skip spares */ |
| return; |
| if (bi_rdev && blk_integrity_compare(mddev->gendisk, |
| rdev->bdev->bd_disk) >= 0) |
| return; |
| printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); |
| blk_integrity_unregister(mddev->gendisk); |
| } |
| EXPORT_SYMBOL(md_integrity_add_rdev); |
| |
| static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) |
| { |
| char b[BDEVNAME_SIZE]; |
| struct kobject *ko; |
| char *s; |
| int err; |
| |
| if (rdev->mddev) { |
| MD_BUG(); |
| return -EINVAL; |
| } |
| |
| /* prevent duplicates */ |
| if (find_rdev(mddev, rdev->bdev->bd_dev)) |
| return -EEXIST; |
| |
| /* make sure rdev->sectors exceeds mddev->dev_sectors */ |
| if (rdev->sectors && (mddev->dev_sectors == 0 || |
| rdev->sectors < mddev->dev_sectors)) { |
| if (mddev->pers) { |
| /* Cannot change size, so fail |
| * If mddev->level <= 0, then we don't care |
| * about aligning sizes (e.g. linear) |
| */ |
| if (mddev->level > 0) |
| return -ENOSPC; |
| } else |
| mddev->dev_sectors = rdev->sectors; |
| } |
| |
| /* Verify rdev->desc_nr is unique. |
| * If it is -1, assign a free number, else |
| * check number is not in use |
| */ |
| if (rdev->desc_nr < 0) { |
| int choice = 0; |
| if (mddev->pers) choice = mddev->raid_disks; |
| while (find_rdev_nr(mddev, choice)) |
| choice++; |
| rdev->desc_nr = choice; |
| } else { |
| if (find_rdev_nr(mddev, rdev->desc_nr)) |
| return -EBUSY; |
| } |
| if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { |
| printk(KERN_WARNING "md: %s: array is limited to %d devices\n", |
| mdname(mddev), mddev->max_disks); |
| return -EBUSY; |
| } |
| bdevname(rdev->bdev,b); |
| while ( (s=strchr(b, '/')) != NULL) |
| *s = '!'; |
| |
| rdev->mddev = mddev; |
| printk(KERN_INFO "md: bind<%s>\n", b); |
| |
| if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) |
| goto fail; |
| |
| ko = &part_to_dev(rdev->bdev->bd_part)->kobj; |
| if (sysfs_create_link(&rdev->kobj, ko, "block")) |
| /* failure here is OK */; |
| rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); |
| |
| list_add_rcu(&rdev->same_set, &mddev->disks); |
| bd_link_disk_holder(rdev->bdev, mddev->gendisk); |
| |
| /* May as well allow recovery to be retried once */ |
| mddev->recovery_disabled++; |
| |
| return 0; |
| |
| fail: |
| printk(KERN_WARNING "md: failed to register dev-%s for %s\n", |
| b, mdname(mddev)); |
| return err; |
| } |
| |
| static void md_delayed_delete(struct work_struct *ws) |
| { |
| struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); |
| kobject_del(&rdev->kobj); |
| kobject_put(&rdev->kobj); |
| } |
| |
| static void unbind_rdev_from_array(struct md_rdev * rdev) |
| { |
| char b[BDEVNAME_SIZE]; |
| if (!rdev->mddev) { |
| MD_BUG(); |
| return; |
| } |
| bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); |
| list_del_rcu(&rdev->same_set); |
| printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); |
| rdev->mddev = NULL; |
| sysfs_remove_link(&rdev->kobj, "block"); |
| sysfs_put(rdev->sysfs_state); |
| rdev->sysfs_state = NULL; |
| kfree(rdev->badblocks.page); |
| rdev->badblocks.count = 0; |
| rdev->badblocks.page = NULL; |
| /* We need to delay this, otherwise we can deadlock when |
| * writing to 'remove' to "dev/state". We also need |
| * to delay it due to rcu usage. |
| */ |
| synchronize_rcu(); |
| INIT_WORK(&rdev->del_work, md_delayed_delete); |
| kobject_get(&rdev->kobj); |
| queue_work(md_misc_wq, &rdev->del_work); |
| } |
| |
| /* |
| * prevent the device from being mounted, repartitioned or |
| * otherwise reused by a RAID array (or any other kernel |
| * subsystem), by bd_claiming the device. |
| */ |
| static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) |
| { |
| int err = 0; |
| struct block_device *bdev; |
| char b[BDEVNAME_SIZE]; |
| |
| bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, |
| shared ? (struct md_rdev *)lock_rdev : rdev); |
| if (IS_ERR(bdev)) { |
| printk(KERN_ERR "md: could not open %s.\n", |
| __bdevname(dev, b)); |
| return PTR_ERR(bdev); |
| } |
| rdev->bdev = bdev; |
| return err; |
| } |
| |
| static void unlock_rdev(struct md_rdev *rdev) |
| { |
| struct block_device *bdev = rdev->bdev; |
| rdev->bdev = NULL; |
| if (!bdev) |
| MD_BUG(); |
| blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); |
| } |
| |
| void md_autodetect_dev(dev_t dev); |
| |
| static void export_rdev(struct md_rdev * rdev) |
| { |
| char b[BDEVNAME_SIZE]; |
| printk(KERN_INFO "md: export_rdev(%s)\n", |
| bdevname(rdev->bdev,b)); |
| if (rdev->mddev) |
| MD_BUG(); |
| free_disk_sb(rdev); |
| #ifndef MODULE |
| if (test_bit(AutoDetected, &rdev->flags)) |
| md_autodetect_dev(rdev->bdev->bd_dev); |
| #endif |
| unlock_rdev(rdev); |
| kobject_put(&rdev->kobj); |
| } |
| |
| static void kick_rdev_from_array(struct md_rdev * rdev) |
| { |
| unbind_rdev_from_array(rdev); |
| export_rdev(rdev); |
| } |
| |
| static void export_array(struct mddev *mddev) |
| { |
| struct md_rdev *rdev, *tmp; |
| |
| rdev_for_each(rdev, tmp, mddev) { |
| if (!rdev->mddev) { |
| MD_BUG(); |
| continue; |
| } |
| kick_rdev_from_array(rdev); |
| } |
| if (!list_empty(&mddev->disks)) |
| MD_BUG(); |
| mddev->raid_disks = 0; |
| mddev->major_version = 0; |
| } |
| |
| static void print_desc(mdp_disk_t *desc) |
| { |
| printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, |
| desc->major,desc->minor,desc->raid_disk,desc->state); |
| } |
| |
| static void print_sb_90(mdp_super_t *sb) |
| { |
| int i; |
| |
| printk(KERN_INFO |
| "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", |
| sb->major_version, sb->minor_version, sb->patch_version, |
| sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, |
| sb->ctime); |
| printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", |
| sb->level, sb->size, sb->nr_disks, sb->raid_disks, |
| sb->md_minor, sb->layout, sb->chunk_size); |
| printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" |
| " FD:%d SD:%d CSUM:%08x E:%08lx\n", |
| sb->utime, sb->state, sb->active_disks, sb->working_disks, |
| sb->failed_disks, sb->spare_disks, |
| sb->sb_csum, (unsigned long)sb->events_lo); |
| |
| printk(KERN_INFO); |
| for (i = 0; i < MD_SB_DISKS; i++) { |
| mdp_disk_t *desc; |
| |
| desc = sb->disks + i; |
| if (desc->number || desc->major || desc->minor || |
| desc->raid_disk || (desc->state && (desc->state != 4))) { |
| printk(" D %2d: ", i); |
| print_desc(desc); |
| } |
| } |
| printk(KERN_INFO "md: THIS: "); |
| print_desc(&sb->this_disk); |
| } |
| |
| static void print_sb_1(struct mdp_superblock_1 *sb) |
| { |
| __u8 *uuid; |
| |
| uuid = sb->set_uuid; |
| printk(KERN_INFO |
| "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" |
| "md: Name: \"%s\" CT:%llu\n", |
| le32_to_cpu(sb->major_version), |
| le32_to_cpu(sb->feature_map), |
| uuid, |
| sb->set_name, |
| (unsigned long long)le64_to_cpu(sb->ctime) |
| & MD_SUPERBLOCK_1_TIME_SEC_MASK); |
| |
| uuid = sb->device_uuid; |
| printk(KERN_INFO |
| "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" |
| " RO:%llu\n" |
| "md: Dev:%08x UUID: %pU\n" |
| "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" |
| "md: (MaxDev:%u) \n", |
| le32_to_cpu(sb->level), |
| (unsigned long long)le64_to_cpu(sb->size), |
| le32_to_cpu(sb->raid_disks), |
| le32_to_cpu(sb->layout), |
| le32_to_cpu(sb->chunksize), |
| (unsigned long long)le64_to_cpu(sb->data_offset), |
| (unsigned long long)le64_to_cpu(sb->data_size), |
| (unsigned long long)le64_to_cpu(sb->super_offset), |
| (unsigned long long)le64_to_cpu(sb->recovery_offset), |
| le32_to_cpu(sb->dev_number), |
| uuid, |
| sb->devflags, |
| (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, |
| (unsigned long long)le64_to_cpu(sb->events), |
| (unsigned long long)le64_to_cpu(sb->resync_offset), |
| le32_to_cpu(sb->sb_csum), |
| le32_to_cpu(sb->max_dev) |
| ); |
| } |
| |
| static void print_rdev(struct md_rdev *rdev, int major_version) |
| { |
| char b[BDEVNAME_SIZE]; |
| printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", |
| bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, |
| test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), |
| rdev->desc_nr); |
| if (rdev->sb_loaded) { |
| printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); |
| switch (major_version) { |
| case 0: |
| print_sb_90(page_address(rdev->sb_page)); |
| break; |
| case 1: |
| print_sb_1(page_address(rdev->sb_page)); |
| break; |
| } |
| } else |
| printk(KERN_INFO "md: no rdev superblock!\n"); |
| } |
| |
| static void md_print_devices(void) |
| { |
| struct list_head *tmp; |
| struct md_rdev *rdev; |
| struct mddev *mddev; |
| char b[BDEVNAME_SIZE]; |
| |
| printk("\n"); |
| printk("md: **********************************\n"); |
| printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); |
| printk("md: **********************************\n"); |
| for_each_mddev(mddev, tmp) { |
| |
| if (mddev->bitmap) |
| bitmap_print_sb(mddev->bitmap); |
| else |
| printk("%s: ", mdname(mddev)); |
| list_for_each_entry(rdev, &mddev->disks, same_set) |
| printk("<%s>", bdevname(rdev->bdev,b)); |
| printk("\n"); |
| |
| list_for_each_entry(rdev, &mddev->disks, same_set) |
| print_rdev(rdev, mddev->major_version); |
| } |
| printk("md: **********************************\n"); |
| printk("\n"); |
| } |
| |
| |
| static void sync_sbs(struct mddev * mddev, int nospares) |
| { |
| /* Update each superblock (in-memory image), but |
| * if we are allowed to, skip spares which already |
| * have the right event counter, or have one earlier |
| * (which would mean they aren't being marked as dirty |
| * with the rest of the array) |
| */ |
| struct md_rdev *rdev; |
| list_for_each_entry(rdev, &mddev->disks, same_set) { |
| if (rdev->sb_events == mddev->events || |
| (nospares && |
| rdev->raid_disk < 0 && |
| rdev->sb_events+1 == mddev->events)) { |
| /* Don't update this superblock */ |
| rdev->sb_loaded = 2; |
| } else { |
| sync_super(mddev, rdev); |
| rdev->sb_loaded = 1; |
| } |
| } |
| } |
| |
| static void md_update_sb(struct mddev * mddev, int force_change) |
| { |
| struct md_rdev *rdev; |
| int sync_req; |
| int nospares = 0; |
| int any_badblocks_changed = 0; |
| |
| repeat: |
| /* First make sure individual recovery_offsets are correct */ |
| list_for_each_entry(rdev, &mddev->disks, same_set) { |
| if (rdev->raid_disk >= 0 && |
| mddev->delta_disks >= 0 && |
| !test_bit(In_sync, &rdev->flags) && |
| mddev->curr_resync_completed > rdev->recovery_offset) |
| rdev->recovery_offset = mddev->curr_resync_completed; |
| |
| } |
| if (!mddev->persistent) { |
| clear_bit(MD_CHANGE_CLEAN, &mddev->flags); |
| clear_bit(MD_CHANGE_DEVS, &mddev->flags); |
| if (!mddev->external) { |
| clear_bit(MD_CHANGE_PENDING, &mddev->flags); |
| list_for_each_entry(rdev, &mddev->disks, same_set) { |
| if (rdev->badblocks.changed) { |
| rdev->badblocks.changed = 0; |
| md_ack_all_badblocks(&rdev->badblocks); |
| md_error(mddev, rdev); |
| } |
| clear_bit(Blocked, &rdev->flags); |
| clear_bit(BlockedBadBlocks, &rdev->flags); |
| wake_up(&rdev->blocked_wait); |
| } |
| } |
| wake_up(&mddev->sb_wait); |
| return; |
| } |
| |
| spin_lock_irq(&mddev->write_lock); |
| |
| mddev->utime = get_seconds(); |
| |
| if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) |
| force_change = 1; |
| if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) |
| /* just a clean<-> dirty transition, possibly leave spares alone, |
| * though if events isn't the right even/odd, we will have to do |
| * spares after all |
| */ |
| nospares = 1; |
| if (force_change) |
| nospares = 0; |
| if (mddev->degraded) |
| /* If the array is degraded, then skipping spares is both |
| * dangerous and fairly pointless. |
| * Dangerous because a device that was removed from the array |
| * might have a event_count that still looks up-to-date, |
| * so it can be re-added without a resync. |
| * Pointless because if there are any spares to skip, |
| * then a recovery will happen and soon that array won't |
| * be degraded any more and the spare can go back to sleep then. |
| */ |
| nospares = 0; |
| |
| sync_req = mddev->in_sync; |
| |
| /* If this is just a dirty<->clean transition, and the array is clean |
| * and 'events' is odd, we can roll back to the previous clean state */ |
| if (nospares |
| && (mddev->in_sync && mddev->recovery_cp == MaxSector) |
| && mddev->can_decrease_events |
| && mddev->events != 1) { |
| mddev->events--; |
| mddev->can_decrease_events = 0; |
| } else { |
| /* otherwise we have to go forward and ... */ |
| mddev->events ++; |
| mddev->can_decrease_events = nospares; |
| } |
| |
| if (!mddev->events) { |
| /* |
| * oops, this 64-bit counter should never wrap. |
| * Either we are in around ~1 trillion A.C., assuming |
| * 1 reboot per second, or we have a bug: |
| */ |
| MD_BUG(); |
| mddev->events --; |
| } |
| |
| list_for_each_entry(rdev, &mddev->disks, same_set) { |
| if (rdev->badblocks.changed) |
| any_badblocks_changed++; |
| if (test_bit(Faulty, &rdev->flags)) |
| set_bit(FaultRecorded, &rdev->flags); |
| } |
| |
| sync_sbs(mddev, nospares); |
| spin_unlock_irq(&mddev->write_lock); |
| |
| pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", |
| mdname(mddev), mddev->in_sync); |
| |
| bitmap_update_sb(mddev->bitmap); |
| list_for_each_entry(rdev, &mddev->disks, same_set) { |
| char b[BDEVNAME_SIZE]; |
| |
| if (rdev->sb_loaded != 1) |
| continue; /* no noise on spare devices */ |
| |
| if (!test_bit(Faulty, &rdev->flags) && |
| rdev->saved_raid_disk == -1) { |
| md_super_write(mddev,rdev, |
| rdev->sb_start, rdev->sb_size, |
| rdev->sb_page); |
| pr_debug("md: (write) %s's sb offset: %llu\n", |
| bdevname(rdev->bdev, b), |
| (unsigned long long)rdev->sb_start); |
| rdev->sb_events = mddev->events; |
| if (rdev->badblocks.size) { |
| md_super_write(mddev, rdev, |
| rdev->badblocks.sector, |
| rdev->badblocks.size << 9, |
| rdev->bb_page); |
| rdev->badblocks.size = 0; |
| } |
| |
| } else if (test_bit(Faulty, &rdev->flags)) |
| pr_debug("md: %s (skipping faulty)\n", |
| bdevname(rdev->bdev, b)); |
| else |
| pr_debug("(skipping incremental s/r "); |
| |
| if (mddev->level == LEVEL_MULTIPATH) |
| /* only need to write one superblock... */ |
| break; |
| } |
| md_super_wait(mddev); |
| /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ |
| |
| spin_lock_irq(&mddev->write_lock); |
| if (mddev->in_sync != sync_req || |
| test_bit(MD_CHANGE_DEVS, &mddev->flags)) { |
| /* have to write it out again */ |
| spin_unlock_irq(&mddev->write_lock); |
| goto repeat; |
| } |
| clear_bit(MD_CHANGE_PENDING, &mddev->flags); |
| spin_unlock_irq(&mddev->write_lock); |
| wake_up(&mddev->sb_wait); |
| if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
| sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
| |
| list_for_each_entry(rdev, &mddev->disks, same_set) { |
| if (test_and_clear_bit(FaultRecorded, &rdev->flags)) |
| clear_bit(Blocked, &rdev->flags); |
| |
| if (any_badblocks_changed) |
| md_ack_all_badblocks(&rdev->badblocks); |
| clear_bit(BlockedBadBlocks, &rdev->flags); |
| wake_up(&rdev->blocked_wait); |
| } |
| } |
| |
| /* words written to sysfs files may, or may not, be \n terminated. |
| * We want to accept with case. For this we use cmd_match. |
| */ |
| static int cmd_match(const char *cmd, const char *str) |
| { |
| /* See if cmd, written into a sysfs file, matches |
| * str. They must either be the same, or cmd can |
| * have a trailing newline |
| */ |
| while (*cmd && *str && *cmd == *str) { |
| cmd++; |
| str++; |
| } |
| if (*cmd == '\n') |
| cmd++; |
| if (*str || *cmd) |
| return 0; |
| return 1; |
| } |
| |
| struct rdev_sysfs_entry { |
| struct attribute attr; |
| ssize_t (*show)(struct md_rdev *, char *); |
| ssize_t (*store)(struct md_rdev *, const char *, size_t); |
| }; |
| |
| static ssize_t |
| state_show(struct md_rdev *rdev, char *page) |
| { |
| char *sep = ""; |
| size_t len = 0; |
| |
| if (test_bit(Faulty, &rdev->flags) || |
| rdev->badblocks.unacked_exist) { |
| len+= sprintf(page+len, "%sfaulty",sep); |
| sep = ","; |
| } |
| if (test_bit(In_sync, &rdev->flags)) { |
| len += sprintf(page+len, "%sin_sync",sep); |
| sep = ","; |
| } |
| if (test_bit(WriteMostly, &rdev->flags)) { |
| len += sprintf(page+len, "%swrite_mostly",sep); |
| sep = ","; |
| } |
| if (test_bit(Blocked, &rdev->flags) || |
| (rdev->badblocks.unacked_exist |
| && !test_bit(Faulty, &rdev->flags))) { |
| len += sprintf(page+len, "%sblocked", sep); |
| sep = ","; |
| } |
| if (!test_bit(Faulty, &rdev->flags) && |
| !test_bit(In_sync, &rdev->flags)) { |
| len += sprintf(page+len, "%sspare", sep); |
| sep = ","; |
| } |
| if (test_bit(WriteErrorSeen, &rdev->flags)) { |
| len += sprintf(page+len, "%swrite_error", sep); |
| sep = ","; |
| } |
| return len+sprintf(page+len, "\n"); |
| } |
| |
| static ssize_t |
| state_store(struct md_rdev *rdev, const char *buf, size_t len) |
| { |
| /* can write |
| * faulty - simulates an error |
| * remove - disconnects the device |
| * writemostly - sets write_mostly |
| * -writemostly - clears write_mostly |
| * blocked - sets the Blocked flags |
| * -blocked - clears the Blocked and possibly simulates an error |
| * insync - sets Insync providing device isn't active |
| * write_error - sets WriteErrorSeen |
| * -write_error - clears WriteErrorSeen |
| */ |
| int err = -EINVAL; |
| if (cmd_match(buf, "faulty") && rdev->mddev->pers) { |
| md_error(rdev->mddev, rdev); |
| if (test_bit(Faulty, &rdev->flags)) |
| err = 0; |
| else |
| err = -EBUSY; |
| } else if (cmd_match(buf, "remove")) { |
| if (rdev->raid_disk >= 0) |
| err = -EBUSY; |
| else { |
| struct mddev *mddev = rdev->mddev; |
| kick_rdev_from_array(rdev); |
| if (mddev->pers) |
| md_update_sb(mddev, 1); |
| md_new_event(mddev); |
| err = 0; |
| } |
| } else if (cmd_match(buf, "writemostly")) { |
| set_bit(WriteMostly, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "-writemostly")) { |
| clear_bit(WriteMostly, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "blocked")) { |
| set_bit(Blocked, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "-blocked")) { |
| if (!test_bit(Faulty, &rdev->flags) && |
| rdev->badblocks.unacked_exist) { |
| /* metadata handler doesn't understand badblocks, |
| * so we need to fail the device |
| */ |
| md_error(rdev->mddev, rdev); |
| } |
| clear_bit(Blocked, &rdev->flags); |
| clear_bit(BlockedBadBlocks, &rdev->flags); |
| wake_up(&rdev->blocked_wait); |
| set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
| md_wakeup_thread(rdev->mddev->thread); |
| |
| err = 0; |
| } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { |
| set_bit(In_sync, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "write_error")) { |
| set_bit(WriteErrorSeen, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "-write_error")) { |
| clear_bit(WriteErrorSeen, &rdev->flags); |
| err = 0; |
| } |
| if (!err) |
| sysfs_notify_dirent_safe(rdev->sysfs_state); |
| return err ? err : len; |
| } |
| static struct rdev_sysfs_entry rdev_state = |
| __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); |
| |
| static ssize_t |
| errors_show(struct md_rdev *rdev, char *page) |
| { |
| return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); |
| } |
| |
| static ssize_t |
| errors_store(struct md_rdev *rdev, const char *buf, size_t len) |
| { |
| char *e; |
| unsigned long n = simple_strtoul(buf, &e, 10); |
| if (*buf && (*e == 0 || *e == '\n')) { |
| atomic_set(&rdev->corrected_errors, n); |
| return len; |
| } |
| return -EINVAL; |
| } |
| static struct rdev_sysfs_entry rdev_errors = |
| __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); |
| |
| static ssize_t |
| slot_show(struct md_rdev *rdev, char *page) |
| { |
| if (rdev->raid_disk < 0) |
| return sprintf(page, "none\n"); |
| else |
| return sprintf(page, "%d\n", rdev->raid_disk); |
| } |
| |
| static ssize_t |
| slot_store(struct md_rdev *rdev, const char *buf, size_t len) |
| { |
| char *e; |
| int err; |
| int slot = simple_strtoul(buf, &e, 10); |
| if (strncmp(buf, "none", 4)==0) |
| slot = -1; |
| else if (e==buf || (*e && *e!= '\n')) |
| return -EINVAL; |
| if (rdev->mddev->pers && slot == -1) { |
| /* Setting 'slot' on an active array requires also |
| * updating the 'rd%d' link, and communicating |
| * with the personality with ->hot_*_disk. |
| * For now we only support removing |
| * failed/spare devices. This normally happens automatically, |
| * but not when the metadata is externally managed. |
| */ |
| if (rdev->raid_disk == -1) |
| return -EEXIST; |
| /* personality does all needed checks */ |
| if (rdev->mddev->pers->hot_remove_disk == NULL) |
| return -EINVAL; |
| err = rdev->mddev->pers-> |
| hot_remove_disk(rdev->mddev, rdev->raid_disk); |
| if (err) |
| return err; |
| sysfs_unlink_rdev(rdev->mddev, rdev); |
| rdev->raid_disk = -1; |
| set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
| md_wakeup_thread(rdev->mddev->thread); |
| } else if (rdev->mddev->pers) { |
| struct md_rdev *rdev2; |
| /* Activating a spare .. or possibly reactivating |
| * if we ever get bitmaps working here. |
| */ |
| |
| if (rdev->raid_disk != -1) |
| return -EBUSY; |
| |
| if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) |
| return -EBUSY; |
| |
| if (rdev->mddev->pers->hot_add_disk == NULL) |
| return -EINVAL; |
| |
| list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) |
| if (rdev2->raid_disk == slot) |
| return -EEXIST; |
| |
| if (slot >= rdev->mddev->raid_disks && |
| slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) |
| return -ENOSPC; |
| |
| rdev->raid_disk = slot; |
| if (test_bit(In_sync, &rdev->flags)) |
| rdev->saved_raid_disk = slot; |
| else |
| rdev->saved_raid_disk = -1; |
| clear_bit(In_sync, &rdev->flags); |
| err = rdev->mddev->pers-> |
| hot_add_disk(rdev->mddev, rdev); |
| if (err) { |
| rdev->raid_disk = -1; |
| return err; |
| } else |
| sysfs_notify_dirent_safe(rdev->sysfs_state); |
| if (sysfs_link_rdev(rdev->mddev, rdev)) |
| /* failure here is OK */; |
| /* don't wakeup anyone, leave that to userspace. */ |
| } else { |
| if (slot >= rdev->mddev->raid_disks && |
| slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) |
| return -ENOSPC; |
| rdev->raid_disk = slot; |
| /* assume it is working */ |
| clear_bit(Faulty, &rdev->flags); |
| clear_bit(WriteMostly, &rdev->flags); |
| set_bit(In_sync, &rdev->flags); |
| sysfs_notify_dirent_safe(rdev->sysfs_state); |
| } |
| return len; |
| } |
| |
| |
| static struct rdev_sysfs_entry rdev_slot = |
| __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); |
| |
| static ssize_t |
| offset_show(struct md_rdev *rdev, char *page) |
| { |
| return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); |
| } |
| |
| static ssize_t |
| offset_store(struct md_rdev *rdev, const char *buf, size_t len) |
| { |
| char *e; |
| unsigned long long offset = simple_strtoull(buf, &e, 10); |
| if (e==buf || (*e && *e != '\n')) |
| return -EINVAL; |
| if (rdev->mddev->pers && rdev->raid_disk >= 0) |
| return -EBUSY; |
| if (rdev->sectors && rdev->mddev->external) |
| /* Must set offset before size, so overlap checks |
| * can be sane */ |
| return -EBUSY; |
| rdev->data_offset = offset; |
| return len; |
| } |
| |
| static struct rdev_sysfs_entry rdev_offset = |
| __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); |
| |
| static ssize_t |
| rdev_size_show(struct md_rdev *rdev, char *page) |
| { |
| return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); |
| } |
| |
| static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) |
| { |
| /* check if two start/length pairs overlap */ |
| if (s1+l1 <= s2) |
| return 0; |
| if (s2+l2 <= s1) |
| return 0; |
| return 1; |
| } |
| |
| static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) |
| { |
| unsigned long long blocks; |
| sector_t new; |
| |
| if (strict_strtoull(buf, 10, &blocks) < 0) |
| return -EINVAL; |
| |
| if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) |
| return -EINVAL; /* sector conversion overflow */ |
| |
| new = blocks * 2; |
| if (new != blocks * 2) |
| return -EINVAL; /* unsigned long long to sector_t overflow */ |
| |
| *sectors = new; |
| return 0; |
| } |
| |
| static ssize_t |
| rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) |
| { |
| struct mddev *my_mddev = rdev->mddev; |
| sector_t oldsectors = rdev->sectors; |
| sector_t sectors; |
| |
| if (strict_blocks_to_sectors(buf, §ors) < 0) |
| return -EINVAL; |
| if (my_mddev->pers && rdev->raid_disk >= 0) { |
| if (my_mddev->persistent) { |
| sectors = super_types[my_mddev->major_version]. |
| rdev_size_change(rdev, sectors); |
| if (!sectors) |
| return -EBUSY; |
| } else if (!sectors) |
| sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - |
| rdev->data_offset; |
| } |
| if (sectors < my_mddev->dev_sectors) |
| return -EINVAL; /* component must fit device */ |
| |
| rdev->sectors = sectors; |
| if (sectors > oldsectors && my_mddev->external) { |
| /* need to check that all other rdevs with the same ->bdev |
| * do not overlap. We need to unlock the mddev to avoid |
| * a deadlock. We have already changed rdev->sectors, and if |
| * we have to change it back, we will have the lock again. |
| */ |
| struct mddev *mddev; |
| int overlap = 0; |
| struct list_head *tmp; |
| |
| mddev_unlock(my_mddev); |
| for_each_mddev(mddev, tmp) { |
| struct md_rdev *rdev2; |
| |
| mddev_lock(mddev); |
| list_for_each_entry(rdev2, &mddev->disks, same_set) |
| if (rdev->bdev == rdev2->bdev && |
| rdev != rdev2 && |
| overlaps(rdev->data_offset, rdev->sectors, |
| rdev2->data_offset, |
| rdev2->sectors)) { |
| overlap = 1; |
| break; |
| } |
| mddev_unlock(mddev); |
| if (overlap) { |
| mddev_put(mddev); |
| break; |
| } |
| } |
| mddev_lock(my_mddev); |
| if (overlap) { |
| /* Someone else could have slipped in a size |
| * change here, but doing so is just silly. |
| * We put oldsectors back because we *know* it is |
| * safe, and trust userspace not to race with |
| * itself |
| */ |
| rdev->sectors = oldsectors; |
| return -EBUSY; |
| } |
| } |
| return len; |
| } |
| |
| static struct rdev_sysfs_entry rdev_size = |
| __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); |
| |
| |
| static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) |
| { |
| unsigned long long recovery_start = rdev->recovery_offset; |
| |
| if (test_bit(In_sync, &rdev->flags) || |
| recovery_start == MaxSector) |
| return sprintf(page, "none\n"); |
| |
| return sprintf(page, "%llu\n", recovery_start); |
| } |
| |
| static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) |
| { |
| unsigned long long recovery_start; |
| |
| if (cmd_match(buf, "none")) |
| recovery_start = MaxSector; |
| else if (strict_strtoull(buf, 10, &recovery_start)) |
| return -EINVAL; |
| |
|