diff -Naur linux-2.4.20-dm-10/drivers/md/Config.in linux-2.4.20-evms-2.0.1/drivers/md/Config.in --- linux-2.4.20-dm-10/drivers/md/Config.in 2003-04-28 12:00:39.000000000 -0500 +++ linux-2.4.20-evms-2.0.1/drivers/md/Config.in 2003-04-28 12:00:19.000000000 -0500 @@ -16,6 +16,7 @@ dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then dep_tristate ' Device-mapper support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM $CONFIG_MD + dep_tristate ' Bad Block Relocation Device Target' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM fi endmenu diff -Naur linux-2.4.20-dm-10/drivers/md/Makefile linux-2.4.20-evms-2.0.1/drivers/md/Makefile --- linux-2.4.20-dm-10/drivers/md/Makefile 2003-04-28 12:00:39.000000000 -0500 +++ linux-2.4.20-evms-2.0.1/drivers/md/Makefile 2003-04-28 12:00:23.000000000 -0500 @@ -25,6 +25,7 @@ obj-$(CONFIG_BLK_DEV_MD) += md.o obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o dm-io.o include $(TOPDIR)/Rules.make diff -Naur linux-2.4.20-dm-10/drivers/md/dm-bbr.c linux-2.4.20-evms-2.0.1/drivers/md/dm-bbr.c --- linux-2.4.20-dm-10/drivers/md/dm-bbr.c 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.4.20-evms-2.0.1/drivers/md/dm-bbr.c 2003-04-28 11:59:53.000000000 -0500 @@ -0,0 +1,1301 @@ +/* + * Copyright (c) International Business Machines Corp., 2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * linux/drivers/md/dm-bbr.c + * + * Bad-block-relocation (BBR) target for device-mapper. + * + * The BBR target is designed to remap I/O write failures to another safe + * location on disk. Note that most disk drives have BBR built into them, + * this means that our software BBR will be only activated when all hardware + * BBR replacement sectors have been used. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dm.h" +#include "dm-bbr.h" +#include "syncio.h" + +static struct sync_io_handle *syncio_handle = NULL; +static char *no_mem_msg = "could not allocate memory"; +#define BBR_NO_MEM() DMERR("%s: %s", __FUNCTION__, no_mem_msg) + +/* List of all BBR nodes. */ +static LIST_HEAD(bbr_instances); + +/* Data pertaining to the I/O thread. */ +static struct bbr_thread * bbr_io_thread = NULL; +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(bbr_io_list); + +/* Global pools for bbr_io_buf's and bbr_remap's. */ +kmem_cache_t * bbr_io_buf_slab; +mempool_t * bbr_io_buf_pool; +kmem_cache_t * bbr_remap_slab; +mempool_t * bbr_remap_pool; + +/* Function prototyes */ +static void bbr_io_handler(void * void_data); +static void bbr_free_remap(struct bbr_private * bbr_id); + +static void destroy_pools(void) +{ + if (bbr_io_buf_pool) { + mempool_destroy(bbr_io_buf_pool); + bbr_io_buf_pool = NULL; + } + if (bbr_io_buf_slab) { + kmem_cache_destroy(bbr_io_buf_slab); + bbr_io_buf_slab = NULL; + } + if (bbr_remap_pool) { + mempool_destroy(bbr_remap_pool); + bbr_remap_pool = NULL; + } + if (bbr_remap_slab) { + kmem_cache_destroy(bbr_remap_slab); + bbr_remap_slab = NULL; + } +} + +static int create_pools(void) +{ + /* Create a memory pool for the remap list. */ + if (!bbr_remap_slab) { + bbr_remap_slab = kmem_cache_create("BBR_Remap_Slab", + sizeof(struct bbr_runtime_remap), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!bbr_remap_slab) { + DMERR("Unable to create BBR remap slab."); + goto out; + } + } + if (!bbr_remap_pool) { + bbr_remap_pool = mempool_create(64, mempool_alloc_slab, + mempool_free_slab, + bbr_remap_slab); + if (!bbr_remap_pool) { + DMERR("Unable to create BBR remap pool."); + goto out; + } + } + + /* Create a memory pool for the BBR I/O anchors. */ + if (!bbr_io_buf_slab) { + bbr_io_buf_slab = kmem_cache_create("BBR_IO_Buf_Slab", + sizeof(struct bbr_io_buffer), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!bbr_io_buf_slab) { + DMERR("Unable to create BBR I/O buffer slab."); + goto out; + } + } + if (!bbr_io_buf_pool) { + bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab, + mempool_free_slab, + bbr_io_buf_slab); + if (!bbr_io_buf_pool) { + DMERR("Unable to create BBR I/O buffer pool."); + goto out; + } + } + +out: + if (!bbr_remap_slab || !bbr_remap_pool || + !bbr_io_buf_slab || !bbr_io_buf_pool ) { + destroy_pools(); + return -ENOMEM; + } + + return 0; +} + +static inline void bbr_flush_signals(void) +{ + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); +} + +static inline void bbr_init_signals(void) +{ + current->exit_signal = SIGCHLD; + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); +} + +static int bbr_thread(void *arg) +{ + struct bbr_thread *thread = arg; + lock_kernel(); + + /* Detach thread */ + daemonize(); + + sprintf(current->comm, thread->name); + bbr_init_signals(); + bbr_flush_signals(); + thread->tsk = current; + + current->policy = SCHED_OTHER; +#ifdef O1_SCHEDULER + set_user_nice(current, -20); +#else + current->nice = -20; +#endif + unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run) (void *data); + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&thread->wqueue, &wait); +#ifdef O1_SCHEDULER + set_current_state(TASK_INTERRUPTIBLE); +#else + set_task_state(current, TASK_INTERRUPTIBLE); +#endif + if (!test_bit(BBR_THREAD_WAKEUP, &thread->flags)) { + schedule(); + } +#ifdef O1_SCHEDULER + set_current_state(TASK_RUNNING); +#else + current->state = TASK_RUNNING; +#endif + remove_wait_queue(&thread->wqueue, &wait); + clear_bit(BBR_THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->data); + run_task_queue(&tq_disk); + } + if (signal_pending(current)) { + bbr_flush_signals(); + } + } + complete(thread->event); + return 0; +} + +static void bbr_wakeup_thread(struct bbr_thread *thread) +{ + set_bit(BBR_THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); +} + +static void bbr_interrupt_thread(struct bbr_thread *thread) +{ + if (!thread->tsk) { + DMERR("%s: attempted to interrupt an invalid thread!", __FUNCTION__); + return; + } + send_sig(SIGKILL, thread->tsk, 1); +} + +static struct bbr_thread * +bbr_register_thread(void (*run) (void *), void *data, const u8 * name) +{ + struct bbr_thread *thread; + int ret; + struct completion event; + + thread = kmalloc(sizeof (struct bbr_thread), GFP_KERNEL); + if (!thread) { + return NULL; + } + memset(thread, 0, sizeof (struct bbr_thread)); + init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->data = data; + thread->name = name; + ret = kernel_thread(bbr_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +static void bbr_unregister_thread(struct bbr_thread *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + bbr_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +static int bbr_global_init(void) +{ + int rc = 0; + + if ( !bbr_remap_pool || !bbr_io_buf_pool ) { + if (create_pools()) + return -ENOMEM; + } + + if (!bbr_io_thread) { + const char * name = "bbr_io"; + bbr_io_thread = bbr_register_thread(bbr_io_handler, NULL, name); + if (!bbr_io_thread) { + destroy_pools(); + return -ENOMEM; + } + } + return rc; +} + +static struct bbr_private * bbr_alloc_private(void) +{ + struct bbr_private *bbr_id; + + if (bbr_global_init()) + return NULL; + + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL); + if (!bbr_id) { + BBR_NO_MEM(); + return NULL; + } + memset(bbr_id, 0, sizeof(*bbr_id)); + INIT_LIST_HEAD(&bbr_id->bbr_list); + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0); + bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED; + if (syncio_handle == NULL) { + if (sync_io_setup(&syncio_handle, "EVMS-BBR")) { + kfree(bbr_id); + return NULL; + } + } + return bbr_id; +} + +static void bbr_free_private(struct bbr_private *bbr_id) +{ + + if (bbr_id->remap_root) + bbr_free_remap(bbr_id); + + if (bbr_id->bbr_table) + kfree(bbr_id->bbr_table); + + + list_del(&bbr_id->bbr_list); + kfree(bbr_id); +} + +static u32 crc_table[256]; +static u32 crc_table_built = 0; + +static void build_crc_table(void) +{ + u32 i, j, crc; + + for (i = 0; i <= 255; i++) { + crc = i; + for (j = 8; j > 0; j--) { + if (crc & 1) + crc = (crc >> 1) ^ CRC_POLYNOMIAL; + else + crc >>= 1; + } + crc_table[i] = crc; + } + crc_table_built = 1; +} + +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize) +{ + unsigned char *current_byte; + u32 temp1, temp2, i; + + current_byte = (unsigned char *) buffer; + /* Make sure the crc table is available */ + if (!crc_table_built) + build_crc_table(); + /* Process each byte in the buffer. */ + for (i = 0; i < buffersize; i++) { + temp1 = (crc >> 8) & 0x00FFFFFF; + temp2 = + crc_table[(crc ^ (u32) * + current_byte) & (u32) 0xff]; + current_byte++; + crc = temp1 ^ temp2; + } + return (crc); +} + +/** + * le_bbr_table_sector_to_cpu + * + * Convert bbr meta data from on-disk (LE) format + * to the native cpu endian format. + **/ +static void le_bbr_table_sector_to_cpu(struct bbr_table * p) +{ + int i; + p->signature = le32_to_cpup(&p->signature); + p->crc = le32_to_cpup(&p->crc); + p->sequence_number = le32_to_cpup(&p->sequence_number); + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt); + for ( i = 0; i < BBR_ENTRIES_PER_SECT; i++ ) { + p->entries[i].bad_sect = + le64_to_cpup(&p->entries[i].bad_sect); + p->entries[i].replacement_sect = + le64_to_cpup(&p->entries[i].replacement_sect); + } +} + +/** + * cpu_bbr_table_sector_to_le + * + * Convert bbr meta data from cpu endian format to on-disk (LE) format + **/ +static void cpu_bbr_table_sector_to_le(struct bbr_table * p, + struct bbr_table * le) +{ + int i; + le->signature = cpu_to_le32p(&p->signature); + le->crc = cpu_to_le32p(&p->crc); + le->sequence_number = cpu_to_le32p(&p->sequence_number); + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt); + for ( i = 0; i < BBR_ENTRIES_PER_SECT; i++ ) { + le->entries[i].bad_sect = + cpu_to_le64p(&p->entries[i].bad_sect); + le->entries[i].replacement_sect = + cpu_to_le64p(&p->entries[i].replacement_sect); + } +} + +/** + * validate_bbr_table_sector + * + * Check the specified BBR table sector for a valid signature and CRC. + **/ +static int validate_bbr_table_sector(struct bbr_table * p) +{ + int rc = 0; + int org_crc, final_crc; + + if ( le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE ) { + DMERR("BBR table signature doesn't match!"); + DMERR("Sector has (0x%08X) expected(0x%08X)", + le32_to_cpup(&p->signature), + BBR_TABLE_SIGNATURE); + rc = -EINVAL; + } else { + if (p->crc) { + org_crc = le32_to_cpup(&p->crc); + p->crc = 0; + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p)); + if ( final_crc != org_crc ) { + DMERR("CRC failed!"); + DMERR("Sector has (0x%08X) calculated(0x%08X)", + org_crc, final_crc); + rc = -EINVAL; + } + p->crc = cpu_to_le32p(&org_crc); + } else { + DMERR("BBR table sector has no CRC!"); + rc = -EINVAL; + } + } + le_bbr_table_sector_to_cpu(p); + return rc; +} + +/** + * bbr_binary_tree_insert + * + * Insert a node into the binary tree. + **/ +void bbr_binary_tree_insert(struct bbr_runtime_remap ** root, + struct bbr_runtime_remap * newnode) +{ + struct bbr_runtime_remap ** node = root; + while (node && *node) { + if ( newnode->remap.bad_sect > (*node)->remap.bad_sect ) { + node = &((*node)->right); + } else { + node = &((*node)->left); + } + } + + newnode->left = newnode->right = NULL; + *node = newnode; +} + +/** + * bbr_binary_search + * + * Search for a node that contains bad_sect = lsn. + **/ +struct bbr_runtime_remap * bbr_binary_search(struct bbr_runtime_remap * root, + u64 lsn) +{ + struct bbr_runtime_remap * node = root; + while (node) { + if (node->remap.bad_sect == lsn) { + break; + } + if ( lsn > node->remap.bad_sect ) { + node = node->right; + } else { + node = node->left; + } + } + return node; +} + +/** + * bbr_binary_tree_destroy + * + * Destroy the binary tree. + **/ +void bbr_binary_tree_destroy(struct bbr_runtime_remap * root, + struct bbr_private * bbr_id) +{ + struct bbr_runtime_remap ** link = NULL; + struct bbr_runtime_remap * node = root; + + while (node) { + if (node->left) { + link = &(node->left); + node = node->left; + continue; + } + if (node->right) { + link = &(node->right); + node = node->right; + continue; + } + + mempool_free(node, bbr_remap_pool); + if (node == root) { + /* If root is deleted, we're done. */ + break; + } + + /* Back to root. */ + node = root; + *link = NULL; + } +} + +static void bbr_free_remap(struct bbr_private * bbr_id) +{ + spin_lock_irq(&bbr_id->bbr_id_lock); + bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id); + bbr_id->remap_root = NULL; + spin_unlock_irq(&bbr_id->bbr_id_lock); +} + +/** + * bbr_insert_remap_entry + * + * Create a new remap entry and add it to the binary tree for this node. + **/ +static int bbr_insert_remap_entry(struct bbr_private * bbr_id, + struct bbr_table_entry * new_bbr_entry) +{ + struct bbr_runtime_remap * newnode = NULL; + int rc; + + newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO); + if (!newnode) { + rc = -ENOMEM; + DMERR("Could not allocate from remap pool! (rc=%d)\n", rc); + return rc; + } + newnode->remap.bad_sect = new_bbr_entry->bad_sect; + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect; + spin_lock_irq(&bbr_id->bbr_id_lock); + bbr_binary_tree_insert(&bbr_id->remap_root, newnode); + spin_unlock_irq(&bbr_id->bbr_id_lock); + return 0; +} + +/** + * bbr_table_to_remap_list + * + * The on-disk bbr table is sorted by the replacement sector LBA. In order to + * improve run time performance, the in memory remap list must be sorted by + * the bad sector LBA. This function is called at discovery time to initialize + * the remap list. This function assumes that at least one copy of meta data + * is valid. + **/ +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id) +{ + u32 in_use_blks = 0; + int i, j; + struct bbr_table *p; + + + for ( i = 0, p = bbr_id->bbr_table; + i < bbr_id->nr_sects_bbr_table; + i++, p++ ) { + if (!p->in_use_cnt) { + break; + } + in_use_blks += p->in_use_cnt; + for ( j = 0; j < p->in_use_cnt; j++ ) { + bbr_insert_remap_entry(bbr_id, &p->entries[j]); + } + } + if (in_use_blks) + DMWARN("There are %u BBR entries for device %02d:%02d", + in_use_blks, MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev) ); + + return in_use_blks; +} + +/** + * bbr_search_remap_entry + * + * Search remap entry for the specified sector. If found, return a pointer to + * the table entry. Otherwise, return NULL. + **/ +static struct bbr_table_entry * bbr_search_remap_entry(struct bbr_private * bbr_id, + u64 lsn) +{ + struct bbr_runtime_remap * p; + + spin_lock_irq(&bbr_id->bbr_id_lock); + p = bbr_binary_search(bbr_id->remap_root, lsn); + spin_unlock_irq(&bbr_id->bbr_id_lock); + if (p) { + return (&p->remap); + } else { + return NULL; + } +} + +/** + * bbr_remap + * + * If *lsn is in the remap table, return TRUE and modify *lsn, + * else, return FALSE. + **/ +static inline int bbr_remap(struct bbr_private * bbr_id, + u64 * lsn) +{ + struct bbr_table_entry *e; + + if ( atomic_read(&bbr_id->in_use_replacement_blks) && + ! (bbr_id->flag & BBR_STOP_REMAP) ) { + e = bbr_search_remap_entry(bbr_id, *lsn); + if (e) { + *lsn = e->replacement_sect; + return 1; + } + } + return 0; +} + +/** + * bbr_remap_probe + * + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap + * table return TRUE, Else, return FALSE. + **/ +static inline int bbr_remap_probe(struct bbr_private * bbr_id, + u64 lsn, u64 nr_sects) +{ + u64 tmp, cnt; + + if ( atomic_read(&bbr_id->in_use_replacement_blks) && + ! (bbr_id->flag & BBR_STOP_REMAP) ) { + for ( cnt = 0, tmp = lsn; + cnt < nr_sects; + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) { + if ( bbr_remap(bbr_id,&tmp) ) { + return 1; + } + } + } + return 0; +} + +static int bbr_setup(struct bbr_private *bbr_id) +{ + struct bbr_table *table; + struct sync_io_job job; + int i; + int rc = 0; + + bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT, + GFP_KERNEL); + if (!bbr_id->bbr_table) { + BBR_NO_MEM(); + rc = -ENOMEM; + goto bad; + } + + job.handle = syncio_handle; + job.dev = bbr_id->dev->dev; + job.rw = READ; + job.start_lsn = bbr_id->lba_table1; + job.num_lsns = bbr_id->nr_sects_bbr_table; + job.data = bbr_id->bbr_table; + + rc = sync_io(&job); + + if (rc && bbr_id->lba_table2) { + job.start_lsn = bbr_id->lba_table2; + rc = sync_io(&job); + } + if (rc) + goto bad; + + table = bbr_id->bbr_table; + for (i=0; i < bbr_id->nr_sects_bbr_table; i++, table++) { + if (validate_bbr_table_sector(table)) { + DMERR("%s : failed on BBR sector index : %d", __FUNCTION__, i); + rc = -EINVAL; + goto bad; + } + } + atomic_set(&bbr_id->in_use_replacement_blks, bbr_table_to_remap_list(bbr_id)); +bad: + if (rc) { + DMERR("%s : FAILED, rc=%d, dev(%d,%d)", + __FUNCTION__, rc, MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev) ); + + DMERR("%s : table1("PFU64"), table2("PFU64"), nr_sects_bbr_table("PFU64"), start_replacement_sect("PFU64"), nr_replacement_blks("PFU64") ", + __FUNCTION__, + bbr_id->lba_table1, bbr_id->lba_table2, bbr_id->nr_sects_bbr_table, + bbr_id->start_replacement_sect, bbr_id->nr_replacement_blks ); + } + return rc; +} + +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id, + struct buffer_head * bh, + int rw) +{ + struct bbr_io_buffer * bbr_io_buf; + + bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO); + if (bbr_io_buf) { + memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer)); + INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list); + bbr_io_buf->bbr_id = bbr_id; + bbr_io_buf->sector = bh->b_rsector; + bbr_io_buf->bh = bh; + bbr_io_buf->rw = rw; + } else { + DMWARN("Could not allocate from BBR I/O buffer pool!\n"); + } + return bbr_io_buf; +} + +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf) +{ + mempool_free(bbr_io_buf, bbr_io_buf_pool); +} + +/** + * bbr_io_remap_error + * @bbr_id: Private data for the BBR node. + * @rw: READ or WRITE. + * @starting_lsn: Starting sector of request to remap. + * @count: Number of sectors in the request. + * @buffer: Data buffer for the request. + * + * For the requested range, try to write each sector individually. For each + * sector that fails, find the next available remap location and write the + * data to that new location. Then update the table and write both copies + * of the table to disk. Finally, update the in-memory mapping and do any + * other necessary bookkeeping. + **/ +static int bbr_io_remap_error(struct bbr_private * bbr_id, + int rw, + u64 starting_lsn, + u64 count, + char * buffer ) +{ + struct bbr_table * bbr_table; + struct sync_io_job job; + unsigned long table_sector_index; + unsigned long table_sector_offset; + unsigned long index; + u64 lsn, new_lsn; + int rc; + + if ( rw == READ ) { + /* Nothing can be done about read errors. */ + return -EIO; + } + + job.handle = syncio_handle; + job.dev = bbr_id->dev->dev; + + /* For each sector in the request. */ + for ( lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE ) { + job.rw = rw; + job.start_lsn = starting_lsn + lsn; + job.num_lsns = 1; + job.data = buffer; + rc = sync_io(&job); + while (rc) { + if ( bbr_id->flag & BBR_STOP_REMAP ) { + /* Can't allow new remaps if the + * engine told us to stop. + */ + DMERR("Object %02d:%02d: Bad sector ("PFU64"), but remapping is turned off.\n", + MAJOR(bbr_id->dev->dev), + MINOR(bbr_id->dev->dev), + starting_lsn+lsn); + return -EIO; + } + + /* Find the next available relocation sector. */ + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks); + if ( new_lsn >= bbr_id->nr_replacement_blks ) { + /* No more replacement sectors available. */ + return -EIO; + } + new_lsn += bbr_id->start_replacement_sect; + + /* Write the data to its new location. */ + DMWARN("Object %02d:%02d: Trying to remap bad sector ("PFU64") to sector ("PFU64")\n", + MAJOR(bbr_id->dev->dev), + MINOR(bbr_id->dev->dev), + starting_lsn + lsn, + new_lsn); + job.start_lsn = new_lsn; + rc = sync_io(&job); + if (rc) { + /* This replacement sector is bad. + * Try the next one. + */ + DMERR("Object %02d:%02d: Replacement sector ("PFU64") is bad. Skipping.\n", + MAJOR(bbr_id->dev->dev), + MINOR(bbr_id->dev->dev), + new_lsn); + atomic_inc(&bbr_id->in_use_replacement_blks); + continue; + } + + /* Add this new entry to the on-disk table. */ + table_sector_index = new_lsn - + bbr_id->start_replacement_sect; + table_sector_offset = table_sector_index / + BBR_ENTRIES_PER_SECT; + index = table_sector_index % BBR_ENTRIES_PER_SECT; + + bbr_table = &bbr_id->bbr_table[table_sector_offset]; + bbr_table->entries[index].bad_sect = starting_lsn + lsn; + bbr_table->entries[index].replacement_sect = new_lsn; + bbr_table->in_use_cnt++; + bbr_table->sequence_number++; + bbr_table->crc = 0; + bbr_table->crc = calculate_crc(INITIAL_CRC, + bbr_table, + sizeof(struct bbr_table)); + + /* Write the table to disk. */ + cpu_bbr_table_sector_to_le(bbr_table, bbr_table); + if ( bbr_id->lba_table1 ) { + job.rw = WRITE; + job.start_lsn = bbr_id->lba_table1 + table_sector_offset; + job.num_lsns = 1; + job.data = bbr_table; + rc = sync_io(&job); + } + if ( bbr_id->lba_table2 ) { + job.start_lsn = bbr_id->lba_table2 + table_sector_offset; + rc |= sync_io(&job); + } + le_bbr_table_sector_to_cpu(bbr_table); + + if (rc) { + /* Error writing one of the tables to disk. */ + DMERR("Object %02d:%02d: Error updating BBR tables on disk.\n", + MAJOR(bbr_id->dev->dev), + MINOR(bbr_id->dev->dev)); + return rc; + } + + /* Insert a new entry in the remapping binary-tree. */ + rc = bbr_insert_remap_entry(bbr_id, + &bbr_table->entries[index]); + if (rc) { + DMERR("Object %02d:%02d: Error adding new entry to remap tree.\n", + MAJOR(bbr_id->dev->dev), + MINOR(bbr_id->dev->dev)); + return rc; + } + + atomic_inc(&bbr_id->in_use_replacement_blks); + } + } + + return 0; +} + +/** + * bbr_io_process_request + * + * For each sector in this request, check if the sector has already + * been remapped. If so, process all previous sectors in the request, + * followed by the remapped sector. Then reset the starting lsn and + * count, and keep going with the rest of the request as if it were + * a whole new request. If any of the sync_io's return an error, + * call the remapper to relocate the bad sector(s). + **/ +static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf) +{ + struct bbr_private * bbr_id = bbr_io_buf->bbr_id; + struct sync_io_job job; + u64 starting_lsn = bbr_io_buf->sector; + u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT; + u64 lsn, remapped_lsn; + char * buffer = bbr_io_buf->bh->b_data; + int rc = 0; + + job.handle = syncio_handle; + job.dev = bbr_id->dev->dev; + job.rw = bbr_io_buf->rw; + + /* For each sector in this request, check if this sector has already + * been remapped. If so, process all previous sectors in this request, + * followed by the remapped sector. Then reset the starting lsn and + * count and keep going with the rest of the request as if it were + * a whole new request. + */ + for ( lsn = 0; lsn < count && !(bbr_id->flag & BBR_STOP_REMAP); lsn++ ) { + remapped_lsn = starting_lsn + lsn; + rc = bbr_remap(bbr_id, &remapped_lsn); + if (!rc) { + /* This sector is fine. */ + continue; + } + + /* Process all sectors in the request up to this one. */ + if ( lsn > 0 ) { + job.start_lsn = starting_lsn; + job.num_lsns = lsn; + job.data = buffer; + rc = sync_io(&job); + if (rc) { + /* If this I/O failed, then one of the sectors + * in this request needs to be relocated. + */ + rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn, + lsn, buffer); + if (rc) { + return rc; + } + } + buffer += (lsn << SECTOR_SHIFT); + } + + /* Process the remapped sector. */ + job.start_lsn = remapped_lsn; + job.num_lsns = 1; + job.data = buffer; + rc = sync_io(&job); + if (rc) { + /* BUGBUG - Need more processing if this caused an + * an error. If this I/O failed, then the existing + * remap is now bad, and we need to find a new remap. + * Can't use bbr_io_remap_error(), because the existing + * map entry needs to be changed, not added again, and + * the original table entry also needs to be changed. + */ + return rc; + } + + buffer += SECTOR_SIZE; + starting_lsn += (lsn + 1); + count -= (lsn + 1); + lsn = -1; + } + + /* Check for any remaining sectors after the last split. This could + * potentially be the whole request, but that should be a rare case + * because requests should only be processed by the thread if we know + * an error occurred or they contained one or more remapped sectors. + */ + if ( count ) { + job.start_lsn = starting_lsn; + job.num_lsns = count; + job.data = buffer; + rc = sync_io(&job); + if (rc) { + /* If this I/O failed, then one of the sectors in this + * request needs to be relocated. + */ + rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn, + count, buffer); + if (rc) { + return rc; + } + } + } + + return 0; +} + +/** + * bbr_io_handler + * + * This is the handler for the bbr_io_thread. It continuously loops, + * taking I/O requests off its list and processing them. If nothing + * is on the list, the thread goes back to sleep until specifically + * woken up. + * + * I/O requests should only be sent to this thread if we know that: + * a) the request contains at least one remapped sector. + * or + * b) the request caused an error on the normal I/O path. + * This function uses synchronous I/O, so sending a request to this + * thread that doesn't need special processing will cause severe + * performance degredation. + **/ +static void bbr_io_handler(void * void_data) +{ + struct bbr_io_buffer * bbr_io_buf; + struct buffer_head * bh; + unsigned long flags; + int rc = 0; + + while (1) { + /* Process bbr_io_list, one entry at a time. */ + spin_lock_irqsave(&bbr_io_list_lock, flags); + if (list_empty(&bbr_io_list)) { + /* No more items on the list. */ + spin_unlock_irqrestore(&bbr_io_list_lock, flags); + break; + } + bbr_io_buf = list_entry(bbr_io_list.next, + struct bbr_io_buffer, bbr_io_list); + list_del_init(&bbr_io_buf->bbr_io_list); + spin_unlock_irqrestore(&bbr_io_list_lock, flags); + + rc = bbr_io_process_request(bbr_io_buf); + + /* Clean up and complete the original I/O. */ + bbr_io_buf->flags |= BBR_IO_HANDLED; + bh = bbr_io_buf->bh; + if (bh->b_end_io) { + /* If this was the bbr_io_buf for an error on the normal WRITE, + * don't free it here. It will be free later in bbr_callback() + */ + if (!(bbr_io_buf->flags & BBR_IO_RELOCATE)) + free_bbr_io_buf(bbr_io_buf); + bh->b_end_io(bh, rc ? 0 : 1); + } + } +} + +/** + * bbr_schedule_io + * + * Place the specified bbr_io_buf on the thread's processing list. + **/ +static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf) +{ + unsigned long flags; + spin_lock_irqsave(&bbr_io_list_lock, flags); + list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list); + spin_unlock_irqrestore(&bbr_io_list_lock, flags); + bbr_wakeup_thread(bbr_io_thread); +} + +/** + * bbr_read + * + * If there are any remapped sectors on this object, send this request over + * to the thread for processing. Otherwise send it down the stack normally. + **/ +static int bbr_read(struct bbr_private *bbr_id, + struct buffer_head * bh) +{ + struct bbr_io_buffer * bbr_io_buf; + + + if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 || + bbr_id->flag & BBR_STOP_REMAP || + ! bbr_remap_probe(bbr_id, bh->b_rsector, + bh->b_size >> SECTOR_SHIFT) ) { + /* No existing remaps, this request doesn't contain any + * remapped sectors, or the engine told us not to remap. + */ + bh->b_rdev = bbr_id->dev->dev; + return 1; + } + + /* This request has at least one remapped sector. */ + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ); + if (!bbr_io_buf) { + /* Can't get memory to track the I/O. */ + BBR_NO_MEM(); + bh->b_end_io(bh, 0); + return (-ENOMEM); + } + + bbr_schedule_io(bbr_io_buf); + return 0; +} + +/** + * bbr_callback + * + * This is the callback for normal write requests. Check for an error + * during the I/O, and send to the thread for processing if necessary. + **/ +static int bbr_callback( + struct dm_target *ti, + struct buffer_head * bh, + int rw, + int error, + void *map_context) +{ + struct bbr_io_buffer * bbr_io_buf; + + if (map_context == NULL) + return error; + + bbr_io_buf = (struct bbr_io_buffer *) map_context; + + /* Will try to relocate the WRITE if: + * - It is an error, and + * - It is not an error of BBR relocation, and + * - noone told us to stop remapping. + */ + if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED) + && !(bbr_io_buf->bbr_id->flag & BBR_STOP_REMAP)) { + DMERR("Object %02d:%02d: Write failure on sector (%lu). Scheduling for retry.\n", + MAJOR(bh->b_rdev), + MINOR(bh->b_rdev), + (unsigned long)bbr_io_buf->sector); + /* indicate this bbr_io_buf is for an error on normal WRITE */ + bbr_io_buf->flags |= BBR_IO_RELOCATE; + bbr_schedule_io(bbr_io_buf); + return (1); /* Returns >0 so that DM will let us retry the I/O */ + } else { + free_bbr_io_buf(bbr_io_buf); + } + return (error); +} + +/** + * bbr_write + * + * If there are any remapped sectors on this object, send the request over + * to the thread for processing. Otherwise, register for callback + * notification, and send the request down normally. + **/ +static int bbr_write(struct bbr_private *bbr_id, + struct buffer_head * bh, + void **map_context) +{ + struct bbr_io_buffer * bbr_io_buf; + + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE); + if (!bbr_io_buf) { + /* Can't get memory to track the I/O. */ + BBR_NO_MEM(); + bh->b_end_io(bh, 0); + return (-ENOMEM); + } + + if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 || + bbr_id->flag & BBR_STOP_REMAP || + ! bbr_remap_probe(bbr_id, bh->b_rsector, + bh->b_size >> SECTOR_SHIFT) ) { + /* No existing remaps, this request contains no remapped + * sectors, or the engine said to stop remapping. + */ + bh->b_rdev = bbr_id->dev->dev; + *map_context = bbr_io_buf; + return (1); + } else { + /* This request contains at least one remapped sector. */ + *map_context = NULL; + bbr_schedule_io(bbr_io_buf); + } + return 0; +} + +/** + * Construct a bbr mapping + **/ +static int bbr_ctr(struct dm_target *ti, int argc, char **argv) +{ + struct bbr_private *bbr_id; + unsigned long block_size; + char* end; + + if (argc != 8) { + ti->error = "dm-bbr requires exactly 8 arguments: " + "device offset lsn_table1 lsn_table2 table_size start_replacement nr_replacement_blks block_size"; + return -EINVAL; + } + + bbr_id = bbr_alloc_private(); + if (bbr_id == NULL) { + BBR_NO_MEM(); + ti->error = no_mem_msg; + return -ENOMEM; + } + + if (dm_get_device(ti, argv[0], 0, ti->len, + dm_table_get_mode(ti->table), &bbr_id->dev)) { + ti->error = "dm-bbr: Device lookup failed"; + goto bad; + } + + bbr_id->offset = simple_strtoull(argv[1], &end, 10); + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10); + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10); + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10); + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10); + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10); + block_size = simple_strtoul(argv[7], &end, 10); + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT); + + if (bbr_setup(bbr_id)) { + ti->error = "dm-bbr: Device setup failed"; + goto bad; + } + + list_add(&bbr_id->bbr_list, &bbr_instances); + + ti->private = bbr_id; + + return 0; +bad: + bbr_free_private(bbr_id); + return -EINVAL; +} + +static void bbr_dtr(struct dm_target *ti) +{ + struct bbr_private *bbr_id = (struct bbr_private *) ti->private; + + dm_put_device(ti, bbr_id->dev); + bbr_free_private(bbr_id); + + if (list_empty(&bbr_instances)) { + destroy_pools(); + if (bbr_io_thread) { + bbr_unregister_thread(bbr_io_thread); + bbr_io_thread = NULL; + } + if (syncio_handle) { + sync_io_cleanup(syncio_handle); + syncio_handle = NULL; + } + } +} + +static int bbr_map(struct dm_target *ti, struct buffer_head *bh, int rw, + void **map_context) +{ + struct bbr_private *bbr_id = (struct bbr_private *) ti->private; + + bh->b_rsector += bbr_id->offset; + switch (rw) { + case READ: + case READA: + *map_context = NULL; + return bbr_read(bbr_id, bh); + case WRITE: + return bbr_write(bbr_id, bh, map_context); + default: + return -EIO; + } +} + +static int bbr_status(struct dm_target *ti, status_type_t type, + char *result, int maxlen) +{ + struct bbr_private *bbr_id = (struct bbr_private *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u", + bdevname(bbr_id->dev->dev), bbr_id->offset, + bbr_id->lba_table1, bbr_id->lba_table2, + bbr_id->nr_sects_bbr_table, + bbr_id->start_replacement_sect, + bbr_id->nr_replacement_blks, + bbr_id->blksize_in_sects << SECTOR_SHIFT); + DMINFO("dm-bbr status: %s", result); + break; + } + return 0; +} + +static struct target_type bbr_target = { + name: "bbr", + module: THIS_MODULE, + ctr: bbr_ctr, + dtr: bbr_dtr, + map: bbr_map, + end_io: bbr_callback, + status: bbr_status, +}; + +int __init dm_bbr_init(void) +{ + int r = dm_register_target(&bbr_target); + + if (r < 0) + DMERR("dm-bbr: register failed %d", r); + + return r; +} + +void __exit dm_bbr_exit(void) +{ + int r = dm_unregister_target(&bbr_target); + + if (r < 0) + DMERR("dm-bbr: unregister failed %d", r); +} + +module_init(dm_bbr_init); +module_exit(dm_bbr_exit); +MODULE_LICENSE("GPL"); diff -Naur linux-2.4.20-dm-10/drivers/md/dm-bbr.h linux-2.4.20-evms-2.0.1/drivers/md/dm-bbr.h --- linux-2.4.20-dm-10/drivers/md/dm-bbr.h 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.4.20-evms-2.0.1/drivers/md/dm-bbr.h 2003-04-28 11:59:53.000000000 -0500 @@ -0,0 +1,174 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * linux/include/linux/dm-bbr.h + * + * Bad-block-relocation (BBR) target for device-mapper. + * + * The BBR target is designed to remap I/O write failures to another safe + * location on disk. Note that most disk drives have BBR built into them, + * this means that our software BBR will be only activated when all hardware + * BBR replacement sectors have been used. + */ + +#ifndef __DM_BBR__ +#define __DM_BBR__ + +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */ +#define BBR_ENTRIES_PER_SECT 31 +#define BBR_NR_BUFS 128 +#define INITIAL_CRC 0xFFFFFFFF +#define CRC_POLYNOMIAL 0xEDB88320L +#define BBR_STOP_REMAP (1<<0) + +/** + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines. + * Use these in place of %Ld, %Lu, and %Lx. + **/ +#if BITS_PER_LONG > 32 +#define PFU64 "%lu" +#else +#define PFU64 "%Lu" +#endif + +/** + * struct bbr_table_entry + * @bad_sect: LBA of bad location. + * @replacement_sect: LBA of new location. + * + * Structure to describe one BBR remap. + **/ +struct bbr_table_entry { + u64 bad_sect; + u64 replacement_sect; +}; + +/** + * struct bbr_table + * @signature: Signature on each BBR table sector. + * @crc: CRC for this table sector. + * @sequence_number: Used to resolve conflicts when primary and secondary + * tables do not match. + * @in_use_cnt: Number of in-use table entries. + * @entries: Actual table of remaps. + * + * Structure to describe each sector of the metadata table. Each sector in this + * table can describe 31 remapped sectors. + **/ +struct bbr_table { + u32 signature; + u32 crc; + u32 sequence_number; + u32 in_use_cnt; + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT]; +}; + +/** + * struct bbr_runtime_remap + * + * Node in the binary tree used to keep track of remaps. + **/ +struct bbr_runtime_remap { + struct bbr_table_entry remap; + struct bbr_runtime_remap *left; + struct bbr_runtime_remap *right; +}; + +/** + * struct bbr_private + * @next: List of all bbr_private structures. + * @dev: Info about underlying device. + * @bbr_table: Copy of metadata table. + * @offset: + * @lba_table1: LBA of primary BBR table. + * @lba_table2: LBA of secondary BBR table. + * @nr_sects_bbr_table: Size of each BBR table. + * @nr_replacement_blks: Number of replacement sectors. + * @start_replacement_sect: LBA of start of replacement sectors. + * @blksize_in_sects: Size of each sector. + * @in_use_replacement_blks: Current number of remaps. + * @remap_root: Binary tree containing all remaps. + * @bbr_id_lock: Lock for the binary tree. + * @flags: BBR_STOP_REMAP + **/ +struct bbr_private { + struct list_head bbr_list; + struct dm_dev *dev; + struct bbr_table *bbr_table; + u64 offset; + u64 lba_table1; + u64 lba_table2; + u64 nr_sects_bbr_table; + u64 nr_replacement_blks; + u64 start_replacement_sect; + u32 blksize_in_sects; + atomic_t in_use_replacement_blks; + struct bbr_runtime_remap *remap_root; + spinlock_t bbr_id_lock; + u32 flag; +}; + +#define BBR_IO_HANDLED (1<<0) +#define BBR_IO_RELOCATE (1<<1) + +/** + * struct bbr_io_buffer + * @bbr_io_list: Thread's list of bbr_io_buf's. + * @bbr_id: Object for this request. + * @bh: Original buffer_head. + * @sector: Original sector + * @flags: Operation flag + * @rw: READ or WRITE. + * @rc: Return code from bbr_io_handler. + * + * Structure used to track each write request. + **/ +struct bbr_io_buffer { + struct list_head bbr_io_list; + struct bbr_private *bbr_id; + struct buffer_head *bh; + u64 sector; + u32 flags; + s32 rw; + s32 rc; +}; + +#define BBR_THREAD_WAKEUP 0 + +/** + * struct bbr_thread + * @run: + * @data: + * @wqueue: thread wait queue + * @flags: thread attributes + * @event: event completion + * @tsk: task info + * @name: thread name + * + * data structure for creating/managing a kernel thread + **/ +struct bbr_thread { + void (*run) (void *data); + void *data; + wait_queue_head_t wqueue; + unsigned long flags; + struct completion *event; + struct task_struct *tsk; + const u8 *name; +}; + +#endif