diff -ruN linux-2.6.20.3/fs/ext3cow/acl.c linux-2.6.20.3-ext3cow/fs/ext3cow/acl.c --- linux-2.6.20.3/fs/ext3cow/acl.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/acl.c 2008-03-09 11:14:49.000000000 -0400 @@ -0,0 +1,551 @@ +/* + * linux/fs/ext3cow/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext3cow_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext3cow_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext3cow_acl_header *)value)->a_version != + cpu_to_le32(EXT3COW_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext3cow_acl_header); + count = ext3cow_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n=0; n < count; n++) { + ext3cow_acl_entry *entry = + (ext3cow_acl_entry *)value; + if ((char *)value + sizeof(ext3cow_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext3cow_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(ext3cow_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext3cow_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext3cow_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext3cow_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext3cow_acl_header) + acl->a_count * + sizeof(ext3cow_acl_entry), GFP_KERNEL); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT3COW_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext3cow_acl_header); + for (n=0; n < acl->a_count; n++) { + ext3cow_acl_entry *entry = (ext3cow_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = + cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(ext3cow_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext3cow_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +static inline struct posix_acl * +ext3cow_iget_acl(struct inode *inode, struct posix_acl **i_acl) +{ + struct posix_acl *acl = EXT3COW_ACL_NOT_CACHED; + + spin_lock(&inode->i_lock); + if (*i_acl != EXT3COW_ACL_NOT_CACHED) + acl = posix_acl_dup(*i_acl); + spin_unlock(&inode->i_lock); + + return acl; +} + +static inline void +ext3cow_iset_acl(struct inode *inode, struct posix_acl **i_acl, + struct posix_acl *acl) +{ + spin_lock(&inode->i_lock); + if (*i_acl != EXT3COW_ACL_NOT_CACHED) + posix_acl_release(*i_acl); + *i_acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: don't care + */ +static struct posix_acl * +ext3cow_get_acl(struct inode *inode, int type) +{ + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return NULL; + + switch(type) { + case ACL_TYPE_ACCESS: + acl = ext3cow_iget_acl(inode, &ei->i_acl); + if (acl != EXT3COW_ACL_NOT_CACHED) + return acl; + name_index = EXT3COW_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + + case ACL_TYPE_DEFAULT: + acl = ext3cow_iget_acl(inode, &ei->i_default_acl); + if (acl != EXT3COW_ACL_NOT_CACHED) + return acl; + name_index = EXT3COW_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + + default: + return ERR_PTR(-EINVAL); + } + retval = ext3cow_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext3cow_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext3cow_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) { + switch(type) { + case ACL_TYPE_ACCESS: + ext3cow_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext3cow_iset_acl(inode, &ei->i_default_acl, acl); + break; + } + } + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_mutex: down unless called from ext3cow_new_inode + */ +static int +ext3cow_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl) +{ + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch(type) { + case ACL_TYPE_ACCESS: + name_index = EXT3COW_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + ext3cow_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT3COW_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext3cow_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext3cow_xattr_set_handle(handle, inode, name_index, "", + value, size, 0); + + kfree(value); + if (!error) { + switch(type) { + case ACL_TYPE_ACCESS: + ext3cow_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext3cow_iset_acl(inode, &ei->i_default_acl, acl); + break; + } + } + return error; +} + +static int +ext3cow_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = ext3cow_get_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +int +ext3cow_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + return generic_permission(inode, mask, ext3cow_check_acl); +} + +/* + * Initialize the ACLs of a new inode. Called from ext3cow_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext3cow_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int error = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (test_opt(dir->i_sb, POSIX_ACL)) { + acl = ext3cow_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current->fs->umask; + } + if (test_opt(inode->i_sb, POSIX_ACL) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + error = ext3cow_set_acl(handle, inode, + ACL_TYPE_DEFAULT, acl); + if (error) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto cleanup; + + mode = inode->i_mode; + error = posix_acl_create_masq(clone, &mode); + if (error >= 0) { + inode->i_mode = mode; + if (error > 0) { + /* This is an extended ACL */ + error = ext3cow_set_acl(handle, inode, + ACL_TYPE_ACCESS, clone); + } + } + posix_acl_release(clone); + } +cleanup: + posix_acl_release(acl); + return error; +} + +/* + * Does chmod for an inode that may have an Access Control List. The + * inode->i_mode field must be updated to the desired value by the caller + * before calling this function. + * Returns 0 on success, or a negative error number. + * + * We change the ACL rather than storing some ACL entries in the file + * mode permission bits (which would be more efficient), because that + * would break once additional permissions (like ACL_APPEND, ACL_DELETE + * for directories) are added. There are no more bits available in the + * file mode. + * + * inode->i_mutex: down + */ +int +ext3cow_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + acl = ext3cow_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) { + handle_t *handle; + int retries = 0; + + retry: + handle = ext3cow_journal_start(inode, + EXT3COW_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext3cow_std_error(inode->i_sb, error); + goto out; + } + error = ext3cow_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); + ext3cow_journal_stop(handle); + if (error == -ENOSPC && + ext3cow_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + } +out: + posix_acl_release(clone); + return error; +} + +/* + * Extended attribute handlers + */ +static size_t +ext3cow_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, + const char *name, size_t name_len) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +static size_t +ext3cow_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, + const char *name, size_t name_len) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +static int +ext3cow_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +{ + struct posix_acl *acl; + int error; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ext3cow_get_acl(inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int +ext3cow_xattr_get_acl_access(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext3cow_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); +} + +static int +ext3cow_xattr_get_acl_default(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext3cow_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); +} + +static int +ext3cow_xattr_set_acl(struct inode *inode, int type, const void *value, + size_t size) +{ + handle_t *handle; + struct posix_acl *acl; + int error, retries = 0; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return -EOPNOTSUPP; + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + +retry: + handle = ext3cow_journal_start(inode, EXT3COW_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = ext3cow_set_acl(handle, inode, type, acl); + ext3cow_journal_stop(handle); + if (error == -ENOSPC && ext3cow_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + +release_and_out: + posix_acl_release(acl); + return error; +} + +static int +ext3cow_xattr_set_acl_access(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext3cow_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int +ext3cow_xattr_set_acl_default(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext3cow_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +struct xattr_handler ext3cow_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .list = ext3cow_xattr_list_acl_access, + .get = ext3cow_xattr_get_acl_access, + .set = ext3cow_xattr_set_acl_access, +}; + +struct xattr_handler ext3cow_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .list = ext3cow_xattr_list_acl_default, + .get = ext3cow_xattr_get_acl_default, + .set = ext3cow_xattr_set_acl_default, +}; diff -ruN linux-2.6.20.3/fs/ext3cow/acl.h linux-2.6.20.3-ext3cow/fs/ext3cow/acl.h --- linux-2.6.20.3/fs/ext3cow/acl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/acl.h 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,81 @@ +/* + File: fs/ext3cow/acl.h + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +#define EXT3COW_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext3cow_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext3cow_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext3cow_acl_header; + +static inline size_t ext3cow_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext3cow_acl_header) + + count * sizeof(ext3cow_acl_entry_short); + } else { + return sizeof(ext3cow_acl_header) + + 4 * sizeof(ext3cow_acl_entry_short) + + (count - 4) * sizeof(ext3cow_acl_entry); + } +} + +static inline int ext3cow_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext3cow_acl_header); + s = size - 4 * sizeof(ext3cow_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext3cow_acl_entry_short)) + return -1; + return size / sizeof(ext3cow_acl_entry_short); + } else { + if (s % sizeof(ext3cow_acl_entry)) + return -1; + return s / sizeof(ext3cow_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT3COW_FS_POSIX_ACL + +/* Value for inode->u.ext3cow_i.i_acl and inode->u.ext3cow_i.i_default_acl + if the ACL has not been cached */ +#define EXT3COW_ACL_NOT_CACHED ((void *)-1) + +/* acl.c */ +extern int ext3cow_permission (struct inode *, int, struct nameidata *); +extern int ext3cow_acl_chmod (struct inode *); +extern int ext3cow_init_acl (handle_t *, struct inode *, struct inode *); + +#else /* CONFIG_EXT3COW_FS_POSIX_ACL */ +#include +#define ext3cow_permission NULL + +static inline int +ext3cow_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int +ext3cow_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT3COW_FS_POSIX_ACL */ + diff -ruN linux-2.6.20.3/fs/ext3cow/balloc.c linux-2.6.20.3-ext3cow/fs/ext3cow/balloc.c --- linux-2.6.20.3/fs/ext3cow/balloc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/balloc.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,1823 @@ +/* + * linux/fs/ext3cow/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext3cow_read_super). + */ + + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/** + * ext3cow_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ +struct ext3cow_group_desc * ext3cow_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh) +{ + unsigned long group_desc; + unsigned long offset; + struct ext3cow_group_desc * desc; + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + + if (block_group >= sbi->s_groups_count) { + ext3cow_error (sb, "ext3cow_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + + return NULL; + } + smp_rmb(); + + group_desc = block_group >> EXT3COW_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT3COW_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext3cow_error (sb, "ext3cow_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext3cow_group_desc *) sbi->s_group_desc[group_desc]->b_data; + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc + offset; +} + +/** + * read_block_bitmap() + * @sb: super block + * @block_group: given block group + * + * Read the bitmap for a given block_group, reading into the specified + * slot in the superblock's bitmap cache. + * + * Return buffer_head on success or NULL in case of failure. + */ +static struct buffer_head * +read_block_bitmap(struct super_block *sb, unsigned int block_group) +{ + struct ext3cow_group_desc * desc; + struct buffer_head * bh = NULL; + + desc = ext3cow_get_group_desc (sb, block_group, NULL); + if (!desc) + goto error_out; + bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap)); + if (!bh) + ext3cow_error (sb, "read_block_bitmap", + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %u", + block_group, le32_to_cpu(desc->bg_block_bitmap)); +error_out: + return bh; +} +/* + * The reservation window structure operations + * -------------------------------------------- + * Operations include: + * dump, find, add, remove, is_empty, find_next_reservable_window, etc. + * + * We use a red-black tree to represent per-filesystem reservation + * windows. + * + */ + +/** + * __rsv_window_dump() -- Dump the filesystem block allocation reservation map + * @rb_root: root of per-filesystem reservation rb tree + * @verbose: verbose mode + * @fn: function which wishes to dump the reservation map + * + * If verbose is turned on, it will print the whole block reservation + * windows(start, end). Otherwise, it will only print out the "bad" windows, + * those windows that overlap with their immediate neighbors. + */ +#if 1 +static void __rsv_window_dump(struct rb_root *root, int verbose, + const char *fn) +{ + struct rb_node *n; + struct ext3cow_reserve_window_node *rsv, *prev; + int bad; + +restart: + n = rb_first(root); + bad = 0; + prev = NULL; + + printk("Block Allocation Reservation Windows Map (%s):\n", fn); + while (n) { + rsv = rb_entry(n, struct ext3cow_reserve_window_node, rsv_node); + if (verbose) + printk("reservation window 0x%p " + "start: %lu, end: %lu\n", + rsv, rsv->rsv_start, rsv->rsv_end); + if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { + printk("Bad reservation %p (start >= end)\n", + rsv); + bad = 1; + } + if (prev && prev->rsv_end >= rsv->rsv_start) { + printk("Bad reservation %p (prev->end >= start)\n", + rsv); + bad = 1; + } + if (bad) { + if (!verbose) { + printk("Restarting reservation walk in verbose mode\n"); + verbose = 1; + goto restart; + } + } + n = rb_next(n); + prev = rsv; + } + printk("Window map complete.\n"); + if (bad) + BUG(); +} +#define rsv_window_dump(root, verbose) \ + __rsv_window_dump((root), (verbose), __FUNCTION__) +#else +#define rsv_window_dump(root, verbose) do {} while (0) +#endif + +/** + * goal_in_my_reservation() + * @rsv: inode's reservation window + * @grp_goal: given goal block relative to the allocation block group + * @group: the current allocation block group + * @sb: filesystem super block + * + * Test if the given goal block (group relative) is within the file's + * own block reservation window range. + * + * If the reservation window is outside the goal allocation group, return 0; + * grp_goal (given goal block) could be -1, which means no specific + * goal block. In this case, always return 1. + * If the goal block is within the reservation window, return 1; + * otherwise, return 0; + */ +static int +goal_in_my_reservation(struct ext3cow_reserve_window *rsv, ext3cow_grpblk_t grp_goal, + unsigned int group, struct super_block * sb) +{ + ext3cow_fsblk_t group_first_block, group_last_block; + + group_first_block = ext3cow_group_first_block_no(sb, group); + group_last_block = group_first_block + (EXT3COW_BLOCKS_PER_GROUP(sb) - 1); + + if ((rsv->_rsv_start > group_last_block) || + (rsv->_rsv_end < group_first_block)) + return 0; + if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) + || (grp_goal + group_first_block > rsv->_rsv_end))) + return 0; + return 1; +} + +/** + * search_reserve_window() + * @rb_root: root of reservation tree + * @goal: target allocation block + * + * Find the reserved window which includes the goal, or the previous one + * if the goal is not in any window. + * Returns NULL if there are no windows or if all windows start after the goal. + */ +static struct ext3cow_reserve_window_node * +search_reserve_window(struct rb_root *root, ext3cow_fsblk_t goal) +{ + struct rb_node *n = root->rb_node; + struct ext3cow_reserve_window_node *rsv; + + if (!n) + return NULL; + + do { + rsv = rb_entry(n, struct ext3cow_reserve_window_node, rsv_node); + + if (goal < rsv->rsv_start) + n = n->rb_left; + else if (goal > rsv->rsv_end) + n = n->rb_right; + else + return rsv; + } while (n); + /* + * We've fallen off the end of the tree: the goal wasn't inside + * any particular node. OK, the previous node must be to one + * side of the interval containing the goal. If it's the RHS, + * we need to back up one. + */ + if (rsv->rsv_start > goal) { + n = rb_prev(&rsv->rsv_node); + rsv = rb_entry(n, struct ext3cow_reserve_window_node, rsv_node); + } + return rsv; +} + +/** + * ext3cow_rsv_window_add() -- Insert a window to the block reservation rb tree. + * @sb: super block + * @rsv: reservation window to add + * + * Must be called with rsv_lock hold. + */ +void ext3cow_rsv_window_add(struct super_block *sb, + struct ext3cow_reserve_window_node *rsv) +{ + struct rb_root *root = &EXT3COW_SB(sb)->s_rsv_window_root; + struct rb_node *node = &rsv->rsv_node; + ext3cow_fsblk_t start = rsv->rsv_start; + + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct ext3cow_reserve_window_node *this; + + while (*p) + { + parent = *p; + this = rb_entry(parent, struct ext3cow_reserve_window_node, rsv_node); + + if (start < this->rsv_start) + p = &(*p)->rb_left; + else if (start > this->rsv_end) + p = &(*p)->rb_right; + else { + rsv_window_dump(root, 1); + BUG(); + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); +} + +/** + * ext3cow_rsv_window_remove() -- unlink a window from the reservation rb tree + * @sb: super block + * @rsv: reservation window to remove + * + * Mark the block reservation window as not allocated, and unlink it + * from the filesystem reservation window rb tree. Must be called with + * rsv_lock hold. + */ +static void rsv_window_remove(struct super_block *sb, + struct ext3cow_reserve_window_node *rsv) +{ + rsv->rsv_start = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_alloc_hit = 0; + rb_erase(&rsv->rsv_node, &EXT3COW_SB(sb)->s_rsv_window_root); +} + +/* + * rsv_is_empty() -- Check if the reservation window is allocated. + * @rsv: given reservation window to check + * + * returns 1 if the end block is EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED. + */ +static inline int rsv_is_empty(struct ext3cow_reserve_window *rsv) +{ + /* a valid reservation end block could not be 0 */ + return rsv->_rsv_end == EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; +} + +/** + * ext3cow_init_block_alloc_info() + * @inode: file inode structure + * + * Allocate and initialize the reservation window structure, and + * link the window to the ext3cow inode structure at last + * + * The reservation window structure is only dynamically allocated + * and linked to ext3cow inode the first time the open file + * needs a new block. So, before every ext3cow_new_block(s) call, for + * regular files, we should check whether the reservation window + * structure exists or not. In the latter case, this function is called. + * Fail to do so will result in block reservation being turned off for that + * open file. + * + * This function is called from ext3cow_get_blocks_handle(), also called + * when setting the reservation window size through ioctl before the file + * is open for write (needs block allocation). + * + * Needs truncate_mutex protection prior to call this function. + */ +void ext3cow_init_block_alloc_info(struct inode *inode) +{ + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + struct ext3cow_block_alloc_info *block_i = ei->i_block_alloc_info; + struct super_block *sb = inode->i_sb; + + block_i = kmalloc(sizeof(*block_i), GFP_NOFS); + if (block_i) { + struct ext3cow_reserve_window_node *rsv = &block_i->rsv_window_node; + + rsv->rsv_start = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; + rsv->rsv_end = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; + + /* + * if filesystem is mounted with NORESERVATION, the goal + * reservation window size is set to zero to indicate + * block reservation is off + */ + if (!test_opt(sb, RESERVATION)) + rsv->rsv_goal_size = 0; + else + rsv->rsv_goal_size = EXT3COW_DEFAULT_RESERVE_BLOCKS; + rsv->rsv_alloc_hit = 0; + block_i->last_alloc_logical_block = 0; + block_i->last_alloc_physical_block = 0; + } + ei->i_block_alloc_info = block_i; +} + +/** + * ext3cow_discard_reservation() + * @inode: inode + * + * Discard(free) block reservation window on last file close, or truncate + * or at last iput(). + * + * It is being called in three cases: + * ext3cow_release_file(): last writer close the file + * ext3cow_clear_inode(): last iput(), when nobody link to this file. + * ext3cow_truncate(): when the block indirect map is about to change. + * + */ +void ext3cow_discard_reservation(struct inode *inode) +{ + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + struct ext3cow_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3cow_reserve_window_node *rsv; + spinlock_t *rsv_lock = &EXT3COW_SB(inode->i_sb)->s_rsv_window_lock; + + if (!block_i) + return; + + rsv = &block_i->rsv_window_node; + if (!rsv_is_empty(&rsv->rsv_window)) { + spin_lock(rsv_lock); + if (!rsv_is_empty(&rsv->rsv_window)) + rsv_window_remove(inode->i_sb, rsv); + spin_unlock(rsv_lock); + } +} + +/** + * ext3cow_free_blocks_sb() -- Free given blocks and update quota + * @handle: handle to this transaction + * @sb: super block + * @block: start physcial block to free + * @count: number of blocks to free + * @pdquot_freed_blocks: pointer to quota + */ +void ext3cow_free_blocks_sb(handle_t *handle, struct super_block *sb, + ext3cow_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + unsigned long block_group; + ext3cow_grpblk_t bit; + unsigned long i; + unsigned long overflow; + struct ext3cow_group_desc * desc; + struct ext3cow_super_block * es; + struct ext3cow_sb_info *sbi; + int err = 0, ret; + ext3cow_grpblk_t group_freed; + + *pdquot_freed_blocks = 0; + sbi = EXT3COW_SB(sb); + es = sbi->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { + ext3cow_error (sb, "ext3cow_free_blocks", + "Freeing blocks not in datazone - " + "block = "E3FSBLK", count = %lu", block, count); + goto error_return; + } + + //TODO: Remove: + printk(KERN_INFO "freeing block(s) %lu-%lu\n", block, block + count - 1); + ext3cow_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1); + +do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT3COW_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT3COW_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT3COW_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT3COW_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + desc = ext3cow_get_group_desc (sb, block_group, &gd_bh); + if (!desc) + goto error_return; + + if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), + sbi->s_itb_per_group)) + ext3cow_error (sb, "ext3cow_free_blocks", + "Freeing blocks in system zones - " + "Block = "E3FSBLK", count = %lu", + block, count); + + /* + * We are about to start releasing blocks in the bitmap, + * so we need undo access. + */ + /* @@@ check errors */ + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3cow_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + jbd_lock_bh_state(bitmap_bh); + + for (i = 0, group_freed = 0; i < count; i++) { + /* + * An HJ special. This is expensive... + */ +#ifdef CONFIG_JBD_DEBUG + jbd_unlock_bh_state(bitmap_bh); + { + struct buffer_head *debug_bh; + debug_bh = sb_find_get_block(sb, block + i); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "Deleted!"); + if (!bh2jh(bitmap_bh)->b_committed_data) + BUFFER_TRACE(debug_bh, + "No commited data in bitmap"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); + __brelse(debug_bh); + } + } + jbd_lock_bh_state(bitmap_bh); +#endif + if (need_resched()) { + jbd_unlock_bh_state(bitmap_bh); + cond_resched(); + jbd_lock_bh_state(bitmap_bh); + } + /* @@@ This prevents newly-allocated data from being + * freed and then reallocated within the same + * transaction. + * + * Ideally we would want to allow that to happen, but to + * do so requires making journal_forget() capable of + * revoking the queued write of a data block, which + * implies blocking on the journal lock. *forget() + * cannot block due to truncate races. + * + * Eventually we can fix this by making journal_forget() + * return a status indicating whether or not it was able + * to revoke the buffer. On successful revoke, it is + * safe not to set the allocation bit in the committed + * bitmap, because we know that there is no outstanding + * activity on the buffer any more and so it is safe to + * reallocate it. + */ + BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); + J_ASSERT_BH(bitmap_bh, + bh2jh(bitmap_bh)->b_committed_data != NULL); + ext3cow_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, + bh2jh(bitmap_bh)->b_committed_data); + + /* + * We clear the bit in the bitmap after setting the committed + * data bit, because this is the reverse order to that which + * the allocator uses. + */ + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3cow_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit + i, bitmap_bh->b_data)) { + jbd_unlock_bh_state(bitmap_bh); + ext3cow_error(sb, __FUNCTION__, + "bit already cleared for block "E3FSBLK, + block + i); + jbd_lock_bh_state(bitmap_bh); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + group_freed++; + } + } + jbd_unlock_bh_state(bitmap_bh); + + spin_lock(sb_bgl_lock(sbi, block_group)); + desc->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + + group_freed); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext3cow_journal_dirty_metadata(handle, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext3cow_journal_dirty_metadata(handle, gd_bh); + if (!err) err = ret; + *pdquot_freed_blocks += group_freed; + + if (overflow && !err) { + block += count; + count = overflow; + goto do_more; + } + sb->s_dirt = 1; +error_return: + brelse(bitmap_bh); + ext3cow_std_error(sb, err); + return; +} + +/** + * ext3cow_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + */ +void ext3cow_free_blocks(handle_t *handle, struct inode *inode, + ext3cow_fsblk_t block, unsigned long count) +{ + struct super_block * sb; + unsigned long dquot_freed_blocks; + + sb = inode->i_sb; + if (!sb) { + printk ("ext3cow_free_blocks: nonexistent device"); + return; + } + ext3cow_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); + if (dquot_freed_blocks) + DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); + return; +} + +/** + * ext3cow_test_allocatable() + * @nr: given allocation block group + * @bh: bufferhead contains the bitmap of the given block group + * + * For ext3cow allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes. + */ +static int ext3cow_test_allocatable(ext3cow_grpblk_t nr, struct buffer_head *bh) +{ + int ret; + struct journal_head *jh = bh2jh(bh); + + if (ext3cow_test_bit(nr, bh->b_data)) + return 0; + + jbd_lock_bh_state(bh); + if (!jh->b_committed_data) + ret = 1; + else + ret = !ext3cow_test_bit(nr, jh->b_committed_data); + jbd_unlock_bh_state(bh); + return ret; +} + +/** + * bitmap_search_next_usable_block() + * @start: the starting block (group relative) of the search + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) of the reservation + * + * The bitmap search --- search forward alternately through the actual + * bitmap on disk and the last-committed copy in journal, until we find a + * bit free in both bitmaps. + */ +static ext3cow_grpblk_t +bitmap_search_next_usable_block(ext3cow_grpblk_t start, struct buffer_head *bh, + ext3cow_grpblk_t maxblocks) +{ + ext3cow_grpblk_t next; + struct journal_head *jh = bh2jh(bh); + + while (start < maxblocks) { + next = ext3cow_find_next_zero_bit(bh->b_data, maxblocks, start); + if (next >= maxblocks) + return -1; + if (ext3cow_test_allocatable(next, bh)) + return next; + jbd_lock_bh_state(bh); + if (jh->b_committed_data) + start = ext3cow_find_next_zero_bit(jh->b_committed_data, + maxblocks, next); + jbd_unlock_bh_state(bh); + } + return -1; +} + +/** + * find_next_usable_block() + * @start: the starting block (group relative) to find next + * allocatable block in bitmap. + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) for the search + * + * Find an allocatable block in a bitmap. We honor both the bitmap and + * its last-committed copy (if that exists), and perform the "most + * appropriate allocation" algorithm of looking for a free block near + * the initial goal; then for a free byte somewhere in the bitmap; then + * for any free bit in the bitmap. + */ +static ext3cow_grpblk_t +find_next_usable_block(ext3cow_grpblk_t start, struct buffer_head *bh, + ext3cow_grpblk_t maxblocks) +{ + ext3cow_grpblk_t here, next; + char *p, *r; + + if (start > 0) { + /* + * The goal was occupied; search forward for a free + * block within the next XX blocks. + * + * end_goal is more or less random, but it has to be + * less than EXT3COW_BLOCKS_PER_GROUP. Aligning up to the + * next 64-bit boundary is simple.. + */ + ext3cow_grpblk_t end_goal = (start + 63) & ~63; + if (end_goal > maxblocks) + end_goal = maxblocks; + here = ext3cow_find_next_zero_bit(bh->b_data, end_goal, start); + if (here < end_goal && ext3cow_test_allocatable(here, bh)) + return here; + ext3cow_debug("Bit not found near goal\n"); + } + + here = start; + if (here < 0) + here = 0; + + p = ((char *)bh->b_data) + (here >> 3); + r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); + next = (r - ((char *)bh->b_data)) << 3; + + if (next < maxblocks && next >= start && ext3cow_test_allocatable(next, bh)) + return next; + + /* + * The bitmap search --- search forward alternately through the actual + * bitmap and the last-committed copy until we find a bit free in + * both + */ + here = bitmap_search_next_usable_block(here, bh, maxblocks); + return here; +} + +/** + * claim_block() + * @block: the free block (group relative) to allocate + * @bh: the bufferhead containts the block group bitmap + * + * We think we can allocate this block in this bitmap. Try to set the bit. + * If that succeeds then check that nobody has allocated and then freed the + * block since we saw that is was not marked in b_committed_data. If it _was_ + * allocated and freed then clear the bit in the bitmap again and return + * zero (failure). + */ +static inline int +claim_block(spinlock_t *lock, ext3cow_grpblk_t block, struct buffer_head *bh) +{ + struct journal_head *jh = bh2jh(bh); + int ret; + + if (ext3cow_set_bit_atomic(lock, block, bh->b_data)) + return 0; + jbd_lock_bh_state(bh); + if (jh->b_committed_data && ext3cow_test_bit(block,jh->b_committed_data)) { + ext3cow_clear_bit_atomic(lock, block, bh->b_data); + ret = 0; + } else { + ret = 1; + } + jbd_unlock_bh_state(bh); + return ret; +} + +/** + * ext3cow_try_to_allocate() + * @sb: superblock + * @handle: handle to this transaction + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * + * Attempt to allocate blocks within a give range. Set the range of allocation + * first, then find the first free bit(s) from the bitmap (within the range), + * and at last, allocate the blocks by claiming the found free bit as allocated. + * + * To set the range of this allocation: + * if there is a reservation window, only try to allocate block(s) from the + * file's own reservation window; + * Otherwise, the allocation range starts from the give goal block, ends at + * the block group's last block. + * + * If we failed to allocate the desired block then we may end up crossing to a + * new bitmap. In that case we must release write access to the old one via + * ext3cow_journal_release_buffer(), else we'll run out of credits. + */ +static ext3cow_grpblk_t +ext3cow_try_to_allocate(struct super_block *sb, handle_t *handle, int group, + struct buffer_head *bitmap_bh, ext3cow_grpblk_t grp_goal, + unsigned long *count, struct ext3cow_reserve_window *my_rsv) +{ + ext3cow_fsblk_t group_first_block; + ext3cow_grpblk_t start, end; + unsigned long num = 0; + + /* we do allocation within the reservation window if we have a window */ + if (my_rsv) { + group_first_block = ext3cow_group_first_block_no(sb, group); + if (my_rsv->_rsv_start >= group_first_block) + start = my_rsv->_rsv_start - group_first_block; + else + /* reservation window cross group boundary */ + start = 0; + end = my_rsv->_rsv_end - group_first_block + 1; + if (end > EXT3COW_BLOCKS_PER_GROUP(sb)) + /* reservation window crosses group boundary */ + end = EXT3COW_BLOCKS_PER_GROUP(sb); + if ((start <= grp_goal) && (grp_goal < end)) + start = grp_goal; + else + grp_goal = -1; + } else { + if (grp_goal > 0) + start = grp_goal; + else + start = 0; + end = EXT3COW_BLOCKS_PER_GROUP(sb); + } + + BUG_ON(start > EXT3COW_BLOCKS_PER_GROUP(sb)); + +repeat: + if (grp_goal < 0 || !ext3cow_test_allocatable(grp_goal, bitmap_bh)) { + grp_goal = find_next_usable_block(start, bitmap_bh, end); + if (grp_goal < 0) + goto fail_access; + if (!my_rsv) { + int i; + + for (i = 0; i < 7 && grp_goal > start && + ext3cow_test_allocatable(grp_goal - 1, + bitmap_bh); + i++, grp_goal--) + ; + } + } + start = grp_goal; + + if (!claim_block(sb_bgl_lock(EXT3COW_SB(sb), group), + grp_goal, bitmap_bh)) { + /* + * The block was allocated by another thread, or it was + * allocated and then freed by another thread + */ + start++; + grp_goal++; + if (start >= end) + goto fail_access; + goto repeat; + } + num++; + grp_goal++; + while (num < *count && grp_goal < end + && ext3cow_test_allocatable(grp_goal, bitmap_bh) + && claim_block(sb_bgl_lock(EXT3COW_SB(sb), group), + grp_goal, bitmap_bh)) { + num++; + grp_goal++; + } + *count = num; + return grp_goal - num; +fail_access: + *count = num; + return -1; +} + +/** + * find_next_reservable_window(): + * find a reservable space within the given range. + * It does not allocate the reservation window for now: + * alloc_new_reservation() will do the work later. + * + * @search_head: the head of the searching list; + * This is not necessarily the list head of the whole filesystem + * + * We have both head and start_block to assist the search + * for the reservable space. The list starts from head, + * but we will shift to the place where start_block is, + * then start from there, when looking for a reservable space. + * + * @size: the target new reservation window size + * + * @group_first_block: the first block we consider to start + * the real search from + * + * @last_block: + * the maximum block number that our goal reservable space + * could start from. This is normally the last block in this + * group. The search will end when we found the start of next + * possible reservable space is out of this boundary. + * This could handle the cross boundary reservation window + * request. + * + * basically we search from the given range, rather than the whole + * reservation double linked list, (start_block, last_block) + * to find a free region that is of my size and has not + * been reserved. + * + */ +static int find_next_reservable_window( + struct ext3cow_reserve_window_node *search_head, + struct ext3cow_reserve_window_node *my_rsv, + struct super_block * sb, + ext3cow_fsblk_t start_block, + ext3cow_fsblk_t last_block) +{ + struct rb_node *next; + struct ext3cow_reserve_window_node *rsv, *prev; + ext3cow_fsblk_t cur; + int size = my_rsv->rsv_goal_size; + + /* TODO: make the start of the reservation window byte-aligned */ + /* cur = *start_block & ~7;*/ + cur = start_block; + rsv = search_head; + if (!rsv) + return -1; + + while (1) { + if (cur <= rsv->rsv_end) + cur = rsv->rsv_end + 1; + + /* TODO? + * in the case we could not find a reservable space + * that is what is expected, during the re-search, we could + * remember what's the largest reservable space we could have + * and return that one. + * + * For now it will fail if we could not find the reservable + * space with expected-size (or more)... + */ + if (cur > last_block) + return -1; /* fail */ + + prev = rsv; + next = rb_next(&rsv->rsv_node); + rsv = rb_entry(next,struct ext3cow_reserve_window_node,rsv_node); + + /* + * Reached the last reservation, we can just append to the + * previous one. + */ + if (!next) + break; + + if (cur + size <= rsv->rsv_start) { + /* + * Found a reserveable space big enough. We could + * have a reservation across the group boundary here + */ + break; + } + } + /* + * we come here either : + * when we reach the end of the whole list, + * and there is empty reservable space after last entry in the list. + * append it to the end of the list. + * + * or we found one reservable space in the middle of the list, + * return the reservation window that we could append to. + * succeed. + */ + + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) + rsv_window_remove(sb, my_rsv); + + /* + * Let's book the whole avaliable window for now. We will check the + * disk bitmap later and then, if there are free blocks then we adjust + * the window size if it's larger than requested. + * Otherwise, we will remove this node from the tree next time + * call find_next_reservable_window. + */ + my_rsv->rsv_start = cur; + my_rsv->rsv_end = cur + size - 1; + my_rsv->rsv_alloc_hit = 0; + + if (prev != my_rsv) + ext3cow_rsv_window_add(sb, my_rsv); + + return 0; +} + +/** + * alloc_new_reservation()--allocate a new reservation window + * + * To make a new reservation, we search part of the filesystem + * reservation list (the list that inside the group). We try to + * allocate a new reservation window near the allocation goal, + * or the beginning of the group, if there is no goal. + * + * We first find a reservable space after the goal, then from + * there, we check the bitmap for the first free block after + * it. If there is no free block until the end of group, then the + * whole group is full, we failed. Otherwise, check if the free + * block is inside the expected reservable space, if so, we + * succeed. + * If the first free block is outside the reservable space, then + * start from the first free block, we search for next available + * space, and go on. + * + * on succeed, a new reservation will be found and inserted into the list + * It contains at least one free block, and it does not overlap with other + * reservation windows. + * + * failed: we failed to find a reservation window in this group + * + * @rsv: the reservation + * + * @grp_goal: The goal (group-relative). It is where the search for a + * free reservable space should start from. + * if we have a grp_goal(grp_goal >0 ), then start from there, + * no grp_goal(grp_goal = -1), we start from the first block + * of the group. + * + * @sb: the super block + * @group: the group we are trying to allocate in + * @bitmap_bh: the block group block bitmap + * + */ +static int alloc_new_reservation(struct ext3cow_reserve_window_node *my_rsv, + ext3cow_grpblk_t grp_goal, struct super_block *sb, + unsigned int group, struct buffer_head *bitmap_bh) +{ + struct ext3cow_reserve_window_node *search_head; + ext3cow_fsblk_t group_first_block, group_end_block, start_block; + ext3cow_grpblk_t first_free_block; + struct rb_root *fs_rsv_root = &EXT3COW_SB(sb)->s_rsv_window_root; + unsigned long size; + int ret; + spinlock_t *rsv_lock = &EXT3COW_SB(sb)->s_rsv_window_lock; + + group_first_block = ext3cow_group_first_block_no(sb, group); + group_end_block = group_first_block + (EXT3COW_BLOCKS_PER_GROUP(sb) - 1); + + if (grp_goal < 0) + start_block = group_first_block; + else + start_block = grp_goal + group_first_block; + + size = my_rsv->rsv_goal_size; + + if (!rsv_is_empty(&my_rsv->rsv_window)) { + /* + * if the old reservation is cross group boundary + * and if the goal is inside the old reservation window, + * we will come here when we just failed to allocate from + * the first part of the window. We still have another part + * that belongs to the next group. In this case, there is no + * point to discard our window and try to allocate a new one + * in this group(which will fail). we should + * keep the reservation window, just simply move on. + * + * Maybe we could shift the start block of the reservation + * window to the first block of next group. + */ + + if ((my_rsv->rsv_start <= group_end_block) && + (my_rsv->rsv_end > group_end_block) && + (start_block >= my_rsv->rsv_start)) + return -1; + + if ((my_rsv->rsv_alloc_hit > + (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { + /* + * if the previously allocation hit ratio is + * greater than 1/2, then we double the size of + * the reservation window the next time, + * otherwise we keep the same size window + */ + size = size * 2; + if (size > EXT3COW_MAX_RESERVE_BLOCKS) + size = EXT3COW_MAX_RESERVE_BLOCKS; + my_rsv->rsv_goal_size= size; + } + } + + spin_lock(rsv_lock); + /* + * shift the search start to the window near the goal block + */ + search_head = search_reserve_window(fs_rsv_root, start_block); + + /* + * find_next_reservable_window() simply finds a reservable window + * inside the given range(start_block, group_end_block). + * + * To make sure the reservation window has a free bit inside it, we + * need to check the bitmap after we found a reservable window. + */ +retry: + ret = find_next_reservable_window(search_head, my_rsv, sb, + start_block, group_end_block); + + if (ret == -1) { + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; + } + + /* + * On success, find_next_reservable_window() returns the + * reservation window where there is a reservable space after it. + * Before we reserve this reservable space, we need + * to make sure there is at least a free block inside this region. + * + * searching the first free bit on the block bitmap and copy of + * last committed bitmap alternatively, until we found a allocatable + * block. Search start from the start block of the reservable space + * we just found. + */ + spin_unlock(rsv_lock); + first_free_block = bitmap_search_next_usable_block( + my_rsv->rsv_start - group_first_block, + bitmap_bh, group_end_block - group_first_block + 1); + + if (first_free_block < 0) { + /* + * no free block left on the bitmap, no point + * to reserve the space. return failed. + */ + spin_lock(rsv_lock); + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; /* failed */ + } + + start_block = first_free_block + group_first_block; + /* + * check if the first free block is within the + * free space we just reserved + */ + if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + return 0; /* success */ + /* + * if the first free bit we found is out of the reservable space + * continue search for next reservable space, + * start from where the free block is, + * we also shift the list head to where we stopped last time + */ + search_head = my_rsv; + spin_lock(rsv_lock); + goto retry; +} + +/** + * try_to_extend_reservation() + * @my_rsv: given reservation window + * @sb: super block + * @size: the delta to extend + * + * Attempt to expand the reservation window large enough to have + * required number of free blocks + * + * Since ext3cow_try_to_allocate() will always allocate blocks within + * the reservation window range, if the window size is too small, + * multiple blocks allocation has to stop at the end of the reservation + * window. To make this more efficient, given the total number of + * blocks needed and the current size of the window, we try to + * expand the reservation window size if necessary on a best-effort + * basis before ext3cow_new_blocks() tries to allocate blocks, + */ +static void try_to_extend_reservation(struct ext3cow_reserve_window_node *my_rsv, + struct super_block *sb, int size) +{ + struct ext3cow_reserve_window_node *next_rsv; + struct rb_node *next; + spinlock_t *rsv_lock = &EXT3COW_SB(sb)->s_rsv_window_lock; + + if (!spin_trylock(rsv_lock)) + return; + + next = rb_next(&my_rsv->rsv_node); + + if (!next) + my_rsv->rsv_end += size; + else { + next_rsv = rb_entry(next, struct ext3cow_reserve_window_node, rsv_node); + + if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) + my_rsv->rsv_end += size; + else + my_rsv->rsv_end = next_rsv->rsv_start - 1; + } + spin_unlock(rsv_lock); +} + +/** + * ext3cow_try_to_allocate_with_rsv() + * @sb: superblock + * @handle: handle to this transaction + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * @errp: pointer to store the error code + * + * This is the main function used to allocate a new block and its reservation + * window. + * + * Each time when a new block allocation is need, first try to allocate from + * its own reservation. If it does not have a reservation window, instead of + * looking for a free bit on bitmap first, then look up the reservation list to + * see if it is inside somebody else's reservation window, we try to allocate a + * reservation window for it starting from the goal first. Then do the block + * allocation within the reservation window. + * + * This will avoid keeping on searching the reservation list again and + * again when somebody is looking for a free block (without + * reservation), and there are lots of free blocks, but they are all + * being reserved. + * + * We use a red-black tree for the per-filesystem reservation list. + * + */ +static ext3cow_grpblk_t +ext3cow_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, + unsigned int group, struct buffer_head *bitmap_bh, + ext3cow_grpblk_t grp_goal, + struct ext3cow_reserve_window_node * my_rsv, + unsigned long *count, int *errp) +{ + ext3cow_fsblk_t group_first_block, group_last_block; + ext3cow_grpblk_t ret = 0; + int fatal; + unsigned long num = *count; + + *errp = 0; + + /* + * Make sure we use undo access for the bitmap, because it is critical + * that we do the frozen_data COW on bitmap buffers in all cases even + * if the buffer is in BJ_Forget state in the committing transaction. + */ + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); + fatal = ext3cow_journal_get_undo_access(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; + } + + /* + * we don't deal with reservation when + * filesystem is mounted without reservation + * or the file is not a regular file + * or last attempt to allocate a block with reservation turned on failed + */ + if (my_rsv == NULL ) { + ret = ext3cow_try_to_allocate(sb, handle, group, bitmap_bh, + grp_goal, count, NULL); + goto out; + } + /* + * grp_goal is a group relative block number (if there is a goal) + * 0 <= grp_goal < EXT3COW_BLOCKS_PER_GROUP(sb) + * first block is a filesystem wide block number + * first block is the block number of the first block in this group + */ + group_first_block = ext3cow_group_first_block_no(sb, group); + group_last_block = group_first_block + (EXT3COW_BLOCKS_PER_GROUP(sb) - 1); + + /* + * Basically we will allocate a new block from inode's reservation + * window. + * + * We need to allocate a new reservation window, if: + * a) inode does not have a reservation window; or + * b) last attempt to allocate a block from existing reservation + * failed; or + * c) we come here with a goal and with a reservation window + * + * We do not need to allocate a new reservation window if we come here + * at the beginning with a goal and the goal is inside the window, or + * we don't have a goal but already have a reservation window. + * then we could go to allocate from the reservation window directly. + */ + while (1) { + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || + !goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) { + if (my_rsv->rsv_goal_size < *count) + my_rsv->rsv_goal_size = *count; + ret = alloc_new_reservation(my_rsv, grp_goal, sb, + group, bitmap_bh); + if (ret < 0) + break; /* failed */ + + if (!goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) + grp_goal = -1; + } else if (grp_goal >= 0) { + int curr = my_rsv->rsv_end - + (grp_goal + group_first_block) + 1; + + if (curr < *count) + try_to_extend_reservation(my_rsv, sb, + *count - curr); + } + + if ((my_rsv->rsv_start > group_last_block) || + (my_rsv->rsv_end < group_first_block)) { + rsv_window_dump(&EXT3COW_SB(sb)->s_rsv_window_root, 1); + BUG(); + } + ret = ext3cow_try_to_allocate(sb, handle, group, bitmap_bh, + grp_goal, &num, &my_rsv->rsv_window); + if (ret >= 0) { + my_rsv->rsv_alloc_hit += num; + *count = num; + break; /* succeed */ + } + num = *count; + } +out: + if (ret >= 0) { + BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " + "bitmap block"); + fatal = ext3cow_journal_dirty_metadata(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + return -1; + } + return ret; + } + + BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); + ext3cow_journal_release_buffer(handle, bitmap_bh); + return ret; +} + +/** + * ext3cow_has_free_blocks() + * @sbi: in-core super block structure. + * + * Check if filesystem has at least 1 free block available for allocation. + */ +static int ext3cow_has_free_blocks(struct ext3cow_sb_info *sbi) +{ + ext3cow_fsblk_t free_blocks, root_blocks; + + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); + if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + return 0; + } + return 1; +} + +/** + * ext3cow_should_retry_alloc() + * @sb: super block + * @retries number of attemps has been made + * + * ext3cow_should_retry_alloc() is called when ENOSPC is returned, and if + * it is profitable to retry the operation, this function will wait + * for the current or commiting transaction to complete, and then + * return TRUE. + * + * if the total number of retries exceed three times, return FALSE. + */ +int ext3cow_should_retry_alloc(struct super_block *sb, int *retries) +{ + if (!ext3cow_has_free_blocks(EXT3COW_SB(sb)) || (*retries)++ > 3) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + + return journal_force_commit_nested(EXT3COW_SB(sb)->s_journal); +} + +/** + * ext3cow_new_blocks() -- core block(s) allocation function + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: target number of blocks to allocate + * @errp: error code + * + * ext3cow_new_blocks uses a goal block to assist allocation. It tries to + * allocate block(s) from the block group contains the goal block first. If that + * fails, it will try to allocate block(s) from other block groups without + * any specific goal block. + * + */ +ext3cow_fsblk_t ext3cow_new_blocks(handle_t *handle, struct inode *inode, + ext3cow_fsblk_t goal, unsigned long *count, int *errp) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gdp_bh; + int group_no; + int goal_group; + ext3cow_grpblk_t grp_target_blk; /* blockgroup relative goal block */ + ext3cow_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ + ext3cow_fsblk_t ret_block; /* filesyetem-wide allocated block */ + int bgi; /* blockgroup iteration index */ + int fatal = 0, err; + int performed_allocation = 0; + ext3cow_grpblk_t free_blocks; /* number of free blocks in a group */ + struct super_block *sb; + struct ext3cow_group_desc *gdp; + struct ext3cow_super_block *es; + struct ext3cow_sb_info *sbi; + struct ext3cow_reserve_window_node *my_rsv = NULL; + struct ext3cow_block_alloc_info *block_i; + unsigned short windowsz = 0; +#ifdef EXT3COWFS_DEBUG + static int goal_hits, goal_attempts; +#endif + unsigned long ngroups; + unsigned long num = *count; + + *errp = -ENOSPC; + sb = inode->i_sb; + if (!sb) { + printk("ext3cow_new_block: nonexistent device"); + return 0; + } + + /* + * Check quota for allocation of this block. + */ + if (DQUOT_ALLOC_BLOCK(inode, num)) { + *errp = -EDQUOT; + return 0; + } + + sbi = EXT3COW_SB(sb); + es = EXT3COW_SB(sb)->s_es; + ext3cow_debug("goal=%lu.\n", goal); + /* + * Allocate a block from reservation only when + * filesystem is mounted with reservation(default,-o reservation), and + * it's a regular file, and + * the desired window size is greater than 0 (One could use ioctl + * command EXT3COW_IOC_SETRSVSZ to set the window size to 0 to turn off + * reservation on that particular file) + */ + block_i = EXT3COW_I(inode)->i_block_alloc_info; + if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) + my_rsv = &block_i->rsv_window_node; + + if (!ext3cow_has_free_blocks(sbi)) { + *errp = -ENOSPC; + goto out; + } + + /* + * First, test whether the goal block is free. + */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + group_no = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT3COW_BLOCKS_PER_GROUP(sb); + goal_group = group_no; +retry_alloc: + gdp = ext3cow_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * if there is not enough free blocks to make a new resevation + * turn off reservation for this allocation + */ + if (my_rsv && (free_blocks < windowsz) + && (rsv_is_empty(&my_rsv->rsv_window))) + my_rsv = NULL; + + if (free_blocks > 0) { + grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3COW_BLOCKS_PER_GROUP(sb)); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + grp_alloc_blk = ext3cow_try_to_allocate_with_rsv(sb, handle, + group_no, bitmap_bh, grp_target_blk, + my_rsv, &num, &fatal); + if (fatal) + goto out; + if (grp_alloc_blk >= 0) + goto allocated; + } + + ngroups = EXT3COW_SB(sb)->s_groups_count; + smp_rmb(); + + /* + * Now search the rest of the groups. We assume that + * i and gdp correctly point to the last group visited. + */ + for (bgi = 0; bgi < ngroups; bgi++) { + group_no++; + if (group_no >= ngroups) + group_no = 0; + gdp = ext3cow_get_group_desc(sb, group_no, &gdp_bh); + if (!gdp) + goto io_error; + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + /* + * skip this group if the number of + * free blocks is less than half of the reservation + * window size. + */ + if (free_blocks <= (windowsz/2)) + continue; + + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + /* + * try to allocate block(s) from this group, without a goal(-1). + */ + grp_alloc_blk = ext3cow_try_to_allocate_with_rsv(sb, handle, + group_no, bitmap_bh, -1, my_rsv, + &num, &fatal); + if (fatal) + goto out; + if (grp_alloc_blk >= 0) + goto allocated; + } + /* + * We may end up a bogus ealier ENOSPC error due to + * filesystem is "full" of reservations, but + * there maybe indeed free blocks avaliable on disk + * In this case, we just forget about the reservations + * just do block allocation as without reservations. + */ + if (my_rsv) { + my_rsv = NULL; + windowsz = 0; + group_no = goal_group; + goto retry_alloc; + } + /* No space left on the device */ + *errp = -ENOSPC; + goto out; + +allocated: + + ext3cow_debug("using block group %d(%d)\n", + group_no, gdp->bg_free_blocks_count); + + BUFFER_TRACE(gdp_bh, "get_write_access"); + fatal = ext3cow_journal_get_write_access(handle, gdp_bh); + if (fatal) + goto out; + + ret_block = grp_alloc_blk + ext3cow_group_first_block_no(sb, group_no); + + if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || + in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || + in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), + EXT3COW_SB(sb)->s_itb_per_group) || + in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), + EXT3COW_SB(sb)->s_itb_per_group)) + ext3cow_error(sb, "ext3cow_new_block", + "Allocating block in system zone - " + "blocks from "E3FSBLK", length %lu", + ret_block, num); + + performed_allocation = 1; + +#ifdef CONFIG_JBD_DEBUG + { + struct buffer_head *debug_bh; + + /* Record bitmap buffer state in the newly allocated block */ + debug_bh = sb_find_get_block(sb, ret_block); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "state when allocated"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); + brelse(debug_bh); + } + } + jbd_lock_bh_state(bitmap_bh); + spin_lock(sb_bgl_lock(sbi, group_no)); + if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { + int i; + + for (i = 0; i < num; i++) { + if (ext3cow_test_bit(grp_alloc_blk+i, + bh2jh(bitmap_bh)->b_committed_data)) { + printk("%s: block was unexpectedly set in " + "b_committed_data\n", __FUNCTION__); + } + } + } + ext3cow_debug("found bit %d\n", grp_alloc_blk); + spin_unlock(sb_bgl_lock(sbi, group_no)); + jbd_unlock_bh_state(bitmap_bh); +#endif + + if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { + ext3cow_error(sb, "ext3cow_new_block", + "block("E3FSBLK") >= blocks count(%d) - " + "block_group = %d, es == %p ", ret_block, + le32_to_cpu(es->s_blocks_count), group_no, es); + goto out; + } + + /* + * It is up to the caller to add the new buffer to a journal + * list of some description. We don't know in advance whether + * the caller wants to use it as metadata or data. + */ + ext3cow_debug("allocating block %lu. Goal hits %d of %d.\n", + ret_block, goal_hits, goal_attempts); + + spin_lock(sb_bgl_lock(sbi, group_no)); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num); + spin_unlock(sb_bgl_lock(sbi, group_no)); + percpu_counter_mod(&sbi->s_freeblocks_counter, -num); + + BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); + err = ext3cow_journal_dirty_metadata(handle, gdp_bh); + if (!fatal) + fatal = err; + + sb->s_dirt = 1; + if (fatal) + goto out; + + *errp = 0; + brelse(bitmap_bh); + DQUOT_FREE_BLOCK(inode, *count-num); + *count = num; + return ret_block; + +io_error: + *errp = -EIO; +out: + if (fatal) { + *errp = fatal; + ext3cow_std_error(sb, fatal); + } + /* + * Undo the block allocation + */ + if (!performed_allocation) + DQUOT_FREE_BLOCK(inode, *count); + brelse(bitmap_bh); + return 0; +} + +ext3cow_fsblk_t ext3cow_new_block(handle_t *handle, struct inode *inode, + ext3cow_fsblk_t goal, int *errp) +{ + unsigned long count = 1; + + return ext3cow_new_blocks(handle, inode, goal, &count, errp); +} + +/** + * ext3cow_count_free_blocks() -- count filesystem free blocks + * @sb: superblock + * + * Adds up the number of free blocks from each block group. + */ +ext3cow_fsblk_t ext3cow_count_free_blocks(struct super_block *sb) +{ + ext3cow_fsblk_t desc_count; + struct ext3cow_group_desc *gdp; + int i; + unsigned long ngroups = EXT3COW_SB(sb)->s_groups_count; +#ifdef EXT3COWFS_DEBUG + struct ext3cow_super_block *es; + ext3cow_fsblk_t bitmap_count; + unsigned long x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT3COW_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + + smp_rmb(); + for (i = 0; i < ngroups; i++) { + gdp = ext3cow_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, i); + if (bitmap_bh == NULL) + continue; + + x = ext3cow_count_free(bitmap_bh, sb->s_blocksize); + printk("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext3cow_count_free_blocks: stored = "E3FSBLK + ", computed = "E3FSBLK", "E3FSBLK"\n", + le32_to_cpu(es->s_free_blocks_count), + desc_count, bitmap_count); + return bitmap_count; +#else + desc_count = 0; + smp_rmb(); + for (i = 0; i < ngroups; i++) { + gdp = ext3cow_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + } + + return desc_count; +#endif +} + +static inline int +block_in_use(ext3cow_fsblk_t block, struct super_block *sb, unsigned char *map) +{ + return ext3cow_test_bit ((block - + le32_to_cpu(EXT3COW_SB(sb)->s_es->s_first_data_block)) % + EXT3COW_BLOCKS_PER_GROUP(sb), map); +} + +static inline int test_root(int a, int b) +{ + int num = b; + + while (a > num) + num *= b; + return num == a; +} + +static int ext3cow_group_sparse(int group) +{ + if (group <= 1) + return 1; + if (!(group & 1)) + return 0; + return (test_root(group, 7) || test_root(group, 5) || + test_root(group, 3)); +} + +/** + * ext3cow_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext3cow_bg_has_super(struct super_block *sb, int group) +{ + if (EXT3COW_HAS_RO_COMPAT_FEATURE(sb, + EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER) && + !ext3cow_group_sparse(group)) + return 0; + return 1; +} + +static unsigned long ext3cow_bg_num_gdb_meta(struct super_block *sb, int group) +{ + unsigned long metagroup = group / EXT3COW_DESC_PER_BLOCK(sb); + unsigned long first = metagroup * EXT3COW_DESC_PER_BLOCK(sb); + unsigned long last = first + EXT3COW_DESC_PER_BLOCK(sb) - 1; + + if (group == first || group == first + 1 || group == last) + return 1; + return 0; +} + +static unsigned long ext3cow_bg_num_gdb_nometa(struct super_block *sb, int group) +{ + if (EXT3COW_HAS_RO_COMPAT_FEATURE(sb, + EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER) && + !ext3cow_group_sparse(group)) + return 0; + return EXT3COW_SB(sb)->s_gdb_count; +} + +/** + * ext3cow_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext3cow_bg_num_gdb(struct super_block *sb, int group) +{ + unsigned long first_meta_bg = + le32_to_cpu(EXT3COW_SB(sb)->s_es->s_first_meta_bg); + unsigned long metagroup = group / EXT3COW_DESC_PER_BLOCK(sb); + + if (!EXT3COW_HAS_INCOMPAT_FEATURE(sb,EXT3COW_FEATURE_INCOMPAT_META_BG) || + metagroup < first_meta_bg) + return ext3cow_bg_num_gdb_nometa(sb,group); + + return ext3cow_bg_num_gdb_meta(sb,group); + +} diff -ruN linux-2.6.20.3/fs/ext3cow/bitmap.c linux-2.6.20.3-ext3cow/fs/ext3cow/bitmap.c --- linux-2.6.20.3/fs/ext3cow/bitmap.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/bitmap.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,32 @@ +/* + * linux/fs/ext3/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include + +#ifdef EXT3COWFS_DEBUG + +static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + +unsigned long ext3cow_count_free (struct buffer_head * map, unsigned int numchars) +{ + unsigned int i; + unsigned long sum = 0; + + if (!map) + return (0); + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return (sum); +} + +#endif /* EXT3COWFS_DEBUG */ + diff -ruN linux-2.6.20.3/fs/ext3cow/CHANGELOG linux-2.6.20.3-ext3cow/fs/ext3cow/CHANGELOG --- linux-2.6.20.3/fs/ext3cow/CHANGELOG 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/CHANGELOG 2008-03-09 11:27:12.000000000 -0400 @@ -0,0 +1,12 @@ +3-9-08 +- Fixed a bug that resulted in the first block in a newly allocated indirect block to be allocated over and over again. +- Fixed a bug that resulted in COW bitmaps not to be reset after truncate. +- Bug e2fsprogs that caused aborting journal fixed. + +6-20-97 +- Finished the rollback code for inode chains in case of error. + +6-18-07 +- Added support for 32-bit uid's and gid's back in again +- Took out support for block fragmentation +- Hopefully fixed the non-sticking uid/gid bug. \ No newline at end of file diff -ruN linux-2.6.20.3/fs/ext3cow/dir.c linux-2.6.20.3-ext3cow/fs/ext3cow/dir.c --- linux-2.6.20.3/fs/ext3cow/dir.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/dir.c 2008-03-09 11:14:49.000000000 -0400 @@ -0,0 +1,732 @@ +/* + * linux/fs/ext3cow/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3cow directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include +#include +#include +#include +#include +#include +#include + +static unsigned char ext3cow_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ext3cow_readdir(struct file *, void *, filldir_t); +static int ext3cow_dx_readdir(struct file * filp, + void * dirent, filldir_t filldir); +static int ext3cow_release_dir (struct inode * inode, + struct file * filp); + +const struct file_operations ext3cow_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ext3cow_readdir, /* we take BKL. needed?*/ + .ioctl = ext3cow_ioctl, /* BKL held */ +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3cow_compat_ioctl, +#endif + .fsync = ext3cow_sync_file, /* BKL held */ +#ifdef CONFIG_EXT3COW_INDEX + .release = ext3cow_release_dir, +#endif +}; + + +static unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT3COW_FT_MAX)) + return DT_UNKNOWN; + + return (ext3cow_filetype_table[filetype]); +} + +static int ext3cow_readversions(struct file * filp, void * dirent, + filldir_t filldir) +{ + int error = 0; + unsigned long offset; + int i, stored; + struct buffer_head *bh; + struct ext3cow_dir_entry_2 * de; + struct super_block * sb; + int err; + struct inode *dir = filp->f_dentry->d_inode; + char *at; + unsigned long ino; + int ref_len = filp->f_dentry->d_name.len -1; + + sb = dir->i_sb; + + stored = 0; + bh = NULL; + offset = filp->f_pos & (sb->s_blocksize - 1); + + at = strrchr(filp->f_dentry->d_name.name, EXT3COW_FLUX_TOKEN); + + while (!error && !stored && filp->f_pos < dir->i_size) { + unsigned long blk = (filp->f_pos) >> EXT3COW_BLOCK_SIZE_BITS(sb); + struct buffer_head map_bh; + + bh = NULL; + map_bh.b_state = 0; + err = ext3cow_get_blocks_handle(NULL, dir, blk, 1, + &map_bh, 0, 0); + if (err > 0) { + page_cache_readahead(sb->s_bdev->bd_inode->i_mapping, + &filp->f_ra, + filp, + map_bh.b_blocknr >> + (PAGE_CACHE_SHIFT - dir->i_blkbits), + 1); + bh = ext3cow_bread(NULL, dir, blk, 0, &err); + } + + /* + * We ignore I/O errors on directories so users have a chance + * of recovering data when there's a bad sector + */ + if (!bh) { + ext3cow_error (sb, "ext3cow_versions", + "directory #%lu contains a hole at offset %lu", + dir->i_ino, (unsigned long)filp->f_pos); + /* corrupt size? Maybe no more blocks to read */ + if (filp->f_pos > dir->i_blocks << 9) + break; + filp->f_pos += sb->s_blocksize - offset; + continue; + } + + ver_revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != dir->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext3cow_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + EXT3COW_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = dir->i_version; + } + + while (!error && filp->f_pos < dir->i_size + && offset < sb->s_blocksize) { + de = (struct ext3cow_dir_entry_2 *) (bh->b_data + offset); + if (!ext3cow_check_dir_entry ("ext3cow_readversions", dir, de, + bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse (bh); + return stored; + } + offset += le16_to_cpu(de->rec_len); + + if (le32_to_cpu(de->inode)){ + unsigned long version = filp->f_version; + unsigned char d_type = DT_UNKNOWN; + + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + + if (EXT3COW_HAS_INCOMPAT_FEATURE(sb, + EXT3COW_FEATURE_INCOMPAT_FILETYPE) + && de->file_type < EXT3COW_FT_MAX) + d_type = + ext3cow_filetype_table[de->file_type]; + if (de->name_len == ref_len + && strncmp(filp->f_dentry->d_name.name, de->name, ref_len)==0) { + + struct inode * inde; + char * name; + + name = kmalloc(EXT3COW_NAME_LEN, GFP_KERNEL); + strncpy(name, de->name, de->name_len); + inde = iget(dir->i_sb, de->inode); + + if (de->death_epoch!=0 && de->birth_epoch!=de->death_epoch) { + name[de->name_len]='\0'; + sprintf(name,"%s@%d",name, de->death_epoch); + error = filldir(dirent, name, + strlen(name), + filp->f_pos, + le32_to_cpu(inde->i_ino), + d_type); + stored++; + } + + while (EXT3COW_I(inde)->i_next_inode!=0) { + name[de->name_len]='\0'; + sprintf(name,"%s@%d",name, EXT3COW_I_EPOCHNUMBER(inde)); + error = filldir(dirent, name, + strlen(name), + filp->f_pos, + le32_to_cpu(inde->i_ino), + d_type); + ino = EXT3COW_I(inde)->i_next_inode; + iput(inde); + inde = iget(dir->i_sb, ino); + stored++; + } + + kfree(name); + iput(inde); + + if (error) + break; + + if (!stored && + EXT3COW_IS_DIRENT_SCOPED(de, EXT3COW_I_EPOCHNUMBER(dir))) { + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + d_type); + } + + if (error) + break; + if (version != filp->f_version) + goto ver_revalidate; + stored ++; + } + } + + filp->f_pos += le16_to_cpu(de->rec_len); + } + offset = 0; + brelse (bh); + } + return 0; +} + + +int ext3cow_check_dir_entry (const char * function, struct inode * dir, + struct ext3cow_dir_entry_2 * de, + struct buffer_head * bh, + unsigned long offset) +{ + const char * error_msg = NULL; + const int rlen = le16_to_cpu(de->rec_len); + unsigned int current_epoch = EXT3COW_S_EPOCHNUMBER(dir->i_sb); + + if (rlen < EXT3COW_DIR_REC_LEN(1)) + error_msg = "rec_len is smaller than minimal"; + else if (rlen % 4 != 0) + error_msg = "rec_len % 4 != 0"; + else if (rlen < EXT3COW_DIR_REC_LEN(de->name_len)) + error_msg = "rec_len is too small for name_len"; + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; + else if (le32_to_cpu(de->inode) > + le32_to_cpu(EXT3COW_SB(dir->i_sb)->s_es->s_inodes_count)) + error_msg = "inode out of bounds"; + /* Some bounds on versioned entries -znjp*/ + else if (le32_to_cpu(de->death_epoch) != EXT3COW_DIRENT_ALIVE && + le32_to_cpu(de->birth_epoch) > le32_to_cpu(de->death_epoch)) + error_msg = "entry died before it was born"; + else if (le32_to_cpu(de->birth_epoch) > current_epoch) + error_msg = "entry was born in the future"; + else if (le32_to_cpu(de->death_epoch) > current_epoch) + error_msg = "entry has already died in the future"; + + if (error_msg != NULL) + ext3cow_error (dir->i_sb, function, + "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d, " + "birth_epoch=%d death_epoch=%d", + dir->i_ino, error_msg, offset, + (unsigned long) le32_to_cpu(de->inode), + rlen, de->name_len, de->birth_epoch, de->death_epoch); + return error_msg == NULL ? 1 : 0; +} + +static int ext3cow_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + int error = 0; + unsigned long offset; + int i, stored; + struct ext3cow_dir_entry_2 *de; + struct super_block *sb; + int err; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = 0; + + /* is this a version listing? */ + if (filp->f_dentry->d_name.name[filp->f_dentry->d_name.len-1] == + EXT3COW_FLUX_TOKEN) + return ext3cow_readversions(filp, dirent, filldir); + + sb = inode->i_sb; + +#ifdef CONFIG_EXT3COW_INDEX + if (EXT3COW_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3COW_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3COW_I(inode)->i_flags & EXT3COW_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + + err = ext3cow_dx_readdir(filp, dirent, filldir); + if (err != ERR_BAD_DX_DIR) { + ret = err; + goto out; + } + /* + * We don't set the inode dirty flag since it's not + * critical that it get flushed back to the disk. + */ + EXT3COW_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3COW_INDEX_FL; + } +#endif + stored = 0; + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < inode->i_size) { + unsigned long blk = filp->f_pos >> EXT3COW_BLOCK_SIZE_BITS(sb); + struct buffer_head map_bh; + struct buffer_head *bh = NULL; + + map_bh.b_state = 0; + err = ext3cow_get_blocks_handle(NULL, inode, blk, 1, + &map_bh, 0, 0); + if (err > 0) { + page_cache_readahead(sb->s_bdev->bd_inode->i_mapping, + &filp->f_ra, + filp, + map_bh.b_blocknr >> + (PAGE_CACHE_SHIFT - inode->i_blkbits), + 1); + bh = ext3cow_bread(NULL, inode, blk, 0, &err); + } + + /* + * We ignore I/O errors on directories so users have a chance + * of recovering data when there's a bad sector + */ + if (!bh) { + ext3cow_error (sb, "ext3cow_readdir", + "directory #%lu contains a hole at offset %lu", + inode->i_ino, (unsigned long)filp->f_pos); + /* corrupt size? Maybe no more blocks to read */ + if (filp->f_pos > inode->i_blocks << 9) + break; + filp->f_pos += sb->s_blocksize - offset; + continue; + } + +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext3cow_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + EXT3COW_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext3cow_dir_entry_2 *) (bh->b_data + offset); + if (!ext3cow_check_dir_entry ("ext3cow_readdir", inode, de, + bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse (bh); + ret = stored; + goto out; + } + offset += le16_to_cpu(de->rec_len); + /* + printk("Inode %ld Epoch number %u: is + dir %d -> %s be %d de %d scoped? %d\n", + dir->i_ino, + EXT3COW_I_EPOCHNUMBER(dir), + de->inode, + de->name, + de->birth_epoch, + de->death_epoch, + EXT3COW_IS_DIRENT_SCOPED(de, EXT3COW_I_EPOCHNUMBER(dir))); + */ + + /* Only add scoped dirents - znjp */ + if (le32_to_cpu(de->inode) && + EXT3COW_IS_DIRENT_SCOPED(de, EXT3COW_I_EPOCHNUMBER(inode))) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + unsigned long version = filp->f_version; + + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored ++; + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + offset = 0; + brelse (bh); + } +out: + return ret; +} + +#ifdef CONFIG_EXT3COW_INDEX +/* + * These functions convert from the major/minor hash to an f_pos + * value. + * + * Currently we only use major hash numer. This is unfortunate, but + * on 32-bit machines, the same VFS interface is used for lseek and + * llseek, so if we use the 64 bit offset, then the 32-bit versions of + * lseek/telldir/seekdir will blow out spectacularly, and from within + * the ext2 low-level routine, we don't know if we're being called by + * a 64-bit version of the system call or the 32-bit version of the + * system call. Worse yet, NFSv2 only allows for a 32-bit readdir + * cookie. Sigh. + */ +#define hash2pos(major, minor) (major >> 1) +#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) +#define pos2min_hash(pos) (0) + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[0]; +}; + +/* + * This functoin implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct rb_node *parent; + struct fname *fname; + + while (n) { + /* Do the node's children first */ + if ((n)->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + fname = rb_entry(n, struct fname, rb_hash); + while (fname) { + struct fname * old = fname; + fname = fname->next; + kfree (old); + } + if (!parent) + root->rb_node = NULL; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } + root->rb_node = NULL; +} + + +static struct dir_private_info *create_dir_info(loff_t pos) +{ + struct dir_private_info *p; + + p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); + if (!p) + return NULL; + p->root.rb_node = NULL; + p->curr_node = NULL; + p->extra_fname = NULL; + p->last_pos = 0; + p->curr_hash = pos2maj_hash(pos); + p->curr_minor_hash = pos2min_hash(pos); + p->next_hash = 0; + return p; +} + +void ext3cow_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + */ +int ext3cow_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3cow_dir_entry_2 *dirent) +{ + struct rb_node **p, *parent = NULL; + struct fname * fname, *new_fn; + struct dir_private_info *info; + int len; + + info = (struct dir_private_info *) dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + len = sizeof(struct fname) + dirent->name_len + 1; + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = dirent->name_len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, dirent->name, dirent->name_len); + new_fn->name[dirent->name_len] = 0; + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext3cow_dx_readdir. It calls filldir + * for all entres on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static int call_filldir(struct file * filp, void * dirent, + filldir_t filldir, struct fname *fname) +{ + struct dir_private_info *info = filp->private_data; + loff_t curr_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block * sb; + int error; + + sb = inode->i_sb; + + printk(KERN_INFO, "Got %s\n", filp->f_path.dentry->d_name.name); + + if (!fname) { + printk("call_filldir: called with null fname?!?\n"); + return 0; + } + curr_pos = hash2pos(fname->hash, fname->minor_hash); + while (fname) { + error = filldir(dirent, fname->name, + fname->name_len, curr_pos, + fname->inode, + get_dtype(sb, fname->file_type)); + if (error) { + filp->f_pos = curr_pos; + info->extra_fname = fname->next; + return error; + } + fname = fname->next; + } + return 0; +} + +static int ext3cow_dx_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + struct dir_private_info *info = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + struct fname *fname; + int ret; + + if (!info) { + info = create_dir_info(filp->f_pos); + if (!info) + return -ENOMEM; + filp->private_data = info; + } + + if (filp->f_pos == EXT3COW_HTREE_EOF) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != filp->f_pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp->f_pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname && + call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; + + if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + (filp->f_version != inode->i_version)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + filp->f_version = inode->i_version; + ret = ext3cow_htree_fill_tree(filp, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + return ret; + if (ret == 0) { + filp->f_pos = EXT3COW_HTREE_EOF; + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (call_filldir(filp, dirent, filldir, fname)) + break; + + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT3COW_HTREE_EOF; + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = filp->f_pos; + return 0; +} + +static int ext3cow_release_dir (struct inode * inode, struct file * filp) +{ + if (filp->private_data) + ext3cow_htree_free_dir_info(filp->private_data); + + return 0; +} + +#endif diff -ruN linux-2.6.20.3/fs/ext3cow/ext3cow_jbd.c linux-2.6.20.3-ext3cow/fs/ext3cow/ext3cow_jbd.c --- linux-2.6.20.3/fs/ext3cow/ext3cow_jbd.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/ext3cow_jbd.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,59 @@ +/* + * Interface between ext3cow and JBD + */ + +#include + +int __ext3cow_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_get_undo_access(handle, bh); + if (err) + ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; +} + +int __ext3cow_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_get_write_access(handle, bh); + if (err) + ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; +} + +int __ext3cow_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_forget(handle, bh); + if (err) + ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; +} + +int __ext3cow_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh) +{ + int err = journal_revoke(handle, blocknr, bh); + if (err) + ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; +} + +int __ext3cow_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh) +{ + int err = journal_get_create_access(handle, bh); + if (err) + ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; +} + +int __ext3cow_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh) +{ + int err = journal_dirty_metadata(handle, bh); + if (err) + ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; +} diff -ruN linux-2.6.20.3/fs/ext3cow/file.c linux-2.6.20.3-ext3cow/fs/ext3cow/file.c --- linux-2.6.20.3/fs/ext3cow/file.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/file.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,147 @@ +/* + * linux/fs/ext3cow/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3cow fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +/* + * Called when an inode is released. Note that this is different + * from ext3cow_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext3cow_release_file (struct inode * inode, struct file * filp) +{ + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1)) + { + mutex_lock(&EXT3COW_I(inode)->truncate_mutex); + ext3cow_discard_reservation(inode); + mutex_unlock(&EXT3COW_I(inode)->truncate_mutex); + } + if (is_dx(inode) && filp->private_data) + ext3cow_htree_free_dir_info(filp->private_data); + + return 0; +} + +static ssize_t +ext3cow_file_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_path.dentry->d_inode; + struct inode *dir = file->f_path.dentry->d_parent->d_inode; + ssize_t ret = 0; + int err = 0; + + /* This is the place where we create a new version on write -znjp */ + if(EXT3COW_S_EPOCHNUMBER(inode->i_sb) > EXT3COW_I_EPOCHNUMBER(inode)){ + err = ext3cow_dup_inode(dir, inode); + if(err) + return err; + } + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + + /* + * Skip flushing if there was an error, or if nothing was written. + */ + if (ret <= 0) + return ret; + + /* + * If the inode is IS_SYNC, or is O_SYNC and we are doing data + * journalling then we need to make sure that we force the transaction + * to disk to keep all metadata uptodate synchronously. + */ + if (file->f_flags & O_SYNC) { + /* + * If we are non-data-journaled, then the dirty data has + * already been flushed to backing store by generic_osync_inode, + * and the inode has been flushed too if there have been any + * modifications other than mere timestamp updates. + * + * Open question --- do we care about flushing timestamps too + * if the inode is IS_SYNC? + */ + if (!ext3cow_should_journal_data(inode)) + return ret; + + goto force_commit; + } + + /* + * So we know that there has been no forced data flush. If the inode + * is marked IS_SYNC, we need to force one ourselves. + */ + if (!IS_SYNC(inode)) + return ret; + + /* + * Open question #2 --- should we force data to disk here too? If we + * don't, the only impact is that data=writeback filesystems won't + * flush data to disk automatically on IS_SYNC, only metadata (but + * historically, that is what ext2 has done.) + */ + +force_commit: + err = ext3cow_force_commit(inode->i_sb); + if (err) + return err; + return ret; +} + +const struct file_operations ext3cow_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = ext3cow_file_write, + .ioctl = ext3cow_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3cow_compat_ioctl, +#endif + .mmap = generic_file_mmap, + .open = generic_file_open, + .release = ext3cow_release_file, + .fsync = ext3cow_sync_file, + .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +struct inode_operations ext3cow_file_inode_operations = { + .truncate = ext3cow_truncate, + .setattr = ext3cow_setattr, +#ifdef CONFIG_EXT3COW_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3cow_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = ext3cow_permission, +}; + diff -ruN linux-2.6.20.3/fs/ext3cow/fsync.c linux-2.6.20.3-ext3cow/fs/ext3cow/fsync.c --- linux-2.6.20.3/fs/ext3cow/fsync.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/fsync.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,88 @@ +/* + * linux/fs/ext3cow/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3cowfs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * akpm: A new design for ext3cow_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + */ + +int ext3cow_sync_file(struct file * file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int ret = 0; + + J_ASSERT(ext3cow_journal_current_handle() == 0); + + /* + * data=writeback: + * The caller's filemap_fdatawrite()/wait will sync the data. + * sync_inode() will sync the metadata + * + * data=ordered: + * The caller's filemap_fdatawrite() will write the data and + * sync_inode() will write the inode if it is dirty. Then the caller's + * filemap_fdatawait() will wait on the pages. + * + * data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). + * ext3cow_force_commit will write the file data into the journal and + * will wait on that. + * filemap_fdatawait() will encounter a ton of newly-dirtied pages + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ + if (ext3cow_should_journal_data(inode)) { + ret = ext3cow_force_commit(inode->i_sb); + goto out; + } + + /* + * The VFS has written the file data. If the inode is unaltered + * then we need not start a commit. + */ + if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, /* sys_fsync did this */ + }; + ret = sync_inode(inode, &wbc); + } +out: + return ret; +} diff -ruN linux-2.6.20.3/fs/ext3cow/hash.c linux-2.6.20.3-ext3cow/fs/ext3cow/hash.c --- linux-2.6.20.3/fs/ext3cow/hash.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/hash.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,152 @@ +/* + * linux/fs/ext3cow/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include +#include +#include +#include +#include + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while(--n); + + buf[0] += b0; + buf[1] += b1; +} + + +/* The old legacy hash */ +static __u32 dx_hack_hash (const char *name, int len) +{ + __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + while (len--) { + __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + + if (hash & 0x80000000) hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return (hash0 << 1); +} + +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i=0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +int ext3cowfs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (hinfo->seed) { + for (i=0; i < 4; i++) { + if (hinfo->seed[i]) + break; + } + if (i < 4) + memcpy(buf, hinfo->seed, sizeof(buf)); + } + + switch (hinfo->hash_version) { + case DX_HASH_LEGACY: + hash = dx_hack_hash(name, len); + break; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 8); + half_md4_transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA: + p = name; + while (len > 0) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + default: + hinfo->hash = 0; + return -1; + } + hash = hash & ~1; + if (hash == (EXT3COW_HTREE_EOF << 1)) + hash = (EXT3COW_HTREE_EOF-1) << 1; + hinfo->hash = hash; + hinfo->minor_hash = minor_hash; + return 0; +} diff -ruN linux-2.6.20.3/fs/ext3cow/ialloc.c linux-2.6.20.3-ext3cow/fs/ext3cow/ialloc.c --- linux-2.6.20.3/fs/ext3cow/ialloc.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/ialloc.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,764 @@ +/* + * linux/fs/ext3cow/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "xattr.h" +#include "acl.h" + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +read_inode_bitmap(struct super_block * sb, unsigned long block_group) +{ + struct ext3cow_group_desc *desc; + struct buffer_head *bh = NULL; + + desc = ext3cow_get_group_desc(sb, block_group, NULL); + if (!desc) + goto error_out; + + bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); + if (!bh) + ext3cow_error(sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %u", + block_group, le32_to_cpu(desc->bg_inode_bitmap)); +error_out: + return bh; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext3cow_free_inode (handle_t *handle, struct inode * inode) +{ + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + unsigned long block_group; + unsigned long bit; + struct ext3cow_group_desc * gdp; + struct ext3cow_super_block * es; + struct ext3cow_sb_info *sbi; + int fatal = 0, err; + + if (atomic_read(&inode->i_count) > 1) { + printk ("ext3cow_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + printk ("ext3cow_free_inode: inode has nlink=%d\n", + inode->i_nlink); + return; + } + if (!sb) { + printk("ext3cow_free_inode: inode on nonexistent device\n"); + return; + } + sbi = EXT3COW_SB(sb); + + ino = inode->i_ino; + ext3cow_debug ("freeing inode %lu\n", ino); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); + ext3cow_xattr_delete_inode(handle, inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + clear_inode (inode); + + es = EXT3COW_SB(sb)->s_es; + if (ino < EXT3COW_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3cow_error (sb, "ext3cow_free_inode", + "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT3COW_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3COW_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext3cow_journal_get_write_access(handle, bitmap_bh); + if (fatal) + goto error_return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext3cow_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit, bitmap_bh->b_data)) + ext3cow_error (sb, "ext3cow_free_inode", + "bit already cleared for inode %lu", ino); + else { + gdp = ext3cow_get_group_desc (sb, block_group, &bh2); + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext3cow_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + + if (gdp) { + spin_lock(sb_bgl_lock(sbi, block_group)); + gdp->bg_free_inodes_count = cpu_to_le16( + le16_to_cpu(gdp->bg_free_inodes_count) + 1); + if (is_directory) + gdp->bg_used_dirs_count = cpu_to_le16( + le16_to_cpu(gdp->bg_used_dirs_count) - 1); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (is_directory) + percpu_counter_dec(&sbi->s_dirs_counter); + + } + BUFFER_TRACE(bh2, "call ext3cow_journal_dirty_metadata"); + err = ext3cow_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bitmap_bh, "call ext3cow_journal_dirty_metadata"); + err = ext3cow_journal_dirty_metadata(handle, bitmap_bh); + if (!fatal) + fatal = err; + sb->s_dirt = 1; +error_return: + brelse(bitmap_bh); + ext3cow_std_error(sb, fatal); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory\'s block + * group to find a free inode. + */ +static int find_group_dir(struct super_block *sb, struct inode *parent) +{ + int ngroups = EXT3COW_SB(sb)->s_groups_count; + unsigned int freei, avefreei; + struct ext3cow_group_desc *desc, *best_desc = NULL; + struct buffer_head *bh; + int group, best_group = -1; + + freei = percpu_counter_read_positive(&EXT3COW_SB(sb)->s_freeinodes_counter); + avefreei = freei / ngroups; + + for (group = 0; group < ngroups; group++) { + desc = ext3cow_get_group_desc (sb, group, &bh); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (!best_desc || + (le16_to_cpu(desc->bg_free_blocks_count) > + le16_to_cpu(best_desc->bg_free_blocks_count))) { + best_group = group; + best_desc = desc; + } + } + return best_group; +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * it's already running too large debt (max_debt). + * Parent's group is prefered, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + * + * Debt is incremented each time we allocate a directory and decremented + * when we allocate an inode, within 0--255. + */ + +#define INODE_COST 64 +#define BLOCK_COST 256 + +static int find_group_orlov(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT3COW_I(parent)->i_block_group; + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + struct ext3cow_super_block *es = sbi->s_es; + int ngroups = sbi->s_groups_count; + int inodes_per_group = EXT3COW_INODES_PER_GROUP(sb); + unsigned int freei, avefreei; + ext3cow_fsblk_t freeb, avefreeb; + ext3cow_fsblk_t blocks_per_dir; + unsigned int ndirs; + int max_debt, max_dirs, min_inodes; + ext3cow_grpblk_t min_blocks; + int group = -1, i; + struct ext3cow_group_desc *desc; + struct buffer_head *bh; + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = freeb / ngroups; + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if ((parent == sb->s_root->d_inode) || + (EXT3COW_I(parent)->i_flags & EXT3COW_TOPDIR_FL)) { + int best_ndir = inodes_per_group; + int best_group = -1; + + get_random_bytes(&group, sizeof(group)); + parent_group = (unsigned)group % ngroups; + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3cow_get_group_desc (sb, group, &bh); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + continue; + best_group = group; + best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + } + if (best_group >= 0) + return best_group; + goto fallback; + } + + blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group / 4; + min_blocks = avefreeb - EXT3COW_BLOCKS_PER_GROUP(sb) / 4; + + max_debt = EXT3COW_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3cow_fsblk_t)BLOCK_COST); + if (max_debt * INODE_COST > inodes_per_group) + max_debt = inodes_per_group / INODE_COST; + if (max_debt > 255) + max_debt = 255; + if (max_debt == 0) + max_debt = 1; + + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3cow_get_group_desc (sb, group, &bh); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + continue; + return group; + } + +fallback: + for (i = 0; i < ngroups; i++) { + group = (parent_group + i) % ngroups; + desc = ext3cow_get_group_desc (sb, group, &bh); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + return group; + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent) +{ + int parent_group = EXT3COW_I(parent)->i_block_group; + int ngroups = EXT3COW_SB(sb)->s_groups_count; + struct ext3cow_group_desc *desc; + struct buffer_head *bh; + int group, i; + + /* + * Try to place the inode in its parent directory + */ + group = parent_group; + desc = ext3cow_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return group; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + group = (group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + group += i; + if (group >= ngroups) + group -= ngroups; + desc = ext3cow_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return group; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++group >= ngroups) + group = 0; + desc = ext3cow_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + return group; + } + + return -1; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *ext3cow_new_inode(handle_t *handle, struct inode * dir, int mode) +{ + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + int group; + unsigned long ino = 0; + struct inode * inode; + struct ext3cow_group_desc * gdp = NULL; + struct ext3cow_super_block * es; + struct ext3cow_inode_info *ei; + struct ext3cow_sb_info *sbi; + int err = 0; + struct inode *ret; + int i; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT3COW_I(inode); + + sbi = EXT3COW_SB(sb); + es = sbi->s_es; + if (S_ISDIR(mode)) { + if (test_opt (sb, OLDALLOC)) + group = find_group_dir(sb, dir); + else + group = find_group_orlov(sb, dir); + } else + group = find_group_other(sb, dir); + + err = -ENOSPC; + if (group == -1) + goto out; + + for (i = 0; i < sbi->s_groups_count; i++) { + err = -EIO; + + gdp = ext3cow_get_group_desc(sb, group, &bh2); + if (!gdp) + goto fail; + + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) + goto fail; + + ino = 0; + +repeat_in_this_group: + ino = ext3cow_find_next_zero_bit((unsigned long *) + bitmap_bh->b_data, EXT3COW_INODES_PER_GROUP(sb), ino); + if (ino < EXT3COW_INODES_PER_GROUP(sb)) { + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, bitmap_bh); + if (err) + goto fail; + + if (!ext3cow_set_bit_atomic(sb_bgl_lock(sbi, group), + ino, bitmap_bh->b_data)) { + /* we won it */ + BUFFER_TRACE(bitmap_bh, + "call ext3cow_journal_dirty_metadata"); + err = ext3cow_journal_dirty_metadata(handle, + bitmap_bh); + if (err) + goto fail; + goto got; + } + /* we lost it */ + journal_release_buffer(handle, bitmap_bh); + + if (++ino < EXT3COW_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; + } + + /* + * This case is possible in concurrent environment. It is very + * rare. We cannot repeat the find_group_xxx() call because + * that will simply return the same blockgroup, because the + * group descriptor metadata has not yet been updated. + * So we just go onto the next blockgroup. + */ + if (++group == sbi->s_groups_count) + group = 0; + } + err = -ENOSPC; + goto out; + +got: + ino += group * EXT3COW_INODES_PER_GROUP(sb) + 1; + if (ino < EXT3COW_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3cow_error (sb, "ext3cow_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d, inode=%lu", group, ino); + err = -EIO; + goto fail; + } + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, bh2); + if (err) goto fail; + spin_lock(sb_bgl_lock(sbi, group)); + gdp->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); + if (S_ISDIR(mode)) { + gdp->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); + } + spin_unlock(sb_bgl_lock(sbi, group)); + BUFFER_TRACE(bh2, "call ext3cow_journal_dirty_metadata"); + err = ext3cow_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + sb->s_dirt = 1; + + inode->i_uid = current->fsuid; + if (test_opt (sb, GRPID)) + inode->i_gid = dir->i_gid; + else if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + inode->i_mode = mode; + + inode->i_ino = ino; + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + + /* For versioning -znjp */ + ei->i_cow_bitmap = 0x0000; + ei->i_epoch_number = EXT3COW_S_EPOCHNUMBER(dir->i_sb); + ei->i_next_inode = 0; + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + ei->i_flags = EXT3COW_I(dir)->i_flags & ~EXT3COW_INDEX_FL; + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT3COW_IMMUTABLE_FL|EXT3COW_APPEND_FL); + /* dirsync only applies to directories */ + if (!S_ISDIR(mode)) + ei->i_flags &= ~EXT3COW_DIRSYNC_FL; +#ifdef EXT3COW_FRAGMENTS + /* Taken out for versioning -znjp */ + //ei->i_faddr = 0; + //ei->i_frag_no = 0; + //ei->i_frag_size = 0; +#endif + ei->i_file_acl = 0; + ei->i_dir_acl = 0; + ei->i_dtime = 0; + ei->i_block_alloc_info = NULL; + ei->i_block_group = group; + + ext3cow_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) + handle->h_sync = 1; + insert_inode_hash(inode); + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ei->i_state = EXT3COW_STATE_NEW; + ei->i_extra_isize = + (EXT3COW_INODE_SIZE(inode->i_sb) > EXT3COW_GOOD_OLD_INODE_SIZE) ? + sizeof(struct ext3cow_inode) - EXT3COW_GOOD_OLD_INODE_SIZE : 0; + + ret = inode; + if(DQUOT_ALLOC_INODE(inode)) { + err = -EDQUOT; + goto fail_drop; + } + + err = ext3cow_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + + err = ext3cow_init_security(handle,inode, dir); + if (err) + goto fail_free_drop; + + err = ext3cow_mark_inode_dirty(handle, inode); + if (err) { + ext3cow_std_error(sb, err); + goto fail_free_drop; + } + + ext3cow_debug("allocating inode %lu\n", inode->i_ino); + goto really_out; +fail: + ext3cow_std_error(sb, err); +out: + iput(inode); + ret = ERR_PTR(err); +really_out: + brelse(bitmap_bh); + return ret; + +fail_free_drop: + DQUOT_FREE_INODE(inode); + +fail_drop: + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + iput(inode); + brelse(bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext3cow_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT3COW_SB(sb)->s_es->s_inodes_count); + unsigned long block_group; + int bit; + struct buffer_head *bitmap_bh = NULL; + struct inode *inode = NULL; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext3cow_warning(sb, __FUNCTION__, + "bad orphan ino %lu! e2fsck was run?", ino); + goto out; + } + + block_group = (ino - 1) / EXT3COW_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3COW_INODES_PER_GROUP(sb); + bitmap_bh = read_inode_bitmap(sb, block_group); + if (!bitmap_bh) { + ext3cow_warning(sb, __FUNCTION__, + "inode bitmap error for orphan %lu", ino); + goto out; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext3cow_test_bit(bit, bitmap_bh->b_data) || + !(inode = iget(sb, ino)) || is_bad_inode(inode) || + NEXT_ORPHAN(inode) > max_ino) { + ext3cow_warning(sb, __FUNCTION__, + "bad orphan inode %lu! e2fsck was run?", ino); + printk(KERN_NOTICE "ext3cow_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext3cow_test_bit(bit, bitmap_bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + } + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode && inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + inode = NULL; + } +out: + brelse(bitmap_bh); + return inode; +} + +unsigned long ext3cow_count_free_inodes (struct super_block * sb) +{ + unsigned long desc_count; + struct ext3cow_group_desc *gdp; + int i; +#ifdef EXT3COWFS_DEBUG + struct ext3cow_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT3COW_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < EXT3COW_SB(sb)->s_groups_count; i++) { + gdp = ext3cow_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext3cow_count_free(bitmap_bh, EXT3COW_INODES_PER_GROUP(sb) / 8); + printk("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk("ext3cow_count_free_inodes: stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < EXT3COW_SB(sb)->s_groups_count; i++) { + gdp = ext3cow_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext3cow_count_dirs (struct super_block * sb) +{ + unsigned long count = 0; + int i; + + for (i = 0; i < EXT3COW_SB(sb)->s_groups_count; i++) { + struct ext3cow_group_desc *gdp = ext3cow_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + count += le16_to_cpu(gdp->bg_used_dirs_count); + } + return count; +} + diff -ruN linux-2.6.20.3/fs/ext3cow/inode.c linux-2.6.20.3-ext3cow/fs/ext3cow/inode.c --- linux-2.6.20.3/fs/ext3cow/inode.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/inode.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,3502 @@ +/* + * linux/fs/ext3cow/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext3cow_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +static int ext3cow_writepage_trans_blocks(struct inode *inode); + +/* + * Test whether an inode is a fast symlink. + */ +static int ext3cow_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT3COW_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); +} + +/* + * The ext3cow forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ +int ext3cow_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3cow_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3COW_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext3cow_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call journal_forget"); + return ext3cow_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext3cow_journal_revoke"); + err = ext3cow_journal_revoke(handle, blocknr, bh); + if (err) + ext3cow_abort(inode->i_sb, __FUNCTION__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static unsigned long blocks_for_truncate(struct inode *inode) +{ + unsigned long needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext3cow to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT3COW_MAX_TRANS_DATA) + needed = EXT3COW_MAX_TRANS_DATA; + + return EXT3COW_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext3cow_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext3cow_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (handle->h_buffer_credits > EXT3COW_RESERVE_TRANS_BLOCKS) + return 0; + if (!ext3cow_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +static int ext3cow_journal_test_restart(handle_t *handle, struct inode *inode) +{ + jbd_debug(2, "restarting handle %p\n", handle); + return ext3cow_journal_restart(handle, blocks_for_truncate(inode)); +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext3cow_delete_inode (struct inode * inode) +{ + handle_t *handle; + + truncate_inode_pages(&inode->i_data, 0); + + if (is_bad_inode(inode)) + goto no_delete; + + handle = start_transaction(inode); + if (IS_ERR(handle)) { + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ + ext3cow_orphan_del(NULL, inode); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + if (inode->i_blocks) + ext3cow_truncate(inode); + /* + * Kill off the orphan record which ext3cow_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext3cow_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext3cow_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext3cow_orphan_del(handle, inode); + EXT3COW_I(inode)->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext3cow_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + clear_inode(inode); + else + ext3cow_free_inode(handle, inode); + ext3cow_journal_stop(handle); + return; +no_delete: + clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +//TODO: Delete at some point +/* znjp - used for bitmap testing */ + + static void printbin(u32 val, int size) { + u32 mask; + + mask=(1UL << (size-1)); + while (mask) { + if (mask & val) + printk("1"); + else + printk("0"); + mask /= 2; + } + printk("\n"); + + } + + +/** + * ext3cow_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext3cow uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext3cow_block_to_path(struct inode *inode, + long i_block, int offsets[4], int *boundary) +{ + /* TODO: Check for efficientcy -znjp */ + int ptrs = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); + const long direct_blocks = EXT3COW_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (ptrs * ptrs); + //double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < 0) { + ext3cow_warning (inode->i_sb, "ext3cow_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT3COW_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT3COW_DIND_BLOCK; + offsets[n++] = (i_block/ptrs); //i_block >> ptrs_bits; + offsets[n++] = (i_block%ptrs); //i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks)/(double_blocks)) < ptrs) { + // } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT3COW_TIND_BLOCK; + offsets[n++] = (i_block/double_blocks); //i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block/double_blocks)%ptrs; //(i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block%ptrs; //i_block & (ptrs - 1); + final = ptrs; + } else { + ext3cow_warning(inode->i_sb, "ext3cow_block_to_path", "block > big"); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext3cow_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * If this is COW we set the cow field to 1. We know if it's COW + * because there will already be a key. We need this field so we + * zero out the data already in the buffer. + * The create flag let's us know if were just looking for a block + * to read, or a block to write. We only set the bitmap when + * we're looking for a block to write, either on new allocation + * or on COWing. -znjp + */ +static Indirect *ext3cow_get_branch(struct inode *inode, int depth, + int *offsets, + Indirect chain[4], int *err, int *cow, + int create) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh = NULL; + u32 *bitmap_w = NULL; + int ptrs = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); + int nbitsperword = (sizeof(u32) * 8); + + *err = 0; + *cow = 0; + + + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT3COW_I(inode)->i_data + *offsets); + if (!p->key){ + /* Set the bitmap on allocation - znjp */ + if(create){ + EXT3COW_I(inode)->i_cow_bitmap |= (1UL << *offsets); + } + goto no_block; + } + + /* Are we writing and COWing any direct blocks? -znjp */ + if(create && !(EXT3COW_I(inode)->i_cow_bitmap & (1UL << *offsets))){ + //printk(KERN_INFO "COWing direct block\n"); + *(p->p) = 0; + p->key = 0; + /* Set the bitamp when COWing -znjp */ + EXT3COW_I(inode)->i_cow_bitmap |= (1UL << *offsets); + *cow = 1; + goto no_block; + } + + while (--depth) { + + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) + goto failure; + + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + /* Reader: end */ + /* Find correct bitmap word */ + bitmap_w = (u32*)bh->b_data + ptrs + (*offsets/nbitsperword); + if (!p->key){ + /* Set the bitmap when allocating -znjp */ + if(create){ + *bitmap_w |= (u32)(1UL << (int)(*offsets%nbitsperword)); + } + goto no_block; + } + + /* Are we COWing any indirect blocks? -znjp */ + if(create && !(*bitmap_w & (1UL << (int)(*offsets%nbitsperword)))){ + //printk(KERN_INFO "COWing indirect block\n"); + *(p->p) = 0; + p->key = 0; + /* Set the bitmap -znjp */ + *bitmap_w |= (u32)(1UL << (int)(*offsets%nbitsperword)); + *cow = 1; + goto no_block; + } + } + return NULL; + +changed: + brelse(bh); + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext3cow_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the prefered place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext3cow_fsblk_t ext3cow_find_near(struct inode *inode, Indirect *ind) +{ + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; + __le32 *p; + ext3cow_fsblk_t bg_start; + ext3cow_grpblk_t colour; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + bg_start = ext3cow_group_first_block_no(inode->i_sb, ei->i_block_group); + colour = (current->pid % 16) * + (EXT3COW_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour; +} + +/** + * ext3cow_find_goal - find a prefered place for allocation. + * @inode: owner + * @block: block we want + * @chain: chain of indirect blocks + * @partial: pointer to the last triple within a chain + * @goal: place to store the result. + * + * Normally this function find the prefered place for block allocation, + * stores it in *@goal and returns zero. + */ + +static ext3cow_fsblk_t ext3cow_find_goal(struct inode *inode, long block, + Indirect chain[4], Indirect *partial) +{ + struct ext3cow_block_alloc_info *block_i; + + block_i = EXT3COW_I(inode)->i_block_alloc_info; + + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block_i && (block == block_i->last_alloc_logical_block + 1) + && (block_i->last_alloc_physical_block != 0)) { + return block_i->last_alloc_physical_block + 1; + } + + return ext3cow_find_near(inode, partial); +} + +/** + * ext3cow_blks_to_allocate: Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext3cow_blks_to_allocate(Indirect *branch, int k, unsigned long blks, + int blocks_to_boundary) +{ + unsigned long count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext3cow_alloc_blocks: multiple allocate blocks needed for a branch + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @blks: on return it will store the total number of allocated + * direct blocks + */ +static int ext3cow_alloc_blocks(handle_t *handle, struct inode *inode, + ext3cow_fsblk_t goal, int indirect_blks, int blks, + ext3cow_fsblk_t new_blocks[4], int *err) +{ + int target, i; + unsigned long count = 0; + int index = 0; + ext3cow_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + target = blks + indirect_blks; + + while (1) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext3cow_new_blocks(handle,inode,goal,&count,err); + if (*err) + goto failed_out; + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + + if (count > 0) + break; + } + + /* save the new block number for the first direct block */ + new_blocks[index] = current_block; + + /* total number of blocks allocated for direct blocks */ + ret = count; + *err = 0; + return ret; +failed_out: + for (i = 0; i key). Upon the exit we have the same + * picture as after the successful ext3cow_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext3cow_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext3cow_alloc_branch(handle_t *handle, struct inode *inode, + int indirect_blks, int *blks, ext3cow_fsblk_t goal, + int *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext3cow_fsblk_t new_blocks[4]; + ext3cow_fsblk_t current_block; + + u32 *bitmap_w = NULL; + int ptrs = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); + int nbitsperword = (sizeof(u32) * 8); + + num = ext3cow_alloc_blocks(handle, inode, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3cow_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + /* Mark the cow bitmap for each new indirect block allocated. + * We had to put this here, because get_branch was insufficient + * when allocating an indirect block. -znjp + */ + bitmap_w = (u32*)bh->b_data + ptrs + (offsets[n]/nbitsperword); + *bitmap_w |= (u32)(1UL << (int)(offsets[n]%nbitsperword)); + + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if ( n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i=1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); + err = ext3cow_journal_dirty_metadata(handle, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + for (i = 1; i <= n ; i++) { + BUFFER_TRACE(branch[i].bh, "call journal_forget"); + ext3cow_journal_forget(handle, branch[i].bh); + } + for (i = 0; i i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext3cow_splice_branch(handle_t *handle, struct inode *inode, + long block, Indirect *where, int num, int blks) +{ + int i; + int err = 0; + struct ext3cow_block_alloc_info *block_i; + ext3cow_fsblk_t current_block; + + block_i = EXT3COW_I(inode)->i_block_alloc_info; + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i ) = cpu_to_le32(current_block++); + } + + /* + * update the most recently allocated logical & physical block + * in i_block_alloc_info, to assist find the proper goal block for next + * allocation + */ + if (block_i) { + block_i->last_alloc_logical_block = block + blks - 1; + block_i->last_alloc_physical_block = + le32_to_cpu(where[num].key) + blks - 1; + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME_SEC; + ext3cow_mark_inode_dirty(handle, inode); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext3cow_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext3cow_journal_dirty_metadata"); + err = ext3cow_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + BUFFER_TRACE(where[i].bh, "call journal_forget"); + ext3cow_journal_forget(handle, where[i].bh); + ext3cow_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); + } + ext3cow_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); + + return err; +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * The BKL may not be held on entry here. Be sure to take it early. + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + */ +int ext3cow_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext3cow_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + int count = 0; + ext3cow_fsblk_t first_block = 0; + int cow = 0; /* To determine wether we clear the buffer of not -znjp */ + + + J_ASSERT(handle != NULL || create == 0); + depth = ext3cow_block_to_path(inode,iblock,offsets,&blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext3cow_get_branch(inode, depth, offsets, + chain, &err, &cow, create); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + if(!cow) /* Don't clear the buffer if it's a COW allocation -znjp */ + clear_buffer_new(bh_result); + count++; + /*map more blocks*/ + while (count < maxblocks && count <= blocks_to_boundary) { + ext3cow_fsblk_t blk; + + if (!verify_chain(chain, partial)) { + /* + * Indirect block might be removed by + * truncate while we were reading it. + * Handling of that case: forget what we've + * got now. Flag the err as EAGAIN, so it + * will reread. + */ + err = -EAGAIN; + count = 0; + break; + } + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + if (err != -EAGAIN) + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) + goto cleanup; + + mutex_lock(&ei->truncate_mutex); + + /* + * If the indirect block is missing while we are reading + * the chain(ext3cow_get_branch() returns -EAGAIN err), or + * if the chain has been changed after we grab the semaphore, + * (either because another process truncated this branch, or + * another get_block allocated this branch) re-grab the chain to see if + * the request block has been allocated or not. + * + * Since we already block the truncate/other get_block + * at this point, we will have the current copy of the chain when we + * splice the branch into the tree. + */ + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); + partial--; + } + partial = ext3cow_get_branch(inode, depth, offsets, + chain, &err, &cow, create); + if (!partial) { + count++; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + /* Don't clear the buffer if we're COWing it -znjp */ + if(!cow) + clear_buffer_new(bh_result); + goto got_it; + } + } + + /* + * Okay, we need to do block allocation. Lazily initialize the block + * allocation info here if necessary + */ + if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) + ext3cow_init_block_alloc_info(inode); + + goal = ext3cow_find_goal(inode, iblock, chain, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext3cow_blks_to_allocate(partial, indirect_blks, + maxblocks, blocks_to_boundary); + /* + * Block out ext3cow_truncate while we alter the tree + */ + err = ext3cow_alloc_branch(handle, inode, indirect_blks, &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext3cow_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext3cow_splice_branch(handle, inode, iblock, + partial, indirect_blks, count); + /* + * i_disksize growing is protected by truncate_mutex. Don't forget to + * protect it if you're about to implement concurrent + * ext3cow_get_block() -bzzz + */ + if (!err && extend_disksize && inode->i_size > ei->i_disksize) + ei->i_disksize = inode->i_size; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + + set_buffer_new(bh_result); +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (count > blocks_to_boundary) + set_buffer_boundary(bh_result); + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); +out: + return err; +} + +#define DIO_CREDITS (EXT3COW_RESERVE_TRANS_BLOCKS + 32) + +static int ext3cow_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = journal_current_handle(); + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + if (!create) + goto get_block; /* A read */ + + if (max_blocks == 1) + goto get_block; /* A single block get */ + + if (handle->h_transaction->t_state == T_LOCKED) { + /* + * Huge direct-io writes can hold off commits for long + * periods of time. Let this commit run. + */ + ext3cow_journal_stop(handle); + handle = ext3cow_journal_start(inode, DIO_CREDITS); + if (IS_ERR(handle)) + ret = PTR_ERR(handle); + goto get_block; + } + + if (handle->h_buffer_credits <= EXT3COW_RESERVE_TRANS_BLOCKS) { + /* + * Getting low on buffer credits... + */ + ret = ext3cow_journal_extend(handle, DIO_CREDITS); + if (ret > 0) { + /* + * Couldn't extend the transaction. Start a new one. + */ + ret = ext3cow_journal_restart(handle, DIO_CREDITS); + } + } + +get_block: + if (ret == 0) { + ret = ext3cow_get_blocks_handle(handle, inode, iblock, + max_blocks, bh_result, create, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + } + return ret; +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext3cow_getblk(handle_t *handle, struct inode *inode, + long block, int create, int *errp) +{ + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + err = ext3cow_get_blocks_handle(handle, inode, block, 1, + &dummy, create, 1); + /* + * ext3cow_get_blocks_handle() returns number of blocks + * mapped. 0 in case of a HOLE. + */ + if (err > 0) { + if (err > 1) + WARN_ON(1); + err = 0; + } + *errp = err; + if (!err && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (!bh) { + *errp = -EIO; + goto err; + } + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != 0); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext3cow_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext3cow_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data,0,inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); + err = ext3cow_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } +err: + return NULL; +} + +struct buffer_head *ext3cow_bread(handle_t *handle, struct inode *inode, + int block, int create, int *err) +{ + struct buffer_head * bh; + + bh = ext3cow_getblk(handle, inode, block, create, err); + if (!bh) + return bh; + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ_META, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext3cow_get_block() + * and the commit_write(). So doing the journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext3cow_writepage() -> + * block_write_full_page(). In that case, we *know* that ext3cow_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext3cow can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + return ext3cow_journal_get_write_access(handle, bh); +} + +/* + * The idea of this helper function is following: + * if prepare_write has allocated some blocks, but not all of them, the + * transaction must include the content of the newly allocated blocks. + * This content is expected to be set to zeroes by block_prepare_write(). + * 2006/10/14 SAW + */ +static int ext3cow_prepare_failure(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct address_space *mapping; + struct buffer_head *bh, *head, *next; + unsigned block_start, block_end; + unsigned blocksize; + int ret; + handle_t *handle = ext3cow_journal_current_handle(); + + mapping = page->mapping; + if (ext3cow_should_writeback_data(mapping->host)) { + /* optimization: no constraints about data */ +skip: + return ext3cow_journal_stop(handle); + } + + head = page_buffers(page); + blocksize = head->b_size; + for ( bh = head, block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from) + continue; + if (block_start >= to) { + block_start = to; + break; + } + if (!buffer_mapped(bh)) + /* prepare_write failed on this bh */ + break; + if (ext3cow_should_journal_data(mapping->host)) { + ret = do_journal_get_write_access(handle, bh); + if (ret) { + ext3cow_journal_stop(handle); + return ret; + } + } + /* + * block_start here becomes the first block where the current iteration + * of prepare_write failed. + */ + } + if (block_start <= from) + goto skip; + + /* commit allocated and zeroed buffers */ + return mapping->a_ops->commit_write(file, page, from, block_start); +} + +/* Used to quickly unmap all buffers in a page for COWing -znjp */ +static int ext3cow_clear_buffer_mapped(handle_t *handle, + struct buffer_head *bh) +{ + clear_buffer_mapped(bh); + return 0; +} + +static int ext3cow_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + int ret, ret2; + int needed_blocks = ext3cow_writepage_trans_blocks(inode); + handle_t *handle; + int retries = 0; + +retry: + handle = ext3cow_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) + return PTR_ERR(handle); + /* Unset the BH_Mapped flag so get_block is always called -znjp */ + if(page_has_buffers(page)) + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, ext3cow_clear_buffer_mapped); + + if (test_opt(inode->i_sb, NOBH) && ext3cow_should_writeback_data(inode)) + ret = nobh_prepare_write(page, from, to, ext3cow_get_block); + else + ret = block_prepare_write(page, from, to, ext3cow_get_block); + if (ret) + goto failure; + + if (ext3cow_should_journal_data(inode)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); + if (ret) + /* fatal error, just put the handle and return */ + journal_stop(handle); + } + return ret; + +failure: + ret2 = ext3cow_prepare_failure(file, page, from, to); + if (ret2 < 0) + return ret2; + if (ret == -ENOSPC && ext3cow_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + /* retry number exceeded, or other error like -EDQUOT */ + return ret; +} + +int ext3cow_journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + int err = journal_dirty_data(handle, bh); + if (err) + ext3cow_journal_abort_handle(__FUNCTION__, __FUNCTION__, + bh, handle,err); + return err; +} + +/* For commit_write() in data=journal mode */ +static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + return ext3cow_journal_dirty_metadata(handle, bh); +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext3cow never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext3cow_ordered_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + handle_t *handle = ext3cow_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, ext3cow_journal_dirty_data); + + if (ret == 0) { + /* + * generic_commit_write() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + loff_t new_i_size; + + new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + if (new_i_size > EXT3COW_I(inode)->i_disksize) + EXT3COW_I(inode)->i_disksize = new_i_size; + ret = generic_commit_write(file, page, from, to); + } + ret2 = ext3cow_journal_stop(handle); + if (!ret) + ret = ret2; + return ret; +} + +static int ext3cow_writeback_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + handle_t *handle = ext3cow_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + loff_t new_i_size; + + new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + if (new_i_size > EXT3COW_I(inode)->i_disksize) + EXT3COW_I(inode)->i_disksize = new_i_size; + + if (test_opt(inode->i_sb, NOBH) && ext3cow_should_writeback_data(inode)) + ret = nobh_commit_write(file, page, from, to); + else + ret = generic_commit_write(file, page, from, to); + + ret2 = ext3cow_journal_stop(handle); + if (!ret) + ret = ret2; + return ret; +} + +static int ext3cow_journalled_commit_write(struct file *file, + struct page *page, unsigned from, unsigned to) +{ + handle_t *handle = ext3cow_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + int partial = 0; + loff_t pos; + + /* + * Here we duplicate the generic_commit_write() functionality + */ + pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + ret = walk_page_buffers(handle, page_buffers(page), from, + to, &partial, commit_write_fn); + if (!partial) + SetPageUptodate(page); + if (pos > inode->i_size) + i_size_write(inode, pos); + EXT3COW_I(inode)->i_state |= EXT3COW_STATE_JDATA; + if (inode->i_size > EXT3COW_I(inode)->i_disksize) { + EXT3COW_I(inode)->i_disksize = inode->i_size; + ret2 = ext3cow_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + ret2 = ext3cow_journal_stop(handle); + if (!ret) + ret = ret2; + return ret; +} + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext3cow data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static sector_t ext3cow_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (EXT3COW_I(inode)->i_state & EXT3COW_STATE_JDATA) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT3COW_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + EXT3COW_I(inode)->i_state &= ~EXT3COW_STATE_JDATA; + journal = EXT3COW_JOURNAL(inode); + journal_lock_updates(journal); + err = journal_flush(journal); + journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping,block,ext3cow_get_block); +} + +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) +{ + if (buffer_mapped(bh)) + return ext3cow_journal_dirty_data(handle, bh); + return 0; +} + +/* + * Note that we always start a transaction even if we're not journalling + * data. This is to preserve ordering: any hole instantiation within + * __block_write_full_page -> ext3cow_get_block() should be journalled + * along with the data so we don't crash and then get metadata which + * refers to old data. + * + * In all journalling modes block_write_full_page() will start the I/O. + * + * Problem: + * + * ext3cow_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext3cow_writepage() + * + * Similar for: + * + * ext3cow_file_write() -> generic_file_write() -> __alloc_pages() -> ... + * + * Same applies to ext3cow_get_block(). We will deadlock on various things like + * lock_journal and i_truncate_mutex. + * + * Setting PF_MEMALLOC here doesn't work - too many internal memory + * allocations fail. + * + * 16May01: If we're reentered then journal_current_handle() will be + * non-zero. We simply *return*. + * + * 1 July 2001: @@@ FIXME: + * In journalled data mode, a data buffer may be metadata against the + * current transaction. But the same file is part of a shared mapping + * and someone does a writepage() on it. + * + * We will move the buffer onto the async_data list, but *after* it has + * been dirtied. So there's a small window where we have dirty data on + * BJ_Metadata. + * + * Note that this only applies to the last partial page in the file. The + * bit which block_write_full_page() uses prepare/commit for. (That's + * broken code anyway: it's wrong for msync()). + * + * It's a rare case: affects the final partial page, for journalled data + * where the file is subject to bith write() and writepage() in the same + * transction. To fix it we'll need a custom block_write_full_page(). + * We'll probably need that anyway for journalling writepage() output. + * + * We don't honour synchronous mounts for writepage(). That would be + * disastrous. Any write() or metadata operation will sync the fs for + * us. + * + * AKPM2: if all the page's buffers are mapped to disk and !data=journal, + * we don't need to open a transaction here. + */ +static int ext3cow_ordered_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + J_ASSERT(PageLocked(page)); + + /* + * We give up here if we're reentered, because it might be for a + * different filesystem. + */ + if (ext3cow_journal_current_handle()) + goto out_fail; + + handle = ext3cow_journal_start(inode, ext3cow_writepage_trans_blocks(inode)); + + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + page_bufs = page_buffers(page); + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bget_one); + + ret = block_write_full_page(page, ext3cow_get_block, wbc); + + /* + * The page can become unlocked at any point now, and + * truncate can then come in and change things. So we + * can't touch *page from now on. But *page_bufs is + * safe due to elevated refcount. + */ + + /* + * And attach them to the current transaction. But only if + * block_write_full_page() succeeded. Otherwise they are unmapped, + * and generally junk. + */ + if (ret == 0) { + err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, journal_dirty_data_fn); + if (!ret) + ret = err; + } + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + err = ext3cow_journal_stop(handle); + if (!ret) + ret = err; + return ret; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return ret; +} + +static int ext3cow_writeback_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + handle_t *handle = NULL; + int ret = 0; + int err; + + if (ext3cow_journal_current_handle()) + goto out_fail; + + handle = ext3cow_journal_start(inode, ext3cow_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + if (test_opt(inode->i_sb, NOBH) && ext3cow_should_writeback_data(inode)) + ret = nobh_writepage(page, ext3cow_get_block, wbc); + else + ret = block_write_full_page(page, ext3cow_get_block, wbc); + + err = ext3cow_journal_stop(handle); + if (!ret) + ret = err; + return ret; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return ret; +} + +static int ext3cow_journalled_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + handle_t *handle = NULL; + int ret = 0; + int err; + + if (ext3cow_journal_current_handle()) + goto no_write; + + handle = ext3cow_journal_start(inode, ext3cow_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } + + if (!page_has_buffers(page) || PageChecked(page)) { + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext3cow_get_block); + if (ret != 0) { + ext3cow_journal_stop(handle); + goto out_unlock; + } + ret = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + + err = walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, commit_write_fn); + if (ret == 0) + ret = err; + EXT3COW_I(inode)->i_state |= EXT3COW_STATE_JDATA; + unlock_page(page); + } else { + /* + * It may be a page full of checkpoint-mode buffers. We don't + * really know unless we go poke around in the buffer_heads. + * But block_write_full_page will do the right thing. + */ + ret = block_write_full_page(page, ext3cow_get_block, wbc); + } + err = ext3cow_journal_stop(handle); + if (!ret) + ret = err; +out: + return ret; + +no_write: + redirty_page_for_writepage(wbc, page); +out_unlock: + unlock_page(page); + goto out; +} + +static int ext3cow_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, ext3cow_get_block); +} + +static int +ext3cow_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext3cow_get_block); +} + +static void ext3cow_invalidatepage(struct page *page, unsigned long offset) +{ + journal_t *journal = EXT3COW_JOURNAL(page->mapping->host); + + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0) + ClearPageChecked(page); + + journal_invalidatepage(journal, page, offset); +} + +static int ext3cow_releasepage(struct page *page, gfp_t wait) +{ + journal_t *journal = EXT3COW_JOURNAL(page->mapping->host); + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + return journal_try_to_free_buffers(journal, page, wait); +} + +/* + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. + */ +static ssize_t ext3cow_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + handle_t *handle = NULL; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + + if (rw == WRITE) { + loff_t final_size = offset + count; + + handle = ext3cow_journal_start(inode, DIO_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + if (final_size > inode->i_size) { + ret = ext3cow_orphan_add(handle, inode); + if (ret) + goto out_stop; + orphan = 1; + ei->i_disksize = inode->i_size; + } + } + + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext3cow_get_block, NULL); + + /* + * Reacquire the handle: ext3cow_get_block() can restart the transaction + */ + handle = journal_current_handle(); + +out_stop: + if (handle) { + int err; + + if (orphan && inode->i_nlink) + ext3cow_orphan_del(handle, inode); + if (orphan && ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext3cow_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext3cow_mark_inode_dirty(handle, inode); + } + } + err = ext3cow_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Pages can be marked dirty completely asynchronously from ext3cow's journalling + * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do + * much here because ->set_page_dirty is called under VFS locks. The page is + * not necessarily locked. + * + * We cannot just dirty the page and leave attached buffers clean, because the + * buffers' dirty state is "definitive". We cannot just set the buffers dirty + * or jbddirty because all the journalling code will explode. + * + * So what we do is to mark the page "pending dirty" and next time writepage + * is called, propagate that into the buffers appropriately. + */ +static int ext3cow_journalled_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); +} + +static const struct address_space_operations ext3cow_ordered_aops = { + .readpage = ext3cow_readpage, + .readpages = ext3cow_readpages, + .writepage = ext3cow_ordered_writepage, + .sync_page = block_sync_page, + .prepare_write = ext3cow_prepare_write, + .commit_write = ext3cow_ordered_commit_write, + .bmap = ext3cow_bmap, + .invalidatepage = ext3cow_invalidatepage, + .releasepage = ext3cow_releasepage, + .direct_IO = ext3cow_direct_IO, + .migratepage = buffer_migrate_page, +}; + +static const struct address_space_operations ext3cow_writeback_aops = { + .readpage = ext3cow_readpage, + .readpages = ext3cow_readpages, + .writepage = ext3cow_writeback_writepage, + .sync_page = block_sync_page, + .prepare_write = ext3cow_prepare_write, + .commit_write = ext3cow_writeback_commit_write, + .bmap = ext3cow_bmap, + .invalidatepage = ext3cow_invalidatepage, + .releasepage = ext3cow_releasepage, + .direct_IO = ext3cow_direct_IO, + .migratepage = buffer_migrate_page, +}; + +static const struct address_space_operations ext3cow_journalled_aops = { + .readpage = ext3cow_readpage, + .readpages = ext3cow_readpages, + .writepage = ext3cow_journalled_writepage, + .sync_page = block_sync_page, + .prepare_write = ext3cow_prepare_write, + .commit_write = ext3cow_journalled_commit_write, + .set_page_dirty = ext3cow_journalled_set_page_dirty, + .bmap = ext3cow_bmap, + .invalidatepage = ext3cow_invalidatepage, + .releasepage = ext3cow_releasepage, +}; + +void ext3cow_set_aops(struct inode *inode) +{ + if (ext3cow_should_order_data(inode)) + inode->i_mapping->a_ops = &ext3cow_ordered_aops; + else if (ext3cow_should_writeback_data(inode)) + inode->i_mapping->a_ops = &ext3cow_writeback_aops; + else + inode->i_mapping->a_ops = &ext3cow_journalled_aops; +} + +/* + * ext3cow_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext3cow_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from) +{ + ext3cow_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct buffer_head *bh; + int err = 0; + void *kaddr; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + /* + * For "nobh" option, we can only work if we don't need to + * read-in the page - otherwise we create buffers to do the IO. + */ + if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && + ext3cow_should_writeback_data(inode) && PageUptodate(page)) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + goto unlock; + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext3cow_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (ext3cow_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext3cow_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + BUFFER_TRACE(bh, "zeroed end of block"); + + err = 0; + if (ext3cow_should_journal_data(inode)) { + err = ext3cow_journal_dirty_metadata(handle, bh); + } else { + if (ext3cow_should_order_data(inode)) + err = ext3cow_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext3cow_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext3cow_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext3cow_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is refered + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext3cow_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext3cow_find_shared(struct inode *inode, int depth, + int offsets[4], Indirect chain[4], __le32 *top) +{ + Indirect *partial, *p; + int k, err, cow; + + *top = 0; + /* Make k index the deepest non-null offest + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext3cow_get_branch(inode, k, offsets, chain, &err, &cow, 0); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext3cow. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while(partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + */ +static void ext3cow_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext3cow_fsblk_t block_to_free, + unsigned long count, __le32 *first, __le32 *last) +{ + __le32 *p; + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); + ext3cow_journal_dirty_metadata(handle, bh); + } + ext3cow_mark_inode_dirty(handle, inode); + ext3cow_journal_test_restart(handle, inode); + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ext3cow_journal_get_write_access(handle, bh); + } + } + + /* + * Any buffers which are on the journal will be in memory. We find + * them on the hash table so journal_revoke() will run journal_forget() + * on them. We've already detached each block from the file, so + * bforget() in journal_forget() should be safe. + * + * AKPM: turn on bforget in journal_forget()!!! + */ + for (p = first; p < last; p++) { + u32 nr = le32_to_cpu(*p); + if (nr) { + struct buffer_head *bh; + + *p = 0; + bh = sb_find_get_block(inode->i_sb, nr); + ext3cow_forget(handle, 0, inode, bh, nr); + } + } + + ext3cow_free_blocks(handle, inode, block_to_free, count); +} + +/** + * ext3cow_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks refered from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext3cow_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext3cow_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext3cow_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + ext3cow_clear_blocks(handle, inode, this_bh, + block_to_free, + count, block_to_free_p, p); + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (count > 0) + ext3cow_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext3cow_journal_dirty_metadata"); + ext3cow_journal_dirty_metadata(handle, this_bh); + } +} + +/** + * ext3cow_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks refered from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext3cow_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext3cow_fsblk_t nr; + __le32 *p; + + if (is_handle_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); + u32 *bitmap_word = NULL, *first_block = NULL; + unsigned int count = 0, cur = 0, bcount = 0; + int i = 0; + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext3cow_error(inode->i_sb, "ext3cow_free_branches", + "Read failure, inode=%lu, block="E3FSBLK, + inode->i_ino, nr); + continue; + } + /* Only free the branches that have been newly allocated - znjp */ + /* Also, set the bits back to 0 in the bitmap -znjp */ + cur = 0; + count = 0; + bitmap_word = (u32*)bh->b_data + addr_per_block; + + for(bcount = 0; bcount < EXT3COW_COWBITMAPS_PER_IBLOCK(inode->i_sb); + bcount++){ + for(i = 0; i < EXT3COW_COWBITMAP_SIZE; i++, cur++){ + if(cur >= addr_per_block) + goto free; + if(le32_to_cpu(*bitmap_word) & (1UL << i)){ + if(count == 0){ + first_block = (u32*)bh->b_data + cur; + count = 1; + }else if((u32*)first_block + count == (u32*)bh->b_data + cur){ + count++; + }else{ + BUFFER_TRACE(bh, "free child branches"); + ext3cow_free_branches(handle, inode, bh, (u32*)first_block, + (u32*)first_block + count, depth); + first_block = (u32*)bh->b_data + cur; + count = 1; + } + /* Set the bit in the bitmap back to 0 */ + *bitmap_word ^= (1UL << i); + } + } + (u32*)bitmap_word++; + } + free: + if(count){ + BUFFER_TRACE(bh, "free child branches"); + ext3cow_free_branches(handle, inode, bh, (u32*)first_block, + (u32*)first_block + count, depth); + } + + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3cow_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. + * + * If this block has already been committed to the + * journal, a revoke record will be written. And + * revoke records must be emitted *before* clearing + * this block's bit in the bitmaps. + */ + ext3cow_forget(handle, 1, inode, bh, bh->b_blocknr); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (is_handle_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext3cow_mark_inode_dirty(handle, inode); + ext3cow_journal_test_restart(handle, inode); + } + + ext3cow_free_blocks(handle, inode, nr, 1); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext3cow_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext3cow_journal_dirty_metadata"); + ext3cow_journal_dirty_metadata(handle, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext3cow_free_data(handle, inode, parent_bh, first, last); + } +} + +/* + * ext3cow_truncate() + * + * We block out ext3cow_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext3cow_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commmit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext3cow_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext3cow_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext3cow filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext3cow_truncate() run will find them and release them. + */ +void ext3cow_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n; + long last_block; + unsigned blocksize = inode->i_sb->s_blocksize; + struct page *page; + + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (ext3cow_inode_is_fast_symlink(inode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode) || + EXT3COW_IS_UNCHANGEABLE(inode)) /* znjp */ + return; + + /* If the inode needs to be dup'd, then there are no blocks + * to truncate; they all are part of the previous version. + * - znjp */ + if(EXT3COW_S_EPOCHNUMBER(inode->i_sb) > EXT3COW_I_EPOCHNUMBER(inode)){ + ext3cow_dup_inode(NULL, inode); + return; + } + + /* + * We have to lock the EOF page here, because lock_page() nests + * outside journal_start(). + */ + if ((inode->i_size & (blocksize - 1)) == 0) { + /* Block boundary? Nothing to do */ + page = NULL; + } else { + page = grab_cache_page(mapping, + inode->i_size >> PAGE_CACHE_SHIFT); + if (!page) + return; + } + + handle = start_transaction(inode); + if (IS_ERR(handle)) { + if (page) { + clear_highpage(page); + flush_dcache_page(page); + unlock_page(page); + page_cache_release(page); + } + return; /* AKPM: return what? */ + } + + last_block = (inode->i_size + blocksize-1) + >> EXT3COW_BLOCK_SIZE_BITS(inode->i_sb); + + if (page) + ext3cow_block_truncate_page(handle, page, mapping, inode->i_size); + + n = ext3cow_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext3cow_orphan_add(handle, inode)) + goto out_stop; + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext3cow *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + /* + * From here we block out all ext3cow_get_block() callers who want to + * modify the block allocation tree. + */ + mutex_lock(&ei->truncate_mutex); + + if (n == 1) { /* direct blocks */ + unsigned int count = 0; + unsigned long block_to_free = 0; + unsigned long b = 0; + + /* We only want to remove blocks that were allocated in this + * epoch, i.e., have 1 bit in the bitmap. -znjp */ + /* If we're going to truncate a block, we should its + * corresponding bit in the bitmap back to 0, meaning, + * it needs to be allocated - znjp */ + for(b = offsets[0]; b < EXT3COW_NDIR_BLOCKS; b++){ + if(EXT3COW_I(inode)->i_cow_bitmap & (1UL << b)){ + if(count == 0){ + block_to_free = b; + count = 1; + }else if(b == block_to_free + count){ + count++; + }else{ + ext3cow_free_data(handle, inode, NULL, i_data + (int)block_to_free, + i_data + (int)(block_to_free + count)); + block_to_free = b; + count = 1; + } + /* Turn off the bit in the bitmap */ + EXT3COW_I(inode)->i_cow_bitmap ^= (1UL << b); + } + } + if(count > 0) + ext3cow_free_data(handle, inode, NULL, i_data+(int)block_to_free, + i_data + (int)(block_to_free + count)); + goto do_indirects; + } + + partial = ext3cow_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext3cow_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext3cow_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext3cow_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + /* Unless we don't have to. If the indirect block has a 0 bit + * then all of the children do too, so we can skip the branch - znjp + */ + switch (offsets[0]) { + default: + if(EXT3COW_I(inode)->i_cow_bitmap & (1UL << EXT3COW_IND_BLOCK)){ + nr = i_data[EXT3COW_IND_BLOCK]; + if (nr) { + ext3cow_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT3COW_IND_BLOCK] = 0; + } + /* And set bitmap back to 0 */ + EXT3COW_I(inode)->i_cow_bitmap ^= (1UL << EXT3COW_IND_BLOCK); + } + case EXT3COW_IND_BLOCK: + if(EXT3COW_I(inode)->i_cow_bitmap & (1UL << EXT3COW_DIND_BLOCK)){ + nr = i_data[EXT3COW_DIND_BLOCK]; + if (nr) { + ext3cow_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT3COW_DIND_BLOCK] = 0; + } + EXT3COW_I(inode)->i_cow_bitmap ^= (1UL << EXT3COW_DIND_BLOCK); + } + case EXT3COW_DIND_BLOCK: + if(EXT3COW_I(inode)->i_cow_bitmap & (1UL << EXT3COW_TIND_BLOCK)){ + nr = i_data[EXT3COW_TIND_BLOCK]; + if (nr) { + ext3cow_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT3COW_TIND_BLOCK] = 0; + } + EXT3COW_I(inode)->i_cow_bitmap ^= (1UL << EXT3COW_TIND_BLOCK); + } + case EXT3COW_TIND_BLOCK: + ; + } + + ext3cow_discard_reservation(inode); + + mutex_unlock(&ei->truncate_mutex); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ext3cow_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + handle->h_sync = 1; +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3cow_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3cow_orphan_del(handle, inode); + + ext3cow_journal_stop(handle); +} + +static ext3cow_fsblk_t ext3cow_get_inode_block(struct super_block *sb, + unsigned long ino, struct ext3cow_iloc *iloc) +{ + unsigned long desc, group_desc, block_group; + unsigned long offset; + ext3cow_fsblk_t block; + struct buffer_head *bh; + struct ext3cow_group_desc * gdp; + + if (!ext3cow_valid_inum(sb, ino)) { + /* + * This error is already checked for in namei.c unless we are + * looking at an NFS filehandle, in which case no error + * report is needed + */ + return 0; + } + + block_group = (ino - 1) / EXT3COW_INODES_PER_GROUP(sb); + if (block_group >= EXT3COW_SB(sb)->s_groups_count) { + ext3cow_error(sb,"ext3cow_get_inode_block","group >= groups count"); + return 0; + } + smp_rmb(); + group_desc = block_group >> EXT3COW_DESC_PER_BLOCK_BITS(sb); + desc = block_group & (EXT3COW_DESC_PER_BLOCK(sb) - 1); + bh = EXT3COW_SB(sb)->s_group_desc[group_desc]; + if (!bh) { + ext3cow_error (sb, "ext3cow_get_inode_block", + "Descriptor not loaded"); + return 0; + } + + gdp = (struct ext3cow_group_desc *)bh->b_data; + /* + * Figure out the offset within the block group inode table + */ + offset = ((ino - 1) % EXT3COW_INODES_PER_GROUP(sb)) * + EXT3COW_INODE_SIZE(sb); + block = le32_to_cpu(gdp[desc].bg_inode_table) + + (offset >> EXT3COW_BLOCK_SIZE_BITS(sb)); + + iloc->block_group = block_group; + iloc->offset = offset & (EXT3COW_BLOCK_SIZE(sb) - 1); + return block; +} + +/* + * ext3cow_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If 'in_mem' is true, we have all + * data in memory that is needed to recreate the on-disk version of this + * inode. + */ +static int __ext3cow_get_inode_loc(struct inode *inode, + struct ext3cow_iloc *iloc, int in_mem) +{ + ext3cow_fsblk_t block; + struct buffer_head *bh; + + block = ext3cow_get_inode_block(inode->i_sb, inode->i_ino, iloc); + if (!block) + return -EIO; + + bh = sb_getblk(inode->i_sb, block); + if (!bh) { + ext3cow_error (inode->i_sb, "ext3cow_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block="E3FSBLK, + inode->i_ino, block); + return -EIO; + } + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + if (buffer_uptodate(bh)) { + /* someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + struct ext3cow_group_desc *desc; + int inodes_per_buffer; + int inode_offset, i; + int block_group; + int start; + + block_group = (inode->i_ino - 1) / + EXT3COW_INODES_PER_GROUP(inode->i_sb); + inodes_per_buffer = bh->b_size / + EXT3COW_INODE_SIZE(inode->i_sb); + inode_offset = ((inode->i_ino - 1) % + EXT3COW_INODES_PER_GROUP(inode->i_sb)); + start = inode_offset & ~(inodes_per_buffer - 1); + + /* Is the inode bitmap in cache? */ + desc = ext3cow_get_group_desc(inode->i_sb, + block_group, NULL); + if (!desc) + goto make_io; + + bitmap_bh = sb_getblk(inode->i_sb, + le32_to_cpu(desc->bg_inode_bitmap)); + if (!bitmap_bh) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_buffer; i++) { + if (i == inode_offset) + continue; + if (ext3cow_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_buffer) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } + +make_io: + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ_META, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ext3cow_error(inode->i_sb, "ext3cow_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block="E3FSBLK, + inode->i_ino, block); + brelse(bh); + return -EIO; + } + } +has_buffer: + iloc->bh = bh; + return 0; +} + +int ext3cow_get_inode_loc(struct inode *inode, struct ext3cow_iloc *iloc) +{ + /* We have all inode data except xattrs in memory here. */ + return __ext3cow_get_inode_loc(inode, iloc, + !(EXT3COW_I(inode)->i_state & EXT3COW_STATE_XATTR)); +} + +void ext3cow_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT3COW_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (flags & EXT3COW_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT3COW_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & EXT3COW_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & EXT3COW_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT3COW_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +void ext3cow_read_inode(struct inode * inode) +{ + struct ext3cow_iloc iloc; + struct ext3cow_inode *raw_inode; + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + struct buffer_head *bh; + int block; + +#ifdef CONFIG_EXT3COW_FS_POSIX_ACL + ei->i_acl = EXT3COW_ACL_NOT_CACHED; + ei->i_default_acl = EXT3COW_ACL_NOT_CACHED; +#endif + ei->i_block_alloc_info = NULL; + + if (__ext3cow_get_inode_loc(inode, &iloc, 0)) + goto bad_inode; + bh = iloc.bh; + raw_inode = ext3cow_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); + inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; + + ei->i_state = 0; + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(EXT3COW_SB(inode->i_sb)->s_mount_state & EXT3COW_ORPHAN_FS)) { + /* this inode is deleted */ + brelse (bh); + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + ei->i_flags = le32_to_cpu(raw_inode->i_flags); + /* For versioning -znjp */ + ei->i_cow_bitmap = le32_to_cpu(raw_inode->i_cowbitmap); + ei->i_epoch_number = le32_to_cpu(raw_inode->i_epch_number); + ei->i_next_inode = le32_to_cpu(raw_inode->i_nxt_inode); + +#ifdef EXT3COW_FRAGMENTS + /* Taken out for versioning -znjp */ + //ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); + //ei->i_frag_no = raw_inode->i_frag; + //ei->i_frag_size = raw_inode->i_fsize; +#endif + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + } else { + inode->i_size |= + ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + } + ei->i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_block_group = iloc.block_group; + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT3COW_N_BLOCKS; block++) + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + + if (inode->i_ino >= EXT3COW_FIRST_INO(inode->i_sb) + 1 && + EXT3COW_INODE_SIZE(inode->i_sb) > EXT3COW_GOOD_OLD_INODE_SIZE) { + /* + * When mke2fs creates big inodes it does not zero out + * the unused bytes above EXT3COW_GOOD_OLD_INODE_SIZE, + * so ignore those first few inodes. + */ + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT3COW_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT3COW_INODE_SIZE(inode->i_sb)) + goto bad_inode; + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + ei->i_extra_isize = sizeof(struct ext3cow_inode) - + EXT3COW_GOOD_OLD_INODE_SIZE; + } else { + __le32 *magic = (void *)raw_inode + + EXT3COW_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT3COW_XATTR_MAGIC)) + ei->i_state |= EXT3COW_STATE_XATTR; + } + } else + ei->i_extra_isize = 0; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3cow_file_inode_operations; + inode->i_fop = &ext3cow_file_operations; + ext3cow_set_aops(inode); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3cow_dir_inode_operations; + inode->i_fop = &ext3cow_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (ext3cow_inode_is_fast_symlink(inode)) + inode->i_op = &ext3cow_fast_symlink_inode_operations; + else { + inode->i_op = &ext3cow_symlink_inode_operations; + ext3cow_set_aops(inode); + } + } else { + inode->i_op = &ext3cow_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } + brelse (iloc.bh); + ext3cow_set_inode_flags(inode); + return; + +bad_inode: + make_bad_inode(inode); + return; +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext3cow_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext3cow_iloc *iloc) +{ + struct ext3cow_inode *raw_inode = ext3cow_raw_inode(iloc); + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ei->i_state & EXT3COW_STATE_NEW) + memset(raw_inode, 0, EXT3COW_SB(inode->i_sb)->s_inode_size); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + + + if(!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + + /* Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + + if(!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(ei->i_disksize); + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags); + /* For versioning -znjp */ + raw_inode->i_cowbitmap = cpu_to_le16(EXT3COW_I(inode)->i_cow_bitmap); + raw_inode->i_epch_number = cpu_to_le32(EXT3COW_I(inode)->i_epoch_number); + raw_inode->i_nxt_inode = cpu_to_le32(EXT3COW_I(inode)->i_next_inode); + +#ifdef EXT3COW_FRAGMENTS + /* Taken out for versioning -znjp */ + //raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); + //raw_inode->i_frag = ei->i_frag_no; + //raw_inode->i_fsize = ei->i_frag_size; +#endif + raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); + } else { + raw_inode->i_size_high = + cpu_to_le32(ei->i_disksize >> 32); + if (ei->i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT3COW_HAS_RO_COMPAT_FEATURE(sb, + EXT3COW_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT3COW_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT3COW_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + err = ext3cow_journal_get_write_access(handle, + EXT3COW_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext3cow_update_dynamic_rev(sb); + EXT3COW_SET_RO_COMPAT_FEATURE(sb, + EXT3COW_FEATURE_RO_COMPAT_LARGE_FILE); + sb->s_dirt = 1; + handle->h_sync = 1; + err = ext3cow_journal_dirty_metadata(handle, + EXT3COW_SB(sb)->s_sbh); + } + } + } + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else for (block = 0; block < EXT3COW_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + + if (ei->i_extra_isize) + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + + BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); + rc = ext3cow_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; + ei->i_state &= ~EXT3COW_STATE_NEW; + +out_brelse: + brelse (bh); + ext3cow_std_error(inode->i_sb, err); + return err; +} + +/* + * ext3cow_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_write() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * trasnaction to commit. + * + * - Within sys_sync(), kupdate and such. + * We wait on commit, if tol to. + * + * - Within prune_icache() (PF_MEMALLOC == true) + * Here we simply return. We can't afford to block kswapd on the + * journal commit. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext3cow_mark_inode_dirty(). This is a correctness thing for O_SYNC and for + * knfsd. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because a kswapd-driven write_inode() could occur while + * `stuff()' is running, and the new i_size will be lost. Plus the inode + * will no longer be on the superblock's dirty inode list. + */ +int ext3cow_write_inode(struct inode *inode, int wait) +{ + if (current->flags & PF_MEMALLOC) + return 0; + + if (ext3cow_journal_current_handle()) { + jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } + + if (!wait) + return 0; + + return ext3cow_force_commit(inode->i_sb); +} + +/* + * ext3cow_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * Called with inode->sem down. + */ +int ext3cow_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error, rc = 0; + const unsigned int ia_valid = attr->ia_valid; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + /* For versioning -znjp */ + if(is_unchangeable(inode, dentry)){ + error = -EROFS; + goto err_out; + } + + if(EXT3COW_S_EPOCHNUMBER(inode->i_sb) > EXT3COW_I_EPOCHNUMBER(inode)){ + error = ext3cow_dup_inode(dentry->d_parent->d_inode, inode); + if(error) + goto err_out; + } + + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext3cow_journal_start(inode, 2*(EXT3COW_QUOTA_INIT_BLOCKS(inode->i_sb)+ + EXT3COW_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + if (error) { + ext3cow_journal_stop(handle); + return error; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + error = ext3cow_mark_inode_dirty(handle, inode); + ext3cow_journal_stop(handle); + } + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + handle_t *handle; + + handle = ext3cow_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + + error = ext3cow_orphan_add(handle, inode); + EXT3COW_I(inode)->i_disksize = attr->ia_size; + rc = ext3cow_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext3cow_journal_stop(handle); + } + + rc = inode_setattr(inode, attr); + + /* If inode_setattr's call to ext3cow_truncate failed to get a + * transaction handle at all, we need to clean up the in-core + * orphan list manually. */ + if (inode->i_nlink) + ext3cow_orphan_del(NULL, inode); + + if (!rc && (ia_valid & ATTR_MODE)) + rc = ext3cow_acl_chmod(inode); + +err_out: + ext3cow_std_error(inode->i_sb, error); + if (!error) + error = rc; + return error; +} + + +/* + * How many blocks doth make a writepage()? + * + * With N blocks per page, it may be: + * N data blocks + * 2 indirect block + * 2 dindirect + * 1 tindirect + * N+5 bitmap blocks (from the above) + * N+5 group descriptor summary blocks + * 1 inode block + * 1 superblock. + * 2 * EXT3COW_SINGLEDATA_TRANS_BLOCKS for the quote files + * + * 3 * (N + 5) + 2 + 2 * EXT3COW_SINGLEDATA_TRANS_BLOCKS + * + * With ordered or writeback data it's the same, less the N data blocks. + * + * If the inode's direct blocks can hold an integral number of pages then a + * page cannot straddle two indirect blocks, and we can only touch one indirect + * and dindirect block, and the "5" above becomes "3". + * + * This still overestimates under most circumstances. If we were to pass the + * start and end offsets in here as well we could do block_to_path() on each + * block and work out the exact number of indirects which are touched. Pah. + */ + +static int ext3cow_writepage_trans_blocks(struct inode *inode) +{ + int bpp = ext3cow_journal_blocks_per_page(inode); + int indirects = (EXT3COW_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + + if (ext3cow_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else + ret = 2 * (bpp + indirects) + 2; + +#ifdef CONFIG_QUOTA + /* We know that structure was already allocated during DQUOT_INIT so + * we will be updating only the data blocks + inodes */ + ret += 2*EXT3COW_QUOTA_TRANS_BLOCKS(inode->i_sb); +#endif + + return ret; +} + +/* + * The caller must have previously called ext3cow_reserve_inode_write(). + * Give this, we know that the caller already has write access to iloc->bh. + */ +int ext3cow_mark_iloc_dirty(handle_t *handle, + struct inode *inode, struct ext3cow_iloc *iloc) +{ + int err = 0; + + /* the do_update_inode consumes one bh->b_count */ + get_bh(iloc->bh); + + /* ext3cow_do_update_inode() does journal_dirty_metadata */ + err = ext3cow_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext3cow_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3cow_iloc *iloc) +{ + int err = 0; + if (handle) { + err = ext3cow_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + } + ext3cow_std_error(inode->i_sb, err); + return err; +} + +/* + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + * + * Is this efficient/effective? Well, we're being nice to the system + * by cleaning up our inodes proactively so they can be reaped + * without I/O. But we are potentially leaving up to five seconds' + * worth of inodes floating about which prune_icache wants us to + * write out. One way to fix that would be to get prune_icache() + * to do a write_super() to free up some memory. It has the desired + * effect. + */ +int ext3cow_mark_inode_dirty(handle_t *handle, struct inode *inode) +{ + struct ext3cow_iloc iloc; + int err; + + if(EXT3COW_IS_FAKEINODE(inode)) + return 0; + + might_sleep(); + err = ext3cow_reserve_inode_write(handle, inode, &iloc); + if (!err) + err = ext3cow_mark_iloc_dirty(handle, inode, &iloc); + return err; +} + +/* + * ext3cow_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ +void ext3cow_dirty_inode(struct inode *inode) +{ + handle_t *current_handle = ext3cow_journal_current_handle(); + handle_t *handle; + + handle = ext3cow_journal_start(inode, 2); + if (IS_ERR(handle)) + goto out; + if (current_handle && + current_handle->h_transaction != handle->h_transaction) { + /* This task has a transaction open against a different fs */ + printk(KERN_EMERG "%s: transactions do not match!\n", + __FUNCTION__); + } else { + jbd_debug(5, "marking dirty. outer handle=%p\n", + current_handle); + ext3cow_mark_inode_dirty(handle, inode); + } + ext3cow_journal_stop(handle); +out: + return; +} + +#if 0 +/* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext3cow_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ +static int ext3cow_pin_inode(handle_t *handle, struct inode *inode) +{ + struct ext3cow_iloc iloc; + + int err = 0; + if (handle) { + err = ext3cow_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext3cow_journal_dirty_metadata(handle, + iloc.bh); + brelse(iloc.bh); + } + } + ext3cow_std_error(inode->i_sb, err); + return err; +} +#endif + +int ext3cow_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT3COW_JOURNAL(inode); + if (is_journal_aborted(journal) || IS_RDONLY(inode)) + return -EROFS; + + journal_lock_updates(journal); + journal_flush(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + EXT3COW_I(inode)->i_flags |= EXT3COW_JOURNAL_DATA_FL; + else + EXT3COW_I(inode)->i_flags &= ~EXT3COW_JOURNAL_DATA_FL; + ext3cow_set_aops(inode); + + journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext3cow_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext3cow_mark_inode_dirty(handle, inode); + handle->h_sync = 1; + ext3cow_journal_stop(handle); + ext3cow_std_error(inode->i_sb, err); + + return err; +} diff -ruN linux-2.6.20.3/fs/ext3cow/ioctl.c linux-2.6.20.3-ext3cow/fs/ext3cow/ioctl.c --- linux-2.6.20.3/fs/ext3cow/ioctl.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/ioctl.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,312 @@ +/* + * linux/fs/ext3cow/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int ext3cow_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +{ + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + unsigned int flags; + unsigned short rsv_window_size; + + ext3cow_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + /* Some IOCTLs for version */ + case EXT3COW_IOC_TAKESNAPSHOT: + return (unsigned int)ext3cow_take_snapshot(inode->i_sb); + case EXT3COW_IOC_GETEPOCH: + return (unsigned int)EXT3COW_S_EPOCHNUMBER(inode->i_sb); + case EXT3COW_IOC_GETFLAGS: + flags = ei->i_flags & EXT3COW_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT3COW_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err; + struct ext3cow_iloc iloc; + unsigned int oldflags; + unsigned int jflag; + + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + if (!S_ISDIR(inode->i_mode)) + flags &= ~EXT3COW_DIRSYNC_FL; + + mutex_lock(&inode->i_mutex); + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT3COW_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT3COW_APPEND_FL | EXT3COW_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + return -EPERM; + } + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT3COW_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) { + mutex_unlock(&inode->i_mutex); + return -EPERM; + } + } + + + handle = ext3cow_journal_start(inode, 1); + if (IS_ERR(handle)) { + mutex_unlock(&inode->i_mutex); + return PTR_ERR(handle); + } + if (IS_SYNC(inode)) + handle->h_sync = 1; + err = ext3cow_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + flags = flags & EXT3COW_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3COW_FL_USER_MODIFIABLE; + ei->i_flags = flags; + + ext3cow_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME_SEC; + + err = ext3cow_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext3cow_journal_stop(handle); + if (err) { + mutex_unlock(&inode->i_mutex); + return err; + } + + if ((jflag ^ oldflags) & (EXT3COW_JOURNAL_DATA_FL)) + err = ext3cow_change_inode_journal_flag(inode, jflag); + mutex_unlock(&inode->i_mutex); + return err; + } + case EXT3COW_IOC_GETVERSION: + case EXT3COW_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT3COW_IOC_SETVERSION: + case EXT3COW_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext3cow_iloc iloc; + __u32 generation; + int err; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + if (IS_RDONLY(inode)) + return -EROFS; + if (get_user(generation, (int __user *) arg)) + return -EFAULT; + + handle = ext3cow_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + err = ext3cow_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode->i_ctime = CURRENT_TIME_SEC; + inode->i_generation = generation; + err = ext3cow_mark_iloc_dirty(handle, inode, &iloc); + } + ext3cow_journal_stop(handle); + return err; + } +#ifdef CONFIG_JBD_DEBUG + case EXT3COW_IOC_WAIT_FOR_READONLY: + /* + * This is racy - by the time we're woken up and running, + * the superblock could be released. And the module could + * have been unloaded. So sue me. + * + * Returns 1 if it slept, else zero. + */ + { + struct super_block *sb = inode->i_sb; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&EXT3COW_SB(sb)->ro_wait_queue, &wait); + if (timer_pending(&EXT3COW_SB(sb)->turn_ro_timer)) { + schedule(); + ret = 1; + } + remove_wait_queue(&EXT3COW_SB(sb)->ro_wait_queue, &wait); + return ret; + } +#endif + case EXT3COW_IOC_GETRSVSZ: + if (test_opt(inode->i_sb, RESERVATION) + && S_ISREG(inode->i_mode) + && ei->i_block_alloc_info) { + rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; + return put_user(rsv_window_size, (int __user *)arg); + } + return -ENOTTY; + case EXT3COW_IOC_SETRSVSZ: { + + if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) + return -ENOTTY; + + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EACCES; + + if (get_user(rsv_window_size, (int __user *)arg)) + return -EFAULT; + + if (rsv_window_size > EXT3COW_MAX_RESERVE_BLOCKS) + rsv_window_size = EXT3COW_MAX_RESERVE_BLOCKS; + + /* + * need to allocate reservation structure for this inode + * before set the window size + */ + mutex_lock(&ei->truncate_mutex); + if (!ei->i_block_alloc_info) + ext3cow_init_block_alloc_info(inode); + + if (ei->i_block_alloc_info){ + struct ext3cow_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; + rsv->rsv_goal_size = rsv_window_size; + } + mutex_unlock(&ei->truncate_mutex); + return 0; + } + case EXT3COW_IOC_GROUP_EXTEND: { + ext3cow_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) + return -EFAULT; + + err = ext3cow_group_extend(sb, EXT3COW_SB(sb)->s_es, n_blocks_count); + journal_lock_updates(EXT3COW_SB(sb)->s_journal); + journal_flush(EXT3COW_SB(sb)->s_journal); + journal_unlock_updates(EXT3COW_SB(sb)->s_journal); + + return err; + } + case EXT3COW_IOC_GROUP_ADD: { + struct ext3cow_new_group_data input; + struct super_block *sb = inode->i_sb; + int err; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (copy_from_user(&input, (struct ext3cow_new_group_input __user *)arg, + sizeof(input))) + return -EFAULT; + + err = ext3cow_group_add(sb, &input); + journal_lock_updates(EXT3COW_SB(sb)->s_journal); + journal_flush(EXT3COW_SB(sb)->s_journal); + journal_unlock_updates(EXT3COW_SB(sb)->s_journal); + + return err; + } + + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ext3cow_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + int ret; + + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT3COW_IOC32_GETFLAGS: + cmd = EXT3COW_IOC_GETFLAGS; + break; + case EXT3COW_IOC32_SETFLAGS: + cmd = EXT3COW_IOC_SETFLAGS; + break; + case EXT3COW_IOC32_GETVERSION: + cmd = EXT3COW_IOC_GETVERSION; + break; + case EXT3COW_IOC32_SETVERSION: + cmd = EXT3COW_IOC_SETVERSION; + break; + case EXT3COW_IOC32_GROUP_EXTEND: + cmd = EXT3COW_IOC_GROUP_EXTEND; + break; + case EXT3COW_IOC32_GETVERSION_OLD: + cmd = EXT3COW_IOC_GETVERSION_OLD; + break; + case EXT3COW_IOC32_SETVERSION_OLD: + cmd = EXT3COW_IOC_SETVERSION_OLD; + break; +#ifdef CONFIG_JBD_DEBUG + case EXT3COW_IOC32_WAIT_FOR_READONLY: + cmd = EXT3COW_IOC_WAIT_FOR_READONLY; + break; +#endif + case EXT3COW_IOC32_GETRSVSZ: + cmd = EXT3COW_IOC_GETRSVSZ; + break; + case EXT3COW_IOC32_SETRSVSZ: + cmd = EXT3COW_IOC_SETRSVSZ; + break; + case EXT3COW_IOC_GROUP_ADD: + break; + default: + return -ENOIOCTLCMD; + } + lock_kernel(); + ret = ext3cow_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); + unlock_kernel(); + return ret; +} +#endif diff -ruN linux-2.6.20.3/fs/ext3cow/Makefile linux-2.6.20.3-ext3cow/fs/ext3cow/Makefile --- linux-2.6.20.3/fs/ext3cow/Makefile 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/Makefile 2008-03-09 11:14:49.000000000 -0400 @@ -0,0 +1,12 @@ +# +# Makefile for the linux ext3cow-filesystem routines. +# + +obj-$(CONFIG_EXT3COW_FS) += ext3cow.o + +ext3cow-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o ext3cow_jbd.o + +ext3cow-$(CONFIG_EXT3COW_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext3cow-$(CONFIG_EXT3COW_FS_POSIX_ACL) += acl.o +ext3cow-$(CONFIG_EXT3COW_FS_SECURITY) += xattr_security.o diff -ruN linux-2.6.20.3/fs/ext3cow/namei.c linux-2.6.20.3-ext3cow/fs/ext3cow/namei.c --- linux-2.6.20.3/fs/ext3cow/namei.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/namei.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,2979 @@ +/* + * linux/fs/ext3cow/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 + * Hash Tree Directory indexing cleanup + * Theodore Ts'o, 2002 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "namei.h" +#include "xattr.h" +#include "acl.h" + +/* + * define how far ahead to read directories while searching them. + */ +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) +#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +/* is the inode marked unchangeable or does the name + contain an epoch less than the current system epoch -znjp */ +int is_unchangeable(struct inode *inode, struct dentry *dentry){ + + char *at = NULL; + + if (inode && (EXT3COW_IS_UNCHANGEABLE(inode) || IS_IMMUTABLE(inode))) + return 1; + if(dentry) + at = strrchr(dentry->d_name.name, EXT3COW_FLUX_TOKEN); + if(at && (simple_strtol(&at[1], (char **)NULL, 10) > 0)) + return 1; + + return 0; +} + +static struct buffer_head *ext3cow_append(handle_t *handle, + struct inode *inode, + u32 *block, int *err) +{ + struct buffer_head *bh; + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + if ((bh = ext3cow_bread(handle, inode, *block, 1, err))) { + inode->i_size += inode->i_sb->s_blocksize; + EXT3COW_I(inode)->i_disksize = inode->i_size; + ext3cow_journal_get_write_access(handle,bh); + } + return bh; +} + +#ifndef assert +#define assert(test) J_ASSERT(test) +#endif + +#ifndef swap +#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +#endif + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +struct fake_dirent +{ + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + __le16 limit; + __le16 count; +}; + +struct dx_entry +{ + __le32 hash; + __le32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[0]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u32 offs; +}; + +#ifdef CONFIG_EXT3COW_INDEX +static inline unsigned dx_get_block (struct dx_entry *entry); +static void dx_set_block (struct dx_entry *entry, unsigned value); +static inline unsigned dx_get_hash (struct dx_entry *entry); +static void dx_set_hash (struct dx_entry *entry, unsigned value); +static unsigned dx_get_count (struct dx_entry *entries); +static unsigned dx_get_limit (struct dx_entry *entries); +static void dx_set_count (struct dx_entry *entries, unsigned value); +static void dx_set_limit (struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit (struct inode *dir, unsigned infosize); +static unsigned dx_node_limit (struct inode *dir); +static struct dx_frame *dx_probe(struct dentry *dentry, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, + int *err); +static void dx_release (struct dx_frame *frames); +static int dx_make_map (struct ext3cow_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_sort_map(struct dx_map_entry *map, unsigned count); +static struct ext3cow_dir_entry_2 *dx_move_dirents (char *from, char *to, + struct dx_map_entry *offsets, int count); +static struct ext3cow_dir_entry_2* dx_pack_dirents (char *base, int size); +static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); +static int ext3cow_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash); +static struct buffer_head * ext3cow_dx_find_entry(struct dentry *dentry, + struct ext3cow_dir_entry_2 **res_dir, int *err); +static int ext3cow_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline unsigned dx_get_block (struct dx_entry *entry) +{ + return le32_to_cpu(entry->block) & 0x00ffffff; +} + +static inline void dx_set_block (struct dx_entry *entry, unsigned value) +{ + entry->block = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash (struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash); +} + +static inline void dx_set_hash (struct dx_entry *entry, unsigned value) +{ + entry->hash = cpu_to_le32(value); +} + +static inline unsigned dx_get_count (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit (struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline void dx_set_limit (struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3COW_DIR_REC_LEN(1) - + EXT3COW_DIR_REC_LEN(2) - infosize; + return 0? 20: entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit (struct inode *dir) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT3COW_DIR_REC_LEN(0); + return 0? 22: entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +#ifdef DX_DEBUG +static void dx_show_index (char * label, struct dx_entry *entries) +{ + int i, n = dx_get_count (entries); + printk("%s index ", label); + for (i = 0; i < n; i++) + { + printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i)); + } + printk("\n"); +} + +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3cow_dir_entry_2 *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { + int len = de->name_len; + char *name = de->name; + while (len--) printk("%c", *name++); + ext3cowfs_dirhash(de->name, de->name_len, &h); + printk(":%x.%u ", h.hash, + ((char *) de - base)); + } + space += EXT3COW_DIR_REC_LEN(de->name_len); + names++; + } + de = (struct ext3cow_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); + } + printk("(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count (entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + int err; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + if (!(bh = ext3cow_bread (NULL,dir, block, 0,&err))) continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(hinfo, (struct ext3cow_dir_entry_2 *) bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse (bh); + } + if (bcount) + printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", + names, space/bcount,(space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} +#endif /* DX_DEBUG */ + +/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format + * error in the directory index, and the caller should fall back to + * searching the directory normally. The callers of dx_probe **MUST** + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +static struct dx_frame * +dx_probe(struct dentry *dentry, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) +{ + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + + frame->bh = NULL; + if (dentry) + dir = dentry->d_parent->d_inode; + if (!(bh = ext3cow_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3cow_warning(dir->i_sb, __FUNCTION__, + "Unrecognised inode hash code %d", + root->info.hash_version); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + hinfo->hash_version = root->info.hash_version; + hinfo->seed = EXT3COW_SB(dir->i_sb)->s_hash_seed; + if (dentry) + ext3cowfs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext3cow_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash flags: %#06x", + root->info.unused_flags); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + if ((indirect = root->info.indirect_levels) > 1) { + ext3cow_warning(dir->i_sb, __FUNCTION__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + entries = (struct dx_entry *) (((char *)&root->info) + + root->info.info_length); + assert(dx_get_limit(entries) == dx_root_limit(dir, + root->info.info_length)); + dxtrace (printk("Look up %x", hash)); + while (1) + { + count = dx_get_count(entries); + assert (count && count <= dx_get_limit(entries)); + p = entries + 1; + q = entries + count - 1; + while (p <= q) + { + m = p + (q - p)/2; + dxtrace(printk(".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + if (0) // linear search cross check + { + unsigned n = count - 1; + at = entries; + while (n--) + { + dxtrace(printk(",")); + if (dx_get_hash(++at) > hash) + { + at--; + break; + } + } + assert (at == p - 1); + } + + at = p - 1; + dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; + if (!indirect--) return frame; + if (!(bh = ext3cow_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; + at = entries = ((struct dx_node *) bh->b_data)->entries; + assert (dx_get_limit(entries) == dx_node_limit (dir)); + frame++; + } +fail2: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } +fail: + return NULL; +} + +static void dx_release (struct dx_frame *frames) +{ + if (frames[0].bh == NULL) + return; + + if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); +} + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int ext3cow_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash) +{ + struct dx_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + p = frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; + if (p == frames) + return 0; + num_frames++; + p--; + } + + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + bhash = dx_get_hash(p->at); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + if (!(bh = ext3cow_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) + return err; /* Failure */ + p++; + brelse (p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } + return 1; +} + + +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ext3cow_dir_entry_2 *ext3cow_next_entry(struct ext3cow_dir_entry_2 *p) +{ + return (struct ext3cow_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); +} + +/* + * This function fills a red-black tree with information from a + * directory block. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +static int htree_dirblock_to_tree(struct file *dir_file, + struct inode *dir, int block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash) +{ + struct buffer_head *bh; + struct ext3cow_dir_entry_2 *de, *top; + int err, count = 0; + + dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); + if (!(bh = ext3cow_bread (NULL, dir, block, 0, &err))) + return err; + + de = (struct ext3cow_dir_entry_2 *) bh->b_data; + top = (struct ext3cow_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - + EXT3COW_DIR_REC_LEN(0)); + for (; de < top; de = ext3cow_next_entry(de)) { + if (!ext3cow_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, + (block<i_sb)) + +((char *)de - bh->b_data))) { + /* On error, skip the f_pos to the next block. */ + dir_file->f_pos = (dir_file->f_pos | + (dir->i_sb->s_blocksize - 1)) + 1; + brelse (bh); + return count; + } + ext3cowfs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + if ((err = ext3cow_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de)) != 0) { + brelse(bh); + return err; + } + count++; + } + brelse(bh); + return count; +} + + +/* + * This function fills a red-black tree with information from a + * directory. We start scanning the directory in hash order, starting + * at start_hash and start_minor_hash. + * + * This function returns the number of entries inserted into the tree, + * or a negative error code. + */ +int ext3cow_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash) +{ + struct dx_hash_info hinfo; + struct ext3cow_dir_entry_2 *de; + struct dx_frame frames[2], *frame; + struct inode *dir; + int block, err; + int count = 0; + int ret; + __u32 hashval; + + dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, + start_minor_hash)); + dir = dir_file->f_path.dentry->d_inode; + if (!(EXT3COW_I(dir)->i_flags & EXT3COW_INDEX_FL)) { + hinfo.hash_version = EXT3COW_SB(dir->i_sb)->s_def_hash_version; + hinfo.seed = EXT3COW_SB(dir->i_sb)->s_hash_seed; + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); + *next_hash = ~0; + return count; + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; + frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); + if (!frame) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext3cow_dir_entry_2 *) frames[0].bh->b_data; + if ((err = ext3cow_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { + de = (struct ext3cow_dir_entry_2 *) frames[0].bh->b_data; + de = ext3cow_next_entry(de); + if ((err = ext3cow_htree_store_dirent(dir_file, 2, 0, de)) != 0) + goto errout; + count++; + } + + while (1) { + block = dx_get_block(frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { + err = ret; + goto errout; + } + count += ret; + hashval = ~0; + ret = ext3cow_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; + goto errout; + } + /* + * Stop if: (a) there are no more entries, or + * (b) we have inserted at least one entry and the + * next hash value is not a continuation + */ + if ((ret == 0) || + (count && ((hashval & 1) == 0))) + break; + } + dx_release(frames); + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + count, *next_hash)); + return count; +errout: + dx_release(frames); + return (err); +} + + +/* + * Directory block splitting, compacting + */ + +static int dx_make_map (struct ext3cow_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +{ + int count = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + while ((char *) de < base + size) + { + if (de->name_len && de->inode) { + ext3cowfs_dirhash(de->name, de->name_len, &h); + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = (u32) ((char *) de - base); + count++; + cond_resched(); + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ + de = (struct ext3cow_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); + } + return count; +} + +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) + { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) + { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); + assert(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} +#endif + + +static void ext3cow_update_dx_flag(struct inode *inode) +{ + if (!EXT3COW_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3COW_FEATURE_COMPAT_DIR_INDEX)) + EXT3COW_I(inode)->i_flags &= ~EXT3COW_INDEX_FL; +} + +/* + * NOTE! unlike strncmp, ext3cow_match returns 1 for success, 0 for failure. + * + * `len <= EXT3COW_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ +static inline int ext3cow_match (int len, const char * const name, + struct ext3cow_dir_entry_2 * de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +/* For versioning - this is the function used when looking for + * names. We now handle names which include the flux token, + * strip it off and continue looking -znjp */ +static inline int search_dirblock(struct buffer_head * bh, + struct inode *dir, + struct dentry *dentry, + unsigned long offset, + struct ext3cow_dir_entry_2 ** res_dir) +{ + struct ext3cow_dir_entry_2 * de; + char * dlimit, * flux = NULL; + int de_len; + char name[EXT3COW_NAME_LEN]; + int namelen = dentry->d_name.len; + unsigned int epoch_number = EXT3COW_I_EPOCHNUMBER(dir); + + /* Get the name for the dentry */ + memcpy(name, dentry->d_name.name, namelen); + name[namelen] = '\0'; + + /* Check to see if the flux token is in the name */ + flux = strrchr(dentry->d_name.name, EXT3COW_FLUX_TOKEN); + if(NULL != flux){ + /* If we're here, the name we want is in the past. */ + int new_namelen = strlen(dentry->d_name.name) - strlen(flux); + /* Get the epoch number */ + epoch_number = simple_strtol(&flux[1], (char **)NULL, 10) - 1; + /* If there's a valid epoch number or if we're version listing + * we need the name seperately, otherwise the FLUX_TOKEN exists + * in the file name */ + if(epoch_number + 1 == 0 && (strlen(flux) > 1)){ + /* EXT3COW_FLUX_TOKEN exists in the file name */ + epoch_number = EXT3COW_S_EPOCHNUMBER(dir->i_sb); + }else{ + /* Grab the correct name and length */ + memcpy(name, dentry->d_name.name, new_namelen); + name[new_namelen] = '\0'; + namelen = strlen(name); + } + } + + + de = (struct ext3cow_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + /* Can't just return first entry of something; + * may exist twice if died and same name appears again. - znjp + */ + if ((char *) de + namelen <= dlimit && + ext3cow_match (namelen, name, de) && + EXT3COW_IS_DIRENT_SCOPED(de, epoch_number)) { + /* found a match - just to be sure, do a full check */ + if (!ext3cow_check_dir_entry("ext3cow_find_entry", + dir, de, bh, offset)) + return -1; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = le16_to_cpu(de->rec_len); + if (de_len <= 0) + return -1; + offset += de_len; + de = (struct ext3cow_dir_entry_2 *) ((char *) de + de_len); + } + return 0; +} + + +/* + * ext3cow_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +static struct buffer_head * ext3cow_find_entry (struct dentry *dentry, + struct ext3cow_dir_entry_2 ** res_dir) +{ + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; + struct buffer_head * bh, *ret = NULL; + unsigned long start, block, b; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; + int namelen; + const u8 *name; + unsigned blocksize; + + *res_dir = NULL; + sb = dir->i_sb; + blocksize = sb->s_blocksize; + namelen = dentry->d_name.len; + name = dentry->d_name.name; + if (namelen > EXT3COW_NAME_LEN) + return NULL; +#ifdef CONFIG_EXT3COW_INDEX + if (is_dx(dir)) { + bh = ext3cow_dx_find_entry(dentry, res_dir, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the + * old fashioned way. + */ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk("ext3cow_find_entry: dx failed, falling back\n")); + } +#endif + nblocks = dir->i_size >> EXT3COW_BLOCK_SIZE_BITS(sb); + start = EXT3COW_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext3cow_getblk(NULL, dir, b++, 0, &err); + bh_use[ra_max] = bh; + if (bh) + ll_rw_block(READ_META, 1, &bh); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + ext3cow_error(sb, __FUNCTION__, "reading directory #%lu " + "offset %lu", dir->i_ino, block); + brelse(bh); + goto next; + } + i = search_dirblock(bh, dir, dentry, + block << EXT3COW_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + EXT3COW_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT3COW_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); + return ret; +} + +#ifdef CONFIG_EXT3COW_INDEX +static struct buffer_head * ext3cow_dx_find_entry(struct dentry *dentry, + struct ext3cow_dir_entry_2 **res_dir, int *err) +{ + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; + struct dx_frame frames[2], *frame; + struct ext3cow_dir_entry_2 *de, *top; + struct buffer_head *bh; + unsigned long block; + int retval; + int namelen = dentry->d_name.len; + const u8 *name = dentry->d_name.name; + struct inode *dir = dentry->d_parent->d_inode; + + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ + if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) + return NULL; + } else { + frame = frames; + frame->bh = NULL; /* for dx_release() */ + frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ + dx_set_block(frame->at, 0); /* dx_root block is 0 */ + } + hash = hinfo.hash; + do { + block = dx_get_block(frame->at); + if (!(bh = ext3cow_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext3cow_dir_entry_2 *) bh->b_data; + top = (struct ext3cow_dir_entry_2 *) ((char *) de + sb->s_blocksize - + EXT3COW_DIR_REC_LEN(0)); + for (; de < top; de = ext3cow_next_entry(de)) + if (ext3cow_match (namelen, name, de)) { + if (!ext3cow_check_dir_entry("ext3cow_find_entry", + dir, de, bh, + (block<b_data))) { + brelse (bh); + goto errout; + } + *res_dir = de; + dx_release (frames); + return bh; + } + brelse (bh); + /* Check to see if we should continue to search */ + retval = ext3cow_htree_next_block(dir, hash, frame, + frames, NULL); + if (retval < 0) { + ext3cow_warning(sb, __FUNCTION__, + "error reading index page in directory #%lu", + dir->i_ino); + *err = retval; + goto errout; + } + } while (retval == 1); + + *err = -ENOENT; +errout: + dxtrace(printk("%s not found\n", name)); + dx_release (frames); + return NULL; +} +#endif + +/* ext3cow_lookup: One the key functions of this versioning file sytem, + * allowing people to return to the past. + * + * Two policies for inode chains: + * 1) If it's the head of the list, it's the most current inode + * and always changable. The inode number is static. + * 2) If it's any inode in the chain that's not the head, + * than it's an inode in the past and unchangeable. The inode + * number may change. + */ +static struct dentry *ext3cow_lookup(struct inode * dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode * inode = NULL; + struct ext3cow_dir_entry_2 * de = NULL; + struct buffer_head * bh = NULL; + unsigned int epoch_number = 0; + char * flux = NULL; + + if (dentry->d_name.len > EXT3COW_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + /* Find the epoch number to scope with -znjp + * if the parent is unchangeable, so is the inode + */ + if(EXT3COW_IS_UNCHANGEABLE(dir)) + epoch_number = EXT3COW_I_EPOCHNUMBER(dir); + else + epoch_number = EXT3COW_S_EPOCHNUMBER(dir->i_sb); + + bh = ext3cow_find_entry(dentry, &de); + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); + brelse (bh); + if (!ext3cow_valid_inum(dir->i_sb, ino)) { + ext3cow_error(dir->i_sb, "ext3cow_lookup", + "bad inode number: %lu", ino); + inode = NULL; + } else + inode = iget(dir->i_sb, ino); + + if (!inode) + return ERR_PTR(-EACCES); + + /* Is this a version listing ? */ + if ((char)dentry->d_name.name[dentry->d_name.len - 1] == + EXT3COW_FLUX_TOKEN) { + /* prevent going round in circles */ + if (dentry->d_parent && + dentry->d_parent->d_name.name[dentry->d_parent->d_name.len - 1] == + EXT3COW_FLUX_TOKEN) { + return NULL; + } + /* we fake a directory using the directory inode instead of + * the file one and subsequently force a call to ext3cow_readdir */ + iput(inode); + inode = ext3cow_fake_inode(dir, EXT3COW_S_EPOCHNUMBER(dir->i_sb)); + EXT3COW_I(inode)->i_next_inode = EXT3COW_I(dir)->i_next_inode; + d_splice_alias(inode, dentry); + + return NULL; + } + + /* Is the user time-shifting to the past? */ + flux = strrchr(dentry->d_name.name, EXT3COW_FLUX_TOKEN); + if(NULL != flux){ + + if(strnicmp(&flux[1], "onehour", 8) == 0){ + epoch_number = get_seconds() - ONEHOUR; + printk(KERN_INFO "ONEHOUR!\n"); + }else if(strnicmp(&flux[1], "yesterday", 10) == 0 || + strnicmp(&flux[1], "oneday", 7) == 0){ + epoch_number = get_seconds() - YESTERDAY; + }else if(strnicmp(&flux[1], "oneweek", 8) == 0){ + epoch_number = get_seconds() - ONEWEEK; + }else if(strnicmp(&flux[1], "onemonth", 9) == 0){ + epoch_number = get_seconds() - ONEMONTH; + }else if(strnicmp(&flux[1], "oneyear", 8) == 0){ + epoch_number = get_seconds() - ONEYEAR; + }else + epoch_number = simple_strtol(&flux[1], (char **)NULL, 10) - 1; + + /* No future epochs */ + if(epoch_number + 1 > EXT3COW_S_EPOCHNUMBER(dir->i_sb)) + return ERR_PTR(-ENOENT); + + /* Move to present + if(epoch_number + 1 == 0) + epoch_number = EXT3COW_S_EPOCHNUMBER(dir->i_sb); + */ + } + + /* Find correct inode in chain */ + while(EXT3COW_I_EPOCHNUMBER(inode) > epoch_number){ + + printk(KERN_INFO "Looking for %u with epoch %u\n", epoch_number, + EXT3COW_I_EPOCHNUMBER(inode)); + + ino = EXT3COW_I(inode)->i_next_inode; + if(ino == 0){ + ext3cow_warning(dir->i_sb, "ext3cow_lookup", + "Next inode is 0 in lookup."); + iput(inode); + return ERR_PTR(-ENOENT); + } + iput(inode); /* for correct usage count (i_count) */ + inode = iget(dir->i_sb, ino); + + if (!inode){ + ext3cow_warning(dir->i_sb, "ext3cow_lookup", + "Could not access inode number %lu", + ino); + return ERR_PTR(-EACCES); + } + } + + /* If we're in the past, fake the inode for scoping and "unchangability" */ + if(flux || (epoch_number != EXT3COW_S_EPOCHNUMBER(dir->i_sb))){ + printk(KERN_INFO "Faking %s\n", dentry->d_name.name); + inode = ext3cow_fake_inode(inode, epoch_number); + } + + if (!inode) + return ERR_PTR(-EACCES); + } + return d_splice_alias(inode, dentry); +} + + +struct dentry *ext3cow_get_parent(struct dentry *child) +{ + unsigned long ino; + struct dentry *parent; + struct inode *inode; + struct dentry dotdot; + struct ext3cow_dir_entry_2 * de; + struct buffer_head *bh; + + dotdot.d_name.name = ".."; + dotdot.d_name.len = 2; + dotdot.d_parent = child; /* confusing, isn't it! */ + + bh = ext3cow_find_entry(&dotdot, &de); + inode = NULL; + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); + + if (!ext3cow_valid_inum(child->d_inode->i_sb, ino)) { + ext3cow_error(child->d_inode->i_sb, "ext3cow_get_parent", + "bad inode number: %lu", ino); + inode = NULL; + } else + inode = iget(child->d_inode->i_sb, ino); + + if (!inode) + return ERR_PTR(-EACCES); + + parent = d_alloc_anon(inode); + if (!parent) { + iput(inode); + parent = ERR_PTR(-ENOMEM); + } + return parent; +} + +#define S_SHIFT 12 +static unsigned char ext3cow_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT3COW_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT3COW_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT3COW_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT3COW_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT3COW_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT3COW_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT3COW_FT_SYMLINK, +}; + +static inline void ext3cow_set_de_type(struct super_block *sb, + struct ext3cow_dir_entry_2 *de, + umode_t mode) { + if (EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext3cow_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +#ifdef CONFIG_EXT3COW_INDEX +static struct ext3cow_dir_entry_2 * +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) +{ + unsigned rec_len = 0; + + while (count--) { + struct ext3cow_dir_entry_2 *de = (struct ext3cow_dir_entry_2 *) (from + map->offs); + rec_len = EXT3COW_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext3cow_dir_entry_2 *) to)->rec_len = + cpu_to_le16(rec_len); + de->inode = 0; + map++; + to += rec_len; + } + return (struct ext3cow_dir_entry_2 *) (to - rec_len); +} + +static struct ext3cow_dir_entry_2* dx_pack_dirents(char *base, int size) +{ + struct ext3cow_dir_entry_2 *next, *to, *prev, *de = (struct ext3cow_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; + while ((char*)de < base + size) { + next = (struct ext3cow_dir_entry_2 *) ((char *) de + + le16_to_cpu(de->rec_len)); + if (de->inode && de->name_len) { + rec_len = EXT3COW_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = cpu_to_le16(rec_len); + prev = to; + to = (struct ext3cow_dir_entry_2 *) (((char *) to) + rec_len); + } + de = next; + } + return prev; +} + +static struct ext3cow_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; + u32 newblock; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2; + unsigned split; + struct ext3cow_dir_entry_2 *de = NULL, *de2; + int err; + + bh2 = ext3cow_append (handle, dir, &newblock, error); + if (!(bh2)) { + brelse(*bh); + *bh = NULL; + goto errout; + } + + BUFFER_TRACE(*bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, *bh); + if (err) { + journal_error: + brelse(*bh); + brelse(bh2); + *bh = NULL; + ext3cow_std_error(dir->i_sb, err); + goto errout; + } + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + + data2 = bh2->b_data; + + /* create map in the end of data2 block */ + map = (struct dx_map_entry *) (data2 + blocksize); + count = dx_make_map ((struct ext3cow_dir_entry_2 *) data1, + blocksize, hinfo, map); + map -= count; + split = count/2; // need to adjust to actual middle + dx_sort_map (map, count); + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", + dx_get_block(frame->at), hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1,blocksize); + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); + dxtrace(dx_show_leaf (hinfo, (struct ext3cow_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext3cow_dir_entry_2 *) data2, blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; + } + dx_insert_block (frame, hash2 + continued, newblock); + err = ext3cow_journal_dirty_metadata (handle, bh2); + if (err) + goto journal_error; + err = ext3cow_journal_dirty_metadata (handle, frame->bh); + if (err) + goto journal_error; + brelse (bh2); + dxtrace(dx_show_index ("frame", frame->entries)); +errout: + return de; +} +#endif + + +/* + * Add a new entry into a directory (leaf) block. If de is non-NULL, + * it points to a directory entry which is guaranteed to be large + * enough for new directory entry. If de is NULL, then + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + * + * NOTE! bh is NOT released in the case where ENOSPC is returned. In + * all other cases bh is released. + */ +static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct ext3cow_dir_entry_2 *de, + struct buffer_head * bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned long offset = 0; + unsigned short reclen; + int nlen, rlen, err; + char *top; + + reclen = EXT3COW_DIR_REC_LEN(namelen); + if (!de) { + de = (struct ext3cow_dir_entry_2 *)bh->b_data; + top = bh->b_data + dir->i_sb->s_blocksize - reclen; + while ((char *) de <= top) { + if (!ext3cow_check_dir_entry("ext3cow_add_entry", dir, de, + bh, offset)) { + brelse (bh); + ext3cow_reclaim_dup_inode(dentry->d_parent->d_parent->d_inode, dir); + return -EIO; + } + /* If name exists and it's still alive, no add. But if it's a new + * name in this scope, ok to add. -znjp */ + if (ext3cow_match (namelen, name, de) && EXT3COW_IS_DIRENT_ALIVE(de)) { + brelse (bh); + return -EEXIST; + } + nlen = EXT3COW_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; + de = (struct ext3cow_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, bh); + if (err) { + ext3cow_std_error(dir->i_sb, err); + brelse(bh); + return err; + } + + /* By now the buffer is marked for journaling */ + nlen = EXT3COW_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); + if (de->inode) { + struct ext3cow_dir_entry_2 *de1 = (struct ext3cow_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = cpu_to_le16(rlen - nlen); + de->rec_len = cpu_to_le16(nlen); + de = de1; + } + de->file_type = EXT3COW_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext3cow_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + /* For versioning -znjp */ + de->birth_epoch = cpu_to_le32(EXT3COW_S_EPOCHNUMBER(dir->i_sb)); + de->death_epoch = cpu_to_le32(EXT3COW_DIRENT_ALIVE); + de->name_len = namelen; + memcpy (de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext3cow_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + ext3cow_update_dx_flag(dir); + dir->i_version++; + ext3cow_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); + err = ext3cow_journal_dirty_metadata(handle, bh); + if (err) + ext3cow_std_error(dir->i_sb, err); + brelse(bh); + return 0; +} + +#ifdef CONFIG_EXT3COW_INDEX +/* + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ +static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; + struct ext3cow_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + u32 block; + struct fake_dirent *fde; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk("Creating index\n")); + retval = ext3cow_journal_get_write_access(handle, bh); + if (retval) { + ext3cow_std_error(dir->i_sb, retval); + brelse(bh); + return retval; + } + root = (struct dx_root *) bh->b_data; + + bh2 = ext3cow_append (handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); + return retval; + } + EXT3COW_I(dir)->i_flags |= EXT3COW_INDEX_FL; + data1 = bh2->b_data; + + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext3cow_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len)); + len = ((char *) root) + blocksize - (char *) de; + memcpy (data1, de, len); + de = (struct ext3cow_dir_entry_2 *) data1; + top = data1 + len; + while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top) + de = de2; + de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); + /* Initialize the root; the dot dirents already exist */ + de = (struct ext3cow_dir_entry_2 *) (&root->dotdot); + de->rec_len = cpu_to_le16(blocksize - EXT3COW_DIR_REC_LEN(2)); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT3COW_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; + dx_set_block (entries, 1); + dx_set_count (entries, 1); + dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + hinfo.seed = EXT3COW_SB(dir->i_sb)->s_hash_seed; + ext3cowfs_dirhash(name, namelen, &hinfo); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + bh = bh2; + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + dx_release (frames); + if (!(de)) + return retval; + + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} +#endif + +/* + * ext3cow_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext3cow_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +static int ext3cow_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + unsigned long offset; + struct buffer_head * bh; + struct ext3cow_dir_entry_2 *de; + struct super_block * sb; + int retval; +#ifdef CONFIG_EXT3COW_INDEX + int dx_fallback=0; +#endif + unsigned blocksize; + u32 block, blocks; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; + /* No additions in the past -znjp */ + if(is_unchangeable(dir, dentry)) + return -EROFS; + + if(EXT3COW_S_EPOCHNUMBER(sb) > EXT3COW_I_EPOCHNUMBER(dir)){ + if(ext3cow_dup_inode(dentry->d_parent->d_parent->d_inode, dir)) + return -1; + } + +#ifdef CONFIG_EXT3COW_INDEX + if (is_dx(dir)) { + retval = ext3cow_dx_add_entry(handle, dentry, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)){ + ext3cow_reclaim_dup_inode(dentry->d_parent->d_parent->d_inode, dir); + return retval; + } + EXT3COW_I(dir)->i_flags &= ~EXT3COW_INDEX_FL; + dx_fallback++; + ext3cow_mark_inode_dirty(handle, dir); + } +#endif + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0, offset = 0; block < blocks; block++) { + bh = ext3cow_bread(handle, dir, block, 0, &retval); + if(!bh){ + ext3cow_reclaim_dup_inode(dentry->d_parent->d_parent->d_inode, dir); + return retval; + } + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (retval != -ENOSPC) + return retval; + +#ifdef CONFIG_EXT3COW_INDEX + if (blocks == 1 && !dx_fallback && + EXT3COW_HAS_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); +#endif + brelse(bh); + } + + bh = ext3cow_append(handle, dir, &block, &retval); + if (!bh){ + ext3cow_reclaim_dup_inode(dentry->d_parent->d_parent->d_inode, dir); + return retval; + } + de = (struct ext3cow_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(blocksize); + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} + +#ifdef CONFIG_EXT3COW_INDEX +/* + * Returns 0 for success, or a negative error value + */ +static int ext3cow_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct dx_frame frames[2], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head * bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block * sb = dir->i_sb; + struct ext3cow_dir_entry_2 *de; + int err; + + frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; + at = frame->at; + + if (!(bh = ext3cow_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) { + bh = NULL; + goto cleanup; + } + + /* Block full, should compress but for now just split */ + dxtrace(printk("using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + u32 newblock; + unsigned icount = dx_get_count(entries); + int levels = frame - frames; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext3cow_warning(sb, __FUNCTION__, + "Directory index full!"); + err = -ENOSPC; + goto cleanup; + } + bh2 = ext3cow_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); + node2->fake.inode = 0; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk("Split index %i/%i\n", icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext3cow_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; + + memcpy ((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count (entries, icount1); + dx_set_count (entries2, icount2); + dx_set_limit (entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } + dx_insert_block (frames + 0, hash2, newblock); + dxtrace(dx_show_index ("node", frames[1].entries)); + dxtrace(dx_show_index ("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext3cow_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; + brelse (bh2); + } else { + dxtrace(printk("Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; + err = ext3cow_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; + } + ext3cow_journal_dirty_metadata(handle, frames[0].bh); + } + de = do_split(handle, dir, &bh, frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + bh = NULL; + goto cleanup; + +journal_error: + ext3cow_std_error(dir->i_sb, err); +cleanup: + if (bh) + brelse(bh); + dx_release(frames); + return err; +} +#endif + +/* + * ext3cow_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int ext3cow_delete_entry (handle_t *handle, + struct inode * dir, + struct ext3cow_dir_entry_2 * de_del, + struct buffer_head * bh, + struct dentry *dentry) +{ + struct ext3cow_dir_entry_2 * de, * pde; + int i; + + i = 0; + pde = NULL; + de = (struct ext3cow_dir_entry_2 *) bh->b_data; + while (i < bh->b_size) { + if (!ext3cow_check_dir_entry("ext3cow_delete_entry", dir, de, bh, i)) + return -EIO; + if (de == de_del) { + /* Can't delete an already dead entry - znjp */ + if(!EXT3COW_IS_DIRENT_ALIVE(de)) + return 0; + + if(EXT3COW_S_EPOCHNUMBER(dir->i_sb) > EXT3COW_I_EPOCHNUMBER(dir)){ + if(ext3cow_dup_inode(dentry->d_parent->d_parent->d_inode, dir)) + return -1; + } + + BUFFER_TRACE(bh, "get_write_access"); + ext3cow_journal_get_write_access(handle, bh); + /* There used to be code here to adjust the rec_len + * but since names really never go away, the code was deleted + if (pde) + pde->rec_len = + cpu_to_le16(le16_to_cpu(pde->rec_len) + + le16_to_cpu(de->rec_len)); + else + de->inode = 0; + */ + /* Mark it dead - znjp */ + de->death_epoch = cpu_to_le32(EXT3COW_I_EPOCHNUMBER(dir)); + dir->i_version++; + BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); + ext3cow_journal_dirty_metadata(handle, bh); + return 0; + } + i += le16_to_cpu(de->rec_len); + pde = de; + de = (struct ext3cow_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + return -ENOENT; +} + +/* + * ext3cow_mark_inode_dirty is somewhat expensive, so unlike ext2 we + * do not perform it in these functions. We perform it at the call site, + * if it is needed. + */ +static inline void ext3cow_inc_count(handle_t *handle, struct inode *inode) +{ + inc_nlink(inode); +} + +static inline void ext3cow_dec_count(handle_t *handle, struct inode *inode) +{ + drop_nlink(inode); +} + +static int ext3cow_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) +{ + int err = ext3cow_add_entry(handle, dentry, inode); + if (!err) { + ext3cow_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + return 0; + } + ext3cow_dec_count(handle, inode); + iput(inode); + return err; +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext3cow_create (struct inode * dir, struct dentry * dentry, int mode, + struct nameidata *nd) +{ + handle_t *handle; + struct inode * inode; + int err, retries = 0; + + /* Can't create in the past -znjp */ + if(is_unchangeable(dir, dentry)) + return -EROFS; + +retry: + handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3cow_new_inode (handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3cow_file_inode_operations; + inode->i_fop = &ext3cow_file_operations; + ext3cow_set_aops(inode); + err = ext3cow_add_nondir(handle, dentry, inode); + } + ext3cow_journal_stop(handle); + if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3cow_mknod (struct inode * dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + +retry: + handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3cow_new_inode (handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +#ifdef CONFIG_EXT3COW_FS_XATTR + inode->i_op = &ext3cow_special_inode_operations; +#endif + err = ext3cow_add_nondir(handle, dentry, inode); + } + ext3cow_journal_stop(handle); + if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3cow_mkdir(struct inode * dir, struct dentry * dentry, int mode) +{ + handle_t *handle; + struct inode * inode; + struct buffer_head * dir_block; + struct ext3cow_dir_entry_2 * de; + int err, retries = 0; + + if (dir->i_nlink >= EXT3COW_LINK_MAX) + return -EMLINK; + /* No mkdirs in the past -znjp */ + if(is_unchangeable(dir, dentry)) + return -EROFS; + + +retry: + handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3cow_new_inode (handle, dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext3cow_dir_inode_operations; + inode->i_fop = &ext3cow_dir_operations; + inode->i_size = EXT3COW_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext3cow_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + drop_nlink(inode); /* is this nlink == 0? */ + ext3cow_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + BUFFER_TRACE(dir_block, "get_write_access"); + ext3cow_journal_get_write_access(handle, dir_block); + de = (struct ext3cow_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = cpu_to_le16(EXT3COW_DIR_REC_LEN(de->name_len)); + /* For versioning -znjp */ + de->birth_epoch = cpu_to_le32(EXT3COW_S_EPOCHNUMBER(dir->i_sb)); + de->death_epoch = cpu_to_le32(EXT3COW_DIRENT_ALIVE); + strcpy (de->name, "."); + ext3cow_set_de_type(dir->i_sb, de, S_IFDIR); + de = (struct ext3cow_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3COW_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy (de->name, ".."); + ext3cow_set_de_type(dir->i_sb, de, S_IFDIR); + inode->i_nlink = 2; + /* For versioning -znjp */ + de->birth_epoch = cpu_to_le32(EXT3COW_I_EPOCHNUMBER(dir)); + de->death_epoch = cpu_to_le32(EXT3COW_DIRENT_ALIVE); + BUFFER_TRACE(dir_block, "call ext3cow_journal_dirty_metadata"); + ext3cow_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); + ext3cow_mark_inode_dirty(handle, inode); + err = ext3cow_add_entry (handle, dentry, inode); + if (err) { + inode->i_nlink = 0; + ext3cow_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + inc_nlink(dir); + ext3cow_update_dx_flag(dir); + ext3cow_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); +out_stop: + ext3cow_journal_stop(handle); + if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +static int empty_dir (struct inode * inode) +{ + unsigned long offset; + struct buffer_head * bh; + struct ext3cow_dir_entry_2 * de, * de1; + struct super_block * sb; + int err = 0; + + sb = inode->i_sb; + if (inode->i_size < EXT3COW_DIR_REC_LEN(1) + EXT3COW_DIR_REC_LEN(2) || + !(bh = ext3cow_bread (NULL, inode, 0, 0, &err))) { + if (err) + ext3cow_error(inode->i_sb, __FUNCTION__, + "error %d reading directory #%lu offset 0", + err, inode->i_ino); + else + ext3cow_warning(inode->i_sb, __FUNCTION__, + "bad directory (dir #%lu) - no data block", + inode->i_ino); + return 1; + } + de = (struct ext3cow_dir_entry_2 *) bh->b_data; + de1 = (struct ext3cow_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp (".", de->name) || + strcmp ("..", de1->name)) { + ext3cow_warning (inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse (bh); + return 1; + } + offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); + de = (struct ext3cow_dir_entry_2 *) + ((char *) de1 + le16_to_cpu(de1->rec_len)); + while (offset < inode->i_size ) { + if (!bh || + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + err = 0; + brelse (bh); + bh = ext3cow_bread (NULL, inode, + offset >> EXT3COW_BLOCK_SIZE_BITS(sb), 0, &err); + if (!bh) { + if (err) + ext3cow_error(sb, __FUNCTION__, + "error %d reading directory" + " #%lu offset %lu", + err, inode->i_ino, offset); + offset += sb->s_blocksize; + continue; + } + de = (struct ext3cow_dir_entry_2 *) bh->b_data; + } + if (!ext3cow_check_dir_entry("empty_dir", inode, de, bh, offset)) { + de = (struct ext3cow_dir_entry_2 *)(bh->b_data + + sb->s_blocksize); + offset = (offset | (sb->s_blocksize - 1)) + 1; + continue; + } + /* Can remove a dir only if all dirents are out of scope -znjp */ + if (le32_to_cpu(de->inode) && + EXT3COW_IS_DIRENT_SCOPED(de, EXT3COW_I_EPOCHNUMBER(inode))) { + brelse (bh); + return 0; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ext3cow_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + brelse (bh); + return 1; +} + +/* ext3cow_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext3cow_orphan_cleanup(). + */ +int ext3cow_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext3cow_iloc iloc; + int err = 0, rc; + + lock_super(sb); + if (!list_empty(&EXT3COW_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: + * I think I can trigger J_ASSERT in ext3cow_orphan_add(). We block + * here (on lock_super()), so race with ext3cow_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. + */ + J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(EXT3COW_SB(sb)->s_sbh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, EXT3COW_SB(sb)->s_sbh); + if (err) + goto out_unlock; + + err = ext3cow_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT3COW_SB(sb)->s_es->s_last_orphan); + EXT3COW_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + err = ext3cow_journal_dirty_metadata(handle, EXT3COW_SB(sb)->s_sbh); + rc = ext3cow_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + + /* Only add to the head of the in-memory list if all the + * previous operations succeeded. If the orphan_add is going to + * fail (possibly taking the journal offline), we can't risk + * leaving the inode on the orphan list: stray orphan-list + * entries can cause panics at unmount time. + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) + list_add(&EXT3COW_I(inode)->i_orphan, &EXT3COW_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out_unlock: + unlock_super(sb); + ext3cow_std_error(inode->i_sb, err); + return err; +} + +/* + * ext3cow_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext3cow_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext3cow_inode_info *ei = EXT3COW_I(inode); + struct ext3cow_sb_info *sbi; + unsigned long ino_next; + struct ext3cow_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); + if (list_empty(&ei->i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); + prev = ei->i_orphan.prev; + sbi = EXT3COW_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle) + goto out; + + err = ext3cow_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_err; + + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %lu\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext3cow_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + err = ext3cow_journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct ext3cow_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext3cow_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %lu\n", + i_prev->i_ino, ino_next); + err = ext3cow_reserve_inode_write(handle, i_prev, &iloc2); + if (err) + goto out_brelse; + NEXT_ORPHAN(i_prev) = ino_next; + err = ext3cow_mark_iloc_dirty(handle, i_prev, &iloc2); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext3cow_mark_iloc_dirty(handle, inode, &iloc); + +out_err: + ext3cow_std_error(inode->i_sb, err); +out: + unlock_super(inode->i_sb); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +static int ext3cow_rmdir (struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3cow_dir_entry_2 * de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ + DQUOT_INIT(dentry->d_inode); + handle = ext3cow_journal_start(dir, EXT3COW_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + retval = -ENOENT; + bh = ext3cow_find_entry (dentry, &de); + if (!bh) + goto end_rmdir; + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = dentry->d_inode; + + /* Can't rmdir in the past -znjp */ + retval = -EROFS; + if(is_unchangeable(inode, dentry)) + goto end_rmdir; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!empty_dir (inode)) + goto end_rmdir; + + retval = ext3cow_delete_entry(handle, dir, de, bh, dentry); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) + ext3cow_warning (inode->i_sb, "ext3cow_rmdir", + "empty directory has nlink!=2 (%d)", + inode->i_nlink); + inode->i_version++; + + /* We only delete things that were created in the same epoch -znjp */ + if(de->birth_epoch == de->death_epoch){ + clear_nlink(inode); + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext3cow_orphan_add(handle, inode); + drop_nlink(dir); + } + EXT3COW_I(inode)->i_flags |= EXT3COW_UNCHANGEABLE_FL; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3cow_mark_inode_dirty(handle, inode); + ext3cow_update_dx_flag(dir); + ext3cow_mark_inode_dirty(handle, dir); + +end_rmdir: + ext3cow_journal_stop(handle); + brelse (bh); + return retval; +} + +static int ext3cow_unlink(struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3cow_dir_entry_2 * de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + DQUOT_INIT(dentry->d_inode); + handle = ext3cow_journal_start(dir, EXT3COW_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + retval = -ENOENT; + bh = ext3cow_find_entry (dentry, &de); + if (!bh) + goto end_unlink; + + inode = dentry->d_inode; + + /* Can't unlink in the past -znjp */ + retval = -EROFS; + if(is_unchangeable(inode, dentry)) + goto end_unlink; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + ext3cow_warning (inode->i_sb, "ext3cow_unlink", + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + retval = ext3cow_delete_entry(handle, dir, de, bh, dentry); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + ext3cow_update_dx_flag(dir); + ext3cow_mark_inode_dirty(handle, dir); + + /* If the file should be deleted here, don't actually delete it + * but mark it unchangeable, i.e. it's now in the past. -znjp */ + + /* If file was created in this epoch, then we actually unlink it, + * if not, then it belongs to the past, so mark it unchangeable -znjp */ + if(de->birth_epoch == de->death_epoch){ + drop_nlink(inode); + if (!inode->i_nlink){ + ext3cow_orphan_add(handle, inode); + } + }else{ + if(!(inode->i_nlink - 1)) + EXT3COW_I(inode)->i_flags |= EXT3COW_UNCHANGEABLE_FL; + } + inode->i_ctime = dir->i_ctime; + ext3cow_mark_inode_dirty(handle, inode); + retval = 0; + +end_unlink: + ext3cow_journal_stop(handle); + brelse (bh); + return retval; +} + +static int ext3cow_symlink (struct inode * dir, + struct dentry *dentry, const char * symname) +{ + handle_t *handle; + struct inode * inode; + int l, err, retries = 0; + + l = strlen(symname)+1; + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + +retry: + handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 5 + + 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext3cow_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + if (l > sizeof (EXT3COW_I(inode)->i_data)) { + inode->i_op = &ext3cow_symlink_inode_operations; + ext3cow_set_aops(inode); + /* + * page_symlink() calls into ext3cow_prepare/commit_write. + * We have a transaction open. All is sweetness. It also sets + * i_size in generic_commit_write(). + */ + err = __page_symlink(inode, symname, l, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + if (err) { + ext3cow_dec_count(handle, inode); + ext3cow_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + } else { + inode->i_op = &ext3cow_fast_symlink_inode_operations; + memcpy((char*)&EXT3COW_I(inode)->i_data,symname,l); + inode->i_size = l-1; + } + EXT3COW_I(inode)->i_disksize = inode->i_size; + err = ext3cow_add_nondir(handle, dentry, inode); +out_stop: + ext3cow_journal_stop(handle); + if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext3cow_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + + if (inode->i_nlink >= EXT3COW_LINK_MAX) + return -EMLINK; + +retry: + handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3COW_INDEX_EXTRA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode->i_ctime = CURRENT_TIME_SEC; + ext3cow_inc_count(handle, inode); + atomic_inc(&inode->i_count); + + err = ext3cow_add_nondir(handle, dentry, inode); + ext3cow_journal_stop(handle); + if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +#define PARENT_INO(buffer) \ + ((struct ext3cow_dir_entry_2 *) ((char *) buffer + \ + le16_to_cpu(((struct ext3cow_dir_entry_2 *) buffer)->rec_len)))->inode + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int ext3cow_rename (struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir,struct dentry *new_dentry) +{ + handle_t *handle; + struct inode * old_inode, * new_inode; + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3cow_dir_entry_2 * old_de, * new_de; + int retval; + + old_bh = new_bh = dir_bh = NULL; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + if (new_dentry->d_inode) + DQUOT_INIT(new_dentry->d_inode); + handle = ext3cow_journal_start(old_dir, 2 * + EXT3COW_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + handle->h_sync = 1; + + old_bh = ext3cow_find_entry (old_dentry, &old_de); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + old_inode = old_dentry->d_inode; + retval = -ENOENT; + if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + goto end_rename; + + new_inode = new_dentry->d_inode; + new_bh = ext3cow_find_entry (new_dentry, &new_de); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); + new_bh = NULL; + } + } + + /* can't move something into the past -znjp */ + retval = -EROFS; + if(is_unchangeable(new_inode, new_dentry)) + goto end_rename; + /* can't some move from the past -znjp */ + if(is_unchangeable(old_inode, old_dentry)) + goto end_rename; + + if (S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir (new_inode)) + goto end_rename; + } + retval = -EIO; + dir_bh = ext3cow_bread (handle, old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir!=old_dir && + new_dir->i_nlink >= EXT3COW_LINK_MAX) + goto end_rename; + } + if (!new_bh) { + retval = ext3cow_add_entry (handle, new_dentry, old_inode); + if (retval) + goto end_rename; + } else { + BUFFER_TRACE(new_bh, "get write access"); + ext3cow_journal_get_write_access(handle, new_bh); + new_de->inode = cpu_to_le32(old_inode->i_ino); + if (EXT3COW_HAS_INCOMPAT_FEATURE(new_dir->i_sb, + EXT3COW_FEATURE_INCOMPAT_FILETYPE)) + new_de->file_type = old_de->file_type; + new_dir->i_version++; + BUFFER_TRACE(new_bh, "call ext3cow_journal_dirty_metadata"); + ext3cow_journal_dirty_metadata(handle, new_bh); + brelse(new_bh); + new_bh = NULL; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME_SEC; + ext3cow_mark_inode_dirty(handle, old_inode); + + /* + * ok, that's it + */ + if (le32_to_cpu(old_de->inode) != old_inode->i_ino || + old_de->name_len != old_dentry->d_name.len || + strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || + (retval = ext3cow_delete_entry(handle, old_dir, + old_de, old_bh, new_dentry)) == -ENOENT) { + /* old_de could have moved from under us during htree split, so + * make sure that we are deleting the right entry. We might + * also be pointing to a stale entry in the unused part of + * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext3cow_dir_entry_2 *old_de2; + + old_bh2 = ext3cow_find_entry(old_dentry, &old_de2); + if (old_bh2) { + retval = ext3cow_delete_entry(handle, old_dir, + old_de2, old_bh2, new_dentry); + brelse(old_bh2); + } + } + if (retval) { + ext3cow_warning(old_dir->i_sb, "ext3cow_rename", + "Deleting old file (%lu), %d, error=%d", + old_dir->i_ino, old_dir->i_nlink, retval); + } + + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME_SEC; + } + if(!is_unchangeable(old_inode, old_dentry)) + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + ext3cow_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3cow_journal_get_write_access(handle, dir_bh); + PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3cow_journal_dirty_metadata"); + ext3cow_journal_dirty_metadata(handle, dir_bh); + if (!new_inode) { + inc_nlink(new_dir); + ext3cow_update_dx_flag(new_dir); + ext3cow_mark_inode_dirty(handle, new_dir); + } + } + ext3cow_mark_inode_dirty(handle, old_dir); + if (new_inode) { + ext3cow_mark_inode_dirty(handle, new_inode); + if (!new_inode->i_nlink) + ext3cow_orphan_add(handle, new_inode); + } + retval = 0; + +end_rename: + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); + ext3cow_journal_stop(handle); + return retval; +} + +/* ext3cow_fake_inode: This function creates a VFS-only inode + * used for properly scoping views into the past file system - znjp + */ +struct inode *ext3cow_fake_inode(struct inode *inode, + unsigned int epoch_number) +{ + struct inode * fake_inode = NULL; + struct ext3cow_inode_info * ini = NULL; + struct ext3cow_inode_info * fake_ini = NULL; + static unsigned int last_ino = UINT_MAX; + int err = 0; + int block = -1; + + if(NULL == inode){ + printk(KERN_ERR "Trying to duplicate a NULL inode.\n"); + return NULL; + } + + if(EXT3COW_IS_FAKEINODE(inode)){ + printk(KERN_ERR "Trying to fake a fake inode.\n"); + return inode; + } + + printk(KERN_INFO "** faking inode %lu\n", inode->i_ino); + + ini = EXT3COW_I(inode); + + /* Create a new VFS-only inode */ + fake_inode = new_inode(inode->i_sb); + err = PTR_ERR(fake_inode); + if(!IS_ERR(fake_inode)){ + + fake_ini = EXT3COW_I(fake_inode); + + printk(KERN_INFO "** got inode %lu setting with %u\n", fake_inode->i_ino, + last_ino); + + /* When inode is a directory, we can fake the inode number */ + //if(S_ISDIR(inode->i_mode)) + fake_inode->i_ino = --last_ino; + + fake_inode->i_mode = inode->i_mode; + fake_inode->i_uid = inode->i_uid; + fake_inode->i_gid = inode->i_gid; + + atomic_set(&fake_inode->i_count, 1); + + fake_inode->i_nlink = inode->i_nlink; + fake_inode->i_size = inode->i_size; + fake_inode->i_atime.tv_sec = inode->i_atime.tv_sec; + fake_inode->i_ctime.tv_sec = inode->i_ctime.tv_sec; + fake_inode->i_mtime.tv_sec = inode->i_mtime.tv_sec; + fake_inode->i_atime.tv_nsec = inode->i_atime.tv_nsec; + fake_inode->i_ctime.tv_nsec = inode->i_ctime.tv_nsec; + fake_inode->i_mtime.tv_nsec = inode->i_mtime.tv_nsec; + + fake_ini->i_state = ini->i_state; + fake_ini->i_dir_start_lookup = ini->i_dir_start_lookup; + fake_ini->i_dtime = ini->i_dtime; + + fake_inode->i_blocks = inode->i_blocks; + fake_ini->i_flags = ini->i_flags; +#ifdef EXT3COW_FRAGMENTS + /* Taken out for versioning -znjp */ + //fake_ini->i_faddr = ini->i_faddr; + //fake_ini->i_frag_no = ini->i_frag_no; + //fake_ini->i_frag_size = ini->i_frag_size; +#endif + fake_ini->i_file_acl = ini->i_file_acl; + if (!S_ISREG(fake_inode->i_mode)) { + fake_ini->i_dir_acl = ini->i_dir_acl; + } + fake_ini->i_disksize = inode->i_size; + fake_inode->i_generation = inode->i_generation; + //TODO: This could be wrong. + //fake_ini->i_block_group = ini->i_block_group; //iloc.block_group; + + for (block = 0; block < EXT3COW_N_BLOCKS; block++) + fake_ini->i_data[block] = ini->i_data[block]; + + fake_ini->i_extra_isize = ini->i_extra_isize; + + /* set copy-on-write bitmap to 0 */ + fake_ini->i_cow_bitmap = 0x0000; + + /* Mark fake inode unchangeable, etc. */ + fake_ini->i_flags |= EXT3COW_UNCHANGEABLE_FL; + fake_ini->i_flags |= EXT3COW_UNVERSIONABLE_FL; + fake_ini->i_flags |= EXT3COW_FAKEINODE_FL; + fake_ini->i_flags |= EXT3COW_IMMUTABLE_FL; + + /* Make sure we get the right operations */ + if (S_ISREG(fake_inode->i_mode)) { + fake_inode->i_op = &ext3cow_file_inode_operations; + fake_inode->i_fop = &ext3cow_file_operations; + ext3cow_set_aops(fake_inode); + } else if (S_ISDIR(fake_inode->i_mode)) { + fake_inode->i_op = &ext3cow_dir_inode_operations; + fake_inode->i_fop = &ext3cow_dir_operations; + } else if (S_ISLNK(fake_inode->i_mode)) { + //if (ext3cow_inode_is_fast_symlink(cow_inode)) + if((S_ISLNK(fake_inode->i_mode) && fake_inode->i_blocks - + (EXT3COW_I(fake_inode)->i_file_acl ? + (fake_inode->i_sb->s_blocksize >> 9) : 0))) + fake_inode->i_op = &ext3cow_fast_symlink_inode_operations; + else { + fake_inode->i_op = &ext3cow_symlink_inode_operations; + ext3cow_set_aops(fake_inode); + } + } else { + fake_inode->i_op = &ext3cow_special_inode_operations; + } + + fake_ini->i_epoch_number = epoch_number; + fake_ini->i_next_inode = 0; + + iput(inode); /* dec i_count */ + + return fake_inode; + }else + ext3cow_warning(inode->i_sb, "ext3cow_fake_inode", + "Could not create fake inode."); + + return NULL; +} + +/* + * ext3cow_dup_inode: This function creates a new inode, + * copies all the metadata from the passed in inode, + * and adds it to the version chain, creating a new version. + * The head of the chain never changes; it is always the most current version. + * Similar in nature to ext3cow_creat and ext3cow_read_inode. -znjp + */ +int ext3cow_dup_inode(struct inode *dir, struct inode *inode){ + + struct inode *cow_inode = NULL; + struct inode *parent = NULL; + struct ext3cow_inode_info *ini = NULL; + struct ext3cow_inode_info *cow_ini = NULL; + handle_t *handle = NULL; + int err = 0; + int block = -1; + unsigned int epoch_number_temp = 0; + int retries = 0; + + printk(KERN_INFO "** duping inode %lu\n", inode->i_ino); + + if(EXT3COW_IS_UNVERSIONABLE(inode)) + return 0; + + if(NULL == inode){ + printk(KERN_ERR "Trying to duplicate a NULL inode.\n"); + return -1; + } + + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(EXT3COW_SB(inode->i_sb)->s_mount_state & EXT3COW_ORPHAN_FS)) { + /* this inode is deleted */ + return -1; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + + ini = EXT3COW_I(inode); + + /* This is for truncate, which can't pass in a parent */ + if(NULL == dir) + parent = inode; + else + parent = dir; + + retry: + handle = ext3cow_journal_start(parent, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(parent)) + handle->h_sync = 1; + + cow_inode = ext3cow_new_inode (handle, parent, inode->i_mode); + err = PTR_ERR(cow_inode); + if (!IS_ERR(cow_inode)) { + + printk(KERN_INFO " ** Allocated new inode %lu\n", cow_inode->i_ino); + + cow_ini = EXT3COW_I(cow_inode); + + cow_inode->i_mode = inode->i_mode; + cow_inode->i_uid = inode->i_uid; + cow_inode->i_gid = inode->i_gid; + + cow_inode->i_nlink = inode->i_nlink; + cow_inode->i_size = inode->i_size; + cow_inode->i_atime.tv_sec = inode->i_atime.tv_sec; + cow_inode->i_ctime.tv_sec = inode->i_ctime.tv_sec; + cow_inode->i_mtime.tv_sec = inode->i_mtime.tv_sec; + cow_inode->i_atime.tv_nsec = inode->i_atime.tv_nsec; + cow_inode->i_ctime.tv_nsec = inode->i_ctime.tv_nsec; + cow_inode->i_mtime.tv_nsec = inode->i_mtime.tv_nsec; + + cow_ini->i_state = ini->i_state; + cow_ini->i_dir_start_lookup = ini->i_dir_start_lookup; + cow_ini->i_dtime = ini->i_dtime; + + cow_inode->i_blocks = inode->i_blocks; + cow_ini->i_flags = ini->i_flags; +#ifdef EXT3COW_FRAGMENTS + /* Taken out for versioning -znjp */ + //cow_ini->i_faddr = ini->i_faddr; + //cow_ini->i_frag_no = ini->i_frag_no; + //cow_ini->i_frag_size = ini->i_frag_size; +#endif + cow_ini->i_file_acl = ini->i_file_acl; + if (!S_ISREG(cow_inode->i_mode)) { + cow_ini->i_dir_acl = ini->i_dir_acl; + } + cow_ini->i_disksize = inode->i_size; + cow_inode->i_generation = inode->i_generation; + //TODO: This could be wrong. + cow_ini->i_block_group = ini->i_block_group; //iloc.block_group; + + for (block = 0; block < EXT3COW_N_BLOCKS; block++) + cow_ini->i_data[block] = ini->i_data[block]; + + //TODO: This could be wrong + //cow_ini->i_orphan = NULL; //INIT_LIST_HEAD(&ei->i_orphan); + + cow_ini->i_extra_isize = ini->i_extra_isize; + + /* Make sure we get the right operations */ + if (S_ISREG(cow_inode->i_mode)) { + cow_inode->i_op = &ext3cow_file_inode_operations; + cow_inode->i_fop = &ext3cow_file_operations; + ext3cow_set_aops(cow_inode); + } else if (S_ISDIR(cow_inode->i_mode)) { + cow_inode->i_op = &ext3cow_dir_inode_operations; + cow_inode->i_fop = &ext3cow_dir_operations; + } else if (S_ISLNK(cow_inode->i_mode)) { + //if (ext3cow_inode_is_fast_symlink(cow_inode)) + if((S_ISLNK(cow_inode->i_mode) && cow_inode->i_blocks - + (EXT3COW_I(cow_inode)->i_file_acl ? + (cow_inode->i_sb->s_blocksize >> 9) : 0))) + cow_inode->i_op = &ext3cow_fast_symlink_inode_operations; + else { + cow_inode->i_op = &ext3cow_symlink_inode_operations; + ext3cow_set_aops(cow_inode); + } + } else { + cow_inode->i_op = &ext3cow_special_inode_operations; + /* + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + */ + } + + /* Dup in the direct cow bitmap */ + cow_ini->i_cow_bitmap = ini->i_cow_bitmap; + ini->i_cow_bitmap = 0x0000; + /* Mark new inode unchangeable */ + cow_ini->i_flags |= EXT3COW_UNCHANGEABLE_FL; + /* Switch epoch numbers */ + epoch_number_temp = ini->i_epoch_number; + ini->i_epoch_number = cow_ini->i_epoch_number; + cow_ini->i_epoch_number = epoch_number_temp; + /* Chain Inodes together */ + cow_ini->i_next_inode = ini->i_next_inode; + ini->i_next_inode = cow_inode->i_ino; + + ext3cow_mark_inode_dirty(handle, cow_inode); + ext3cow_mark_inode_dirty(handle, inode); + + iput(cow_inode); /* dec i_count */ + + err = 0; + } + ext3cow_journal_stop(handle); + if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; + +} + +/* ext3cow_reclaim_dup_inode: rolls back a recently dup'd inode + * on error, including epoch number and bitmaps. Should not + * be used for removing versions. */ +int ext3cow_reclaim_dup_inode(struct inode *dir, struct inode *inode) +{ + handle_t *handle = NULL; + int err = 0; + struct inode *old_inode = NULL; + struct inode *parent = dir; + + if(!parent) + parent = inode; + + if(is_bad_inode(inode)) + return -1; + + handle = ext3cow_journal_start(parent, + EXT3COW_DELETE_TRANS_BLOCKS(parent->i_sb)); + if(IS_ERR(handle)) + return PTR_ERR(handle); + + if(IS_DIRSYNC(parent)) + handle->h_sync = 1; + + old_inode = iget(parent->i_sb, EXT3COW_I_NEXT_INODE(inode)); + err = PTR_ERR(old_inode); + if (!IS_ERR(old_inode)){ + + EXT3COW_I(inode)->i_epoch_number = EXT3COW_I_EPOCHNUMBER(old_inode); + EXT3COW_I(inode)->i_cow_bitmap = EXT3COW_I(old_inode)->i_cow_bitmap; + EXT3COW_I(inode)->i_next_inode = EXT3COW_I(old_inode)->i_next_inode; + old_inode->i_nlink = 0; + + iput(old_inode); + ext3cow_mark_inode_dirty(handle, inode); + }else + ext3cow_error(inode->i_sb, "ext3cow_reclaim_dup_inode", + "Couldn't remove dup'd inode."); + + ext3cow_journal_stop(handle); + + return 0; +} + +/* + * directories can handle most operations... + */ +struct inode_operations ext3cow_dir_inode_operations = { + .create = ext3cow_create, + .lookup = ext3cow_lookup, + .link = ext3cow_link, + .unlink = ext3cow_unlink, + .symlink = ext3cow_symlink, + .mkdir = ext3cow_mkdir, + .rmdir = ext3cow_rmdir, + .mknod = ext3cow_mknod, + .rename = ext3cow_rename, + .setattr = ext3cow_setattr, +#ifdef CONFIG_EXT3COW_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3cow_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = ext3cow_permission, +}; + +struct inode_operations ext3cow_special_inode_operations = { + .setattr = ext3cow_setattr, +#ifdef CONFIG_EXT3COW_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3cow_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = ext3cow_permission, +}; diff -ruN linux-2.6.20.3/fs/ext3cow/namei.h linux-2.6.20.3-ext3cow/fs/ext3cow/namei.h --- linux-2.6.20.3/fs/ext3cow/namei.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/namei.h 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,8 @@ +/* linux/fs/ext3cow/namei.h + * + * Copyright (C) 2005 Simtec Electronics + * Ben Dooks + * +*/ + +extern struct dentry *ext3cow_get_parent(struct dentry *child); diff -ruN linux-2.6.20.3/fs/ext3cow/resize.c linux-2.6.20.3-ext3cow/fs/ext3cow/resize.c --- linux-2.6.20.3/fs/ext3cow/resize.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/resize.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,1042 @@ +/* + * linux/fs/ext3cow/resize.c + * + * Support for resizing an ext3cow filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger + * + * This could probably be made into a module, because it is not often in use. + */ + + +#define EXT3COWFS_DEBUG + +#include +#include +#include + +#include +#include + + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext3cow_new_group_data *input) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + struct ext3cow_super_block *es = sbi->s_es; + ext3cow_fsblk_t start = le32_to_cpu(es->s_blocks_count); + ext3cow_fsblk_t end = start + input->blocks_count; + unsigned group = input->group; + ext3cow_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; + unsigned overhead = ext3cow_bg_has_super(sb, group) ? + (1 + ext3cow_bg_num_gdb(sb, group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + ext3cow_fsblk_t metaend = start + overhead; + struct buffer_head *bh = NULL; + ext3cow_grpblk_t free_blocks_count; + int err = -EINVAL; + + input->free_blocks_count = free_blocks_count = + input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3COW-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext3cow_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + if (group != sbi->s_groups_count) + ext3cow_warning(sb, __FUNCTION__, + "Cannot add at group %u (only %lu groups)", + input->group, sbi->s_groups_count); + else if ((start - le32_to_cpu(es->s_first_data_block)) % + EXT3COW_BLOCKS_PER_GROUP(sb)) + ext3cow_warning(sb, __FUNCTION__, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext3cow_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext3cow_warning(sb, __FUNCTION__, "Bad blocks count %u", + input->blocks_count); + else if (!(bh = sb_bread(sb, end - 1))) + ext3cow_warning(sb, __FUNCTION__, + "Cannot read last block ("E3FSBLK")", + end - 1); + else if (outside(input->block_bitmap, start, end)) + ext3cow_warning(sb, __FUNCTION__, + "Block bitmap not in group (block %u)", + input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext3cow_warning(sb, __FUNCTION__, + "Inode bitmap not in group (block %u)", + input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext3cow_warning(sb, __FUNCTION__, + "Inode table not in group (blocks %u-"E3FSBLK")", + input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext3cow_warning(sb, __FUNCTION__, + "Block bitmap same as inode bitmap (%u)", + input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext3cow_warning(sb, __FUNCTION__, + "Block bitmap (%u) in inode table (%u-"E3FSBLK")", + input->block_bitmap, input->inode_table, itend-1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext3cow_warning(sb, __FUNCTION__, + "Inode bitmap (%u) in inode table (%u-"E3FSBLK")", + input->inode_bitmap, input->inode_table, itend-1); + else if (inside(input->block_bitmap, start, metaend)) + ext3cow_warning(sb, __FUNCTION__, + "Block bitmap (%u) in GDT table" + " ("E3FSBLK"-"E3FSBLK")", + input->block_bitmap, start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext3cow_warning(sb, __FUNCTION__, + "Inode bitmap (%u) in GDT table" + " ("E3FSBLK"-"E3FSBLK")", + input->inode_bitmap, start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext3cow_warning(sb, __FUNCTION__, + "Inode table (%u-"E3FSBLK") overlaps" + "GDT table ("E3FSBLK"-"E3FSBLK")", + input->inode_table, itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + ext3cow_fsblk_t blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if (!bh) + return ERR_PTR(-EIO); + if ((err = ext3cow_journal_get_write_access(handle, bh))) { + brelse(bh); + bh = ERR_PTR(err); + } else { + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + } + + return bh; +} + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext3cow_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext3cow_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + */ +static int setup_new_group_blocks(struct super_block *sb, + struct ext3cow_new_group_data *input) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + ext3cow_fsblk_t start = ext3cow_group_first_block_no(sb, input->group); + int reserved_gdb = ext3cow_bg_has_super(sb, input->group) ? + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; + unsigned long gdblocks = ext3cow_bg_num_gdb(sb, input->group); + struct buffer_head *bh; + handle_t *handle; + ext3cow_fsblk_t block; + ext3cow_grpblk_t bit; + int i; + int err = 0, err2; + + handle = ext3cow_journal_start_sb(sb, reserved_gdb + gdblocks + + 2 + sbi->s_itb_per_group); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + lock_super(sb); + if (input->group != sbi->s_groups_count) { + err = -EBUSY; + goto exit_journal; + } + + if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext3cow_bg_has_super(sb, input->group)) { + ext3cow_debug("mark backup superblock %#04lx (+0)\n", start); + ext3cow_set_bit(0, bh->b_data); + } + + /* Copy all of the GDT blocks into the backup in this group */ + for (i = 0, bit = 1, block = start + 1; + i < gdblocks; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3cow_debug("update backup group %#04lx (+%d)\n", block, bit); + + gdb = sb_getblk(sb, block); + if (!gdb) { + err = -EIO; + goto exit_bh; + } + if ((err = ext3cow_journal_get_write_access(handle, gdb))) { + brelse(gdb); + goto exit_bh; + } + lock_buffer(bh); + memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size); + set_buffer_uptodate(gdb); + unlock_buffer(bh); + ext3cow_journal_dirty_metadata(handle, gdb); + ext3cow_set_bit(bit, bh->b_data); + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor table blocks */ + for (i = 0, bit = gdblocks + 1, block = start + bit; + i < reserved_gdb; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3cow_debug("clear reserved block %#04lx (+%d)\n", block, bit); + + if (IS_ERR(gdb = bclean(handle, sb, block))) { + err = PTR_ERR(bh); + goto exit_bh; + } + ext3cow_journal_dirty_metadata(handle, gdb); + ext3cow_set_bit(bit, bh->b_data); + brelse(gdb); + } + ext3cow_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, + input->block_bitmap - start); + ext3cow_set_bit(input->block_bitmap - start, bh->b_data); + ext3cow_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, + input->inode_bitmap - start); + ext3cow_set_bit(input->inode_bitmap - start, bh->b_data); + + /* Zero out all of the inode table blocks */ + for (i = 0, block = input->inode_table, bit = block - start; + i < sbi->s_itb_per_group; i++, bit++, block++) { + struct buffer_head *it; + + ext3cow_debug("clear inode block %#04lx (+%d)\n", block, bit); + if (IS_ERR(it = bclean(handle, sb, block))) { + err = PTR_ERR(it); + goto exit_bh; + } + ext3cow_journal_dirty_metadata(handle, it); + brelse(it); + ext3cow_set_bit(bit, bh->b_data); + } + mark_bitmap_end(input->blocks_count, EXT3COW_BLOCKS_PER_GROUP(sb), + bh->b_data); + ext3cow_journal_dirty_metadata(handle, bh); + brelse(bh); + + /* Mark unused entries in inode bitmap used */ + ext3cow_debug("clear inode bitmap %#04x (+%ld)\n", + input->inode_bitmap, input->inode_bitmap - start); + if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + mark_bitmap_end(EXT3COW_INODES_PER_GROUP(sb), EXT3COW_BLOCKS_PER_GROUP(sb), + bh->b_data); + ext3cow_journal_dirty_metadata(handle, bh); +exit_bh: + brelse(bh); + +exit_journal: + unlock_super(sb); + if ((err2 = ext3cow_journal_stop(handle)) && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext3cow filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +static unsigned ext3cow_list_backups(struct super_block *sb, unsigned *three, + unsigned *five, unsigned *seven) +{ + unsigned *min = three; + int mult = 3; + unsigned ret; + + if (!EXT3COW_HAS_RO_COMPAT_FEATURE(sb, + EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + struct buffer_head *primary) +{ + const ext3cow_fsblk_t blk = primary->b_blocknr; + const unsigned long end = EXT3COW_SB(sb)->s_groups_count; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __le32 *p = (__le32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext3cow_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != grp * EXT3COW_BLOCKS_PER_GROUP(sb) + blk){ + ext3cow_warning(sb, __FUNCTION__, + "reserved GDT "E3FSBLK + " missing grp %d ("E3FSBLK")", + blk, grp, + grp * EXT3COW_BLOCKS_PER_GROUP(sb) + blk); + return -EINVAL; + } + if (++gdbackups > EXT3COW_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + struct ext3cow_new_group_data *input, + struct buffer_head **primary) +{ + struct super_block *sb = inode->i_sb; + struct ext3cow_super_block *es = EXT3COW_SB(sb)->s_es; + unsigned long gdb_num = input->group / EXT3COW_DESC_PER_BLOCK(sb); + ext3cow_fsblk_t gdblock = EXT3COW_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc; + struct buffer_head *dind; + int gdbackups; + struct ext3cow_iloc iloc; + __le32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT3COW-fs: ext3cow_add_new_gdb: adding group block %lu\n", + gdb_num); + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT3COW_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT3COW_SB(sb)->s_es->s_first_data_block)) { + ext3cow_warning(sb, __FUNCTION__, + "won't resize using backup superblock at %llu", + (unsigned long long)EXT3COW_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + *primary = sb_bread(sb, gdblock); + if (!*primary) + return -EIO; + + if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + err = gdbackups; + goto exit_bh; + } + + data = EXT3COW_I(inode)->i_data + EXT3COW_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_bh; + } + + data = (__le32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT3COW_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext3cow_warning(sb, __FUNCTION__, + "new group %u GDT block "E3FSBLK" not reserved", + input->group, gdblock); + err = -EINVAL; + goto exit_dind; + } + + if ((err = ext3cow_journal_get_write_access(handle, EXT3COW_SB(sb)->s_sbh))) + goto exit_dind; + + if ((err = ext3cow_journal_get_write_access(handle, *primary))) + goto exit_sbh; + + if ((err = ext3cow_journal_get_write_access(handle, dind))) + goto exit_primary; + + /* ext3cow_reserve_inode_write() gets a reference on the iloc */ + if ((err = ext3cow_reserve_inode_write(handle, inode, &iloc))) + goto exit_dindj; + + n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); + if (!n_group_desc) { + err = -ENOMEM; + ext3cow_warning (sb, __FUNCTION__, + "not enough memory for %lu groups", gdb_num + 1); + goto exit_inode; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT3COW_ADDR_PER_BLOCK(sb)] = 0; + ext3cow_journal_dirty_metadata(handle, dind); + brelse(dind); + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + ext3cow_mark_iloc_dirty(handle, inode, &iloc); + memset((*primary)->b_data, 0, sb->s_blocksize); + ext3cow_journal_dirty_metadata(handle, *primary); + + o_group_desc = EXT3COW_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT3COW_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = *primary; + EXT3COW_SB(sb)->s_group_desc = n_group_desc; + EXT3COW_SB(sb)->s_gdb_count++; + kfree(o_group_desc); + + es->s_reserved_gdt_blocks = + cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1); + ext3cow_journal_dirty_metadata(handle, EXT3COW_SB(sb)->s_sbh); + + return 0; + +exit_inode: + //ext3cow_journal_release_buffer(handle, iloc.bh); + brelse(iloc.bh); +exit_dindj: + //ext3cow_journal_release_buffer(handle, dind); +exit_primary: + //ext3cow_journal_release_buffer(handle, *primary); +exit_sbh: + //ext3cow_journal_release_buffer(handle, *primary); +exit_dind: + brelse(dind); +exit_bh: + brelse(*primary); + + ext3cow_debug("leaving with error %d\n", err); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + struct ext3cow_new_group_data *input) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT3COW_SB(sb)->s_es->s_reserved_gdt_blocks); + struct buffer_head **primary; + struct buffer_head *dind; + struct ext3cow_iloc iloc; + ext3cow_fsblk_t blk; + __le32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL); + if (!primary) + return -ENOMEM; + + data = EXT3COW_I(inode)->i_data + EXT3COW_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_free; + } + + blk = EXT3COW_SB(sb)->s_sbh->b_blocknr + 1 + EXT3COW_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + EXT3COW_SB(sb)->s_gdb_count; + end = (__le32 *)dind->b_data + EXT3COW_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext3cow_warning(sb, __FUNCTION__, + "reserved block "E3FSBLK + " not at offset %ld", + blk, + (long)(data - (__le32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = sb_bread(sb, blk); + if (!primary[res]) { + err = -EIO; + goto exit_bh; + } + if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__le32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + if ((err = ext3cow_journal_get_write_access(handle, primary[i]))) { + /* + int j; + for (j = 0; j < i; j++) + ext3cow_journal_release_buffer(handle, primary[j]); + */ + goto exit_bh; + } + } + + if ((err = ext3cow_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = input->group * EXT3COW_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__le32 *)primary[i]->b_data; + /* printk("reserving backup %lu[%u] = %lu\n", + primary[i]->b_blocknr, gdbackups, + blk + primary[i]->b_blocknr); */ + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext3cow_journal_dirty_metadata(handle, primary[i]); + if (!err) + err = err2; + } + inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + ext3cow_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +/* + * Update the backup copies of the ext3cow metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need lock_super() for this, because these blocks are not + * otherwise touched by the filesystem code when it is mounted. We don't + * need to worry about last changing from sbi->s_groups_count, because the + * worst that can happen is that we do not copy the full number of backups + * at this time. The resize which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, + int blk_off, char *data, int size) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + const unsigned long last = sbi->s_groups_count; + const int bpg = EXT3COW_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext3cow_journal_start_sb(sb, EXT3COW_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + while ((group = ext3cow_list_backups(sb, &three, &five, &seven)) < last) { + struct buffer_head *bh; + + /* Out of journal space, and can't get more - abort - so sad */ + if (handle->h_buffer_credits == 0 && + ext3cow_journal_extend(handle, EXT3COW_MAX_TRANS_DATA) && + (err = ext3cow_journal_restart(handle, EXT3COW_MAX_TRANS_DATA))) + break; + + bh = sb_getblk(sb, group * bpg + blk_off); + if (!bh) { + err = -EIO; + break; + } + ext3cow_debug("update metadata backup %#04lx\n", + (unsigned long)bh->b_blocknr); + if ((err = ext3cow_journal_get_write_access(handle, bh))) + break; + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + set_buffer_uptodate(bh); + unlock_buffer(bh); + ext3cow_journal_dirty_metadata(handle, bh); + brelse(bh); + } + if ((err2 = ext3cow_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext3cow_warning(sb, __FUNCTION__, + "can't update backup for group %d (err %d), " + "forcing fsck on next reboot", group, err); + sbi->s_mount_state &= ~EXT3COW_VALID_FS; + sbi->s_es->s_state &= cpu_to_le16(~EXT3COW_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext3cow_group_add(struct super_block *sb, struct ext3cow_new_group_data *input) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + struct ext3cow_super_block *es = sbi->s_es; + int reserved_gdb = ext3cow_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct buffer_head *primary = NULL; + struct ext3cow_group_desc *gdp; + struct inode *inode = NULL; + handle_t *handle; + int gdb_off, gdb_num; + int err, err2; + + gdb_num = input->group / EXT3COW_DESC_PER_BLOCK(sb); + gdb_off = input->group % EXT3COW_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !EXT3COW_HAS_RO_COMPAT_FEATURE(sb, + EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ext3cow_warning(sb, __FUNCTION__, + "Can't resize non-sparse filesystem further"); + return -EPERM; + } + + if (le32_to_cpu(es->s_blocks_count) + input->blocks_count < + le32_to_cpu(es->s_blocks_count)) { + ext3cow_warning(sb, __FUNCTION__, "blocks_count overflow\n"); + return -EINVAL; + } + + if (le32_to_cpu(es->s_inodes_count) + EXT3COW_INODES_PER_GROUP(sb) < + le32_to_cpu(es->s_inodes_count)) { + ext3cow_warning(sb, __FUNCTION__, "inodes_count overflow\n"); + return -EINVAL; + } + + if (reserved_gdb || gdb_off == 0) { + if (!EXT3COW_HAS_COMPAT_FEATURE(sb, + EXT3COW_FEATURE_COMPAT_RESIZE_INODE)){ + ext3cow_warning(sb, __FUNCTION__, + "No reserved GDT blocks, can't resize"); + return -EPERM; + } + inode = iget(sb, EXT3COW_RESIZE_INO); + if (!inode || is_bad_inode(inode)) { + ext3cow_warning(sb, __FUNCTION__, + "Error opening resize inode"); + iput(inode); + return -ENOENT; + } + } + + if ((err = verify_group_input(sb, input))) + goto exit_put; + + if ((err = setup_new_group_blocks(sb, input))) + goto exit_put; + + /* + * We will always be modifying at least the superblock and a GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + handle = ext3cow_journal_start_sb(sb, + ext3cow_bg_has_super(sb, input->group) ? + 3 + reserved_gdb : 4); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit_put; + } + + lock_super(sb); + if (input->group != sbi->s_groups_count) { + ext3cow_warning(sb, __FUNCTION__, + "multiple resizers run on filesystem!"); + err = -EBUSY; + goto exit_journal; + } + + if ((err = ext3cow_journal_get_write_access(handle, sbi->s_sbh))) + goto exit_journal; + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + primary = sbi->s_group_desc[gdb_num]; + if ((err = ext3cow_journal_get_write_access(handle, primary))) + goto exit_journal; + + if (reserved_gdb && ext3cow_bg_num_gdb(sb, input->group) && + (err = reserve_backup_gdb(handle, inode, input))) + goto exit_journal; + } else if ((err = add_new_gdb(handle, inode, input, &primary))) + goto exit_journal; + + /* + * OK, now we've set up the new group. Time to make it active. + * + * Current kernels don't lock all allocations via lock_super(), + * so we have to be safe wrt. concurrent accesses the group + * data. So we need to be careful to set all of the relevant + * group descriptor data etc. *before* we enable the group. + * + * The key field here is sbi->s_groups_count: as long as + * that retains its old value, nobody is going to access the new + * group. + * + * So first we update all the descriptor metadata for the new + * group; then we update the total disk blocks count; then we + * update the groups count to enable the group; then finally we + * update the free space counts so that the system can start + * using the new disk blocks. + */ + + /* Update group descriptor block for new group */ + gdp = (struct ext3cow_group_desc *)primary->b_data + gdb_off; + + gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap); + gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap); + gdp->bg_inode_table = cpu_to_le32(input->inode_table); + gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); + gdp->bg_free_inodes_count = cpu_to_le16(EXT3COW_INODES_PER_GROUP(sb)); + + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + es->s_blocks_count = cpu_to_le32(le32_to_cpu(es->s_blocks_count) + + input->blocks_count); + es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) + + EXT3COW_INODES_PER_GROUP(sb)); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers of s_groups_count *must* hold lock_super + * AND + * * Writers must perform a smp_wmb() after updating all dependent + * data and before modifying the groups count + * + * * Readers must hold lock_super() over the access + * OR + * * Readers must perform an smp_rmb() after reading the groups count + * and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count++; + + ext3cow_journal_dirty_metadata(handle, primary); + + /* Update the reserved block counts only once the new group is + * active. */ + es->s_r_blocks_count = cpu_to_le32(le32_to_cpu(es->s_r_blocks_count) + + input->reserved_blocks); + + /* Update the free space counts */ + percpu_counter_mod(&sbi->s_freeblocks_counter, + input->free_blocks_count); + percpu_counter_mod(&sbi->s_freeinodes_counter, + EXT3COW_INODES_PER_GROUP(sb)); + + ext3cow_journal_dirty_metadata(handle, sbi->s_sbh); + sb->s_dirt = 1; + +exit_journal: + unlock_super(sb); + if ((err2 = ext3cow_journal_stop(handle)) && !err) + err = err2; + if (!err) { + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3cow_super_block)); + update_backups(sb, primary->b_blocknr, primary->b_data, + primary->b_size); + } +exit_put: + iput(inode); + return err; +} /* ext3cow_group_add */ + +/* Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext3cow_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext3cow_group_extend(struct super_block *sb, struct ext3cow_super_block *es, + ext3cow_fsblk_t n_blocks_count) +{ + ext3cow_fsblk_t o_blocks_count; + unsigned long o_groups_count; + ext3cow_grpblk_t last; + ext3cow_grpblk_t add; + struct buffer_head * bh; + handle_t *handle; + int err; + unsigned long freed_blocks; + + /* We don't need to worry about locking wrt other resizers just + * yet: we're going to revalidate es->s_blocks_count after + * taking lock_super() below. */ + o_blocks_count = le32_to_cpu(es->s_blocks_count); + o_groups_count = EXT3COW_SB(sb)->s_groups_count; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3COW-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + printk(KERN_ERR "EXT3COW-fs: filesystem on %s:" + " too large to resize to %lu blocks safely\n", + sb->s_id, n_blocks_count); + if (sizeof(sector_t) < 8) + ext3cow_warning(sb, __FUNCTION__, + "CONFIG_LBD not enabled\n"); + return -EINVAL; + } + + if (n_blocks_count < o_blocks_count) { + ext3cow_warning(sb, __FUNCTION__, + "can't shrink FS - resize aborted"); + return -EBUSY; + } + + /* Handle the remaining blocks in the last group only. */ + last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) % + EXT3COW_BLOCKS_PER_GROUP(sb); + + if (last == 0) { + ext3cow_warning(sb, __FUNCTION__, + "need to use ext2online to resize further"); + return -EPERM; + } + + add = EXT3COW_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add < o_blocks_count) { + ext3cow_warning(sb, __FUNCTION__, "blocks_count overflow"); + return -EINVAL; + } + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext3cow_warning(sb, __FUNCTION__, + "will only finish group ("E3FSBLK + " blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, o_blocks_count + add -1); + if (!bh) { + ext3cow_warning(sb, __FUNCTION__, + "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext3cow_free_blocks(). + */ + handle = ext3cow_journal_start_sb(sb, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext3cow_warning(sb, __FUNCTION__, "error %d on journal start",err); + goto exit_put; + } + + lock_super(sb); + if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { + ext3cow_warning(sb, __FUNCTION__, + "multiple resizers run on filesystem!"); + unlock_super(sb); + err = -EBUSY; + goto exit_put; + } + + if ((err = ext3cow_journal_get_write_access(handle, + EXT3COW_SB(sb)->s_sbh))) { + ext3cow_warning(sb, __FUNCTION__, + "error %d on journal write access", err); + unlock_super(sb); + ext3cow_journal_stop(handle); + goto exit_put; + } + es->s_blocks_count = cpu_to_le32(o_blocks_count + add); + ext3cow_journal_dirty_metadata(handle, EXT3COW_SB(sb)->s_sbh); + sb->s_dirt = 1; + unlock_super(sb); + ext3cow_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, + o_blocks_count + add); + ext3cow_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); + ext3cow_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, + o_blocks_count + add); + if ((err = ext3cow_journal_stop(handle))) + goto exit_put; + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3COW-fs: extended group to %u blocks\n", + le32_to_cpu(es->s_blocks_count)); + update_backups(sb, EXT3COW_SB(sb)->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3cow_super_block)); +exit_put: + return err; +} /* ext3cow_group_extend */ diff -ruN linux-2.6.20.3/fs/ext3cow/super.c linux-2.6.20.3-ext3cow/fs/ext3cow/super.c --- linux-2.6.20.3/fs/ext3cow/super.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/super.c 2008-03-09 11:14:49.000000000 -0400 @@ -0,0 +1,2808 @@ +/* + * linux/fs/ext3cow/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "xattr.h" +#include "acl.h" +#include "namei.h" + +static int ext3cow_load_journal(struct super_block *, struct ext3cow_super_block *, + unsigned long journal_devnum); +static int ext3cow_create_journal(struct super_block *, struct ext3cow_super_block *, + unsigned int); +static void ext3cow_commit_super (struct super_block * sb, + struct ext3cow_super_block * es, + int sync); +static void ext3cow_mark_recovery_complete(struct super_block * sb, + struct ext3cow_super_block * es); +static void ext3cow_clear_journal_err(struct super_block * sb, + struct ext3cow_super_block * es); +static int ext3cow_sync_fs(struct super_block *sb, int wait); +static const char *ext3cow_decode_error(struct super_block * sb, int errno, + char nbuf[16]); +static int ext3cow_remount (struct super_block * sb, int * flags, char * data); +static int ext3cow_statfs (struct dentry * dentry, struct kstatfs * buf); +static void ext3cow_unlockfs(struct super_block *sb); +static void ext3cow_write_super (struct super_block * sb); +static void ext3cow_write_super_lockfs(struct super_block *sb); + +/* + * Wrappers for journal_start/end. + * + * The only special thing we need to do here is to make sure that all + * journal_end calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +handle_t *ext3cow_journal_start_sb(struct super_block *sb, int nblocks) +{ + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + + /* Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. */ + journal = EXT3COW_SB(sb)->s_journal; + if (is_journal_aborted(journal)) { + ext3cow_abort(sb, __FUNCTION__, + "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + + return journal_start(journal, nblocks); +} + +/* + * The only special thing we need to do here is to make sure that all + * journal_stop calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +int __ext3cow_journal_stop(const char *where, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext3cow_std_error(sb, where, err); + return err; +} + +void ext3cow_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext3cow_decode_error(NULL, err, nbuf); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "%s: aborting transaction: %s in %s\n", + caller, errstr, err_fn); + + journal_abort_handle(handle); +} + +/* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext3cow, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will compain about + * that error until we've noted it down and cleared it. + */ + +static void ext3cow_handle_error(struct super_block *sb) +{ + struct ext3cow_super_block *es = EXT3COW_SB(sb)->s_es; + + EXT3COW_SB(sb)->s_mount_state |= EXT3COW_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3COW_ERROR_FS); + + if (sb->s_flags & MS_RDONLY) + return; + + if (!test_opt (sb, ERRORS_CONT)) { + journal_t *journal = EXT3COW_SB(sb)->s_journal; + + EXT3COW_SB(sb)->s_mount_opt |= EXT3COW_MOUNT_ABORT; + if (journal) + journal_abort(journal, -EIO); + } + if (test_opt (sb, ERRORS_RO)) { + printk (KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + ext3cow_commit_super(sb, es, 1); + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT3COW-fs (device %s): panic forced after error\n", + sb->s_id); +} + +void ext3cow_error (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_CRIT "EXT3COW-fs error (device %s): %s: ",sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + ext3cow_handle_error(sb); +} + +static const char *ext3cow_decode_error(struct super_block * sb, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || EXT3COW_SB(sb)->s_journal->j_flags & JFS_ABORT) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +/* __ext3cow_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + +void __ext3cow_std_error (struct super_block * sb, const char * function, + int errno) +{ + char nbuf[16]; + const char *errstr; + + /* Special case: if the error is EROFS, and we're not already + * inside a transaction, then there's really no point in logging + * an error. */ + if (errno == -EROFS && journal_current_handle() == NULL && + (sb->s_flags & MS_RDONLY)) + return; + + errstr = ext3cow_decode_error(sb, errno, nbuf); + printk (KERN_CRIT "EXT3COW-fs error (device %s) in %s: %s\n", + sb->s_id, function, errstr); + + ext3cow_handle_error(sb); +} + +/* + * ext3cow_abort is a much stronger failure handler than ext3cow_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + +void ext3cow_abort (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + va_list args; + + printk (KERN_CRIT "ext3cow_abort called.\n"); + + va_start(args, fmt); + printk(KERN_CRIT "EXT3COW-fs error (device %s): %s: ",sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT3COW-fs panic from previous error\n"); + + if (sb->s_flags & MS_RDONLY) + return; + + printk(KERN_CRIT "Remounting filesystem read-only\n"); + EXT3COW_SB(sb)->s_mount_state |= EXT3COW_ERROR_FS; + sb->s_flags |= MS_RDONLY; + EXT3COW_SB(sb)->s_mount_opt |= EXT3COW_MOUNT_ABORT; + journal_abort(EXT3COW_SB(sb)->s_journal, -EIO); +} + +void ext3cow_warning (struct super_block * sb, const char * function, + const char * fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_WARNING "EXT3COW-fs warning (device %s): %s: ", + sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); +} + +void ext3cow_update_dynamic_rev(struct super_block *sb) +{ + struct ext3cow_super_block *es = EXT3COW_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT3COW_GOOD_OLD_REV) + return; + + ext3cow_warning(sb, __FUNCTION__, + "updating to rev %d because of new feature flag, " + "running e2fsck is recommended", + EXT3COW_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT3COW_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT3COW_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT3COW_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +/* + * Open the external journal device + */ +static struct block_device *ext3cow_blkdev_get(dev_t dev) +{ + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + if (IS_ERR(bdev)) + goto fail; + return bdev; + +fail: + printk(KERN_ERR "EXT3COW: failed to open journal device %s: %ld\n", + __bdevname(dev, b), PTR_ERR(bdev)); + return NULL; +} + +/* + * Release the journal device + */ +static int ext3cow_blkdev_put(struct block_device *bdev) +{ + bd_release(bdev); + return blkdev_put(bdev); +} + +static int ext3cow_blkdev_remove(struct ext3cow_sb_info *sbi) +{ + struct block_device *bdev; + int ret = -ENODEV; + + bdev = sbi->journal_bdev; + if (bdev) { + ret = ext3cow_blkdev_put(bdev); + sbi->journal_bdev = NULL; + } + return ret; +} + +static inline struct inode *orphan_list_entry(struct list_head *l) +{ + return &list_entry(l, struct ext3cow_inode_info, i_orphan)->vfs_inode; +} + +static void dump_orphan_list(struct super_block *sb, struct ext3cow_sb_info *sbi) +{ + struct list_head *l; + + printk(KERN_ERR "sb orphan head is %d\n", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + printk(KERN_ERR "sb_info orphan list:\n"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + printk(KERN_ERR " " + "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + inode->i_sb->s_id, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + NEXT_ORPHAN(inode)); + } +} + +static void ext3cow_put_super (struct super_block * sb) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + struct ext3cow_super_block *es = sbi->s_es; + int i; + + ext3cow_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3COW_CLEAR_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); + es->s_state = cpu_to_le16(sbi->s_mount_state); + BUFFER_TRACE(sbi->s_sbh, "marking dirty"); + mark_buffer_dirty(sbi->s_sbh); + ext3cow_commit_super(sb, es, 1); + } + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + brelse(sbi->s_sbh); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_bdev(sb->s_bdev, 0); + if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(sbi->journal_bdev); + invalidate_bdev(sbi->journal_bdev, 0); + ext3cow_blkdev_remove(sbi); + } + sb->s_fs_info = NULL; + kfree(sbi); + return; +} + +static struct kmem_cache *ext3cow_inode_cachep; + +/* + * Called inside transaction, so use GFP_NOFS + */ +static struct inode *ext3cow_alloc_inode(struct super_block *sb) +{ + struct ext3cow_inode_info *ei; + + ei = kmem_cache_alloc(ext3cow_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; +#ifdef CONFIG_EXT3COW_FS_POSIX_ACL + ei->i_acl = EXT3COW_ACL_NOT_CACHED; + ei->i_default_acl = EXT3COW_ACL_NOT_CACHED; +#endif + ei->i_block_alloc_info = NULL; + ei->vfs_inode.i_version = 1; + return &ei->vfs_inode; +} + +static void ext3cow_destroy_inode(struct inode *inode) +{ + kmem_cache_free(ext3cow_inode_cachep, EXT3COW_I(inode)); +} + +static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags) +{ + struct ext3cow_inode_info *ei = (struct ext3cow_inode_info *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + INIT_LIST_HEAD(&ei->i_orphan); +#ifdef CONFIG_EXT3COW_FS_XATTR + init_rwsem(&ei->xattr_sem); +#endif + mutex_init(&ei->truncate_mutex); + inode_init_once(&ei->vfs_inode); + } +} + +static int init_inodecache(void) +{ + ext3cow_inode_cachep = kmem_cache_create("ext3cow_inode_cache", + sizeof(struct ext3cow_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once, NULL); + if (ext3cow_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(ext3cow_inode_cachep); +} + +static void ext3cow_clear_inode(struct inode *inode) +{ + struct ext3cow_block_alloc_info *rsv = EXT3COW_I(inode)->i_block_alloc_info; +#ifdef CONFIG_EXT3COW_FS_POSIX_ACL + if (EXT3COW_I(inode)->i_acl && + EXT3COW_I(inode)->i_acl != EXT3COW_ACL_NOT_CACHED) { + posix_acl_release(EXT3COW_I(inode)->i_acl); + EXT3COW_I(inode)->i_acl = EXT3COW_ACL_NOT_CACHED; + } + if (EXT3COW_I(inode)->i_default_acl && + EXT3COW_I(inode)->i_default_acl != EXT3COW_ACL_NOT_CACHED) { + posix_acl_release(EXT3COW_I(inode)->i_default_acl); + EXT3COW_I(inode)->i_default_acl = EXT3COW_ACL_NOT_CACHED; + } +#endif + ext3cow_discard_reservation(inode); + EXT3COW_I(inode)->i_block_alloc_info = NULL; + if (unlikely(rsv)) + kfree(rsv); +} + +static inline void ext3cow_show_quota_options(struct seq_file *seq, struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + + if (sbi->s_jquota_fmt) + seq_printf(seq, ",jqfmt=%s", + (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_mount_opt & EXT3COW_MOUNT_USRQUOTA) + seq_puts(seq, ",usrquota"); + + if (sbi->s_mount_opt & EXT3COW_MOUNT_GRPQUOTA) + seq_puts(seq, ",grpquota"); +#endif +} + +static int ext3cow_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct super_block *sb = vfs->mnt_sb; + + if (test_opt(sb, DATA_FLAGS) == EXT3COW_MOUNT_JOURNAL_DATA) + seq_puts(seq, ",data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT3COW_MOUNT_ORDERED_DATA) + seq_puts(seq, ",data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT3COW_MOUNT_WRITEBACK_DATA) + seq_puts(seq, ",data=writeback"); + + ext3cow_show_quota_options(seq, sb); + + return 0; +} + + +static struct dentry *ext3cow_get_dentry(struct super_block *sb, void *vobjp) +{ + __u32 *objp = vobjp; + unsigned long ino = objp[0]; + __u32 generation = objp[1]; + struct inode *inode; + struct dentry *result; + + if (ino < EXT3COW_FIRST_INO(sb) && ino != EXT3COW_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT3COW_SB(sb)->s_es->s_inodes_count)) + return ERR_PTR(-ESTALE); + + /* iget isn't really right if the inode is currently unallocated!! + * + * ext3cow_read_inode will return a bad_inode if the inode had been + * deleted, so we should be safe. + * + * Currently we don't know the generation for parent directory, so + * a generation of 0 means "accept any" + */ + inode = iget(sb, ino); + if (inode == NULL) + return ERR_PTR(-ENOMEM); + if (is_bad_inode(inode) || + (generation && inode->i_generation != generation)) { + iput(inode); + return ERR_PTR(-ESTALE); + } + /* now to find a dentry. + * If possible, get a well-connected one + */ + result = d_alloc_anon(inode); + if (!result) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + return result; +} + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") +#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) + +static int ext3cow_dquot_initialize(struct inode *inode, int type); +static int ext3cow_dquot_drop(struct inode *inode); +static int ext3cow_write_dquot(struct dquot *dquot); +static int ext3cow_acquire_dquot(struct dquot *dquot); +static int ext3cow_release_dquot(struct dquot *dquot); +static int ext3cow_mark_dquot_dirty(struct dquot *dquot); +static int ext3cow_write_info(struct super_block *sb, int type); +static int ext3cow_quota_on(struct super_block *sb, int type, int format_id, char *path); +static int ext3cow_quota_on_mount(struct super_block *sb, int type); +static ssize_t ext3cow_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +static ssize_t ext3cow_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + +static struct dquot_operations ext3cow_quota_operations = { + .initialize = ext3cow_dquot_initialize, + .drop = ext3cow_dquot_drop, + .alloc_space = dquot_alloc_space, + .alloc_inode = dquot_alloc_inode, + .free_space = dquot_free_space, + .free_inode = dquot_free_inode, + .transfer = dquot_transfer, + .write_dquot = ext3cow_write_dquot, + .acquire_dquot = ext3cow_acquire_dquot, + .release_dquot = ext3cow_release_dquot, + .mark_dirty = ext3cow_mark_dquot_dirty, + .write_info = ext3cow_write_info +}; + +static struct quotactl_ops ext3cow_qctl_operations = { + .quota_on = ext3cow_quota_on, + .quota_off = vfs_quota_off, + .quota_sync = vfs_quota_sync, + .get_info = vfs_get_dqinfo, + .set_info = vfs_set_dqinfo, + .get_dqblk = vfs_get_dqblk, + .set_dqblk = vfs_set_dqblk +}; +#endif + +static struct super_operations ext3cow_sops = { + .alloc_inode = ext3cow_alloc_inode, + .destroy_inode = ext3cow_destroy_inode, + .read_inode = ext3cow_read_inode, + .write_inode = ext3cow_write_inode, + .dirty_inode = ext3cow_dirty_inode, + .delete_inode = ext3cow_delete_inode, + .put_super = ext3cow_put_super, + .write_super = ext3cow_write_super, + .sync_fs = ext3cow_sync_fs, + .write_super_lockfs = ext3cow_write_super_lockfs, + .unlockfs = ext3cow_unlockfs, + .statfs = ext3cow_statfs, + .remount_fs = ext3cow_remount, + .clear_inode = ext3cow_clear_inode, + .show_options = ext3cow_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext3cow_quota_read, + .quota_write = ext3cow_quota_write, +#endif +}; + +static struct export_operations ext3cow_export_ops = { + .get_parent = ext3cow_get_parent, + .get_dentry = ext3cow_get_dentry, +}; + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, + Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_grpquota +}; + +static match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_nocheck, "nocheck"}, + {Opt_nocheck, "check=none"}, + {Opt_debug, "debug"}, + {Opt_oldalloc, "oldalloc"}, + {Opt_orlov, "orlov"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, + {Opt_noload, "noload"}, + {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, + {Opt_journal_dev, "journal_dev=%u"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_grpquota, "grpquota"}, + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_barrier, "barrier=%u"}, + {Opt_err, NULL}, + {Opt_resize, "resize"}, +}; + +static ext3cow_fsblk_t get_sb_block(void **data) +{ + ext3cow_fsblk_t sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + options += 3; + /*todo: use simple_strtoll with >32bit ext3cow */ + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + printk("EXT3COW-fs: Invalid sb specification: %s\n", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + return sb_block; +} + +static int parse_options (char *options, struct super_block *sb, + unsigned int *inum, unsigned long *journal_devnum, + ext3cow_fsblk_t *n_blocks_count, int is_remount) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + char * p; + substring_t args[MAX_OPT_ARGS]; + int data_opt = 0; + int option; +#ifdef CONFIG_QUOTA + int qtype; + char *qname; +#endif + + if (!options) + return 1; + + while ((p = strsep (&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_bsd_df: + clear_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_minix_df: + set_opt (sbi->s_mount_opt, MINIX_DF); + break; + case Opt_grpid: + set_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_nogrpid: + clear_opt (sbi->s_mount_opt, GRPID); + break; + case Opt_resuid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resuid = option; + break; + case Opt_resgid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resgid = option; + break; + case Opt_sb: + /* handled by get_sb_block() instead of here */ + /* *sb_block = match_int(&args[0]); */ + break; + case Opt_err_panic: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_RO); + set_opt (sbi->s_mount_opt, ERRORS_PANIC); + break; + case Opt_err_ro: + clear_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_RO); + break; + case Opt_err_cont: + clear_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt (sbi->s_mount_opt, ERRORS_PANIC); + set_opt (sbi->s_mount_opt, ERRORS_CONT); + break; + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; + case Opt_debug: + set_opt (sbi->s_mount_opt, DEBUG); + break; + case Opt_oldalloc: + set_opt (sbi->s_mount_opt, OLDALLOC); + break; + case Opt_orlov: + clear_opt (sbi->s_mount_opt, OLDALLOC); + break; +#ifdef CONFIG_EXT3COW_FS_XATTR + case Opt_user_xattr: + set_opt (sbi->s_mount_opt, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt (sbi->s_mount_opt, XATTR_USER); + break; +#else + case Opt_user_xattr: + case Opt_nouser_xattr: + printk("EXT3COW (no)user_xattr options not supported\n"); + break; +#endif +#ifdef CONFIG_EXT3COW_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi->s_mount_opt, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi->s_mount_opt, POSIX_ACL); + break; +#else + case Opt_acl: + case Opt_noacl: + printk("EXT3COW (no)acl options not supported\n"); + break; +#endif + case Opt_reservation: + set_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_noreservation: + clear_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_journal_update: + /* @@@ FIXME */ + /* Eventually we will want to be able to create + a journal file here. For now, only allow the + user to specify an existing inode to be the + journal file. */ + if (is_remount) { + printk(KERN_ERR "EXT3COW-fs: cannot specify " + "journal on remount\n"); + return 0; + } + set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); + break; + case Opt_journal_inum: + if (is_remount) { + printk(KERN_ERR "EXT3COW-fs: cannot specify " + "journal on remount\n"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *inum = option; + break; + case Opt_journal_dev: + if (is_remount) { + printk(KERN_ERR "EXT3COW-fs: cannot specify " + "journal on remount\n"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *journal_devnum = option; + break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; + case Opt_commit: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = JBD_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * option; + break; + case Opt_data_journal: + data_opt = EXT3COW_MOUNT_JOURNAL_DATA; + goto datacheck; + case Opt_data_ordered: + data_opt = EXT3COW_MOUNT_ORDERED_DATA; + goto datacheck; + case Opt_data_writeback: + data_opt = EXT3COW_MOUNT_WRITEBACK_DATA; + datacheck: + if (is_remount) { + if ((sbi->s_mount_opt & EXT3COW_MOUNT_DATA_FLAGS) + != data_opt) { + printk(KERN_ERR + "EXT3COW-fs: cannot change data " + "mode on remount\n"); + return 0; + } + } else { + sbi->s_mount_opt &= ~EXT3COW_MOUNT_DATA_FLAGS; + sbi->s_mount_opt |= data_opt; + } + break; +#ifdef CONFIG_QUOTA + case Opt_usrjquota: + qtype = USRQUOTA; + goto set_qf_name; + case Opt_grpjquota: + qtype = GRPQUOTA; +set_qf_name: + if (sb_any_quota_enabled(sb)) { + printk(KERN_ERR + "EXT3COW-fs: Cannot change journalled " + "quota options when quota turned on.\n"); + return 0; + } + qname = match_strdup(&args[0]); + if (!qname) { + printk(KERN_ERR + "EXT3COW-fs: not enough memory for " + "storing quotafile name.\n"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + printk(KERN_ERR + "EXT3COW-fs: %s quota file already " + "specified.\n", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + printk(KERN_ERR + "EXT3COW-fs: quotafile must be on " + "filesystem root.\n"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + break; + case Opt_offusrjquota: + qtype = USRQUOTA; + goto clear_qf_name; + case Opt_offgrpjquota: + qtype = GRPQUOTA; +clear_qf_name: + if (sb_any_quota_enabled(sb)) { + printk(KERN_ERR "EXT3COW-fs: Cannot change " + "journalled quota options when " + "quota turned on.\n"); + return 0; + } + /* + * The space will be released later when all options + * are confirmed to be correct + */ + sbi->s_qf_names[qtype] = NULL; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_quota: + case Opt_usrquota: + set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sbi->s_mount_opt, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sbi->s_mount_opt, GRPQUOTA); + break; + case Opt_noquota: + if (sb_any_quota_enabled(sb)) { + printk(KERN_ERR "EXT3COW-fs: Cannot change quota " + "options when quota turned on.\n"); + return 0; + } + clear_opt(sbi->s_mount_opt, QUOTA); + clear_opt(sbi->s_mount_opt, USRQUOTA); + clear_opt(sbi->s_mount_opt, GRPQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + printk(KERN_ERR + "EXT3COW-fs: journalled quota options not " + "supported.\n"); + break; + case Opt_noquota: + break; +#endif + case Opt_abort: + set_opt(sbi->s_mount_opt, ABORT); + break; + case Opt_barrier: + if (match_int(&args[0], &option)) + return 0; + if (option) + set_opt(sbi->s_mount_opt, BARRIER); + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; + case Opt_ignore: + break; + case Opt_resize: + if (!is_remount) { + printk("EXT3COW-fs: resize option only available " + "for remount\n"); + return 0; + } + if (match_int(&args[0], &option) != 0) + return 0; + *n_blocks_count = option; + break; + case Opt_nobh: + set_opt(sbi->s_mount_opt, NOBH); + break; + case Opt_bh: + clear_opt(sbi->s_mount_opt, NOBH); + break; + default: + printk (KERN_ERR + "EXT3COW-fs: Unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } +#ifdef CONFIG_QUOTA + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + if ((sbi->s_mount_opt & EXT3COW_MOUNT_USRQUOTA) && + sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi->s_mount_opt, USRQUOTA); + + if ((sbi->s_mount_opt & EXT3COW_MOUNT_GRPQUOTA) && + sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi->s_mount_opt, GRPQUOTA); + + if ((sbi->s_qf_names[USRQUOTA] && + (sbi->s_mount_opt & EXT3COW_MOUNT_GRPQUOTA)) || + (sbi->s_qf_names[GRPQUOTA] && + (sbi->s_mount_opt & EXT3COW_MOUNT_USRQUOTA))) { + printk(KERN_ERR "EXT3COW-fs: old and new quota " + "format mixing.\n"); + return 0; + } + + if (!sbi->s_jquota_fmt) { + printk(KERN_ERR "EXT3COW-fs: journalled quota format " + "not specified.\n"); + return 0; + } + } else { + if (sbi->s_jquota_fmt) { + printk(KERN_ERR "EXT3COW-fs: journalled quota format " + "specified with no journalling " + "enabled.\n"); + return 0; + } + } +#endif + return 1; +} + +static int ext3cow_setup_super(struct super_block *sb, struct ext3cow_super_block *es, + int read_only) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT3COW_MAX_SUPP_REV) { + printk (KERN_ERR "EXT3COW-fs warning: revision level too high, " + "forcing read-only mode\n"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT3COW_VALID_FS)) + printk (KERN_WARNING "EXT3COW-fs warning: mounting unchecked fs, " + "running e2fsck is recommended\n"); + else if ((sbi->s_mount_state & EXT3COW_ERROR_FS)) + printk (KERN_WARNING + "EXT3COW-fs warning: mounting fs with errors, " + "running e2fsck is recommended\n"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + printk (KERN_WARNING + "EXT3COW-fs warning: maximal mount count reached, " + "running e2fsck is recommended\n"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + printk (KERN_WARNING + "EXT3COW-fs warning: checktime reached, " + "running e2fsck is recommended\n"); +#if 0 + /* @@@ We _will_ want to clear the valid bit if we find + inconsistencies, to force a fsck at reboot. But for + a plain journaled filesystem we can keep it set as + valid forever! :) */ + es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3COW_VALID_FS); +#endif + if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT3COW_DFL_MAX_MNT_COUNT); + es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); + es->s_mtime = cpu_to_le32(get_seconds()); + ext3cow_update_dynamic_rev(sb); + EXT3COW_SET_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); + + ext3cow_commit_super(sb, es, 1); + if (test_opt(sb, DEBUG)) + printk(KERN_INFO "[EXT3COW FS bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]\n", + sb->s_blocksize, + sbi->s_groups_count, + EXT3COW_BLOCKS_PER_GROUP(sb), + EXT3COW_INODES_PER_GROUP(sb), + sbi->s_mount_opt); + + printk(KERN_INFO "EXT3COW FS on %s, ", sb->s_id); + if (EXT3COW_SB(sb)->s_journal->j_inode == NULL) { + char b[BDEVNAME_SIZE]; + + printk("external journal on %s\n", + bdevname(EXT3COW_SB(sb)->s_journal->j_dev, b)); + } else { + printk("internal journal\n"); + } + return res; +} + +/* Called at mount-time, super-block is locked */ +static int ext3cow_check_descriptors (struct super_block * sb) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + ext3cow_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); + ext3cow_fsblk_t last_block; + struct ext3cow_group_desc * gdp = NULL; + int desc_block = 0; + int i; + + ext3cow_debug ("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) + { + if (i == sbi->s_groups_count - 1) + last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; + else + last_block = first_block + + (EXT3COW_BLOCKS_PER_GROUP(sb) - 1); + + if ((i % EXT3COW_DESC_PER_BLOCK(sb)) == 0) + gdp = (struct ext3cow_group_desc *) + sbi->s_group_desc[desc_block++]->b_data; + if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || + le32_to_cpu(gdp->bg_block_bitmap) > last_block) + { + ext3cow_error (sb, "ext3cow_check_descriptors", + "Block bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_block_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || + le32_to_cpu(gdp->bg_inode_bitmap) > last_block) + { + ext3cow_error (sb, "ext3cow_check_descriptors", + "Inode bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_table) < first_block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group > + last_block) + { + ext3cow_error (sb, "ext3cow_check_descriptors", + "Inode table for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_table)); + return 0; + } + first_block += EXT3COW_BLOCKS_PER_GROUP(sb); + gdp++; + } + + sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3cow_count_free_blocks(sb)); + sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3cow_count_free_inodes(sb)); + return 1; +} + + +/* ext3cow_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext3cow_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +static void ext3cow_orphan_cleanup (struct super_block * sb, + struct ext3cow_super_block * es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + printk(KERN_ERR "EXT3COW-fs: write access " + "unavailable, skipping orphan cleanup.\n"); + return; + } + + if (EXT3COW_SB(sb)->s_mount_state & EXT3COW_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT3COW-fs: %s: orphan cleanup on readonly fs\n", + sb->s_id); + sb->s_flags &= ~MS_RDONLY; + } +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < MAXQUOTAS; i++) { + if (EXT3COW_SB(sb)->s_qf_names[i]) { + int ret = ext3cow_quota_on_mount(sb, i); + if (ret < 0) + printk(KERN_ERR + "EXT3COW-fs: Cannot turn on journalled " + "quota: error %d\n", ret); + } + } +#endif + + while (es->s_last_orphan) { + struct inode *inode; + + if (!(inode = + ext3cow_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT3COW_I(inode)->i_orphan, &EXT3COW_SB(sb)->s_orphan); + DQUOT_INIT(inode); + if (inode->i_nlink) { + printk(KERN_DEBUG + "%s: truncating inode %lu to %Ld bytes\n", + __FUNCTION__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %Ld bytes\n", + inode->i_ino, inode->i_size); + ext3cow_truncate(inode); + nr_truncates++; + } else { + printk(KERN_DEBUG + "%s: deleting unreferenced inode %lu\n", + __FUNCTION__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x)==1) ? "" : "s" + + if (nr_orphans) + printk(KERN_INFO "EXT3COW-fs: %s: %d orphan inode%s deleted\n", + sb->s_id, PLURAL(nr_orphans)); + if (nr_truncates) + printk(KERN_INFO "EXT3COW-fs: %s: %d truncate%s cleaned up\n", + sb->s_id, PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + vfs_quota_off(sb, i); + } +#endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +} + +/* + * Maximal file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^32 sector limit. + */ +static loff_t ext3cow_max_size(int bits) +{ + loff_t res = EXT3COW_NDIR_BLOCKS; + /* This constant is calculated to be the largest file size for a + * dense, 4k-blocksize file such that the total number of + * sectors in the file, including data and all indirect blocks, + * does not exceed 2^32. */ + const loff_t upper_limit = 0x1ff7fffd000LL; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + return res; +} + +static ext3cow_fsblk_t descriptor_loc(struct super_block *sb, + ext3cow_fsblk_t logic_sb_block, + int nr) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + unsigned long bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return (logic_sb_block + nr + 1); + bg = sbi->s_desc_per_block * nr; + if (ext3cow_bg_has_super(sb, bg)) + has_super = 1; + return (has_super + ext3cow_group_first_block_no(sb, bg)); +} + + +static int ext3cow_fill_super (struct super_block *sb, void *data, int silent) +{ + struct buffer_head * bh; + struct ext3cow_super_block *es = NULL; + struct ext3cow_sb_info *sbi; + ext3cow_fsblk_t block; + ext3cow_fsblk_t sb_block = get_sb_block(&data); + ext3cow_fsblk_t logic_sb_block; + unsigned long offset = 0; + unsigned int journal_inum = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; + int hblock; + int db_count; + int i; + int needs_recovery; + __le32 features; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3COW_DEF_RESUID; + sbi->s_resgid = EXT3COW_DEF_RESGID; + + unlock_kernel(); + + blocksize = sb_min_blocksize(sb, EXT3COW_MIN_BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "EXT3COW-fs: unable to set blocksize\n"); + goto out_fail; + } + + /* + * The ext3cow superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT3COW_MIN_BLOCK_SIZE) { + logic_sb_block = (sb_block * EXT3COW_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3COW_MIN_BLOCK_SIZE) % blocksize; + } else { + logic_sb_block = sb_block; + } + + if (!(bh = sb_bread(sb, logic_sb_block))) { + printk (KERN_ERR "EXT3COW-fs: unable to read superblock\n"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext3cow macro-instructions depend on its value + */ + es = (struct ext3cow_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT3COW_SUPER_MAGIC) + goto cantfind_ext3cow; + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + if (def_mount_opts & EXT3COW_DEFM_DEBUG) + set_opt(sbi->s_mount_opt, DEBUG); + if (def_mount_opts & EXT3COW_DEFM_BSDGROUPS) + set_opt(sbi->s_mount_opt, GRPID); + if (def_mount_opts & EXT3COW_DEFM_UID16) + set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT3COW_FS_XATTR + if (def_mount_opts & EXT3COW_DEFM_XATTR_USER) + set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT3COW_FS_POSIX_ACL + if (def_mount_opts & EXT3COW_DEFM_ACL) + set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif + if ((def_mount_opts & EXT3COW_DEFM_JMODE) == EXT3COW_DEFM_JMODE_DATA) + sbi->s_mount_opt |= EXT3COW_MOUNT_JOURNAL_DATA; + else if ((def_mount_opts & EXT3COW_DEFM_JMODE) == EXT3COW_DEFM_JMODE_ORDERED) + sbi->s_mount_opt |= EXT3COW_MOUNT_ORDERED_DATA; + else if ((def_mount_opts & EXT3COW_DEFM_JMODE) == EXT3COW_DEFM_JMODE_WBACK) + sbi->s_mount_opt |= EXT3COW_MOUNT_WRITEBACK_DATA; + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT3COW_ERRORS_PANIC) + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3COW_ERRORS_RO) + set_opt(sbi->s_mount_opt, ERRORS_RO); + else + set_opt(sbi->s_mount_opt, ERRORS_CONT); + + sbi->s_resuid = le16_to_cpu(es->s_def_resuid); + sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + + set_opt(sbi->s_mount_opt, RESERVATION); + + if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, + NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT3COW_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + if (le32_to_cpu(es->s_rev_level) == EXT3COW_GOOD_OLD_REV && + (EXT3COW_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT3COW_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT3COW_HAS_INCOMPAT_FEATURE(sb, ~0U))) + printk(KERN_WARNING + "EXT3COW-fs warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended\n"); + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + features = EXT3COW_HAS_INCOMPAT_FEATURE(sb, ~EXT3COW_FEATURE_INCOMPAT_SUPP); + if (features) { + printk(KERN_ERR "EXT3COW-fs: %s: couldn't mount because of " + "unsupported optional features (%x).\n", + sb->s_id, le32_to_cpu(features)); + goto failed_mount; + } + features = EXT3COW_HAS_RO_COMPAT_FEATURE(sb, ~EXT3COW_FEATURE_RO_COMPAT_SUPP); + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "EXT3COW-fs: %s: couldn't mount RDWR because of " + "unsupported optional features (%x).\n", + sb->s_id, le32_to_cpu(features)); + goto failed_mount; + } + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize < EXT3COW_MIN_BLOCK_SIZE || + blocksize > EXT3COW_MAX_BLOCK_SIZE) { + printk(KERN_ERR + "EXT3COW-fs: Unsupported filesystem blocksize %d on %s.\n", + blocksize, sb->s_id); + goto failed_mount; + } + + hblock = bdev_hardsect_size(sb->s_bdev); + if (sb->s_blocksize != blocksize) { + /* + * Make sure the blocksize for the filesystem is larger + * than the hardware sectorsize for the machine. + */ + if (blocksize < hblock) { + printk(KERN_ERR "EXT3COW-fs: blocksize %d too small for " + "device blocksize %d.\n", blocksize, hblock); + goto failed_mount; + } + + brelse (bh); + sb_set_blocksize(sb, blocksize); + logic_sb_block = (sb_block * EXT3COW_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3COW_MIN_BLOCK_SIZE) % blocksize; + bh = sb_bread(sb, logic_sb_block); + if (!bh) { + printk(KERN_ERR + "EXT3COW-fs: Can't read superblock on 2nd try.\n"); + goto failed_mount; + } + es = (struct ext3cow_super_block *)(((char *)bh->b_data) + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT3COW_SUPER_MAGIC)) { + printk (KERN_ERR + "EXT3COW-fs: Magic mismatch, very weird !\n"); + goto failed_mount; + } + } + + sb->s_maxbytes = ext3cow_max_size(sb->s_blocksize_bits); + + if (le32_to_cpu(es->s_rev_level) == EXT3COW_GOOD_OLD_REV) { + sbi->s_inode_size = EXT3COW_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT3COW_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT3COW_GOOD_OLD_INODE_SIZE) || + (sbi->s_inode_size & (sbi->s_inode_size - 1)) || + (sbi->s_inode_size > blocksize)) { + printk (KERN_ERR + "EXT3COW-fs: unsupported inode size: %d\n", + sbi->s_inode_size); + goto failed_mount; + } + } + sbi->s_frag_size = EXT3COW_MIN_FRAG_SIZE << + le32_to_cpu(es->s_log_frag_size); + if (blocksize != sbi->s_frag_size) { + printk(KERN_ERR + "EXT3COW-fs: fragsize %lu != blocksize %u (unsupported)\n", + sbi->s_frag_size, blocksize); + goto failed_mount; + } + sbi->s_frags_per_block = 1; + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + if (EXT3COW_INODE_SIZE(sb) == 0) + goto cantfind_ext3cow; + sbi->s_inodes_per_block = blocksize / EXT3COW_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0) + goto cantfind_ext3cow; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / sizeof(struct ext3cow_group_desc); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = ilog2(EXT3COW_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT3COW_DESC_PER_BLOCK(sb)); + for (i=0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; + + /* Epoch number for versioning -znjp */ + sbi->s_epoch_number = le32_to_cpu(es->s_epoch_number); + printk(KERN_INFO "EXT3COW-fs: System epoch number: %u\n", + sbi->s_epoch_number); + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3COW-fs: #blocks per group too big: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_frags_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3COW-fs: #fragments per group too big: %lu\n", + sbi->s_frags_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3COW-fs: #inodes per group too big: %lu\n", + sbi->s_inodes_per_group); + goto failed_mount; + } + + if (le32_to_cpu(es->s_blocks_count) > + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + printk(KERN_ERR "EXT3COW-fs: filesystem on %s:" + " too large to mount safely\n", sb->s_id); + if (sizeof(sector_t) < 8) + printk(KERN_WARNING "EXT3COW-fs: CONFIG_LBD not " + "enabled\n"); + goto failed_mount; + } + + if (EXT3COW_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext3cow; + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT3COW_BLOCKS_PER_GROUP(sb)) + 1; + db_count = (sbi->s_groups_count + EXT3COW_DESC_PER_BLOCK(sb) - 1) / + EXT3COW_DESC_PER_BLOCK(sb); + sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + printk (KERN_ERR "EXT3COW-fs: not enough memory\n"); + goto failed_mount; + } + + bgl_lock_init(&sbi->s_blockgroup_lock); + + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logic_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); + if (!sbi->s_group_desc[i]) { + printk (KERN_ERR "EXT3COW-fs: " + "can't read group descriptor %d\n", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext3cow_check_descriptors (sb)) { + printk(KERN_ERR "EXT3COW-fs: group descriptors corrupted!\n"); + goto failed_mount2; + } + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + percpu_counter_init(&sbi->s_freeblocks_counter, + ext3cow_count_free_blocks(sb)); + percpu_counter_init(&sbi->s_freeinodes_counter, + ext3cow_count_free_inodes(sb)); + percpu_counter_init(&sbi->s_dirs_counter, + ext3cow_count_dirs(sb)); + + /* per fileystem reservation list head & lock */ + spin_lock_init(&sbi->s_rsv_window_lock); + sbi->s_rsv_window_root = RB_ROOT; + /* Add a single, static dummy reservation to the start of the + * reservation window list --- it gives us a placeholder for + * append-at-start-of-list which makes the allocation logic + * _much_ simpler. */ + sbi->s_rsv_window_head.rsv_start = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_end = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; + sbi->s_rsv_window_head.rsv_alloc_hit = 0; + sbi->s_rsv_window_head.rsv_goal_size = 0; + ext3cow_rsv_window_add(sb, &sbi->s_rsv_window_head); + + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext3cow_sops; + sb->s_export_op = &ext3cow_export_ops; + sb->s_xattr = ext3cow_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->s_qcop = &ext3cow_qctl_operations; + sb->dq_op = &ext3cow_quota_operations; +#endif + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + + sb->s_root = NULL; + + needs_recovery = (es->s_last_orphan != 0 || + EXT3COW_HAS_INCOMPAT_FEATURE(sb, + EXT3COW_FEATURE_INCOMPAT_RECOVER)); + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT3COW_HAS_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext3cow_load_journal(sb, es, journal_devnum)) + goto failed_mount3; + } else if (journal_inum) { + if (ext3cow_create_journal(sb, es, journal_inum)) + goto failed_mount3; + } else { + if (!silent) + printk (KERN_ERR + "ext3cow: No journal on filesystem on %s\n", + sb->s_id); + goto failed_mount3; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + capabilities: ORDERED_DATA if the journal can + cope, else JOURNAL_DATA */ + if (journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) + set_opt(sbi->s_mount_opt, ORDERED_DATA); + else + set_opt(sbi->s_mount_opt, JOURNAL_DATA); + break; + + case EXT3COW_MOUNT_ORDERED_DATA: + case EXT3COW_MOUNT_WRITEBACK_DATA: + if (!journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { + printk(KERN_ERR "EXT3COW-fs: Journal does not support " + "requested data journaling mode\n"); + goto failed_mount4; + } + default: + break; + } + + if (test_opt(sb, NOBH)) { + if (!(test_opt(sb, DATA_FLAGS) == EXT3COW_MOUNT_WRITEBACK_DATA)) { + printk(KERN_WARNING "EXT3COW-fs: Ignoring nobh option - " + "its supported only with writeback mode\n"); + clear_opt(sbi->s_mount_opt, NOBH); + } + } + /* + * The journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + root = iget(sb, EXT3COW_ROOT_INO); + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + printk(KERN_ERR "EXT3COW-fs: get root inode failed\n"); + iput(root); + goto failed_mount4; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + dput(sb->s_root); + sb->s_root = NULL; + printk(KERN_ERR "EXT3COW-fs: corrupt root inode, run e2fsck\n"); + goto failed_mount4; + } + + ext3cow_setup_super (sb, es, sb->s_flags & MS_RDONLY); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock + * in numerous places. Here we just pop the lock - it's relatively + * harmless, because we are now ready to accept write_super() requests, + * and aviro says that's the only reason for hanging onto the + * superblock lock. + */ + EXT3COW_SB(sb)->s_mount_state |= EXT3COW_ORPHAN_FS; + ext3cow_orphan_cleanup(sb, es); + EXT3COW_SB(sb)->s_mount_state &= ~EXT3COW_ORPHAN_FS; + if (needs_recovery) + printk (KERN_INFO "EXT3COW-fs: recovery complete.\n"); + ext3cow_mark_recovery_complete(sb, es); + printk (KERN_INFO "EXT3COW-fs: mounted filesystem with %s data mode.\n", + test_opt(sb,DATA_FLAGS) == EXT3COW_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3COW_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + + lock_kernel(); + return 0; + +cantfind_ext3cow: + if (!silent) + printk(KERN_ERR "VFS: Can't find ext3cow filesystem on dev %s.\n", + sb->s_id); + goto failed_mount; + +failed_mount4: + journal_destroy(sbi->s_journal); +failed_mount3: + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); +failed_mount: +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + ext3cow_blkdev_remove(sbi); + brelse(bh); +out_fail: + sb->s_fs_info = NULL; + kfree(sbi); + lock_kernel(); + return -EINVAL; +} + +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext3cow_init_journal_params(struct super_block *sb, journal_t *journal) +{ + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + + if (sbi->s_commit_interval) + journal->j_commit_interval = sbi->s_commit_interval; + /* We could also set up an ext3cow-specific default for the commit + * interval here, but for now we'll just fall back to the jbd + * default. */ + + spin_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JFS_BARRIER; + else + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); +} + +static journal_t *ext3cow_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = iget(sb, journal_inum); + if (!journal_inode) { + printk(KERN_ERR "EXT3COW-fs: no journal found.\n"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + printk(KERN_ERR "EXT3COW-fs: journal inode is deleted.\n"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", + journal_inode, journal_inode->i_size); + if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) { + printk(KERN_ERR "EXT3COW-fs: invalid journal inode.\n"); + iput(journal_inode); + return NULL; + } + + journal = journal_init_inode(journal_inode); + if (!journal) { + printk(KERN_ERR "EXT3COW-fs: Could not load journal inode\n"); + iput(journal_inode); + return NULL; + } + /* Make sure the journal never gets versioned -znjp */ + EXT3COW_I(journal_inode)->i_flags |= EXT3COW_UNVERSIONABLE_FL; + journal->j_private = sb; + ext3cow_init_journal_params(sb, journal); + return journal; +} + +static journal_t *ext3cow_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + struct buffer_head * bh; + journal_t *journal; + ext3cow_fsblk_t start; + ext3cow_fsblk_t len; + int hblock, blocksize; + ext3cow_fsblk_t sb_block; + unsigned long offset; + struct ext3cow_super_block * es; + struct block_device *bdev; + + bdev = ext3cow_blkdev_get(j_dev); + if (bdev == NULL) + return NULL; + + if (bd_claim(bdev, sb)) { + printk(KERN_ERR + "EXT3COW: failed to claim external journal device.\n"); + blkdev_put(bdev); + return NULL; + } + + blocksize = sb->s_blocksize; + hblock = bdev_hardsect_size(bdev); + if (blocksize < hblock) { + printk(KERN_ERR + "EXT3COW-fs: blocksize too small for journal device.\n"); + goto out_bdev; + } + + sb_block = EXT3COW_MIN_BLOCK_SIZE / blocksize; + offset = EXT3COW_MIN_BLOCK_SIZE % blocksize; + set_blocksize(bdev, blocksize); + if (!(bh = __bread(bdev, sb_block, blocksize))) { + printk(KERN_ERR "EXT3COW-fs: couldn't read superblock of " + "external journal\n"); + goto out_bdev; + } + + es = (struct ext3cow_super_block *) (((char *)bh->b_data) + offset); + if ((le16_to_cpu(es->s_magic) != EXT3COW_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT3COW_FEATURE_INCOMPAT_JOURNAL_DEV)) { + printk(KERN_ERR "EXT3COW-fs: external journal has " + "bad superblock\n"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT3COW_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + printk(KERN_ERR "EXT3COW-fs: journal UUID does not match\n"); + brelse(bh); + goto out_bdev; + } + + len = le32_to_cpu(es->s_blocks_count); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = journal_init_dev(bdev, sb->s_bdev, + start, len, blocksize); + if (!journal) { + printk(KERN_ERR "EXT3COW-fs: failed to create device journal\n"); + goto out_bdev; + } + journal->j_private = sb; + ll_rw_block(READ, 1, &journal->j_sb_buffer); + wait_on_buffer(journal->j_sb_buffer); + if (!buffer_uptodate(journal->j_sb_buffer)) { + printk(KERN_ERR "EXT3COW-fs: I/O error on journal device\n"); + goto out_journal; + } + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + printk(KERN_ERR "EXT3COW-fs: External journal has more than one " + "user (unsupported) - %d\n", + be32_to_cpu(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT3COW_SB(sb)->journal_bdev = bdev; + ext3cow_init_journal_params(sb, journal); + return journal; +out_journal: + journal_destroy(journal); +out_bdev: + ext3cow_blkdev_put(bdev); + return NULL; +} + +static int ext3cow_load_journal(struct super_block *sb, + struct ext3cow_super_block *es, + unsigned long journal_devnum) +{ + journal_t *journal; + unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); + dev_t journal_dev; + int err = 0; + int really_read_only; + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + printk(KERN_INFO "EXT3COW-fs: external journal device major/minor " + "numbers have changed\n"); + journal_dev = new_decode_dev(journal_devnum); + } else + journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + + really_read_only = bdev_read_only(sb->s_bdev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + + if (EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT3COW-fs: INFO: recovery " + "required on readonly filesystem.\n"); + if (really_read_only) { + printk(KERN_ERR "EXT3COW-fs: write access " + "unavailable, cannot proceed.\n"); + return -EROFS; + } + printk (KERN_INFO "EXT3COW-fs: write access will " + "be enabled during recovery.\n"); + } + } + + if (journal_inum && journal_dev) { + printk(KERN_ERR "EXT3COW-fs: filesystem has both journal " + "and inode journals!\n"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext3cow_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext3cow_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { + err = journal_update_format(journal); + if (err) { + printk(KERN_ERR "EXT3COW-fs: error updating journal.\n"); + journal_destroy(journal); + return err; + } + } + + if (!EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER)) + err = journal_wipe(journal, !really_read_only); + if (!err) + err = journal_load(journal); + + if (err) { + printk(KERN_ERR "EXT3COW-fs: error loading journal.\n"); + journal_destroy(journal); + return err; + } + + EXT3COW_SB(sb)->s_journal = journal; + ext3cow_clear_journal_err(sb, es); + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); + sb->s_dirt = 1; + + /* Make sure we flush the recovery flag to disk. */ + ext3cow_commit_super(sb, es, 1); + } + + return 0; +} + +static int ext3cow_create_journal(struct super_block * sb, + struct ext3cow_super_block * es, + unsigned int journal_inum) +{ + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) { + printk(KERN_ERR "EXT3COW-fs: readonly filesystem when trying to " + "create journal.\n"); + return -EROFS; + } + + if (!(journal = ext3cow_get_journal(sb, journal_inum))) + return -EINVAL; + + printk(KERN_INFO "EXT3COW-fs: creating new journal on inode %u\n", + journal_inum); + + if (journal_create(journal)) { + printk(KERN_ERR "EXT3COW-fs: error creating journal.\n"); + journal_destroy(journal); + return -EIO; + } + + EXT3COW_SB(sb)->s_journal = journal; + + ext3cow_update_dynamic_rev(sb); + EXT3COW_SET_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); + EXT3COW_SET_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_HAS_JOURNAL); + + es->s_journal_inum = cpu_to_le32(journal_inum); + sb->s_dirt = 1; + + /* Make sure we flush the recovery flag to disk. */ + ext3cow_commit_super(sb, es, 1); + + return 0; +} + +static void ext3cow_commit_super (struct super_block * sb, + struct ext3cow_super_block * es, + int sync) +{ + struct buffer_head *sbh = EXT3COW_SB(sb)->s_sbh; + + if (!sbh) + return; + es->s_wtime = cpu_to_le32(get_seconds()); + es->s_free_blocks_count = cpu_to_le32(ext3cow_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext3cow_count_free_inodes(sb)); + BUFFER_TRACE(sbh, "marking dirty"); + mark_buffer_dirty(sbh); + if (sync) + sync_dirty_buffer(sbh); +} + + +/* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ +static void ext3cow_mark_recovery_complete(struct super_block * sb, + struct ext3cow_super_block * es) +{ + journal_t *journal = EXT3COW_SB(sb)->s_journal; + + journal_lock_updates(journal); + journal_flush(journal); + if (EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT3COW_CLEAR_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); + sb->s_dirt = 0; + ext3cow_commit_super(sb, es, 1); + } + journal_unlock_updates(journal); +} + +/* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ +static void ext3cow_clear_journal_err(struct super_block * sb, + struct ext3cow_super_block * es) +{ + journal_t *journal; + int j_errno; + const char *errstr; + + journal = EXT3COW_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext3cow_error() or ext3cow_abort() + */ + + j_errno = journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext3cow_decode_error(sb, j_errno, nbuf); + ext3cow_warning(sb, __FUNCTION__, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext3cow_warning(sb, __FUNCTION__, "Marking fs in need of " + "filesystem check."); + + EXT3COW_SB(sb)->s_mount_state |= EXT3COW_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3COW_ERROR_FS); + ext3cow_commit_super (sb, es, 1); + + journal_clear_err(journal); + } +} + +/* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ +int ext3cow_force_commit(struct super_block *sb) +{ + journal_t *journal; + int ret; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT3COW_SB(sb)->s_journal; + sb->s_dirt = 0; + ret = ext3cow_journal_force_commit(journal); + return ret; +} + +/* + * Ext3 always journals updates to the superblock itself, so we don't + * have to propagate any other updates to the superblock on disk at this + * point. Just start an async writeback to get the buffers on their way + * to the disk. + * + * This implicitly triggers the writebehind on sync(). + */ + +static void ext3cow_write_super (struct super_block * sb) +{ + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + sb->s_dirt = 0; +} + +static int ext3cow_sync_fs(struct super_block *sb, int wait) +{ + tid_t target; + + sb->s_dirt = 0; + if (journal_start_commit(EXT3COW_SB(sb)->s_journal, &target)) { + if (wait) + log_wait_commit(EXT3COW_SB(sb)->s_journal, target); + } + return 0; +} + +/* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + */ +static void ext3cow_write_super_lockfs(struct super_block *sb) +{ + sb->s_dirt = 0; + + if (!(sb->s_flags & MS_RDONLY)) { + journal_t *journal = EXT3COW_SB(sb)->s_journal; + + /* Now we set up the journal barrier. */ + journal_lock_updates(journal); + journal_flush(journal); + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT3COW_CLEAR_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); + ext3cow_commit_super(sb, EXT3COW_SB(sb)->s_es, 1); + } +} + +/* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ +static void ext3cow_unlockfs(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) { + lock_super(sb); + /* Reser the needs_recovery flag before the fs is unlocked. */ + EXT3COW_SET_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); + ext3cow_commit_super(sb, EXT3COW_SB(sb)->s_es, 1); + unlock_super(sb); + journal_unlock_updates(EXT3COW_SB(sb)->s_journal); + } +} + +static int ext3cow_remount (struct super_block * sb, int * flags, char * data) +{ + struct ext3cow_super_block * es; + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + ext3cow_fsblk_t n_blocks_count = 0; + unsigned long old_sb_flags; + struct ext3cow_mount_options old_opts; + int err; +#ifdef CONFIG_QUOTA + int i; +#endif + + /* Store the original options */ + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + old_opts.s_commit_interval = sbi->s_commit_interval; +#ifdef CONFIG_QUOTA + old_opts.s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) + old_opts.s_qf_names[i] = sbi->s_qf_names[i]; +#endif + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { + err = -EINVAL; + goto restore_opts; + } + + if (sbi->s_mount_opt & EXT3COW_MOUNT_ABORT) + ext3cow_abort(sb, __FUNCTION__, "Abort forced by user"); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT3COW_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + es = sbi->s_es; + + ext3cow_init_journal_params(sb, sbi->s_journal); + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || + n_blocks_count > le32_to_cpu(es->s_blocks_count)) { + if (sbi->s_mount_opt & EXT3COW_MOUNT_ABORT) { + err = -EROFS; + goto restore_opts; + } + + if (*flags & MS_RDONLY) { + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT3COW_VALID_FS)) && + (sbi->s_mount_state & EXT3COW_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + ext3cow_mark_recovery_complete(sb, es); + } else { + __le32 ret; + if ((ret = EXT3COW_HAS_RO_COMPAT_FEATURE(sb, + ~EXT3COW_FEATURE_RO_COMPAT_SUPP))) { + printk(KERN_WARNING "EXT3COW-fs: %s: couldn't " + "remount RDWR because of unsupported " + "optional features (%x).\n", + sb->s_id, le32_to_cpu(ret)); + err = -EROFS; + goto restore_opts; + } + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + ext3cow_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if ((err = ext3cow_group_extend(sb, es, n_blocks_count))) + goto restore_opts; + if (!ext3cow_setup_super (sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + } + } +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + if (old_opts.s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(old_opts.s_qf_names[i]); +#endif + return 0; +restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sbi->s_commit_interval = old_opts.s_commit_interval; +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = old_opts.s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + } +#endif + return err; +} + +static int ext3cow_statfs (struct dentry * dentry, struct kstatfs * buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); + struct ext3cow_super_block *es = sbi->s_es; + ext3cow_fsblk_t overhead; + int i; + u64 fsid; + + if (test_opt (sb, MINIX_DF)) + overhead = 0; + else { + unsigned long ngroups; + ngroups = EXT3COW_SB(sb)->s_groups_count; + smp_rmb(); + + /* + * Compute the overhead (FS structures) + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < ngroups; i++) { + overhead += ext3cow_bg_has_super(sb, i) + + ext3cow_bg_num_gdb(sb, i); + cond_resched(); + } + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += (ngroups * (2 + EXT3COW_SB(sb)->s_itb_per_group)); + } + + buf->f_type = EXT3COW_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; + buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); + buf->f_namelen = EXT3COW_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + return 0; +} + +/* Helper function for writing quotas on sync - we need to start transaction before quota file + * is locked for write. Otherwise the are possible deadlocks: + * Process 1 Process 2 + * ext3cow_create() quota_sync() + * journal_start() write_dquot() + * DQUOT_INIT() down(dqio_mutex) + * down(dqio_mutex) journal_start() + * + */ + +#ifdef CONFIG_QUOTA + +static inline struct inode *dquot_to_inode(struct dquot *dquot) +{ + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; +} + +static int ext3cow_dquot_initialize(struct inode *inode, int type) +{ + handle_t *handle; + int ret, err; + + /* We may create quota structure so we need to reserve enough blocks */ + handle = ext3cow_journal_start(inode, 2*EXT3COW_QUOTA_INIT_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_initialize(inode, type); + err = ext3cow_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3cow_dquot_drop(struct inode *inode) +{ + handle_t *handle; + int ret, err; + + /* We may delete quota structure so we need to reserve enough blocks */ + handle = ext3cow_journal_start(inode, 2*EXT3COW_QUOTA_DEL_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_drop(inode); + err = ext3cow_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3cow_write_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + struct inode *inode; + + inode = dquot_to_inode(dquot); + handle = ext3cow_journal_start(inode, + EXT3COW_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit(dquot); + err = ext3cow_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3cow_acquire_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext3cow_journal_start(dquot_to_inode(dquot), + EXT3COW_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_acquire(dquot); + err = ext3cow_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3cow_release_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext3cow_journal_start(dquot_to_inode(dquot), + EXT3COW_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_release(dquot); + err = ext3cow_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext3cow_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journalling quotas? */ + if (EXT3COW_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + EXT3COW_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return ext3cow_write_dquot(dquot); + } else { + return dquot_mark_dquot_dirty(dquot); + } +} + +static int ext3cow_write_info(struct super_block *sb, int type) +{ + int ret, err; + handle_t *handle; + + /* Data block + inode block */ + handle = ext3cow_journal_start(sb->s_root->d_inode, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit_info(sb, type); + err = ext3cow_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +/* + * Turn on quotas during mount time - we need to find + * the quota file and such... + */ +static int ext3cow_quota_on_mount(struct super_block *sb, int type) +{ + return vfs_quota_on_mount(sb, EXT3COW_SB(sb)->s_qf_names[type], + EXT3COW_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int ext3cow_quota_on(struct super_block *sb, int type, int format_id, + char *path) +{ + int err; + struct nameidata nd; + + if (!test_opt(sb, QUOTA)) + return -EINVAL; + /* Not journalling quota? */ + if (!EXT3COW_SB(sb)->s_qf_names[USRQUOTA] && + !EXT3COW_SB(sb)->s_qf_names[GRPQUOTA]) + return vfs_quota_on(sb, type, format_id, path); + err = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (err) + return err; + /* Quotafile not on the same filesystem? */ + if (nd.mnt->mnt_sb != sb) { + path_release(&nd); + return -EXDEV; + } + /* Quotafile not of fs root? */ + if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode) + printk(KERN_WARNING + "EXT3COW-fs: Quota file not on filesystem root. " + "Journalled quota will not work.\n"); + path_release(&nd); + return vfs_quota_on(sb, type, format_id, path); +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and noone else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext3cow_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT3COW_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + bh = ext3cow_bread(NULL, inode, blk, 0, &err); + if (err) + return err; + if (!bh) /* A hole? */ + memset(data, 0, tocopy); + else + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t ext3cow_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> EXT3COW_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + int journal_quota = EXT3COW_SB(sb)->s_qf_names[type] != NULL; + size_t towrite = len; + struct buffer_head *bh; + handle_t *handle = journal_current_handle(); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + bh = ext3cow_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext3cow_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; + } + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext3cow_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext3cow_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + brelse(bh); + if (err) + goto out; + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) + return err; + if (inode->i_size < off+len-towrite) { + i_size_write(inode, off+len-towrite); + EXT3COW_I(inode)->i_disksize = inode->i_size; + } + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext3cow_mark_inode_dirty(handle, inode); + mutex_unlock(&inode->i_mutex); + return len - towrite; +} + +#endif + +static int ext3cow_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, ext3cow_fill_super, mnt); +} + +/* Code to update the epoch counter in the super block -znjp */ +unsigned int ext3cow_take_snapshot(struct super_block *sb){ + + struct ext3cow_sb_info *sbi = NULL; + struct ext3cow_super_block *es = NULL; + tid_t target; + + if(NULL == sb){ + printk("EXT3COW-fs: superblock is NULL when taking snapshot.\n"); + return -1; + } + + sbi = EXT3COW_SB(sb); + es = sbi->s_es; + + /* Sync the dirty blocks */ + if (journal_start_commit(EXT3COW_SB(sb)->s_journal, &target)) { + log_wait_commit(EXT3COW_SB(sb)->s_journal, target); + } + + + sbi->s_epoch_number = cpu_to_le32(get_seconds()); + es->s_epoch_number = sbi->s_epoch_number; + sb->s_dirt = 1; + + BUFFER_TRACE(EXT3COW_SB(sb)->s_sbh, "marking dirty"); + mark_buffer_dirty(sbi->s_sbh); + ext3cow_commit_super (sb, es, 1); + + return (unsigned int)sbi->s_epoch_number; +} + +static struct file_system_type ext3cow_fs_type = { + .owner = THIS_MODULE, + .name = "ext3cow", + .get_sb = ext3cow_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_ext3cow_fs(void) +{ + int err = init_ext3cow_xattr(); + if (err) + return err; + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ext3cow_fs_type); + if (err) + goto out; + return 0; +out: + destroy_inodecache(); +out1: + exit_ext3cow_xattr(); + return err; +} + +static void __exit exit_ext3cow_fs(void) +{ + unregister_filesystem(&ext3cow_fs_type); + destroy_inodecache(); + exit_ext3cow_xattr(); +} + +MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +MODULE_LICENSE("GPL"); +module_init(init_ext3cow_fs) +module_exit(exit_ext3cow_fs) diff -ruN linux-2.6.20.3/fs/ext3cow/symlink.c linux-2.6.20.3-ext3cow/fs/ext3cow/symlink.c --- linux-2.6.20.3/fs/ext3cow/symlink.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/symlink.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,54 @@ +/* + * linux/fs/ext3cow/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3cow symlink handling code + */ + +#include +#include +#include +#include +#include "xattr.h" + +static void * ext3cow_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext3cow_inode_info *ei = EXT3COW_I(dentry->d_inode); + nd_set_link(nd, (char*)ei->i_data); + return NULL; +} + +struct inode_operations ext3cow_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +#ifdef CONFIG_EXT3COW_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3cow_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +struct inode_operations ext3cow_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext3cow_follow_link, +#ifdef CONFIG_EXT3COW_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext3cow_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff -ruN linux-2.6.20.3/fs/ext3cow/xattr.c linux-2.6.20.3-ext3cow/fs/ext3cow/xattr.c --- linux-2.6.20.3/fs/ext3cow/xattr.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr.c 2008-03-09 11:14:49.000000000 -0400 @@ -0,0 +1,1314 @@ +/* + * linux/fs/ext3cow/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * Fix by Harrison Xing . + * Ext3 code with a lot of help from Eric Jarman . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * ea-in-inode support by Alex Tomas aka bzzz + * and Andreas Gruenbacher . + */ + +/* + * Extended attributes are stored directly in inodes (on file systems with + * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl + * field contains the block number if an inode uses an additional block. All + * attributes must fit in the inode and one additional block. Blocks that + * contain the identical set of attributes may be shared among several inodes. + * Identical blocks are detected by keeping a cache of blocks that have + * recently been accessed. + * + * The attributes in inodes and on blocks have a different header; the entries + * are stored in the same format: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The header is followed by multiple entry descriptors. In disk blocks, the + * entry descriptors are kept sorted. In inodes, they are unsorted. The + * attribute values are aligned to the end of the block in no specific order. + * + * Locking strategy + * ---------------- + * EXT3COW_I(inode)->i_file_acl is protected by EXT3COW_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count can change. Multiple writers to the same block are synchronized + * by the buffer lock. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" + +#define BHDR(bh) ((struct ext3cow_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext3cow_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define IHDR(inode, raw_inode) \ + ((struct ext3cow_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT3COW_GOOD_OLD_INODE_SIZE + \ + EXT3COW_I(inode)->i_extra_isize)) +#define IFIRST(hdr) ((struct ext3cow_xattr_entry *)((hdr)+1)) + +#ifdef EXT3COW_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%lu: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(f...) +# define ea_bdebug(f...) +#endif + +static void ext3cow_xattr_cache_insert(struct buffer_head *); +static struct buffer_head *ext3cow_xattr_cache_find(struct inode *, + struct ext3cow_xattr_header *, + struct mb_cache_entry **); +static void ext3cow_xattr_rehash(struct ext3cow_xattr_header *, + struct ext3cow_xattr_entry *); + +static struct mb_cache *ext3cow_xattr_cache; + +static struct xattr_handler *ext3cow_xattr_handler_map[] = { + [EXT3COW_XATTR_INDEX_USER] = &ext3cow_xattr_user_handler, +#ifdef CONFIG_EXT3COW_FS_POSIX_ACL + [EXT3COW_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3cow_xattr_acl_access_handler, + [EXT3COW_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3cow_xattr_acl_default_handler, +#endif + [EXT3COW_XATTR_INDEX_TRUSTED] = &ext3cow_xattr_trusted_handler, +#ifdef CONFIG_EXT3COW_FS_SECURITY + [EXT3COW_XATTR_INDEX_SECURITY] = &ext3cow_xattr_security_handler, +#endif +}; + +struct xattr_handler *ext3cow_xattr_handlers[] = { + &ext3cow_xattr_user_handler, + &ext3cow_xattr_trusted_handler, +#ifdef CONFIG_EXT3COW_FS_POSIX_ACL + &ext3cow_xattr_acl_access_handler, + &ext3cow_xattr_acl_default_handler, +#endif +#ifdef CONFIG_EXT3COW_FS_SECURITY + &ext3cow_xattr_security_handler, +#endif + NULL +}; + +static inline struct xattr_handler * +ext3cow_xattr_handler(int name_index) +{ + struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext3cow_xattr_handler_map)) + handler = ext3cow_xattr_handler_map[name_index]; + return handler; +} + +/* + * Inode operation listxattr() + * + * dentry->d_inode->i_mutex: don't care + */ +ssize_t +ext3cow_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext3cow_xattr_list(dentry->d_inode, buffer, size); +} + +static int +ext3cow_xattr_check_names(struct ext3cow_xattr_entry *entry, void *end) +{ + while (!IS_LAST_ENTRY(entry)) { + struct ext3cow_xattr_entry *next = EXT3COW_XATTR_NEXT(entry); + if ((void *)next >= end) + return -EIO; + entry = next; + } + return 0; +} + +static inline int +ext3cow_xattr_check_block(struct buffer_head *bh) +{ + int error; + + if (BHDR(bh)->h_magic != cpu_to_le32(EXT3COW_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) + return -EIO; + error = ext3cow_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + return error; +} + +static inline int +ext3cow_xattr_check_entry(struct ext3cow_xattr_entry *entry, size_t size) +{ + size_t value_size = le32_to_cpu(entry->e_value_size); + + if (entry->e_value_block != 0 || value_size > size || + le16_to_cpu(entry->e_value_offs) + value_size > size) + return -EIO; + return 0; +} + +static int +ext3cow_xattr_find_entry(struct ext3cow_xattr_entry **pentry, int name_index, + const char *name, size_t size, int sorted) +{ + struct ext3cow_xattr_entry *entry; + size_t name_len; + int cmp = 1; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + entry = *pentry; + for (; !IS_LAST_ENTRY(entry); entry = EXT3COW_XATTR_NEXT(entry)) { + cmp = name_index - entry->e_name_index; + if (!cmp) + cmp = name_len - entry->e_name_len; + if (!cmp) + cmp = memcmp(name, entry->e_name, name_len); + if (cmp <= 0 && (sorted || cmp == 0)) + break; + } + *pentry = entry; + if (!cmp && ext3cow_xattr_check_entry(entry, size)) + return -EIO; + return cmp ? -ENODATA : 0; +} + +static int +ext3cow_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext3cow_xattr_entry *entry; + size_t size; + int error; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + error = -ENODATA; + if (!EXT3COW_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT3COW_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT3COW_I(inode)->i_file_acl); + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext3cow_xattr_check_block(bh)) { +bad_block: ext3cow_error(inode->i_sb, __FUNCTION__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3COW_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext3cow_xattr_cache_insert(bh); + entry = BFIRST(bh); + error = ext3cow_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + return error; +} + +static int +ext3cow_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct ext3cow_xattr_ibody_header *header; + struct ext3cow_xattr_entry *entry; + struct ext3cow_inode *raw_inode; + struct ext3cow_iloc iloc; + size_t size; + void *end; + int error; + + if (!(EXT3COW_I(inode)->i_state & EXT3COW_STATE_XATTR)) + return -ENODATA; + error = ext3cow_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext3cow_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT3COW_SB(inode->i_sb)->s_inode_size; + error = ext3cow_xattr_check_names(entry, end); + if (error) + goto cleanup; + error = ext3cow_xattr_find_entry(&entry, name_index, name, + end - (void *)entry, 0); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } + error = size; + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext3cow_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext3cow_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + int error; + + down_read(&EXT3COW_I(inode)->xattr_sem); + error = ext3cow_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext3cow_xattr_block_get(inode, name_index, name, buffer, + buffer_size); + up_read(&EXT3COW_I(inode)->xattr_sem); + return error; +} + +static int +ext3cow_xattr_list_entries(struct inode *inode, struct ext3cow_xattr_entry *entry, + char *buffer, size_t buffer_size) +{ + size_t rest = buffer_size; + + for (; !IS_LAST_ENTRY(entry); entry = EXT3COW_XATTR_NEXT(entry)) { + struct xattr_handler *handler = + ext3cow_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(inode, buffer, rest, + entry->e_name, + entry->e_name_len); + if (buffer) { + if (size > rest) + return -ERANGE; + buffer += size; + } + rest -= size; + } + } + return buffer_size - rest; +} + +static int +ext3cow_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + error = 0; + if (!EXT3COW_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT3COW_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT3COW_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext3cow_xattr_check_block(bh)) { + ext3cow_error(inode->i_sb, __FUNCTION__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3COW_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext3cow_xattr_cache_insert(bh); + error = ext3cow_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + +cleanup: + brelse(bh); + + return error; +} + +static int +ext3cow_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +{ + struct ext3cow_xattr_ibody_header *header; + struct ext3cow_inode *raw_inode; + struct ext3cow_iloc iloc; + void *end; + int error; + + if (!(EXT3COW_I(inode)->i_state & EXT3COW_STATE_XATTR)) + return 0; + error = ext3cow_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext3cow_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT3COW_SB(inode->i_sb)->s_inode_size; + error = ext3cow_xattr_check_names(IFIRST(header), end); + if (error) + goto cleanup; + error = ext3cow_xattr_list_entries(inode, IFIRST(header), + buffer, buffer_size); + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext3cow_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext3cow_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +{ + int i_error, b_error; + + down_read(&EXT3COW_I(inode)->xattr_sem); + i_error = ext3cow_xattr_ibody_list(inode, buffer, buffer_size); + if (i_error < 0) { + b_error = 0; + } else { + if (buffer) { + buffer += i_error; + buffer_size -= i_error; + } + b_error = ext3cow_xattr_block_list(inode, buffer, buffer_size); + if (b_error < 0) + i_error = 0; + } + up_read(&EXT3COW_I(inode)->xattr_sem); + return i_error + b_error; +} + +/* + * If the EXT3COW_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext3cow_xattr_update_super_block(handle_t *handle, + struct super_block *sb) +{ + if (EXT3COW_HAS_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_EXT_ATTR)) + return; + + if (ext3cow_journal_get_write_access(handle, EXT3COW_SB(sb)->s_sbh) == 0) { + EXT3COW_SET_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_EXT_ATTR); + sb->s_dirt = 1; + ext3cow_journal_dirty_metadata(handle, EXT3COW_SB(sb)->s_sbh); + } +} + +/* + * Release the xattr block BH: If the reference count is > 1, decrement + * it; otherwise free the block. + */ +static void +ext3cow_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + struct mb_cache_entry *ce = NULL; + + ce = mb_cache_entry_get(ext3cow_xattr_cache, bh->b_bdev, bh->b_blocknr); + if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); + ext3cow_free_blocks(handle, inode, bh->b_blocknr, 1); + get_bh(bh); + ext3cow_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { + if (ext3cow_journal_get_write_access(handle, bh) == 0) { + lock_buffer(bh); + BHDR(bh)->h_refcount = cpu_to_le32( + le32_to_cpu(BHDR(bh)->h_refcount) - 1); + ext3cow_journal_dirty_metadata(handle, bh); + if (IS_SYNC(inode)) + handle->h_sync = 1; + DQUOT_FREE_BLOCK(inode, 1); + unlock_buffer(bh); + ea_bdebug(bh, "refcount now=%d; releasing", + le32_to_cpu(BHDR(bh)->h_refcount)); + } + if (ce) + mb_cache_entry_release(ce); + } +} + +struct ext3cow_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext3cow_xattr_search { + struct ext3cow_xattr_entry *first; + void *base; + void *end; + struct ext3cow_xattr_entry *here; + int not_found; +}; + +static int +ext3cow_xattr_set_entry(struct ext3cow_xattr_info *i, struct ext3cow_xattr_search *s) +{ + struct ext3cow_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT3COW_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { + if (!s->here->e_value_block && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT3COW_XATTR_SIZE(size); + } + free += EXT3COW_XATTR_LEN(name_len); + } + if (i->value) { + if (free < EXT3COW_XATTR_SIZE(i->value_len) || + free < EXT3COW_XATTR_LEN(name_len) + + EXT3COW_XATTR_SIZE(i->value_len)) + return -ENOSPC; + } + + if (i->value && s->not_found) { + /* Insert the new name. */ + size_t size = EXT3COW_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)s->here + sizeof(__u32); + memmove((void *)s->here + size, s->here, rest); + memset(s->here, 0, size); + s->here->e_name_index = i->name_index; + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { + if (!s->here->e_value_block && s->here->e_value_size) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; + size_t size = EXT3COW_XATTR_SIZE( + le32_to_cpu(s->here->e_value_size)); + + if (i->value && size == EXT3COW_XATTR_SIZE(i->value_len)) { + /* The old and the new value have the same + size. Just replace. */ + s->here->e_value_size = + cpu_to_le32(i->value_len); + memset(val + size - EXT3COW_XATTR_PAD, 0, + EXT3COW_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, i->value, i->value_len); + return 0; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + s->here->e_value_size = 0; + s->here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && + last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT3COW_XATTR_NEXT(last); + } + } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT3COW_XATTR_LEN(name_len); + last = ENTRY((void *)last - size); + memmove(s->here, (void *)s->here + size, + (void *)last - (void *)s->here + sizeof(__u32)); + memset(last, 0, size); + } + } + + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); + if (i->value_len) { + size_t size = EXT3COW_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); + memset(val + size - EXT3COW_XATTR_PAD, 0, + EXT3COW_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, i->value, i->value_len); + } + } + return 0; +} + +struct ext3cow_xattr_block_find { + struct ext3cow_xattr_search s; + struct buffer_head *bh; +}; + +static int +ext3cow_xattr_block_find(struct inode *inode, struct ext3cow_xattr_info *i, + struct ext3cow_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + int error; + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + i->name_index, i->name, i->value, (long)i->value_len); + + if (EXT3COW_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bs->bh = sb_bread(sb, EXT3COW_I(inode)->i_file_acl); + error = -EIO; + if (!bs->bh) + goto cleanup; + ea_bdebug(bs->bh, "b_count=%d, refcount=%d", + atomic_read(&(bs->bh->b_count)), + le32_to_cpu(BHDR(bs->bh)->h_refcount)); + if (ext3cow_xattr_check_block(bs->bh)) { + ext3cow_error(sb, __FUNCTION__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3COW_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + bs->s.base = BHDR(bs->bh); + bs->s.first = BFIRST(bs->bh); + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext3cow_xattr_find_entry(&bs->s.here, i->name_index, + i->name, bs->bh->b_size, 1); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; + } + error = 0; + +cleanup: + return error; +} + +static int +ext3cow_xattr_block_set(handle_t *handle, struct inode *inode, + struct ext3cow_xattr_info *i, + struct ext3cow_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext3cow_xattr_search *s = &bs->s; + struct mb_cache_entry *ce = NULL; + int error; + +#define header(x) ((struct ext3cow_xattr_header *)(x)) + + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext3cow_xattr_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); + if (header(s->base)->h_refcount == cpu_to_le32(1)) { + if (ce) { + mb_cache_entry_free(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext3cow_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; + lock_buffer(bs->bh); + error = ext3cow_xattr_set_entry(i, s); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext3cow_xattr_rehash(header(s->base), + s->here); + ext3cow_xattr_cache_insert(bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) + goto bad_block; + if (!error) + error = ext3cow_journal_dirty_metadata(handle, + bs->bh); + if (error) + goto cleanup; + goto inserted; + } else { + int offset = (char *)s->here - bs->bh->b_data; + + if (ce) { + mb_cache_entry_release(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_KERNEL); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; + } + } else { + /* Allocate a buffer where we construct the new block. */ + s->base = kmalloc(sb->s_blocksize, GFP_KERNEL); + /* assert(header == s->base) */ + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memset(s->base, 0, sb->s_blocksize); + header(s->base)->h_magic = cpu_to_le32(EXT3COW_XATTR_MAGIC); + header(s->base)->h_blocks = cpu_to_le32(1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->first = ENTRY(header(s->base)+1); + s->here = ENTRY(header(s->base)+1); + s->end = s->base + sb->s_blocksize; + } + + error = ext3cow_xattr_set_entry(i, s); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + if (!IS_LAST_ENTRY(s->first)) + ext3cow_xattr_rehash(header(s->base), s->here); + +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext3cow_xattr_cache_find(inode, header(s->base), &ce); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == bs->bh) + ea_bdebug(new_bh, "keeping"); + else { + /* The old block is released after updating + the inode. */ + error = -EDQUOT; + if (DQUOT_ALLOC_BLOCK(inode, 1)) + goto cleanup; + error = ext3cow_journal_get_write_access(handle, + new_bh); + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); + BHDR(new_bh)->h_refcount = cpu_to_le32(1 + + le32_to_cpu(BHDR(new_bh)->h_refcount)); + ea_bdebug(new_bh, "reusing; refcount now=%d", + le32_to_cpu(BHDR(new_bh)->h_refcount)); + unlock_buffer(new_bh); + error = ext3cow_journal_dirty_metadata(handle, + new_bh); + if (error) + goto cleanup_dquot; + } + mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ + ea_bdebug(bs->bh, "keeping this block"); + new_bh = bs->bh; + get_bh(new_bh); + } else { + /* We need to allocate a new block */ + ext3cow_fsblk_t goal = le32_to_cpu( + EXT3COW_SB(sb)->s_es->s_first_data_block) + + (ext3cow_fsblk_t)EXT3COW_I(inode)->i_block_group * + EXT3COW_BLOCKS_PER_GROUP(sb); + ext3cow_fsblk_t block = ext3cow_new_block(handle, inode, + goal, &error); + if (error) + goto cleanup; + ea_idebug(inode, "creating block %d", block); + + new_bh = sb_getblk(sb, block); + if (!new_bh) { +getblk_failed: + ext3cow_free_blocks(handle, inode, block, 1); + error = -EIO; + goto cleanup; + } + lock_buffer(new_bh); + error = ext3cow_journal_get_create_access(handle, new_bh); + if (error) { + unlock_buffer(new_bh); + goto getblk_failed; + } + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext3cow_xattr_cache_insert(new_bh); + error = ext3cow_journal_dirty_metadata(handle, new_bh); + if (error) + goto cleanup; + } + } + + /* Update the inode. */ + EXT3COW_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + + /* Drop the previous xattr block. */ + if (bs->bh && bs->bh != new_bh) + ext3cow_xattr_release_block(handle, inode, bs->bh); + error = 0; + +cleanup: + if (ce) + mb_cache_entry_release(ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); + + return error; + +cleanup_dquot: + DQUOT_FREE_BLOCK(inode, 1); + goto cleanup; + +bad_block: + ext3cow_error(inode->i_sb, __FUNCTION__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3COW_I(inode)->i_file_acl); + goto cleanup; + +#undef header +} + +struct ext3cow_xattr_ibody_find { + struct ext3cow_xattr_search s; + struct ext3cow_iloc iloc; +}; + +static int +ext3cow_xattr_ibody_find(struct inode *inode, struct ext3cow_xattr_info *i, + struct ext3cow_xattr_ibody_find *is) +{ + struct ext3cow_xattr_ibody_header *header; + struct ext3cow_inode *raw_inode; + int error; + + if (EXT3COW_I(inode)->i_extra_isize == 0) + return 0; + raw_inode = ext3cow_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT3COW_SB(inode->i_sb)->s_inode_size; + if (EXT3COW_I(inode)->i_state & EXT3COW_STATE_XATTR) { + error = ext3cow_xattr_check_names(IFIRST(header), is->s.end); + if (error) + return error; + /* Find the named attribute. */ + error = ext3cow_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - + (void *)is->s.base, 0); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; + } + return 0; +} + +static int +ext3cow_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext3cow_xattr_info *i, + struct ext3cow_xattr_ibody_find *is) +{ + struct ext3cow_xattr_ibody_header *header; + struct ext3cow_xattr_search *s = &is->s; + int error; + + if (EXT3COW_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext3cow_xattr_set_entry(i, s); + if (error) + return error; + header = IHDR(inode, ext3cow_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT3COW_XATTR_MAGIC); + EXT3COW_I(inode)->i_state |= EXT3COW_STATE_XATTR; + } else { + header->h_magic = cpu_to_le32(0); + EXT3COW_I(inode)->i_state &= ~EXT3COW_STATE_XATTR; + } + return 0; +} + +/* + * ext3cow_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Buffer + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext3cow_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) +{ + struct ext3cow_xattr_info i = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + + }; + struct ext3cow_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext3cow_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; + int error; + + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; + down_write(&EXT3COW_I(inode)->xattr_sem); + error = ext3cow_get_inode_loc(inode, &is.iloc); + if (error) + goto cleanup; + + if (EXT3COW_I(inode)->i_state & EXT3COW_STATE_NEW) { + struct ext3cow_inode *raw_inode = ext3cow_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT3COW_SB(inode->i_sb)->s_inode_size); + EXT3COW_I(inode)->i_state &= ~EXT3COW_STATE_NEW; + } + + error = ext3cow_xattr_ibody_find(inode, &i, &is); + if (error) + goto cleanup; + if (is.s.not_found) + error = ext3cow_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + if (is.s.not_found && bs.s.not_found) { + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (!value) + goto cleanup; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + error = ext3cow_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (!value) { + if (!is.s.not_found) + error = ext3cow_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext3cow_xattr_block_set(handle, inode, &i, &bs); + } else { + error = ext3cow_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext3cow_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { + error = ext3cow_xattr_block_set(handle, inode, &i, &bs); + if (error) + goto cleanup; + if (!is.s.not_found) { + i.value = NULL; + error = ext3cow_xattr_ibody_set(handle, inode, &i, + &is); + } + } + } + if (!error) { + ext3cow_xattr_update_super_block(handle, inode->i_sb); + inode->i_ctime = CURRENT_TIME_SEC; + error = ext3cow_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext3cow_mark_iloc_dirty, even with + * error != 0. + */ + is.iloc.bh = NULL; + if (IS_SYNC(inode)) + handle->h_sync = 1; + } + +cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + up_write(&EXT3COW_I(inode)->xattr_sem); + return error; +} + +/* + * ext3cow_xattr_set() + * + * Like ext3cow_xattr_set_handle, but start from an inode. This extended + * attribute modification is a filesystem transaction by itself. + * + * Returns 0, or a negative error number on failure. + */ +int +ext3cow_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + handle_t *handle; + int error, retries = 0; + +retry: + handle = ext3cow_journal_start(inode, EXT3COW_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { + int error2; + + error = ext3cow_xattr_set_handle(handle, inode, name_index, name, + value, value_len, flags); + error2 = ext3cow_journal_stop(handle); + if (error == -ENOSPC && + ext3cow_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (error == 0) + error = error2; + } + + return error; +} + +/* + * ext3cow_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. We have exclusive + * access to the inode. + */ +void +ext3cow_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (!EXT3COW_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT3COW_I(inode)->i_file_acl); + if (!bh) { + ext3cow_error(inode->i_sb, __FUNCTION__, + "inode %lu: block "E3FSBLK" read error", inode->i_ino, + EXT3COW_I(inode)->i_file_acl); + goto cleanup; + } + if (BHDR(bh)->h_magic != cpu_to_le32(EXT3COW_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + ext3cow_error(inode->i_sb, __FUNCTION__, + "inode %lu: bad block "E3FSBLK, inode->i_ino, + EXT3COW_I(inode)->i_file_acl); + goto cleanup; + } + ext3cow_xattr_release_block(handle, inode, bh); + EXT3COW_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); +} + +/* + * ext3cow_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext3cow_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(sb->s_bdev); +} + +/* + * ext3cow_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static void +ext3cow_xattr_cache_insert(struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext3cow_xattr_cache); + if (!ce) { + ea_bdebug(bh, "out of memory"); + return; + } + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache"); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x]", (int)hash); + mb_cache_entry_release(ce); + } +} + +/* + * ext3cow_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext3cow_xattr_cmp(struct ext3cow_xattr_header *header1, + struct ext3cow_xattr_header *header2) +{ + struct ext3cow_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT3COW_XATTR_NEXT(entry1); + entry2 = EXT3COW_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext3cow_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a pointer to the block found, or NULL if such a block was + * not found or an error occurred. + */ +static struct buffer_head * +ext3cow_xattr_cache_find(struct inode *inode, struct ext3cow_xattr_header *header, + struct mb_cache_entry **pce) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext3cow_xattr_cache, 0, + inode->i_sb->s_bdev, hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + ext3cow_error(inode->i_sb, __FUNCTION__, + "inode %lu: block %lu read error", + inode->i_ino, (unsigned long) ce->e_block); + } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= + EXT3COW_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %lu refcount %d>=%d", + (unsigned long) ce->e_block, + le32_to_cpu(BHDR(bh)->h_refcount), + EXT3COW_XATTR_REFCOUNT_MAX); + } else if (ext3cow_xattr_cmp(header, BHDR(bh)) == 0) { + *pce = ce; + return bh; + } + brelse(bh); + ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext3cow_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext3cow_xattr_hash_entry(struct ext3cow_xattr_header *header, + struct ext3cow_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n=0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT3COW_XATTR_ROUND) >> EXT3COW_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext3cow_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext3cow_xattr_rehash(struct ext3cow_xattr_header *header, + struct ext3cow_xattr_entry *entry) +{ + struct ext3cow_xattr_entry *here; + __u32 hash = 0; + + ext3cow_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT3COW_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +int __init +init_ext3cow_xattr(void) +{ + ext3cow_xattr_cache = mb_cache_create("ext3cow_xattr", NULL, + sizeof(struct mb_cache_entry) + + sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6); + if (!ext3cow_xattr_cache) + return -ENOMEM; + return 0; +} + +void +exit_ext3cow_xattr(void) +{ + if (ext3cow_xattr_cache) + mb_cache_destroy(ext3cow_xattr_cache); + ext3cow_xattr_cache = NULL; +} diff -ruN linux-2.6.20.3/fs/ext3cow/xattr.h linux-2.6.20.3-ext3cow/fs/ext3cow/xattr.h --- linux-2.6.20.3/fs/ext3cow/xattr.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr.h 2008-03-09 11:14:49.000000000 -0400 @@ -0,0 +1,145 @@ +/* + File: fs/ext3cow/xattr.h + + On-disk format of extended attributes for the ext3cow filesystem. + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +/* Magic value in attribute blocks */ +#define EXT3COW_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT3COW_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT3COW_XATTR_INDEX_USER 1 +#define EXT3COW_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT3COW_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT3COW_XATTR_INDEX_TRUSTED 4 +#define EXT3COW_XATTR_INDEX_LUSTRE 5 +#define EXT3COW_XATTR_INDEX_SECURITY 6 + +struct ext3cow_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct ext3cow_xattr_ibody_header { + __le32 h_magic; /* magic number for identification */ +}; + +struct ext3cow_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT3COW_XATTR_PAD_BITS 2 +#define EXT3COW_XATTR_PAD (1<e_name_len)) ) +#define EXT3COW_XATTR_SIZE(size) \ + (((size) + EXT3COW_XATTR_ROUND) & ~EXT3COW_XATTR_ROUND) + +# ifdef CONFIG_EXT3COW_FS_XATTR + +extern struct xattr_handler ext3cow_xattr_user_handler; +extern struct xattr_handler ext3cow_xattr_trusted_handler; +extern struct xattr_handler ext3cow_xattr_acl_access_handler; +extern struct xattr_handler ext3cow_xattr_acl_default_handler; +extern struct xattr_handler ext3cow_xattr_security_handler; + +extern ssize_t ext3cow_listxattr(struct dentry *, char *, size_t); + +extern int ext3cow_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext3cow_xattr_list(struct inode *, char *, size_t); +extern int ext3cow_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +extern int ext3cow_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +extern void ext3cow_xattr_delete_inode(handle_t *, struct inode *); +extern void ext3cow_xattr_put_super(struct super_block *); + +extern int init_ext3cow_xattr(void); +extern void exit_ext3cow_xattr(void); + +extern struct xattr_handler *ext3cow_xattr_handlers[]; + +# else /* CONFIG_EXT3COW_FS_XATTR */ + +static inline int +ext3cow_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3cow_xattr_list(struct inode *inode, void *buffer, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3cow_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext3cow_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext3cow_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ +} + +static inline void +ext3cow_xattr_put_super(struct super_block *sb) +{ +} + +static inline int +init_ext3cow_xattr(void) +{ + return 0; +} + +static inline void +exit_ext3cow_xattr(void) +{ +} + +#define ext3cow_xattr_handlers NULL + +# endif /* CONFIG_EXT3COW_FS_XATTR */ + +#ifdef CONFIG_EXT3COW_FS_SECURITY +extern int ext3cow_init_security(handle_t *handle, struct inode *inode, + struct inode *dir); +#else +static inline int ext3cow_init_security(handle_t *handle, struct inode *inode, + struct inode *dir) +{ + return 0; +} +#endif diff -ruN linux-2.6.20.3/fs/ext3cow/xattr_security.c linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_security.c --- linux-2.6.20.3/fs/ext3cow/xattr_security.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_security.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,77 @@ +/* + * linux/fs/ext3cow/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" + +static size_t +ext3cow_xattr_security_list(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3cow_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3cow_xattr_get(inode, EXT3COW_XATTR_INDEX_SECURITY, name, + buffer, size); +} + +static int +ext3cow_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3cow_xattr_set(inode, EXT3COW_XATTR_INDEX_SECURITY, name, + value, size, flags); +} + +int +ext3cow_init_security(handle_t *handle, struct inode *inode, struct inode *dir) +{ + int err; + size_t len; + void *value; + char *name; + + err = security_inode_init_security(inode, dir, &name, &value, &len); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = ext3cow_xattr_set_handle(handle, inode, EXT3COW_XATTR_INDEX_SECURITY, + name, value, len, 0); + kfree(name); + kfree(value); + return err; +} + +struct xattr_handler ext3cow_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext3cow_xattr_security_list, + .get = ext3cow_xattr_security_get, + .set = ext3cow_xattr_security_set, +}; diff -ruN linux-2.6.20.3/fs/ext3cow/xattr_trusted.c linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_trusted.c --- linux-2.6.20.3/fs/ext3cow/xattr_trusted.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_trusted.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,62 @@ +/* + * linux/fs/ext3cow/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" + +#define XATTR_TRUSTED_PREFIX "trusted." + +static size_t +ext3cow_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3cow_xattr_trusted_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3cow_xattr_get(inode, EXT3COW_XATTR_INDEX_TRUSTED, name, + buffer, size); +} + +static int +ext3cow_xattr_trusted_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext3cow_xattr_set(inode, EXT3COW_XATTR_INDEX_TRUSTED, name, + value, size, flags); +} + +struct xattr_handler ext3cow_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext3cow_xattr_trusted_list, + .get = ext3cow_xattr_trusted_get, + .set = ext3cow_xattr_trusted_set, +}; diff -ruN linux-2.6.20.3/fs/ext3cow/xattr_user.c linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_user.c --- linux-2.6.20.3/fs/ext3cow/xattr_user.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_user.c 2008-03-09 11:14:48.000000000 -0400 @@ -0,0 +1,64 @@ +/* + * linux/fs/ext3cow/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include +#include "xattr.h" + +#define XATTR_USER_PREFIX "user." + +static size_t +ext3cow_xattr_user_list(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(inode->i_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext3cow_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext3cow_xattr_get(inode, EXT3COW_XATTR_INDEX_USER, name, buffer, size); +} + +static int +ext3cow_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext3cow_xattr_set(inode, EXT3COW_XATTR_INDEX_USER, name, + value, size, flags); +} + +struct xattr_handler ext3cow_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext3cow_xattr_user_list, + .get = ext3cow_xattr_user_get, + .set = ext3cow_xattr_user_set, +}; diff -ruN linux-2.6.20.3/fs/Kconfig linux-2.6.20.3-ext3cow/fs/Kconfig --- linux-2.6.20.3/fs/Kconfig 2007-03-13 14:27:08.000000000 -0400 +++ linux-2.6.20.3-ext3cow/fs/Kconfig 2008-03-09 11:14:25.000000000 -0400 @@ -136,6 +136,77 @@ If you are not using a security module that requires using extended attributes for file security labels, say N. + + +config EXT3COW_FS + tristate "Ext3cow journalling and versioning file system support" + select JBD + help + This is the journalling version of the Second extended file system + (often called ext3), the de facto standard Linux file system + (method to organize files on a storage device) for hard disks. + + The journalling code included in this driver means you do not have + to run e2fsck (file system checker) on your file systems after a + crash. The journal keeps track of any changes that were being made + at the time the system crashed, and can ensure that your file system + is consistent without the need for a lengthy check. + + Other than adding the journal to the file system, the on-disk format + of ext3 is identical to ext2. It is possible to freely switch + between using the ext3 driver and the ext2 driver, as long as the + file system has been cleanly unmounted, or e2fsck is run on the file + system. + + To add a journal on an existing ext2 file system or change the + behavior of ext3 file systems, you can use the tune2fs utility ("man + tune2fs"). To modify attributes of files and directories on ext3 + file systems, use chattr ("man chattr"). You need to be using + e2fsprogs version 1.20 or later in order to create ext3 journals + (available at ). + + To compile this file system support as a module, choose M here: the + module will be called ext3. + +config EXT3COW_FS_XATTR + bool "Ext3cow extended attributes" + depends on EXT3COW_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext3cow. + +config EXT3COW_FS_POSIX_ACL + bool "Ext3cow POSIX Access Control Lists" + depends on EXT3COW_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config EXT3COW_FS_SECURITY + bool "Ext3cow Security Labels" + depends on EXT3COW_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext3cow filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + + config EXT4DEV_FS tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)" depends on EXPERIMENTAL @@ -205,23 +276,23 @@ tristate help This is a generic journalling layer for block devices. It is - currently used by the ext3 and OCFS2 file systems, but it could + currently used by the ext3, ext3cow and OCFS2 file systems, but it could also be used to add journal support to other file systems or block devices such as RAID or LVM. - If you are using the ext3 or OCFS2 file systems, you need to + If you are using the ext3, ext3cow or OCFS2 file systems, you need to say Y here. If you are not using ext3 OCFS2 then you will probably want to say N. To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 or OCFS2 into the kernel, + called jbd. If you are compiling ext3, ext3cow or OCFS2 into the kernel, you cannot compile this code as a module. config JBD_DEBUG bool "JBD (ext3) debugging support" depends on JBD help - If you are using the ext3 journaled file system (or potentially any + If you are using the ext3 or ext3cow journaled file system (or potentially any other file system/device using JBD), this option allows you to enable debugging output while the system is running, in order to help track down any problems you are having. By default the @@ -266,11 +337,12 @@ "echo 0 > /proc/sys/fs/jbd2-debug". config FS_MBCACHE -# Meta block cache for Extended Attributes (ext2/ext3/ext4) +# Meta block cache for Extended Attributes (ext2/ext3(cow)/ext4) tristate - depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR - default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y - default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m + depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT3COW_FS_XATTR || EXT4DEV_FS_XATTR + default y if EXT2_FS=y || EXT3_FS=y || EXT3COW_FS=y || EXT4DEV_FS=y + default m if EXT2_FS=m || EXT3_FS=m || EXT3COW_FS=m || EXT4DEV_FS=m + config REISERFS_FS tristate "Reiserfs support" diff -ruN linux-2.6.20.3/fs/Makefile linux-2.6.20.3-ext3cow/fs/Makefile --- linux-2.6.20.3/fs/Makefile 2007-03-13 14:27:08.000000000 -0400 +++ linux-2.6.20.3-ext3cow/fs/Makefile 2008-03-09 11:14:54.000000000 -0400 @@ -63,6 +63,7 @@ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 +obj-$(CONFIG_EXT3COW_FS) += ext3cow/ # Before ext2 so root fs can be ext3 obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ diff -ruN linux-2.6.20.3/include/linux/ext3cow_fs.h linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs.h --- linux-2.6.20.3/include/linux/ext3cow_fs.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs.h 2008-03-09 11:10:57.000000000 -0400 @@ -0,0 +1,948 @@ +/* + * linux/include/linux/ext3cow_fs.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _LINUX_EXT3COW_FS_H +#define _LINUX_EXT3COW_FS_H + +#include +#include + +/* + * The second extended filesystem constants/structures + */ + +/* + * Define EXT3COWFS_DEBUG to produce debug messages + */ +#undef EXT3COWFS_DEBUG + + +/* + * Define EXT3COW_RESERVATION to reserve data blocks for expanding files + */ +#define EXT3COW_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT3COW_MAX_RESERVE_BLOCKS 1027 +#define EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED 0 +/* + * Always enable hashed directories + */ +//#define CONFIG_EXT3COW_INDEX + +/* + * Debug code + */ +#ifdef EXT3COWFS_DEBUG +#define ext3cow_debug(f, a...) \ + do { \ + printk (KERN_DEBUG "EXT3COW-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext3cow_debug(f, a...) do {} while (0) +#endif + +/* + * Special inodes numbers + */ +#define EXT3COW_BAD_INO 1 /* Bad blocks inode */ +#define EXT3COW_ROOT_INO 2 /* Root inode */ +#define EXT3COW_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT3COW_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT3COW_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT3COW_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext3cow filesystems */ +#define EXT3COW_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT3COW_LINK_MAX 32000 + +/* For versioning -znjp */ +#define EXT3COW_FLUX_TOKEN '@' +/* Macros for scoping - in seconds -znjp */ +#define ONEHOUR 3600 +#define YESTERDAY 86400 +#define ONEWEEK 604800 +#define ONEMONTH 2419200 +#define ONEYEAR 31449600 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT3COW_MIN_BLOCK_SIZE 1024 +#define EXT3COW_MAX_BLOCK_SIZE 4096 +#define EXT3COW_MIN_BLOCK_LOG_SIZE 10 +#ifdef __KERNEL__ +# define EXT3COW_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT3COW_BLOCK_SIZE(s) (EXT3COW_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +//#define EXT3COW_ADDR_PER_BLOCK(s) (EXT3COW_BLOCK_SIZE(s) / sizeof (__u32)) +#ifdef __KERNEL__ +# define EXT3COW_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#else +# define EXT3COW_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT3COW_ADDR_PER_BLOCK_BITS(s) (EXT3COW_SB(s)->s_addr_per_block_bits) +#define EXT3COW_INODE_SIZE(s) (EXT3COW_SB(s)->s_inode_size) +#define EXT3COW_FIRST_INO(s) (EXT3COW_SB(s)->s_first_ino) +#else +#define EXT3COW_INODE_SIZE(s) (((s)->s_rev_level == EXT3COW_GOOD_OLD_REV) ? \ + EXT3COW_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT3COW_FIRST_INO(s) (((s)->s_rev_level == EXT3COW_GOOD_OLD_REV) ? \ + EXT3COW_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +/* + * Macro-instructions for versioning support - znjp + */ +#define EXT3COW_COWBITMAP_SIZE (sizeof(__u32) * 8) /* one word */ +#define EXT3COW_COWBITMAPS_PER_IBLOCK(s) \ + (( (EXT3COW_BLOCK_SIZE(s) / sizeof(__u32)) / (EXT3COW_COWBITMAP_SIZE))) +/* Accounts for COW bitmaps */ +#define EXT3COW_ADDR_PER_BLOCK(s) ((EXT3COW_BLOCK_SIZE(s) / sizeof(__u32)) - EXT3COW_COWBITMAPS_PER_IBLOCK(s)) + +/* + * Macro-instructions used to manage fragments + */ +#define EXT3COW_MIN_FRAG_SIZE 1024 +#define EXT3COW_MAX_FRAG_SIZE 4096 +#define EXT3COW_MIN_FRAG_LOG_SIZE 10 +#ifdef __KERNEL__ +# define EXT3COW_FRAG_SIZE(s) (EXT3COW_SB(s)->s_frag_size) +# define EXT3COW_FRAGS_PER_BLOCK(s) (EXT3COW_SB(s)->s_frags_per_block) +#else +# define EXT3COW_FRAG_SIZE(s) (EXT3COW_MIN_FRAG_SIZE << (s)->s_log_frag_size) +# define EXT3COW_FRAGS_PER_BLOCK(s) (EXT3COW_BLOCK_SIZE(s) / EXT3COW_FRAG_SIZE(s)) +#endif + +/* + * Structure of a blocks group descriptor + */ +struct ext3cow_group_desc +{ + __le32 bg_block_bitmap; /* Blocks bitmap block */ + __le32 bg_inode_bitmap; /* Inodes bitmap block */ + __le32 bg_inode_table; /* Inodes table block */ + __le16 bg_free_blocks_count; /* Free blocks count */ + __le16 bg_free_inodes_count; /* Free inodes count */ + __le16 bg_used_dirs_count; /* Directories count */ + __u16 bg_pad; + __le32 bg_reserved[3]; +}; + +/* + * Macro-instructions used to manage group descriptors + */ +#ifdef __KERNEL__ +# define EXT3COW_BLOCKS_PER_GROUP(s) (EXT3COW_SB(s)->s_blocks_per_group) +# define EXT3COW_DESC_PER_BLOCK(s) (EXT3COW_SB(s)->s_desc_per_block) +# define EXT3COW_INODES_PER_GROUP(s) (EXT3COW_SB(s)->s_inodes_per_group) +# define EXT3COW_DESC_PER_BLOCK_BITS(s) (EXT3COW_SB(s)->s_desc_per_block_bits) +#else +# define EXT3COW_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT3COW_DESC_PER_BLOCK(s) (EXT3COW_BLOCK_SIZE(s) / sizeof (struct ext3cow_group_desc)) +# define EXT3COW_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT3COW_NDIR_BLOCKS 12 +#define EXT3COW_IND_BLOCK EXT3COW_NDIR_BLOCKS +#define EXT3COW_DIND_BLOCK (EXT3COW_IND_BLOCK + 1) +#define EXT3COW_TIND_BLOCK (EXT3COW_DIND_BLOCK + 1) +#define EXT3COW_N_BLOCKS (EXT3COW_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT3COW_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT3COW_UNRM_FL 0x00000002 /* Undelete */ +#define EXT3COW_COMPR_FL 0x00000004 /* Compress file */ +#define EXT3COW_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT3COW_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT3COW_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT3COW_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT3COW_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT3COW_DIRTY_FL 0x00000100 +#define EXT3COW_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT3COW_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT3COW_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT3COW_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT3COW_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT3COW_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT3COW_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT3COW_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT3COW_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +/* Used for Versioning - znjp */ +#define EXT3COW_UNCHANGEABLE_FL 0x00040000 +#define EXT3COW_UNVERSIONABLE_FL 0x00080000 +#define EXT3COW_FAKEINODE_FL 0x00100000 +#define EXT3COW_RESERVED_FL 0x80000000 /* reserved for ext3cow lib */ + +#define EXT3COW_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +#define EXT3COW_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +/* + * Inode dynamic state flags + */ +#define EXT3COW_STATE_JDATA 0x00000001 /* journaled data exists */ +#define EXT3COW_STATE_NEW 0x00000002 /* inode is newly created */ +#define EXT3COW_STATE_XATTR 0x00000004 /* has in-inode xattrs */ + +/* Used to pass group descriptor data when online resize is done */ +struct ext3cow_new_group_input { + __u32 group; /* Group number for this data */ + __u32 block_bitmap; /* Absolute block number of block bitmap */ + __u32 inode_bitmap; /* Absolute block number of inode bitmap */ + __u32 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +/* The struct ext3cow_new_group_input in kernel space, with free_blocks_count */ +struct ext3cow_new_group_data { + __u32 group; + __u32 block_bitmap; + __u32 inode_bitmap; + __u32 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + + +/* + * ioctl commands + */ +#define EXT3COW_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT3COW_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT3COW_IOC_GETVERSION _IOR('f', 3, long) +#define EXT3COW_IOC_SETVERSION _IOW('f', 4, long) +#define EXT3COW_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT3COW_IOC_GROUP_ADD _IOW('f', 8,struct ext3cow_new_group_input) +#define EXT3COW_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT3COW_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#ifdef CONFIG_JBD_DEBUG +#define EXT3COW_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) +#endif +#define EXT3COW_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT3COW_IOC_SETRSVSZ _IOW('f', 6, long) +/* ioctls for versioning - znjp */ +#define EXT3COW_IOC_TAKESNAPSHOT _IOR('f', 7, long) +#define EXT3COW_IOC_GETEPOCH _IOR('f', 8, long) + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT3COW_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT3COW_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT3COW_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT3COW_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT3COW_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT3COW_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT3COW_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#ifdef CONFIG_JBD_DEBUG +#define EXT3COW_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) +#endif +#define EXT3COW_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT3COW_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + + +/* + * Mount options + */ +struct ext3cow_mount_options { + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + unsigned long s_commit_interval; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + +/* + * Structure of an inode on the disk + */ +struct ext3cow_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Creation time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + //__u32 l_i_reserved1; + /* Direct block COW bitmap -znjp */ + __u16 l_i_direct_cow_bitmap; + __u16 l_i_pad1; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT3COW_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl; /* File ACL */ + __le32 i_dir_acl; /* Directory ACL */ + __le32 i_faddr; /* Fragment address */ + union { + struct { + //__u8 l_i_frag; /* Fragment number */ + //__u8 l_i_fsize; /* Fragment size */ + //__u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + //__u32 l_i_reserved2; + /* Epoch number for versioning -znjp */ + __le32 l_i_epoch_number; + __u32 l_i_next_inode; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_pad1; +}; + +#define i_size_high i_dir_acl + +#if defined(__KERNEL__) || defined(__linux__) +/* For versioning -znjp */ +//#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_cowbitmap osd1.linux1.l_i_direct_cow_bitmap +//#define i_frag osd2.linux2.l_i_frag +//#define i_fsize osd2.linux2.l_i_fsize +#define i_uid_low i_uid +#define i_gid_low i_gid +/* For versioning -znjp */ +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +//#define i_reserved2 osd2.linux2.l_i_reserved2 +#define i_epch_number osd2.linux2.l_i_epoch_number +#define i_nxt_inode osd2.linux2.l_i_next_inode + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_frag osd2.hurd2.h_i_frag; +#define i_fsize osd2.hurd2.h_i_fsize; +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_frag osd2.masix2.m_i_frag +#define i_fsize osd2.masix2.m_i_fsize +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +/* + * File system states + */ +#define EXT3COW_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT3COW_ERROR_FS 0x0002 /* Errors detected */ +#define EXT3COW_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Mount flags + */ +#define EXT3COW_MOUNT_CHECK 0x00001 /* Do mount-time checks */ +#define EXT3COW_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ +#define EXT3COW_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT3COW_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT3COW_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT3COW_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT3COW_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT3COW_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT3COW_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT3COW_MOUNT_ABORT 0x00200 /* Fatal error detected */ +#define EXT3COW_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT3COW_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT3COW_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT3COW_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT3COW_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT3COW_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT3COW_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT3COW_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT3COW_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT3COW_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT3COW_MOUNT_NOBH 0x40000 /* No bufferheads */ +#define EXT3COW_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT3COW_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT3COW_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ + +/* Compatibility, for having both ext2_fs.h and ext3cow_fs.h included at once */ +#ifndef _LINUX_EXT2_FS_H +#define clear_opt(o, opt) o &= ~EXT3COW_MOUNT_##opt +#define set_opt(o, opt) o |= EXT3COW_MOUNT_##opt +#define test_opt(sb, opt) (EXT3COW_SB(sb)->s_mount_opt & \ + EXT3COW_MOUNT_##opt) +#else +#define EXT2_MOUNT_NOLOAD EXT3COW_MOUNT_NOLOAD +#define EXT2_MOUNT_ABORT EXT3COW_MOUNT_ABORT +#define EXT2_MOUNT_DATA_FLAGS EXT3COW_MOUNT_DATA_FLAGS +#endif + +#define ext3cow_set_bit ext2_set_bit +#define ext3cow_set_bit_atomic ext2_set_bit_atomic +#define ext3cow_clear_bit ext2_clear_bit +#define ext3cow_clear_bit_atomic ext2_clear_bit_atomic +#define ext3cow_test_bit ext2_test_bit +#define ext3cow_find_first_zero_bit ext2_find_first_zero_bit +#define ext3cow_find_next_zero_bit ext2_find_next_zero_bit + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT3COW_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT3COW_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT3COW_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT3COW_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT3COW_ERRORS_PANIC 3 /* Panic */ +#define EXT3COW_ERRORS_DEFAULT EXT3COW_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext3cow_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count; /* Blocks count */ + __le32 s_r_blocks_count; /* Reserved blocks count */ + __le32 s_free_blocks_count; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_frag_size; /* Fragment size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_frags_per_group; /* # Fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT3COW_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT3COW_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT3COW_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __u16 s_reserved_word_pad; + __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + /* Added for version - znjp */ + __le32 s_epoch_number; + __u32 s_reserved[189]; /* Padding to the end of the block */ +}; + +#ifdef __KERNEL__ +#include +#include +static inline struct ext3cow_sb_info * EXT3COW_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext3cow_inode_info *EXT3COW_I(struct inode *inode) +{ + return container_of(inode, struct ext3cow_inode_info, vfs_inode); +} + +static inline int ext3cow_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT3COW_ROOT_INO || + ino == EXT3COW_JOURNAL_INO || + ino == EXT3COW_RESIZE_INO || + (ino >= EXT3COW_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT3COW_SB(sb)->s_es->s_inodes_count)); +} +#else +/* Assume that user mode programs are passing in an ext3cowfs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT3COW_SB(sb) (sb) +#endif + +#define NEXT_ORPHAN(inode) EXT3COW_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT3COW_OS_LINUX 0 +#define EXT3COW_OS_HURD 1 +#define EXT3COW_OS_MASIX 2 +#define EXT3COW_OS_FREEBSD 3 +#define EXT3COW_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT3COW_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT3COW_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT3COW_CURRENT_REV EXT3COW_GOOD_OLD_REV +#define EXT3COW_MAX_SUPP_REV EXT3COW_DYNAMIC_REV + +#define EXT3COW_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT3COW_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT3COW_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT3COW_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT3COW_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT3COW_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT3COW_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT3COW_SET_COMPAT_FEATURE(sb,mask) \ + EXT3COW_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT3COW_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT3COW_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT3COW_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT3COW_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT3COW_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT3COW_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT3COW_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT3COW_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT3COW_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT3COW_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT3COW_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT3COW_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT3COW_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT3COW_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT3COW_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT3COW_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT3COW_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT3COW_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 + +#define EXT3COW_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT3COW_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT3COW_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT3COW_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT3COW_FEATURE_INCOMPAT_META_BG 0x0010 + +#define EXT3COW_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT3COW_FEATURE_INCOMPAT_SUPP (EXT3COW_FEATURE_INCOMPAT_FILETYPE| \ + EXT3COW_FEATURE_INCOMPAT_RECOVER| \ + EXT3COW_FEATURE_INCOMPAT_META_BG) +#define EXT3COW_FEATURE_RO_COMPAT_SUPP (EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3COW_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3COW_FEATURE_RO_COMPAT_BTREE_DIR) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT3COW_DEF_RESUID 0 +#define EXT3COW_DEF_RESGID 0 + +/* + * Default mount options + */ +#define EXT3COW_DEFM_DEBUG 0x0001 +#define EXT3COW_DEFM_BSDGROUPS 0x0002 +#define EXT3COW_DEFM_XATTR_USER 0x0004 +#define EXT3COW_DEFM_ACL 0x0008 +#define EXT3COW_DEFM_UID16 0x0010 +#define EXT3COW_DEFM_JMODE 0x0060 +#define EXT3COW_DEFM_JMODE_DATA 0x0020 +#define EXT3COW_DEFM_JMODE_ORDERED 0x0040 +#define EXT3COW_DEFM_JMODE_WBACK 0x0060 + +/* + * Structure of a directory entry + */ +#define EXT3COW_NAME_LEN 255 + +struct ext3cow_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT3COW_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT3COW structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext3cow_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + /* Added for versioning - znjp */ + __u32 birth_epoch; + __u32 death_epoch; + char name[EXT3COW_NAME_LEN]; /* File name */ +}; + +/* + * Ext3 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT3COW_FT_UNKNOWN 0 +#define EXT3COW_FT_REG_FILE 1 +#define EXT3COW_FT_DIR 2 +#define EXT3COW_FT_CHRDEV 3 +#define EXT3COW_FT_BLKDEV 4 +#define EXT3COW_FT_FIFO 5 +#define EXT3COW_FT_SOCK 6 +#define EXT3COW_FT_SYMLINK 7 + +#define EXT3COW_FT_MAX 8 + +/* Versioning macros - znjp */ +#define EXT3COW_DIRENT_ALIVE 0 +#define EXT3COW_IS_DIRENT_ALIVE(de) ((le32_to_cpu(de->death_epoch) == EXT3COW_DIRENT_ALIVE)) +#define EXT3COW_IS_DIRENT_SCOPED(de, epoch) \ +((le32_to_cpu(de->birth_epoch) <= epoch) && \ +(EXT3COW_IS_DIRENT_ALIVE(de) || (!EXT3COW_IS_DIRENT_ALIVE(de) && \ +le32_to_cpu(de->death_epoch) > epoch))) +#define EXT3COW_I_EPOCHNUMBER(inode) (((unsigned int)EXT3COW_I(inode)->i_epoch_number)) +#define EXT3COW_S_EPOCHNUMBER(sb) (((unsigned int)EXT3COW_SB(sb)->s_epoch_number)) +#define EXT3COW_I_NEXT_INODE(inode) (((unsigned int)EXT3COW_I(inode)->i_next_inode)) +#define EXT3COW_IS_UNVERSIONABLE(inode) (((unsigned int)EXT3COW_I(inode)->i_flags & EXT3COW_UNVERSIONABLE_FL)) +#define EXT3COW_IS_UNCHANGEABLE(inode) (((unsigned int)EXT3COW_I(inode)->i_flags & EXT3COW_UNCHANGEABLE_FL)) +#define EXT3COW_IS_FAKEINODE(inode) (((unsigned int)EXT3COW_I(inode)->i_flags & EXT3COW_FAKEINODE_FL)) + + +/* + * EXT3COW_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT3COW_DIR_PAD 4 +#define EXT3COW_DIR_ROUND (EXT3COW_DIR_PAD - 1) +/* Added 8 to account for birth and death epochs -znjp */ +#define EXT3COW_DIR_REC_LEN(name_len) (((name_len) + 16 + EXT3COW_DIR_ROUND) & \ + ~EXT3COW_DIR_ROUND) +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#ifdef CONFIG_EXT3COW_INDEX + #define is_dx(dir) (EXT3COW_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT3COW_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT3COW_I(dir)->i_flags & EXT3COW_INDEX_FL)) +#define EXT3COW_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3COW_LINK_MAX) +#define EXT3COW_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) +#else + #define is_dx(dir) 0 +#define EXT3COW_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3COW_LINK_MAX) +#define EXT3COW_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) +#endif + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + +#define EXT3COW_HTREE_EOF 0x7fffffff + +/* + * Control parameters used by ext3cow_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext3cow_iloc +{ + struct buffer_head *bh; + unsigned long offset; + unsigned long block_group; +}; + +static inline struct ext3cow_inode *ext3cow_raw_inode(struct ext3cow_iloc *iloc) +{ + return (struct ext3cow_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext3cow_fsblk_t +ext3cow_group_first_block_no(struct super_block *sb, unsigned long group_no) +{ + return group_no * (ext3cow_fsblk_t)EXT3COW_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT3COW_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext3cow source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* balloc.c */ +extern int ext3cow_bg_has_super(struct super_block *sb, int group); +extern unsigned long ext3cow_bg_num_gdb(struct super_block *sb, int group); +extern ext3cow_fsblk_t ext3cow_new_block (handle_t *handle, struct inode *inode, + ext3cow_fsblk_t goal, int *errp); +extern ext3cow_fsblk_t ext3cow_new_blocks (handle_t *handle, struct inode *inode, + ext3cow_fsblk_t goal, unsigned long *count, int *errp); +extern void ext3cow_free_blocks (handle_t *handle, struct inode *inode, + ext3cow_fsblk_t block, unsigned long count); +extern void ext3cow_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext3cow_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +extern ext3cow_fsblk_t ext3cow_count_free_blocks (struct super_block *); +extern void ext3cow_check_blocks_bitmap (struct super_block *); +extern struct ext3cow_group_desc * ext3cow_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); +extern int ext3cow_should_retry_alloc(struct super_block *sb, int *retries); +extern void ext3cow_init_block_alloc_info(struct inode *); +extern void ext3cow_rsv_window_add(struct super_block *sb, struct ext3cow_reserve_window_node *rsv); + + +/* dir.c */ +extern int ext3cow_check_dir_entry(const char *, struct inode *, + struct ext3cow_dir_entry_2 *, + struct buffer_head *, unsigned long); +extern int ext3cow_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3cow_dir_entry_2 *dirent); +extern void ext3cow_htree_free_dir_info(struct dir_private_info *p); + +/* fsync.c */ +extern int ext3cow_sync_file (struct file *, struct dentry *, int); + +/* hash.c */ +extern int ext3cowfs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode * ext3cow_new_inode (handle_t *, struct inode *, int); +extern void ext3cow_free_inode (handle_t *, struct inode *); +extern struct inode * ext3cow_orphan_get (struct super_block *, unsigned long); +extern unsigned long ext3cow_count_free_inodes (struct super_block *); +extern unsigned long ext3cow_count_dirs (struct super_block *); +extern void ext3cow_check_inodes_bitmap (struct super_block *); +extern unsigned long ext3cow_count_free (struct buffer_head *, unsigned); + + +/* inode.c */ +int ext3cow_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext3cow_fsblk_t blocknr); +struct buffer_head * ext3cow_getblk (handle_t *, struct inode *, long, int, int *); +struct buffer_head * ext3cow_bread (handle_t *, struct inode *, int, int, int *); +int ext3cow_get_blocks_handle(handle_t *handle, struct inode *inode, + sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, + int create, int extend_disksize); + +extern void ext3cow_read_inode (struct inode *); +extern int ext3cow_write_inode (struct inode *, int); +extern int ext3cow_setattr (struct dentry *, struct iattr *); +extern void ext3cow_delete_inode (struct inode *); +extern int ext3cow_sync_inode (handle_t *, struct inode *); +extern void ext3cow_discard_reservation (struct inode *); +extern void ext3cow_dirty_inode(struct inode *); +extern int ext3cow_change_inode_journal_flag(struct inode *, int); +extern int ext3cow_get_inode_loc(struct inode *, struct ext3cow_iloc *); +extern void ext3cow_truncate (struct inode *); +extern void ext3cow_set_inode_flags(struct inode *); +extern void ext3cow_set_aops(struct inode *inode); + +/* ioctl.c */ +extern int ext3cow_ioctl (struct inode *, struct file *, unsigned int, + unsigned long); +extern long ext3cow_compat_ioctl (struct file *, unsigned int, unsigned long); + +/* namei.c */ +extern int is_unchangeable(struct inode *, struct dentry *); +extern int ext3cow_orphan_add(handle_t *, struct inode *); +extern int ext3cow_orphan_del(handle_t *, struct inode *); +extern int ext3cow_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern struct inode *ext3cow_fake_inode(struct inode *, unsigned int); +extern int ext3cow_dup_inode(struct inode *, struct inode *); +extern int ext3cow_reclaim_dup_inode(struct inode *, struct inode *); + +/* resize.c */ +extern int ext3cow_group_add(struct super_block *sb, + struct ext3cow_new_group_data *input); +extern int ext3cow_group_extend(struct super_block *sb, + struct ext3cow_super_block *es, + ext3cow_fsblk_t n_blocks_count); + +/* super.c */ +extern void ext3cow_error (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void __ext3cow_std_error (struct super_block *, const char *, int); +extern void ext3cow_abort (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext3cow_warning (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext3cow_update_dynamic_rev (struct super_block *sb); +extern unsigned int ext3cow_take_snapshot(struct super_block *sb); + +#define ext3cow_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext3cow_std_error((sb), __FUNCTION__, (errno)); \ +} while (0) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext3cow_dir_operations; + +/* file.c */ +extern struct inode_operations ext3cow_file_inode_operations; +extern const struct file_operations ext3cow_file_operations; + +/* namei.c */ +extern struct inode_operations ext3cow_dir_inode_operations; +extern struct inode_operations ext3cow_special_inode_operations; + +/* symlink.c */ +extern struct inode_operations ext3cow_symlink_inode_operations; +extern struct inode_operations ext3cow_fast_symlink_inode_operations; + + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_EXT3COW_FS_H */ diff -ruN linux-2.6.20.3/include/linux/ext3cow_fs_i.h linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs_i.h --- linux-2.6.20.3/include/linux/ext3cow_fs_i.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs_i.h 2008-03-09 11:10:55.000000000 -0400 @@ -0,0 +1,152 @@ +/* + * linux/include/linux/ext3cow_fs_i.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_i.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _LINUX_EXT3COW_FS_I +#define _LINUX_EXT3COW_FS_I + +#include +#include +#include +#include + +/* data type for block offset of block group */ +typedef int ext3cow_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long ext3cow_fsblk_t; + +#define E3FSBLK "%lu" + +struct ext3cow_reserve_window { + ext3cow_fsblk_t _rsv_start; /* First byte reserved */ + ext3cow_fsblk_t _rsv_end; /* Last byte reserved or 0 */ +}; + +struct ext3cow_reserve_window_node { + struct rb_node rsv_node; + __u32 rsv_goal_size; + __u32 rsv_alloc_hit; + struct ext3cow_reserve_window rsv_window; +}; + +struct ext3cow_block_alloc_info { + /* information about reservation window */ + struct ext3cow_reserve_window_node rsv_window_node; + /* + * was i_next_alloc_block in ext3cow_inode_info + * is the logical (file-relative) number of the + * most-recently-allocated block in this file. + * We use this for detecting linearly ascending allocation requests. + */ + __u32 last_alloc_logical_block; + /* + * Was i_next_alloc_goal in ext3cow_inode_info + * is the *physical* companion to i_next_alloc_block. + * it the the physical block number of the block which was most-recentl + * allocated to this file. This give us the goal (target) for the next + * allocation when we detect linearly ascending requests. + */ + ext3cow_fsblk_t last_alloc_physical_block; +}; + +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + +/* + * third extended file system inode data in memory + */ +struct ext3cow_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; +#ifdef EXT3COW_FRAGMENTS + __u32 i_faddr; + __u8 i_frag_no; + __u8 i_frag_size; +#endif + ext3cow_fsblk_t i_file_acl; + __u32 i_dir_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + __u32 i_block_group; + __u32 i_state; /* Dynamic state flags for ext3cow */ + + /* block reservation info */ + struct ext3cow_block_alloc_info *i_block_alloc_info; + + __u32 i_dir_start_lookup; + + /* For versioning -znjp */ + __u16 i_cow_bitmap; + __u32 i_epoch_number; + __u32 i_next_inode; +#ifdef CONFIG_EXT3COW_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif +#ifdef CONFIG_EXT3COW_FS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext3cow_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext3cow_get_block (growth) and ext3cow_truncate (shrinkth). + */ + loff_t i_disksize; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* + * truncate_mutex is for serialising ext3cow_truncate() against + * ext3cow_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext3cow because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have truncate_mutex. + */ + struct mutex truncate_mutex; + struct inode vfs_inode; +}; + +#endif /* _LINUX_EXT3COW_FS_I */ diff -ruN linux-2.6.20.3/include/linux/ext3cow_fs_sb.h linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs_sb.h --- linux-2.6.20.3/include/linux/ext3cow_fs_sb.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs_sb.h 2008-03-09 11:10:57.000000000 -0400 @@ -0,0 +1,86 @@ +/* + * linux/include/linux/ext3cow_fs_sb.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_sb.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _LINUX_EXT3COW_FS_SB +#define _LINUX_EXT3COW_FS_SB + +#ifdef __KERNEL__ +#include +#include +#include +#include +#endif +#include + +/* + * third extended-fs super-block data in memory + */ +struct ext3cow_sb_info { + unsigned long s_frag_size; /* Size of a fragment in bytes */ + unsigned long s_frags_per_block;/* Number of fragments per block */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_frags_per_group;/* Number of fragments in a group */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + unsigned long s_groups_count; /* Number of groups in the fs */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext3cow_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock s_blockgroup_lock; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; + struct ext3cow_reserve_window_node s_rsv_window_head; + + /* For versioning -znjp */ + u32 s_epoch_number; + + /* Journaling */ + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; + unsigned long s_commit_interval; + struct block_device *journal_bdev; +#ifdef CONFIG_JBD_DEBUG + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ +#endif +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif +}; + +#endif /* _LINUX_EXT3COW_FS_SB */ diff -ruN linux-2.6.20.3/include/linux/ext3cow_jbd.h linux-2.6.20.3-ext3cow/include/linux/ext3cow_jbd.h --- linux-2.6.20.3/include/linux/ext3cow_jbd.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.20.3-ext3cow/include/linux/ext3cow_jbd.h 2008-03-09 11:10:56.000000000 -0400 @@ -0,0 +1,226 @@ +/* + * linux/include/linux/ext3cow_jbd.h + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Ext3-specific journaling extensions. + */ + +#ifndef _LINUX_EXT3COW_JBD_H +#define _LINUX_EXT3COW_JBD_H + +#include +#include +#include + +#define EXT3COW_JOURNAL(inode) (EXT3COW_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. */ + +#define EXT3COW_SINGLEDATA_TRANS_BLOCKS 8U + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT3COW_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT3COW_DATA_TRANS_BLOCKS(sb) (EXT3COW_SINGLEDATA_TRANS_BLOCKS + \ + EXT3COW_XATTR_TRANS_BLOCKS - 2 + \ + 2*EXT3COW_QUOTA_TRANS_BLOCKS(sb)) + +/* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + +#define EXT3COW_DELETE_TRANS_BLOCKS(sb) (2 * EXT3COW_DATA_TRANS_BLOCKS(sb) + 64) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT3COW_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT3COW_RESERVE_TRANS_BLOCKS 12U + +#define EXT3COW_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only inode+data */ +#define EXT3COW_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT3COW_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT3COW_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) +#define EXT3COW_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT3COW_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT3COW_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT3COW_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT3COW_QUOTA_DEL_BLOCKS(sb) 0 +#endif + +int +ext3cow_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3cow_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext3cow_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3cow_iloc *iloc); + +int ext3cow_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext3cow calls into JBD. The intent here is + * to allow these to be turned into appropriate stubs so ext3cow can control + * ext2 filesystems, so ext2+ext3cow systems only nee one fs. This work hasn't + * been done yet. + */ + +static inline void ext3cow_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + journal_release_buffer(handle, bh); +} + +void ext3cow_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err); + +int __ext3cow_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3cow_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3cow_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext3cow_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh); + +int __ext3cow_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh); + +int __ext3cow_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh); + +#define ext3cow_journal_get_undo_access(handle, bh) \ + __ext3cow_journal_get_undo_access(__FUNCTION__, (handle), (bh)) +#define ext3cow_journal_get_write_access(handle, bh) \ + __ext3cow_journal_get_write_access(__FUNCTION__, (handle), (bh)) +#define ext3cow_journal_revoke(handle, blocknr, bh) \ + __ext3cow_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) +#define ext3cow_journal_get_create_access(handle, bh) \ + __ext3cow_journal_get_create_access(__FUNCTION__, (handle), (bh)) +#define ext3cow_journal_dirty_metadata(handle, bh) \ + __ext3cow_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) +#define ext3cow_journal_forget(handle, bh) \ + __ext3cow_journal_forget(__FUNCTION__, (handle), (bh)) + +int ext3cow_journal_dirty_data(handle_t *handle, struct buffer_head *bh); + +handle_t *ext3cow_journal_start_sb(struct super_block *sb, int nblocks); +int __ext3cow_journal_stop(const char *where, handle_t *handle); + +static inline handle_t *ext3cow_journal_start(struct inode *inode, int nblocks) +{ + return ext3cow_journal_start_sb(inode->i_sb, nblocks); +} + +#define ext3cow_journal_stop(handle) \ + __ext3cow_journal_stop(__FUNCTION__, (handle)) + +static inline handle_t *ext3cow_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext3cow_journal_extend(handle_t *handle, int nblocks) +{ + return journal_extend(handle, nblocks); +} + +static inline int ext3cow_journal_restart(handle_t *handle, int nblocks) +{ + return journal_restart(handle, nblocks); +} + +static inline int ext3cow_journal_blocks_per_page(struct inode *inode) +{ + return journal_blocks_per_page(inode); +} + +static inline int ext3cow_journal_force_commit(journal_t *journal) +{ + return journal_force_commit(journal); +} + +/* super.c */ +int ext3cow_force_commit(struct super_block *sb); + +static inline int ext3cow_should_journal_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3COW_MOUNT_JOURNAL_DATA) + return 1; + if (EXT3COW_I(inode)->i_flags & EXT3COW_JOURNAL_DATA_FL) + return 1; + return 0; +} + +static inline int ext3cow_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3COW_I(inode)->i_flags & EXT3COW_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3COW_MOUNT_ORDERED_DATA) + return 1; + return 0; +} + +static inline int ext3cow_should_writeback_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT3COW_I(inode)->i_flags & EXT3COW_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3COW_MOUNT_WRITEBACK_DATA) + return 1; + return 0; +} + +#endif /* _LINUX_EXT3COW_JBD_H */ diff -ruN linux-2.6.20.3/include/linux/magic.h linux-2.6.20.3-ext3cow/include/linux/magic.h --- linux-2.6.20.3/include/linux/magic.h 2007-03-13 14:27:08.000000000 -0400 +++ linux-2.6.20.3-ext3cow/include/linux/magic.h 2008-03-09 11:10:57.000000000 -0400 @@ -9,6 +9,7 @@ #define EFS_SUPER_MAGIC 0x414A53 #define EXT2_SUPER_MAGIC 0xEF53 #define EXT3_SUPER_MAGIC 0xEF53 +#define EXT3COW_SUPER_MAGIC 0xEF53 #define EXT4_SUPER_MAGIC 0xEF53 #define HPFS_SUPER_MAGIC 0xf995e849 #define ISOFS_SUPER_MAGIC 0x9660