/* * linux/fs/ext4/indirect.c * * from * * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Goal-directed block allocation by Stephen Tweedie * (sct@redhat.com), 1993, 1998 */ #include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" #include <trace/events/ext4.h> typedef struct { __le32 *p; __le32 key; struct buffer_head *bh; } Indirect; static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) { p->key = *(p->p = v); p->bh = bh; } /** * ext4_block_to_path - parse the block number into array of offsets * @inode: inode in question (we are only interested in its superblock) * @i_block: block number to be parsed * @offsets: array to store the offsets in * @boundary: set this non-zero if the referred-to block is likely to be * followed (on disk) by an indirect block. * * To store the locations of file's data ext4 uses a data structure common * for UNIX filesystems - tree of pointers anchored in the inode, with * data blocks at leaves and indirect blocks in intermediate nodes. * This function translates the block number into path in that tree - * return value is the path length and @offsets[n] is the offset of * pointer to (n+1)th node in the nth one. If @block is out of range * (negative or too large) warning is printed and zero returned. * * Note: function doesn't find node addresses, so no IO is needed. All * we need to know is the capacity of indirect blocks (taken from the * inode->i_sb). */ /* * Portability note: the last comparison (check that we fit into triple * indirect block) is spelled differently, because otherwise on an * architecture with 32-bit longs and 8Kb pages we might get into trouble * if our filesystem had 8Kb blocks. We might use long long, but that would * kill us on x86. Oh, well, at least the sign propagation does not matter - * i_block would have to be negative in the very beginning, so we would not * get there at all. */ static int ext4_block_to_path(struct inode *inode, ext4_lblk_t i_block, ext4_lblk_t offsets[4], int *boundary) { int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); const long direct_blocks = EXT4_NDIR_BLOCKS, indirect_blocks = ptrs, double_blocks = (1 << (ptrs_bits * 2)); int n = 0; int final = 0; if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; } else if ((i_block -= direct_blocks) < indirect_blocks) { offsets[n++] = EXT4_IND_BLOCK; offsets[n++] = i_block; final = ptrs; } else if ((i_block -= indirect_blocks) < double_blocks) { offsets[n++] = EXT4_DIND_BLOCK; offsets[n++] = i_block >> ptrs_bits; offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { offsets[n++] = EXT4_TIND_BLOCK; offsets[n++] = i_block >> (ptrs_bits * 2); offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { ext4_warning(inode->i_sb, "block %lu > max in inode %lu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); return n; } /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question * @depth: depth of the chain (1 - direct pointer, etc.) * @offsets: offsets of pointers in inode/indirect blocks * @chain: place to store the result * @err: here we store the error value * * Function fills the array of triples <key, p, bh> and returns %NULL * if everything went OK or the pointer to the last filled triple * (incomplete one) otherwise. Upon the return chain[i].key contains * the number of (i+1)-th block in the chain (as it is stored in memory, * i.e. little-endian 32-bit), chain[i].p contains the address of that * number (it points into struct inode for i==0 and into the bh->b_data * for i>0) and chain[i].bh points to the buffer_head of i-th indirect * block for i>0 and NULL for i==0. In other words, it holds the block * numbers of the chain, addresses they were taken from (and where we can * verify that chain did not change) and buffer_heads hosting these * numbers. * * Function stops when it stumbles upon zero pointer (absent block) * (pointer to last triple returned, *@err == 0) * or when it gets an IO error reading an indirect block * (ditto, *@err == -EIO) * or when it reads all @depth-1 indirect blocks successfully and finds * the whole chain, all way to the data (returns %NULL, *err == 0). * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) */ static Indirect *ext4_get_branch(struct inode *inode, int depth, ext4_lblk_t *offsets, Indirect chain[4], int *err) { struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; int ret = -EIO; *err = 0; /* i_data is not going away, no lock needed */ add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); if (!p->key) goto no_block; while (--depth) { bh = sb_getblk(sb, le32_to_cpu(p->key)); if (unlikely(!bh)) { ret = -ENOMEM; goto failure; } if (!bh_uptodate_or_lock(bh)) { if (bh_submit_read(bh) < 0) { put_bh(bh); goto failure; } /* validate block references */ if (ext4_check_indirect_blockref(inode, bh)) { put_bh(bh); goto failure; } } add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) goto no_block; } return NULL; failure: *err = ret; no_block: return p; } /** * ext4_find_near - find a place for allocation with sufficient locality * @inode: owner * @ind: descriptor of indirect block. * * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. * + if pointer will live in indirect block - allocate near that block. * + if pointer will live in inode - allocate in the same * cylinder group. * * In the latter case we colour the starting block by the callers PID to * prevent it from clashing with concurrent allocations for a different inode * in the same block group. The PID is used here so that functionally related * files will be close-by on-disk. * * Caller must make sure that @ind is valid and will stay that way. */ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { if (*p) return le32_to_cpu(*p); } /* No such thing, so let's try location of indirect block */ if (ind->bh) return ind->bh->b_blocknr; /* * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ return ext4_inode_to_goal_block(inode); } /** * ext4_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * * Normally this function find the preferred place for block allocation, * returns it. * Because this is only used for non-extent files, we limit the block nr * to 32 bits. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { ext4_fsblk_t goal; /* * XXX need to get goal block from mballoc's data structures */ goal = ext4_find_near(inode, partial); goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; return goal; } /** * ext4_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks * @k: number of blocks need for indirect blocks * @blks: number of data blocks to be mapped. * @blocks_to_boundary: the offset in the indirect block * * return the total number of blocks to be allocate, including the * direct and indirect blocks. */ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, int blocks_to_boundary) { unsigned int count = 0; /* * Simple case, [t,d]Indirect block(s) has not allocated yet * then it's clear blocks on that path have not allocated */ if (k > 0) { /* right now we don't handle cross boundary allocation */ if (blks < blocks_to_boundary + 1) count += blks; else count += blocks_to_boundary + 1; return count; } count++; while (count < blks && count <= blocks_to_boundary && le32_to_cpu(*(branch[0].p + count)) == 0) { count++; } return count; } /** * ext4_alloc_branch - allocate and set up a chain of blocks. * @handle: handle for this transaction * @inode: owner * @indirect_blks: number of allocated indirect blocks * @blks: number of allocated direct blocks * @goal: preferred place for allocation * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * * This function allocates blocks, zeroes out all but the last one, * links them into chain and (if we are synchronous) writes them to disk. * In other words, it prepares a branch that can be spliced onto the * inode. It stores the information about that chain in the branch[], in * the same format as ext4_get_branch() would do. We are calling it after * we had read the existing part of chain and partial points to the last * triple of that (one with zero ->key). Upon the exit we have the same * picture as after the successful ext4_get_block(), except that in one * place chain is disconnected - *branch->p is still zero (we did not * set the last link), but branch->key contains the number that should * be placed into *branch->p to fill that gap. * * If allocation fails we free all blocks we've allocated (and forget * their buffer_heads) and return the error value the from failed * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, int indirect_blks, int *blks, ext4_fsblk_t goal, ext4_lblk_t *offsets, Indirect *branch) { struct ext4_allocation_request ar; struct buffer_head * bh; ext4_fsblk_t b, new_blocks[4]; __le32 *p; int i, j, err, len = 1; /* * Set up for the direct block allocation */ memset(&ar, 0, sizeof(ar)); ar.inode = inode; ar.len = *blks; ar.logical = iblock; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { ar.goal = goal; new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); } else goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &err); if (err) { i--; goto failed; } branch[i].key = cpu_to_le32(new_blocks[i]); if (i == 0) continue; bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; } lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, bh); if (err) { unlock_buffer(bh); goto failed; } memset(bh->b_data, 0, bh->b_size); p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; b = new_blocks[i]; if (i == indirect_blks) len = ar.len; for (j = 0; j < len; j++) *p++ = cpu_to_le32(b++); BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto failed; } *blks = ar.len; return 0; failed: for (; i >= 0; i--) { if (i != indirect_blks && branch[i].bh) ext4_forget(handle, 1, inode, branch[i].bh, branch[i].bh->b_blocknr); ext4_free_blocks(handle, inode, NULL, new_blocks[i], (i == indirect_blks) ? ar.len : 1, 0); } return err; } /** * ext4_splice_branch - splice the allocated branch onto inode. * @handle: handle for this transaction * @inode: owner * @block: (logical) number of block we are adding * @chain: chain of indirect blocks (with a missing link - see * ext4_alloc_branch) * @where: location of missing link * @num: number of indirect blocks we are adding * @blks: number of direct blocks we are adding * * This function fills the missing link and does all housekeeping needed in * inode (->i_blocks, etc.). In case of success we end up with the full * chain to new block and return 0. */ static int ext4_splice_branch(handle_t *handle, struct inode *inode, ext4_lblk_t block, Indirect *where, int num, int blks) { int i; int err = 0; ext4_fsblk_t current_block; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block * before the splice. */ if (where->bh) { BUFFER_TRACE(where->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, where->bh); if (err) goto err_out; } /* That's it */ *where->p = where->key; /* * Update the host buffer_head or inode to point to more just allocated * direct blocks blocks */ if (num == 0 && blks > 1) { current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < blks; i++) *(where->p + i) = cpu_to_le32(current_block++); } /* We are done with atomic stuff, now do the rest of housekeeping */ /* had we spliced it onto indirect block? */ if (where->bh) { /* * If we spliced it onto an indirect block, we haven't * altered the inode. Note however that if it is being spliced * onto an indirect block at the very end of the file (the * file is growing) then we *will* alter the inode to reflect * the new i_size. But that is not done here - it is done in * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ jbd_debug(5, "splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, where->bh); if (err) goto err_out; } else { /* * OK, we spliced it into the inode itself on a direct block. */ ext4_mark_inode_dirty(handle, inode); jbd_debug(5, "splicing direct\n"); } return err; err_out: for (i = 1; i <= num; i++) { /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ ext4_free_blocks(handle, inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), blks, 0); return err; } /* * The ext4_ind_map_blocks() function handles non-extents inodes * (i.e., using the traditional indirect/double-indirect i_blocks * scheme) for ext4_map_blocks(). * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything * to tree, set linkage between the newborn blocks, write them if sync is * required, recheck the path, free and repeat if check fails, otherwise * set the last missing link (that will protect us from any truncate-generated * removals - all blocks on the path are immune now) and possibly force the * write on the parent block. * That has a nice additional property: no special recovery from the failed * allocations is needed - we simply release blocks and do not touch anything * reachable from inode. * * `handle' can be NULL if create == 0. * * return > 0, # of blocks mapped or allocated. * return = 0, if plain lookup failed. * return < 0, error case. * * The ext4_ind_get_blocks() function should be called with * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system * blocks. */ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { int err = -EIO; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; ext4_fsblk_t goal; int indirect_blks; int blocks_to_boundary = 0; int depth; int count = 0; ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, map->m_lblk, offsets, &blocks_to_boundary); if (depth == 0) goto out; partial = ext4_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); count++; /*map more blocks*/ while (count < map->m_len && count <= blocks_to_boundary) { ext4_fsblk_t blk; blk = le32_to_cpu(*(chain[depth-1].p + count)); if (blk == first_block + count) count++; else break; } goto got_it; } /* Next simple case - plain lookup or failed read of indirect block */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) goto cleanup; /* * Okay, we need to do block allocation. */ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); return -ENOSPC; } goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; /* * Next look up the indirect map to count the totoal number of * direct blocks to allocate for this branch. */ count = ext4_blks_to_allocate(partial, indirect_blks, map->m_len, blocks_to_boundary); /* * Block out ext4_truncate while we alter the tree */ err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, &count, goal, offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers * on the new chain if there is a failure, but that risks using * up transaction credits, especially for bitmaps where the * credits cannot be returned. Can we handle this somehow? We * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) err = ext4_splice_branch(handle, inode, map->m_lblk, partial, indirect_blks, count); if (err) goto cleanup; map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); map->m_len = count; if (count > blocks_to_boundary) map->m_flags |= EXT4_MAP_BOUNDARY; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ cleanup: while (partial > chain) { BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } out: trace_ext4_ind_map_blocks_exit(inode, flags, map, err); return err; } /* * O_DIRECT for ext3 (or indirect map) based files * * If the O_DIRECT write will extend the file then add this inode to the * orphan list. So recovery will truncate it back to the original size * if the machine crashes during the write. * * If the O_DIRECT write is intantiating holes inside i_size and the machine * crashes then stale disk data _may_ be exposed inside the file. But current * VFS code falls back into buffered path in that case so we are safe. */ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); handle_t *handle; ssize_t ret; int orphan = 0; size_t count = iov_length(iov, nr_segs); int retries = 0; if (rw == WRITE) { loff_t final_size = offset + count; if (final_size > inode->i_size) { /* Credits for sb + inode write */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } orphan = 1; ei->i_disksize = inode->i_size; ext4_journal_stop(handle); } } retry: if (rw == READ && ext4_should_dioread_nolock(inode)) { /* * Nolock dioread optimization may be dynamically disabled * via ext4_inode_block_unlocked_dio(). Check inode's state * while holding extra i_dio_count ref. */ atomic_inc(&inode->i_dio_count); smp_mb(); if (unlikely(ext4_test_inode_state(inode, EXT4_STATE_DIOREAD_LOCK))) { inode_dio_done(inode); goto locked; } ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) ext4_truncate_failed_write(inode); } } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (orphan) { int err; /* Credits for sb + inode write */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { /* This is really bad luck. We've written the data * but cannot extend i_size. Bail out and pretend * the write failed... */ ret = PTR_ERR(handle); if (inode->i_nlink) ext4_orphan_del(NULL, inode); goto out; } if (inode->i_nlink) ext4_orphan_del(handle, inode); if (ret > 0) { loff_t end = offset + ret; if (end > inode->i_size) { ei->i_disksize = end; i_size_write(inode, end); /* * We're going to return a positive `ret' * here due to non-zero-length I/O, so there's * no way of reporting error returns from * ext4_mark_inode_dirty() to userspace. So * ignore it. */ ext4_mark_inode_dirty(handle, inode); } } err = ext4_journal_stop(handle); if (ret == 0) ret = err; } out: return ret; } /* * Calculate the number of metadata blocks need to reserve * to allocate a new block at @lblocks for non extent file based file */ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); int blk_bits; if (lblock < EXT4_NDIR_BLOCKS) return 0; lblock -= EXT4_NDIR_BLOCKS; if (ei->i_da_metadata_calc_len && (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { ei->i_da_metadata_calc_len++; return 0; } ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; ei->i_da_metadata_calc_len = 1; blk_bits = order_base_2(lblock); return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; } /* * Calculate number of indirect blocks touched by mapping @nrblocks logically * contiguous blocks */ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) { /* * With N contiguous data blocks, we need at most * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, * 2 dindirect blocks, and 1 tindirect block */ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If * extend fails, we need to propagate the failure up and restart the * transaction in the top-level truncate loop. --sct * * Returns 0 if we managed to create more room. If we can't create more * room, and the transaction must be restarted we return 1. */ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) { if (!ext4_handle_valid(handle)) return 0; if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) return 0; if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) return 0; return 1; } /* * Probably it should be a library function... search for first non-zero word * or memcmp with zero_page, whatever is better for particular architecture. * Linus? */ static inline int all_zeroes(__le32 *p, __le32 *q) { while (p < q) if (*p++) return 0; return 1; } /** * ext4_find_shared - find the indirect blocks for partial truncation. * @inode: inode in question * @depth: depth of the affected branch * @offsets: offsets of pointers in that branch (see ext4_block_to_path) * @chain: place to store the pointers to partial indirect blocks * @top: place to the (detached) top of branch * * This is a helper function used by ext4_truncate(). * * When we do truncate() we may have to clean the ends of several * indirect blocks but leave the blocks themselves alive. Block is * partially truncated if some data below the new i_size is referred * from it (and it is on the path to the first completely truncated * data block, indeed). We have to free the top of that path along * with everything to the right of the path. Since no allocation * past the truncation point is possible until ext4_truncate() * finishes, we may safely do the latter, but top of branch may * require special attention - pageout below the truncation point * might try to populate it. * * We atomically detach the top of branch from the tree, store the * block number of its root in *@top, pointers to buffer_heads of * partially truncated blocks - in @chain[].bh and pointers to * their last elements that should not be removed - in * @chain[].p. Return value is the pointer to last filled element * of @chain. * * The work left to caller to do the actual freeing of subtrees: * a) free the subtree starting from *@top * b) free the subtrees whose roots are stored in * (@chain[i].p+1 .. end of @chain[i].bh->b_data) * c) free the subtrees growing from the inode past the @chain[0]. * (no partially truncated stuff there). */ static Indirect *ext4_find_shared(struct inode *inode, int depth, ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) { Indirect *partial, *p; int k, err; *top = 0; /* Make k index the deepest non-null offset + 1 */ for (k = depth; k > 1 && !offsets[k-1]; k--) ; partial = ext4_get_branch(inode, k, offsets, chain, &err); /* Writer: pointers */ if (!partial) partial = chain + k-1; /* * If the branch acquired continuation since we've looked at it - * fine, it should all survive and (new) top doesn't belong to us. */ if (!partial->key && *partial->p) /* Writer: end */ goto no_top; for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) ; /* * OK, we've found the last block that must survive. The rest of our * branch should be detached before unlocking. However, if that rest * of branch is all ours and does not grow immediately from the inode * it's easier to cheat and just decrement partial->p. */ if (p == chain + k - 1 && p > chain) { p->p--; } else { *top = *p->p; /* Nope, don't do this in ext4. Must leave the tree intact */ #if 0 *p->p = 0; #endif } /* Writer: end */ while (partial > p) { brelse(partial->bh); partial--; } no_top: return partial; } /* * Zero a number of block pointers in either an inode or an indirect block. * If we restart the transaction we must again get write access to the * indirect block for further modification. * * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. * * Return 0 on success, 1 on invalid block range * and < 0 on fatal error. */ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block_to_free, unsigned long count, __le32 *first, __le32 *last) { __le32 *p; int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); return 1; } if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) goto out_err; } err = ext4_mark_inode_dirty(handle, inode); if (unlikely(err)) goto out_err; err = ext4_truncate_restart_trans(handle, inode, ext4_blocks_for_truncate(inode)); if (unlikely(err)) goto out_err; if (bh) { BUFFER_TRACE(bh, "retaking write access"); err = ext4_journal_get_write_access(handle, bh); if (unlikely(err)) goto out_err; } } for (p = first; p < last; p++) *p = 0; ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); return 0; out_err: ext4_std_error(inode->i_sb, err); return err; } /** * ext4_free_data - free a list of data blocks * @handle: handle for this transaction * @inode: inode we are dealing with * @this_bh: indirect buffer_head which contains *@first and *@last * @first: array of block numbers * @last: points immediately past the end of array * * We are freeing all blocks referred from that array (numbers are stored as * little-endian 32-bit) and updating @inode->i_blocks appropriately. * * We accumulate contiguous runs of blocks to free. Conveniently, if these * blocks are contiguous then releasing them at one time will only affect one * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't * actually use a lot of journal space. * * @this_bh will be %NULL if @first and @last point into the inode's direct * block pointers. */ static void ext4_free_data(handle_t *handle, struct inode *inode, struct buffer_head *this_bh, __le32 *first, __le32 *last) { ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ unsigned long count = 0; /* Number of blocks in the run */ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind corresponding to block_to_free */ ext4_fsblk_t nr; /* Current block # */ __le32 *p; /* Pointer into inode/ind for current block */ int err = 0; if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, this_bh); /* Important: if we can't update the indirect pointers * to the blocks, we can't free them. */ if (err) return; } for (p = first; p < last; p++) { nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ if (count == 0) { block_to_free = nr; block_to_free_p = p; count = 1; } else if (nr == block_to_free + count) { count++; } else { err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err) break; block_to_free = nr; block_to_free_p = p; count = 1; } } } if (!err && count > 0) err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err < 0) /* fatal error */ return; if (this_bh) { BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); /* * The buffer head should have an attached journal head at this * point. However, if the data is corrupted and an indirect * block pointed to itself, it would have been detached when * the block was cleared. Check for this instead of OOPSing. */ if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else EXT4_ERROR_INODE(inode, "circular indirect block detected at " "block %llu", (unsigned long long) this_bh->b_blocknr); } } /** * ext4_free_branches - free an array of branches * @handle: JBD handle for this transaction * @inode: inode we are dealing with * _bh: the buffer_head which contains *@first and *@last * @first: array of block numbers * @last: pointer immediately past the end of array * @depth: depth of the branches to free * * We are freeing all blocks referred from these branches (numbers are * stored as little-endian 32-bit) and updating @inode->i_blocks * appropriately. */ static void ext4_free_branches(handle_t *handle, struct inode *inode, struct buffer_head *parent_bh, __le32 *first, __le32 *last, int depth) { ext4_fsblk_t nr; __le32 *p; if (ext4_handle_is_aborted(handle)) return; if (depth--) { struct buffer_head *bh; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); p = last; while (--p >= first) { nr = le32_to_cpu(*p); if (!nr) continue; /* A hole */ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", (unsigned long) nr, depth); break; } /* Go read the buffer for the next level down */ bh = sb_bread(inode->i_sb, nr); /* * A read failure? Report error and clear slot * (should be rare). */ if (!bh) { EXT4_ERROR_INODE_BLOCK(inode, nr, "Read failure"); continue; } /* This zaps the entire block. Bottom up. */ BUFFER_TRACE(bh, "free child branches"); ext4_free_branches(handle, inode, bh, (__le32 *) bh->b_data, (__le32 *) bh->b_data + addr_per_block, depth); brelse(bh); /* * Everything below this this pointer has been * released. Now let this top-of-subtree go. * * We want the freeing of this indirect block to be * atomic in the journal with the updating of the * bitmap block which owns it. So make some room in * the journal. * * We zero the parent pointer *after* freeing its * pointee in the bitmaps, so if extend_transaction() * for some reason fails to put the bitmap changes and * the release into the same transaction, recovery * will merely complain about releasing a free block, * rather than leaking blocks. */ if (ext4_handle_is_aborted(handle)) return; if (try_to_extend_transaction(handle, inode)) { ext4_mark_inode_dirty(handle, inode); ext4_truncate_restart_trans(handle, inode, ext4_blocks_for_truncate(inode)); } /* * The forget flag here is critical because if * we are journaling (and not doing data * journaling), we have to make sure a revoke * record is written to prevent the journal * replay from overwriting the (former) * indirect block if it gets reallocated as a * data block. This must happen in the same * transaction where the data blocks are * actually freed. */ ext4_free_blocks(handle, inode, NULL, nr, 1, EXT4_FREE_BLOCKS_METADATA| EXT4_FREE_BLOCKS_FORGET); if (parent_bh) { /* * The block which we have just freed is * pointed to by an indirect block: journal it */ BUFFER_TRACE(parent_bh, "get_write_access"); if (!ext4_journal_get_write_access(handle, parent_bh)){ *p = 0; BUFFER_TRACE(parent_bh, "call ext4_handle_dirty_metadata"); ext4_handle_dirty_metadata(handle, inode, parent_bh); } } } } else { /* We have reached the bottom of the tree. */ BUFFER_TRACE(parent_bh, "free data blocks"); ext4_free_data(handle, inode, parent_bh, first, last); } } void ext4_ind_truncate(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; unsigned blocksize = inode->i_sb->s_blocksize; last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) return; } ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate * the new, shorter inode size (held for now in i_size) into the * on-disk inode. We do this via i_disksize, which is the value which * ext4 *really* writes onto the disk inode. */ ei->i_disksize = inode->i_size; if (last_block == max_block) { /* * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); goto do_indirects; } partial = ext4_find_shared(inode, n, offsets, chain, &nr); /* Kill the top of shared branch (not detached) */ if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; /* * We mark the inode dirty prior to restart, * and prior to stop. No need for it here. */ } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } /* Clear the ends of indirect blocks on the shared branch */ while (partial > chain) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32*)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { default: nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } case EXT4_TIND_BLOCK: ; } } static int free_hole_blocks(handle_t *handle, struct inode *inode, struct buffer_head *parent_bh, __le32 *i_data, int level, ext4_lblk_t first, ext4_lblk_t count, int max) { struct buffer_head *bh = NULL; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); int ret = 0; int i, inc; ext4_lblk_t offset; __le32 blk; inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level); for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) { if (offset >= count + first) break; if (*i_data == 0 || (offset + inc) <= first) continue; blk = *i_data; if (level > 0) { ext4_lblk_t first2; bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } first2 = (first > offset) ? first - offset : 0; ret = free_hole_blocks(handle, inode, bh, (__le32 *)bh->b_data, level - 1, first2, count - offset, inode->i_sb->s_blocksize >> 2); if (ret) { brelse(bh); goto err; } } if (level == 0 || (bh && all_zeroes((__le32 *)bh->b_data, (__le32 *)bh->b_data + addr_per_block))) { ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); *i_data = 0; } brelse(bh); bh = NULL; } err: return ret; } int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t first, ext4_lblk_t stop) { int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); int level, ret = 0; int num = EXT4_NDIR_BLOCKS; ext4_lblk_t count, max = EXT4_NDIR_BLOCKS; __le32 *i_data = EXT4_I(inode)->i_data; count = stop - first; for (level = 0; level < 4; level++, max *= addr_per_block) { if (first < max) { ret = free_hole_blocks(handle, inode, NULL, i_data, level, first, count, num); if (ret) goto err; if (count > max - first) count -= max - first; else break; first = 0; } else { first -= max; } i_data += num; if (level == 0) { num = 1; max = 1; } } err: return ret; }