Showing error 1359

User: Jiri Slaby
Error type: Leaving function in locked state
Error type description: Some lock is not unlocked on all paths of a function, so it is leaked
File location: fs/ocfs2/alloc.c
Line in file: 6949
Project: Linux Kernel
Project version: 2.6.28
Tools: Stanse (1.2)
Entered: 2012-05-21 20:30:05 UTC


Source:

   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * alloc.c
   5 *
   6 * Extent allocs and frees
   7 *
   8 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
   9 *
  10 * This program is free software; you can redistribute it and/or
  11 * modify it under the terms of the GNU General Public
  12 * License as published by the Free Software Foundation; either
  13 * version 2 of the License, or (at your option) any later version.
  14 *
  15 * This program is distributed in the hope that it will be useful,
  16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 * General Public License for more details.
  19 *
  20 * You should have received a copy of the GNU General Public
  21 * License along with this program; if not, write to the
  22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23 * Boston, MA 021110-1307, USA.
  24 */
  25
  26#include <linux/fs.h>
  27#include <linux/types.h>
  28#include <linux/slab.h>
  29#include <linux/highmem.h>
  30#include <linux/swap.h>
  31
  32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
  33#include <cluster/masklog.h>
  34
  35#include "ocfs2.h"
  36
  37#include "alloc.h"
  38#include "aops.h"
  39#include "dlmglue.h"
  40#include "extent_map.h"
  41#include "inode.h"
  42#include "journal.h"
  43#include "localalloc.h"
  44#include "suballoc.h"
  45#include "sysfile.h"
  46#include "file.h"
  47#include "super.h"
  48#include "uptodate.h"
  49
  50#include "buffer_head_io.h"
  51
  52
  53/*
  54 * Operations for a specific extent tree type.
  55 *
  56 * To implement an on-disk btree (extent tree) type in ocfs2, add
  57 * an ocfs2_extent_tree_operations structure and the matching
  58 * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
  59 * for the allocation portion of the extent tree.
  60 */
  61struct ocfs2_extent_tree_operations {
  62        /*
  63         * last_eb_blk is the block number of the right most leaf extent
  64         * block.  Most on-disk structures containing an extent tree store
  65         * this value for fast access.  The ->eo_set_last_eb_blk() and
  66         * ->eo_get_last_eb_blk() operations access this value.  They are
  67         *  both required.
  68         */
  69        void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
  70                                   u64 blkno);
  71        u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
  72
  73        /*
  74         * The on-disk structure usually keeps track of how many total
  75         * clusters are stored in this extent tree.  This function updates
  76         * that value.  new_clusters is the delta, and must be
  77         * added to the total.  Required.
  78         */
  79        void (*eo_update_clusters)(struct inode *inode,
  80                                   struct ocfs2_extent_tree *et,
  81                                   u32 new_clusters);
  82
  83        /*
  84         * If ->eo_insert_check() exists, it is called before rec is
  85         * inserted into the extent tree.  It is optional.
  86         */
  87        int (*eo_insert_check)(struct inode *inode,
  88                               struct ocfs2_extent_tree *et,
  89                               struct ocfs2_extent_rec *rec);
  90        int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
  91
  92        /*
  93         * --------------------------------------------------------------
  94         * The remaining are internal to ocfs2_extent_tree and don't have
  95         * accessor functions
  96         */
  97
  98        /*
  99         * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
 100         * It is required.
 101         */
 102        void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
 103
 104        /*
 105         * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
 106         * it exists.  If it does not, et->et_max_leaf_clusters is set
 107         * to 0 (unlimited).  Optional.
 108         */
 109        void (*eo_fill_max_leaf_clusters)(struct inode *inode,
 110                                          struct ocfs2_extent_tree *et);
 111};
 112
 113
 114/*
 115 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
 116 * in the methods.
 117 */
 118static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
 119static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
 120                                         u64 blkno);
 121static void ocfs2_dinode_update_clusters(struct inode *inode,
 122                                         struct ocfs2_extent_tree *et,
 123                                         u32 clusters);
 124static int ocfs2_dinode_insert_check(struct inode *inode,
 125                                     struct ocfs2_extent_tree *et,
 126                                     struct ocfs2_extent_rec *rec);
 127static int ocfs2_dinode_sanity_check(struct inode *inode,
 128                                     struct ocfs2_extent_tree *et);
 129static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
 130static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
 131        .eo_set_last_eb_blk        = ocfs2_dinode_set_last_eb_blk,
 132        .eo_get_last_eb_blk        = ocfs2_dinode_get_last_eb_blk,
 133        .eo_update_clusters        = ocfs2_dinode_update_clusters,
 134        .eo_insert_check        = ocfs2_dinode_insert_check,
 135        .eo_sanity_check        = ocfs2_dinode_sanity_check,
 136        .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
 137};
 138
 139static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
 140                                         u64 blkno)
 141{
 142        struct ocfs2_dinode *di = et->et_object;
 143
 144        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
 145        di->i_last_eb_blk = cpu_to_le64(blkno);
 146}
 147
 148static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
 149{
 150        struct ocfs2_dinode *di = et->et_object;
 151
 152        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
 153        return le64_to_cpu(di->i_last_eb_blk);
 154}
 155
 156static void ocfs2_dinode_update_clusters(struct inode *inode,
 157                                         struct ocfs2_extent_tree *et,
 158                                         u32 clusters)
 159{
 160        struct ocfs2_dinode *di = et->et_object;
 161
 162        le32_add_cpu(&di->i_clusters, clusters);
 163        spin_lock(&OCFS2_I(inode)->ip_lock);
 164        OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
 165        spin_unlock(&OCFS2_I(inode)->ip_lock);
 166}
 167
 168static int ocfs2_dinode_insert_check(struct inode *inode,
 169                                     struct ocfs2_extent_tree *et,
 170                                     struct ocfs2_extent_rec *rec)
 171{
 172        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 173
 174        BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
 175        mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
 176                        (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
 177                        "Device %s, asking for sparse allocation: inode %llu, "
 178                        "cpos %u, clusters %u\n",
 179                        osb->dev_str,
 180                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
 181                        rec->e_cpos,
 182                        OCFS2_I(inode)->ip_clusters);
 183
 184        return 0;
 185}
 186
 187static int ocfs2_dinode_sanity_check(struct inode *inode,
 188                                     struct ocfs2_extent_tree *et)
 189{
 190        int ret = 0;
 191        struct ocfs2_dinode *di;
 192
 193        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
 194
 195        di = et->et_object;
 196        if (!OCFS2_IS_VALID_DINODE(di)) {
 197                ret = -EIO;
 198                ocfs2_error(inode->i_sb,
 199                        "Inode %llu has invalid path root",
 200                        (unsigned long long)OCFS2_I(inode)->ip_blkno);
 201        }
 202
 203        return ret;
 204}
 205
 206static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
 207{
 208        struct ocfs2_dinode *di = et->et_object;
 209
 210        et->et_root_el = &di->id2.i_list;
 211}
 212
 213
 214static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 215{
 216        struct ocfs2_xattr_value_root *xv = et->et_object;
 217
 218        et->et_root_el = &xv->xr_list;
 219}
 220
 221static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
 222                                              u64 blkno)
 223{
 224        struct ocfs2_xattr_value_root *xv =
 225                (struct ocfs2_xattr_value_root *)et->et_object;
 226
 227        xv->xr_last_eb_blk = cpu_to_le64(blkno);
 228}
 229
 230static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 231{
 232        struct ocfs2_xattr_value_root *xv =
 233                (struct ocfs2_xattr_value_root *) et->et_object;
 234
 235        return le64_to_cpu(xv->xr_last_eb_blk);
 236}
 237
 238static void ocfs2_xattr_value_update_clusters(struct inode *inode,
 239                                              struct ocfs2_extent_tree *et,
 240                                              u32 clusters)
 241{
 242        struct ocfs2_xattr_value_root *xv =
 243                (struct ocfs2_xattr_value_root *)et->et_object;
 244
 245        le32_add_cpu(&xv->xr_clusters, clusters);
 246}
 247
 248static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
 249        .eo_set_last_eb_blk        = ocfs2_xattr_value_set_last_eb_blk,
 250        .eo_get_last_eb_blk        = ocfs2_xattr_value_get_last_eb_blk,
 251        .eo_update_clusters        = ocfs2_xattr_value_update_clusters,
 252        .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
 253};
 254
 255static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
 256{
 257        struct ocfs2_xattr_block *xb = et->et_object;
 258
 259        et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
 260}
 261
 262static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
 263                                                    struct ocfs2_extent_tree *et)
 264{
 265        et->et_max_leaf_clusters =
 266                ocfs2_clusters_for_bytes(inode->i_sb,
 267                                         OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
 268}
 269
 270static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
 271                                             u64 blkno)
 272{
 273        struct ocfs2_xattr_block *xb = et->et_object;
 274        struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
 275
 276        xt->xt_last_eb_blk = cpu_to_le64(blkno);
 277}
 278
 279static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
 280{
 281        struct ocfs2_xattr_block *xb = et->et_object;
 282        struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
 283
 284        return le64_to_cpu(xt->xt_last_eb_blk);
 285}
 286
 287static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
 288                                             struct ocfs2_extent_tree *et,
 289                                             u32 clusters)
 290{
 291        struct ocfs2_xattr_block *xb = et->et_object;
 292
 293        le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
 294}
 295
 296static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 297        .eo_set_last_eb_blk        = ocfs2_xattr_tree_set_last_eb_blk,
 298        .eo_get_last_eb_blk        = ocfs2_xattr_tree_get_last_eb_blk,
 299        .eo_update_clusters        = ocfs2_xattr_tree_update_clusters,
 300        .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
 301        .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 302};
 303
 304static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 305                                     struct inode *inode,
 306                                     struct buffer_head *bh,
 307                                     void *obj,
 308                                     struct ocfs2_extent_tree_operations *ops)
 309{
 310        et->et_ops = ops;
 311        et->et_root_bh = bh;
 312        if (!obj)
 313                obj = (void *)bh->b_data;
 314        et->et_object = obj;
 315
 316        et->et_ops->eo_fill_root_el(et);
 317        if (!et->et_ops->eo_fill_max_leaf_clusters)
 318                et->et_max_leaf_clusters = 0;
 319        else
 320                et->et_ops->eo_fill_max_leaf_clusters(inode, et);
 321}
 322
 323void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 324                                   struct inode *inode,
 325                                   struct buffer_head *bh)
 326{
 327        __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
 328}
 329
 330void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 331                                       struct inode *inode,
 332                                       struct buffer_head *bh)
 333{
 334        __ocfs2_init_extent_tree(et, inode, bh, NULL,
 335                                 &ocfs2_xattr_tree_et_ops);
 336}
 337
 338void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 339                                        struct inode *inode,
 340                                        struct buffer_head *bh,
 341                                        struct ocfs2_xattr_value_root *xv)
 342{
 343        __ocfs2_init_extent_tree(et, inode, bh, xv,
 344                                 &ocfs2_xattr_value_et_ops);
 345}
 346
 347static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
 348                                            u64 new_last_eb_blk)
 349{
 350        et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
 351}
 352
 353static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
 354{
 355        return et->et_ops->eo_get_last_eb_blk(et);
 356}
 357
 358static inline void ocfs2_et_update_clusters(struct inode *inode,
 359                                            struct ocfs2_extent_tree *et,
 360                                            u32 clusters)
 361{
 362        et->et_ops->eo_update_clusters(inode, et, clusters);
 363}
 364
 365static inline int ocfs2_et_insert_check(struct inode *inode,
 366                                        struct ocfs2_extent_tree *et,
 367                                        struct ocfs2_extent_rec *rec)
 368{
 369        int ret = 0;
 370
 371        if (et->et_ops->eo_insert_check)
 372                ret = et->et_ops->eo_insert_check(inode, et, rec);
 373        return ret;
 374}
 375
 376static inline int ocfs2_et_sanity_check(struct inode *inode,
 377                                        struct ocfs2_extent_tree *et)
 378{
 379        int ret = 0;
 380
 381        if (et->et_ops->eo_sanity_check)
 382                ret = et->et_ops->eo_sanity_check(inode, et);
 383        return ret;
 384}
 385
 386static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 387static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 388                                         struct ocfs2_extent_block *eb);
 389
 390/*
 391 * Structures which describe a path through a btree, and functions to
 392 * manipulate them.
 393 *
 394 * The idea here is to be as generic as possible with the tree
 395 * manipulation code.
 396 */
 397struct ocfs2_path_item {
 398        struct buffer_head                *bh;
 399        struct ocfs2_extent_list        *el;
 400};
 401
 402#define OCFS2_MAX_PATH_DEPTH        5
 403
 404struct ocfs2_path {
 405        int                        p_tree_depth;
 406        struct ocfs2_path_item        p_node[OCFS2_MAX_PATH_DEPTH];
 407};
 408
 409#define path_root_bh(_path) ((_path)->p_node[0].bh)
 410#define path_root_el(_path) ((_path)->p_node[0].el)
 411#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
 412#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 413#define path_num_items(_path) ((_path)->p_tree_depth + 1)
 414
 415/*
 416 * Reset the actual path elements so that we can re-use the structure
 417 * to build another path. Generally, this involves freeing the buffer
 418 * heads.
 419 */
 420static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
 421{
 422        int i, start = 0, depth = 0;
 423        struct ocfs2_path_item *node;
 424
 425        if (keep_root)
 426                start = 1;
 427
 428        for(i = start; i < path_num_items(path); i++) {
 429                node = &path->p_node[i];
 430
 431                brelse(node->bh);
 432                node->bh = NULL;
 433                node->el = NULL;
 434        }
 435
 436        /*
 437         * Tree depth may change during truncate, or insert. If we're
 438         * keeping the root extent list, then make sure that our path
 439         * structure reflects the proper depth.
 440         */
 441        if (keep_root)
 442                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
 443
 444        path->p_tree_depth = depth;
 445}
 446
 447static void ocfs2_free_path(struct ocfs2_path *path)
 448{
 449        if (path) {
 450                ocfs2_reinit_path(path, 0);
 451                kfree(path);
 452        }
 453}
 454
 455/*
 456 * All the elements of src into dest. After this call, src could be freed
 457 * without affecting dest.
 458 *
 459 * Both paths should have the same root. Any non-root elements of dest
 460 * will be freed.
 461 */
 462static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 463{
 464        int i;
 465
 466        BUG_ON(path_root_bh(dest) != path_root_bh(src));
 467        BUG_ON(path_root_el(dest) != path_root_el(src));
 468
 469        ocfs2_reinit_path(dest, 1);
 470
 471        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
 472                dest->p_node[i].bh = src->p_node[i].bh;
 473                dest->p_node[i].el = src->p_node[i].el;
 474
 475                if (dest->p_node[i].bh)
 476                        get_bh(dest->p_node[i].bh);
 477        }
 478}
 479
 480/*
 481 * Make the *dest path the same as src and re-initialize src path to
 482 * have a root only.
 483 */
 484static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 485{
 486        int i;
 487
 488        BUG_ON(path_root_bh(dest) != path_root_bh(src));
 489
 490        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
 491                brelse(dest->p_node[i].bh);
 492
 493                dest->p_node[i].bh = src->p_node[i].bh;
 494                dest->p_node[i].el = src->p_node[i].el;
 495
 496                src->p_node[i].bh = NULL;
 497                src->p_node[i].el = NULL;
 498        }
 499}
 500
 501/*
 502 * Insert an extent block at given index.
 503 *
 504 * This will not take an additional reference on eb_bh.
 505 */
 506static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
 507                                        struct buffer_head *eb_bh)
 508{
 509        struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
 510
 511        /*
 512         * Right now, no root bh is an extent block, so this helps
 513         * catch code errors with dinode trees. The assertion can be
 514         * safely removed if we ever need to insert extent block
 515         * structures at the root.
 516         */
 517        BUG_ON(index == 0);
 518
 519        path->p_node[index].bh = eb_bh;
 520        path->p_node[index].el = &eb->h_list;
 521}
 522
 523static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 524                                         struct ocfs2_extent_list *root_el)
 525{
 526        struct ocfs2_path *path;
 527
 528        BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
 529
 530        path = kzalloc(sizeof(*path), GFP_NOFS);
 531        if (path) {
 532                path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
 533                get_bh(root_bh);
 534                path_root_bh(path) = root_bh;
 535                path_root_el(path) = root_el;
 536        }
 537
 538        return path;
 539}
 540
 541/*
 542 * Convenience function to journal all components in a path.
 543 */
 544static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
 545                                     struct ocfs2_path *path)
 546{
 547        int i, ret = 0;
 548
 549        if (!path)
 550                goto out;
 551
 552        for(i = 0; i < path_num_items(path); i++) {
 553                ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
 554                                           OCFS2_JOURNAL_ACCESS_WRITE);
 555                if (ret < 0) {
 556                        mlog_errno(ret);
 557                        goto out;
 558                }
 559        }
 560
 561out:
 562        return ret;
 563}
 564
 565/*
 566 * Return the index of the extent record which contains cluster #v_cluster.
 567 * -1 is returned if it was not found.
 568 *
 569 * Should work fine on interior and exterior nodes.
 570 */
 571int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
 572{
 573        int ret = -1;
 574        int i;
 575        struct ocfs2_extent_rec *rec;
 576        u32 rec_end, rec_start, clusters;
 577
 578        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 579                rec = &el->l_recs[i];
 580
 581                rec_start = le32_to_cpu(rec->e_cpos);
 582                clusters = ocfs2_rec_clusters(el, rec);
 583
 584                rec_end = rec_start + clusters;
 585
 586                if (v_cluster >= rec_start && v_cluster < rec_end) {
 587                        ret = i;
 588                        break;
 589                }
 590        }
 591
 592        return ret;
 593}
 594
 595enum ocfs2_contig_type {
 596        CONTIG_NONE = 0,
 597        CONTIG_LEFT,
 598        CONTIG_RIGHT,
 599        CONTIG_LEFTRIGHT,
 600};
 601
 602
 603/*
 604 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
 605 * ocfs2_extent_contig only work properly against leaf nodes!
 606 */
 607static int ocfs2_block_extent_contig(struct super_block *sb,
 608                                     struct ocfs2_extent_rec *ext,
 609                                     u64 blkno)
 610{
 611        u64 blk_end = le64_to_cpu(ext->e_blkno);
 612
 613        blk_end += ocfs2_clusters_to_blocks(sb,
 614                                    le16_to_cpu(ext->e_leaf_clusters));
 615
 616        return blkno == blk_end;
 617}
 618
 619static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
 620                                  struct ocfs2_extent_rec *right)
 621{
 622        u32 left_range;
 623
 624        left_range = le32_to_cpu(left->e_cpos) +
 625                le16_to_cpu(left->e_leaf_clusters);
 626
 627        return (left_range == le32_to_cpu(right->e_cpos));
 628}
 629
 630static enum ocfs2_contig_type
 631        ocfs2_extent_contig(struct inode *inode,
 632                            struct ocfs2_extent_rec *ext,
 633                            struct ocfs2_extent_rec *insert_rec)
 634{
 635        u64 blkno = le64_to_cpu(insert_rec->e_blkno);
 636
 637        /*
 638         * Refuse to coalesce extent records with different flag
 639         * fields - we don't want to mix unwritten extents with user
 640         * data.
 641         */
 642        if (ext->e_flags != insert_rec->e_flags)
 643                return CONTIG_NONE;
 644
 645        if (ocfs2_extents_adjacent(ext, insert_rec) &&
 646            ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
 647                        return CONTIG_RIGHT;
 648
 649        blkno = le64_to_cpu(ext->e_blkno);
 650        if (ocfs2_extents_adjacent(insert_rec, ext) &&
 651            ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
 652                return CONTIG_LEFT;
 653
 654        return CONTIG_NONE;
 655}
 656
 657/*
 658 * NOTE: We can have pretty much any combination of contiguousness and
 659 * appending.
 660 *
 661 * The usefulness of APPEND_TAIL is more in that it lets us know that
 662 * we'll have to update the path to that leaf.
 663 */
 664enum ocfs2_append_type {
 665        APPEND_NONE = 0,
 666        APPEND_TAIL,
 667};
 668
 669enum ocfs2_split_type {
 670        SPLIT_NONE = 0,
 671        SPLIT_LEFT,
 672        SPLIT_RIGHT,
 673};
 674
 675struct ocfs2_insert_type {
 676        enum ocfs2_split_type        ins_split;
 677        enum ocfs2_append_type        ins_appending;
 678        enum ocfs2_contig_type        ins_contig;
 679        int                        ins_contig_index;
 680        int                        ins_tree_depth;
 681};
 682
 683struct ocfs2_merge_ctxt {
 684        enum ocfs2_contig_type        c_contig_type;
 685        int                        c_has_empty_extent;
 686        int                        c_split_covers_rec;
 687};
 688
 689/*
 690 * How many free extents have we got before we need more meta data?
 691 */
 692int ocfs2_num_free_extents(struct ocfs2_super *osb,
 693                           struct inode *inode,
 694                           struct ocfs2_extent_tree *et)
 695{
 696        int retval;
 697        struct ocfs2_extent_list *el = NULL;
 698        struct ocfs2_extent_block *eb;
 699        struct buffer_head *eb_bh = NULL;
 700        u64 last_eb_blk = 0;
 701
 702        mlog_entry_void();
 703
 704        el = et->et_root_el;
 705        last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 706
 707        if (last_eb_blk) {
 708                retval = ocfs2_read_block(inode, last_eb_blk,
 709                                          &eb_bh);
 710                if (retval < 0) {
 711                        mlog_errno(retval);
 712                        goto bail;
 713                }
 714                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 715                el = &eb->h_list;
 716        }
 717
 718        BUG_ON(el->l_tree_depth != 0);
 719
 720        retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
 721bail:
 722        brelse(eb_bh);
 723
 724        mlog_exit(retval);
 725        return retval;
 726}
 727
 728/* expects array to already be allocated
 729 *
 730 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
 731 * l_count for you
 732 */
 733static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
 734                                     handle_t *handle,
 735                                     struct inode *inode,
 736                                     int wanted,
 737                                     struct ocfs2_alloc_context *meta_ac,
 738                                     struct buffer_head *bhs[])
 739{
 740        int count, status, i;
 741        u16 suballoc_bit_start;
 742        u32 num_got;
 743        u64 first_blkno;
 744        struct ocfs2_extent_block *eb;
 745
 746        mlog_entry_void();
 747
 748        count = 0;
 749        while (count < wanted) {
 750                status = ocfs2_claim_metadata(osb,
 751                                              handle,
 752                                              meta_ac,
 753                                              wanted - count,
 754                                              &suballoc_bit_start,
 755                                              &num_got,
 756                                              &first_blkno);
 757                if (status < 0) {
 758                        mlog_errno(status);
 759                        goto bail;
 760                }
 761
 762                for(i = count;  i < (num_got + count); i++) {
 763                        bhs[i] = sb_getblk(osb->sb, first_blkno);
 764                        if (bhs[i] == NULL) {
 765                                status = -EIO;
 766                                mlog_errno(status);
 767                                goto bail;
 768                        }
 769                        ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
 770
 771                        status = ocfs2_journal_access(handle, inode, bhs[i],
 772                                                      OCFS2_JOURNAL_ACCESS_CREATE);
 773                        if (status < 0) {
 774                                mlog_errno(status);
 775                                goto bail;
 776                        }
 777
 778                        memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
 779                        eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
 780                        /* Ok, setup the minimal stuff here. */
 781                        strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
 782                        eb->h_blkno = cpu_to_le64(first_blkno);
 783                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
 784                        eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
 785                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 786                        eb->h_list.l_count =
 787                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
 788
 789                        suballoc_bit_start++;
 790                        first_blkno++;
 791
 792                        /* We'll also be dirtied by the caller, so
 793                         * this isn't absolutely necessary. */
 794                        status = ocfs2_journal_dirty(handle, bhs[i]);
 795                        if (status < 0) {
 796                                mlog_errno(status);
 797                                goto bail;
 798                        }
 799                }
 800
 801                count += num_got;
 802        }
 803
 804        status = 0;
 805bail:
 806        if (status < 0) {
 807                for(i = 0; i < wanted; i++) {
 808                        brelse(bhs[i]);
 809                        bhs[i] = NULL;
 810                }
 811        }
 812        mlog_exit(status);
 813        return status;
 814}
 815
 816/*
 817 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
 818 *
 819 * Returns the sum of the rightmost extent rec logical offset and
 820 * cluster count.
 821 *
 822 * ocfs2_add_branch() uses this to determine what logical cluster
 823 * value should be populated into the leftmost new branch records.
 824 *
 825 * ocfs2_shift_tree_depth() uses this to determine the # clusters
 826 * value for the new topmost tree record.
 827 */
 828static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 829{
 830        int i;
 831
 832        i = le16_to_cpu(el->l_next_free_rec) - 1;
 833
 834        return le32_to_cpu(el->l_recs[i].e_cpos) +
 835                ocfs2_rec_clusters(el, &el->l_recs[i]);
 836}
 837
 838/*
 839 * Add an entire tree branch to our inode. eb_bh is the extent block
 840 * to start at, if we don't want to start the branch at the dinode
 841 * structure.
 842 *
 843 * last_eb_bh is required as we have to update it's next_leaf pointer
 844 * for the new last extent block.
 845 *
 846 * the new branch will be 'empty' in the sense that every block will
 847 * contain a single record with cluster count == 0.
 848 */
 849static int ocfs2_add_branch(struct ocfs2_super *osb,
 850                            handle_t *handle,
 851                            struct inode *inode,
 852                            struct ocfs2_extent_tree *et,
 853                            struct buffer_head *eb_bh,
 854                            struct buffer_head **last_eb_bh,
 855                            struct ocfs2_alloc_context *meta_ac)
 856{
 857        int status, new_blocks, i;
 858        u64 next_blkno, new_last_eb_blk;
 859        struct buffer_head *bh;
 860        struct buffer_head **new_eb_bhs = NULL;
 861        struct ocfs2_extent_block *eb;
 862        struct ocfs2_extent_list  *eb_el;
 863        struct ocfs2_extent_list  *el;
 864        u32 new_cpos;
 865
 866        mlog_entry_void();
 867
 868        BUG_ON(!last_eb_bh || !*last_eb_bh);
 869
 870        if (eb_bh) {
 871                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 872                el = &eb->h_list;
 873        } else
 874                el = et->et_root_el;
 875
 876        /* we never add a branch to a leaf. */
 877        BUG_ON(!el->l_tree_depth);
 878
 879        new_blocks = le16_to_cpu(el->l_tree_depth);
 880
 881        /* allocate the number of new eb blocks we need */
 882        new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
 883                             GFP_KERNEL);
 884        if (!new_eb_bhs) {
 885                status = -ENOMEM;
 886                mlog_errno(status);
 887                goto bail;
 888        }
 889
 890        status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
 891                                           meta_ac, new_eb_bhs);
 892        if (status < 0) {
 893                mlog_errno(status);
 894                goto bail;
 895        }
 896
 897        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
 898        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
 899
 900        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
 901         * linked with the rest of the tree.
 902         * conversly, new_eb_bhs[0] is the new bottommost leaf.
 903         *
 904         * when we leave the loop, new_last_eb_blk will point to the
 905         * newest leaf, and next_blkno will point to the topmost extent
 906         * block. */
 907        next_blkno = new_last_eb_blk = 0;
 908        for(i = 0; i < new_blocks; i++) {
 909                bh = new_eb_bhs[i];
 910                eb = (struct ocfs2_extent_block *) bh->b_data;
 911                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 912                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
 913                        status = -EIO;
 914                        goto bail;
 915                }
 916                eb_el = &eb->h_list;
 917
 918                status = ocfs2_journal_access(handle, inode, bh,
 919                                              OCFS2_JOURNAL_ACCESS_CREATE);
 920                if (status < 0) {
 921                        mlog_errno(status);
 922                        goto bail;
 923                }
 924
 925                eb->h_next_leaf_blk = 0;
 926                eb_el->l_tree_depth = cpu_to_le16(i);
 927                eb_el->l_next_free_rec = cpu_to_le16(1);
 928                /*
 929                 * This actually counts as an empty extent as
 930                 * c_clusters == 0
 931                 */
 932                eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
 933                eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
 934                /*
 935                 * eb_el isn't always an interior node, but even leaf
 936                 * nodes want a zero'd flags and reserved field so
 937                 * this gets the whole 32 bits regardless of use.
 938                 */
 939                eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
 940                if (!eb_el->l_tree_depth)
 941                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
 942
 943                status = ocfs2_journal_dirty(handle, bh);
 944                if (status < 0) {
 945                        mlog_errno(status);
 946                        goto bail;
 947                }
 948
 949                next_blkno = le64_to_cpu(eb->h_blkno);
 950        }
 951
 952        /* This is a bit hairy. We want to update up to three blocks
 953         * here without leaving any of them in an inconsistent state
 954         * in case of error. We don't have to worry about
 955         * journal_dirty erroring as it won't unless we've aborted the
 956         * handle (in which case we would never be here) so reserving
 957         * the write with journal_access is all we need to do. */
 958        status = ocfs2_journal_access(handle, inode, *last_eb_bh,
 959                                      OCFS2_JOURNAL_ACCESS_WRITE);
 960        if (status < 0) {
 961                mlog_errno(status);
 962                goto bail;
 963        }
 964        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
 965                                      OCFS2_JOURNAL_ACCESS_WRITE);
 966        if (status < 0) {
 967                mlog_errno(status);
 968                goto bail;
 969        }
 970        if (eb_bh) {
 971                status = ocfs2_journal_access(handle, inode, eb_bh,
 972                                              OCFS2_JOURNAL_ACCESS_WRITE);
 973                if (status < 0) {
 974                        mlog_errno(status);
 975                        goto bail;
 976                }
 977        }
 978
 979        /* Link the new branch into the rest of the tree (el will
 980         * either be on the root_bh, or the extent block passed in. */
 981        i = le16_to_cpu(el->l_next_free_rec);
 982        el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
 983        el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
 984        el->l_recs[i].e_int_clusters = 0;
 985        le16_add_cpu(&el->l_next_free_rec, 1);
 986
 987        /* fe needs a new last extent block pointer, as does the
 988         * next_leaf on the previously last-extent-block. */
 989        ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
 990
 991        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
 992        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
 993
 994        status = ocfs2_journal_dirty(handle, *last_eb_bh);
 995        if (status < 0)
 996                mlog_errno(status);
 997        status = ocfs2_journal_dirty(handle, et->et_root_bh);
 998        if (status < 0)
 999                mlog_errno(status);
1000        if (eb_bh) {
1001                status = ocfs2_journal_dirty(handle, eb_bh);
1002                if (status < 0)
1003                        mlog_errno(status);
1004        }
1005
1006        /*
1007         * Some callers want to track the rightmost leaf so pass it
1008         * back here.
1009         */
1010        brelse(*last_eb_bh);
1011        get_bh(new_eb_bhs[0]);
1012        *last_eb_bh = new_eb_bhs[0];
1013
1014        status = 0;
1015bail:
1016        if (new_eb_bhs) {
1017                for (i = 0; i < new_blocks; i++)
1018                        brelse(new_eb_bhs[i]);
1019                kfree(new_eb_bhs);
1020        }
1021
1022        mlog_exit(status);
1023        return status;
1024}
1025
1026/*
1027 * adds another level to the allocation tree.
1028 * returns back the new extent block so you can add a branch to it
1029 * after this call.
1030 */
1031static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1032                                  handle_t *handle,
1033                                  struct inode *inode,
1034                                  struct ocfs2_extent_tree *et,
1035                                  struct ocfs2_alloc_context *meta_ac,
1036                                  struct buffer_head **ret_new_eb_bh)
1037{
1038        int status, i;
1039        u32 new_clusters;
1040        struct buffer_head *new_eb_bh = NULL;
1041        struct ocfs2_extent_block *eb;
1042        struct ocfs2_extent_list  *root_el;
1043        struct ocfs2_extent_list  *eb_el;
1044
1045        mlog_entry_void();
1046
1047        status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1048                                           &new_eb_bh);
1049        if (status < 0) {
1050                mlog_errno(status);
1051                goto bail;
1052        }
1053
1054        eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1055        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1056                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1057                status = -EIO;
1058                goto bail;
1059        }
1060
1061        eb_el = &eb->h_list;
1062        root_el = et->et_root_el;
1063
1064        status = ocfs2_journal_access(handle, inode, new_eb_bh,
1065                                      OCFS2_JOURNAL_ACCESS_CREATE);
1066        if (status < 0) {
1067                mlog_errno(status);
1068                goto bail;
1069        }
1070
1071        /* copy the root extent list data into the new extent block */
1072        eb_el->l_tree_depth = root_el->l_tree_depth;
1073        eb_el->l_next_free_rec = root_el->l_next_free_rec;
1074        for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1075                eb_el->l_recs[i] = root_el->l_recs[i];
1076
1077        status = ocfs2_journal_dirty(handle, new_eb_bh);
1078        if (status < 0) {
1079                mlog_errno(status);
1080                goto bail;
1081        }
1082
1083        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
1084                                      OCFS2_JOURNAL_ACCESS_WRITE);
1085        if (status < 0) {
1086                mlog_errno(status);
1087                goto bail;
1088        }
1089
1090        new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1091
1092        /* update root_bh now */
1093        le16_add_cpu(&root_el->l_tree_depth, 1);
1094        root_el->l_recs[0].e_cpos = 0;
1095        root_el->l_recs[0].e_blkno = eb->h_blkno;
1096        root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1097        for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1098                memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1099        root_el->l_next_free_rec = cpu_to_le16(1);
1100
1101        /* If this is our 1st tree depth shift, then last_eb_blk
1102         * becomes the allocated extent block */
1103        if (root_el->l_tree_depth == cpu_to_le16(1))
1104                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1105
1106        status = ocfs2_journal_dirty(handle, et->et_root_bh);
1107        if (status < 0) {
1108                mlog_errno(status);
1109                goto bail;
1110        }
1111
1112        *ret_new_eb_bh = new_eb_bh;
1113        new_eb_bh = NULL;
1114        status = 0;
1115bail:
1116        brelse(new_eb_bh);
1117
1118        mlog_exit(status);
1119        return status;
1120}
1121
1122/*
1123 * Should only be called when there is no space left in any of the
1124 * leaf nodes. What we want to do is find the lowest tree depth
1125 * non-leaf extent block with room for new records. There are three
1126 * valid results of this search:
1127 *
1128 * 1) a lowest extent block is found, then we pass it back in
1129 *    *lowest_eb_bh and return '0'
1130 *
1131 * 2) the search fails to find anything, but the root_el has room. We
1132 *    pass NULL back in *lowest_eb_bh, but still return '0'
1133 *
1134 * 3) the search fails to find anything AND the root_el is full, in
1135 *    which case we return > 0
1136 *
1137 * return status < 0 indicates an error.
1138 */
1139static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1140                                    struct inode *inode,
1141                                    struct ocfs2_extent_tree *et,
1142                                    struct buffer_head **target_bh)
1143{
1144        int status = 0, i;
1145        u64 blkno;
1146        struct ocfs2_extent_block *eb;
1147        struct ocfs2_extent_list  *el;
1148        struct buffer_head *bh = NULL;
1149        struct buffer_head *lowest_bh = NULL;
1150
1151        mlog_entry_void();
1152
1153        *target_bh = NULL;
1154
1155        el = et->et_root_el;
1156
1157        while(le16_to_cpu(el->l_tree_depth) > 1) {
1158                if (le16_to_cpu(el->l_next_free_rec) == 0) {
1159                        ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1160                                    "extent list (next_free_rec == 0)",
1161                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
1162                        status = -EIO;
1163                        goto bail;
1164                }
1165                i = le16_to_cpu(el->l_next_free_rec) - 1;
1166                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1167                if (!blkno) {
1168                        ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1169                                    "list where extent # %d has no physical "
1170                                    "block start",
1171                                    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1172                        status = -EIO;
1173                        goto bail;
1174                }
1175
1176                brelse(bh);
1177                bh = NULL;
1178
1179                status = ocfs2_read_block(inode, blkno, &bh);
1180                if (status < 0) {
1181                        mlog_errno(status);
1182                        goto bail;
1183                }
1184
1185                eb = (struct ocfs2_extent_block *) bh->b_data;
1186                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1187                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1188                        status = -EIO;
1189                        goto bail;
1190                }
1191                el = &eb->h_list;
1192
1193                if (le16_to_cpu(el->l_next_free_rec) <
1194                    le16_to_cpu(el->l_count)) {
1195                        brelse(lowest_bh);
1196                        lowest_bh = bh;
1197                        get_bh(lowest_bh);
1198                }
1199        }
1200
1201        /* If we didn't find one and the fe doesn't have any room,
1202         * then return '1' */
1203        el = et->et_root_el;
1204        if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1205                status = 1;
1206
1207        *target_bh = lowest_bh;
1208bail:
1209        brelse(bh);
1210
1211        mlog_exit(status);
1212        return status;
1213}
1214
1215/*
1216 * Grow a b-tree so that it has more records.
1217 *
1218 * We might shift the tree depth in which case existing paths should
1219 * be considered invalid.
1220 *
1221 * Tree depth after the grow is returned via *final_depth.
1222 *
1223 * *last_eb_bh will be updated by ocfs2_add_branch().
1224 */
1225static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1226                           struct ocfs2_extent_tree *et, int *final_depth,
1227                           struct buffer_head **last_eb_bh,
1228                           struct ocfs2_alloc_context *meta_ac)
1229{
1230        int ret, shift;
1231        struct ocfs2_extent_list *el = et->et_root_el;
1232        int depth = le16_to_cpu(el->l_tree_depth);
1233        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1234        struct buffer_head *bh = NULL;
1235
1236        BUG_ON(meta_ac == NULL);
1237
1238        shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1239        if (shift < 0) {
1240                ret = shift;
1241                mlog_errno(ret);
1242                goto out;
1243        }
1244
1245        /* We traveled all the way to the bottom of the allocation tree
1246         * and didn't find room for any more extents - we need to add
1247         * another tree level */
1248        if (shift) {
1249                BUG_ON(bh);
1250                mlog(0, "need to shift tree depth (current = %d)\n", depth);
1251
1252                /* ocfs2_shift_tree_depth will return us a buffer with
1253                 * the new extent block (so we can pass that to
1254                 * ocfs2_add_branch). */
1255                ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1256                                             meta_ac, &bh);
1257                if (ret < 0) {
1258                        mlog_errno(ret);
1259                        goto out;
1260                }
1261                depth++;
1262                if (depth == 1) {
1263                        /*
1264                         * Special case: we have room now if we shifted from
1265                         * tree_depth 0, so no more work needs to be done.
1266                         *
1267                         * We won't be calling add_branch, so pass
1268                         * back *last_eb_bh as the new leaf. At depth
1269                         * zero, it should always be null so there's
1270                         * no reason to brelse.
1271                         */
1272                        BUG_ON(*last_eb_bh);
1273                        get_bh(bh);
1274                        *last_eb_bh = bh;
1275                        goto out;
1276                }
1277        }
1278
1279        /* call ocfs2_add_branch to add the final part of the tree with
1280         * the new data. */
1281        mlog(0, "add branch. bh = %p\n", bh);
1282        ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1283                               meta_ac);
1284        if (ret < 0) {
1285                mlog_errno(ret);
1286                goto out;
1287        }
1288
1289out:
1290        if (final_depth)
1291                *final_depth = depth;
1292        brelse(bh);
1293        return ret;
1294}
1295
1296/*
1297 * This function will discard the rightmost extent record.
1298 */
1299static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1300{
1301        int next_free = le16_to_cpu(el->l_next_free_rec);
1302        int count = le16_to_cpu(el->l_count);
1303        unsigned int num_bytes;
1304
1305        BUG_ON(!next_free);
1306        /* This will cause us to go off the end of our extent list. */
1307        BUG_ON(next_free >= count);
1308
1309        num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1310
1311        memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1312}
1313
1314static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1315                              struct ocfs2_extent_rec *insert_rec)
1316{
1317        int i, insert_index, next_free, has_empty, num_bytes;
1318        u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1319        struct ocfs2_extent_rec *rec;
1320
1321        next_free = le16_to_cpu(el->l_next_free_rec);
1322        has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1323
1324        BUG_ON(!next_free);
1325
1326        /* The tree code before us didn't allow enough room in the leaf. */
1327        BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1328
1329        /*
1330         * The easiest way to approach this is to just remove the
1331         * empty extent and temporarily decrement next_free.
1332         */
1333        if (has_empty) {
1334                /*
1335                 * If next_free was 1 (only an empty extent), this
1336                 * loop won't execute, which is fine. We still want
1337                 * the decrement above to happen.
1338                 */
1339                for(i = 0; i < (next_free - 1); i++)
1340                        el->l_recs[i] = el->l_recs[i+1];
1341
1342                next_free--;
1343        }
1344
1345        /*
1346         * Figure out what the new record index should be.
1347         */
1348        for(i = 0; i < next_free; i++) {
1349                rec = &el->l_recs[i];
1350
1351                if (insert_cpos < le32_to_cpu(rec->e_cpos))
1352                        break;
1353        }
1354        insert_index = i;
1355
1356        mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1357             insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1358
1359        BUG_ON(insert_index < 0);
1360        BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1361        BUG_ON(insert_index > next_free);
1362
1363        /*
1364         * No need to memmove if we're just adding to the tail.
1365         */
1366        if (insert_index != next_free) {
1367                BUG_ON(next_free >= le16_to_cpu(el->l_count));
1368
1369                num_bytes = next_free - insert_index;
1370                num_bytes *= sizeof(struct ocfs2_extent_rec);
1371                memmove(&el->l_recs[insert_index + 1],
1372                        &el->l_recs[insert_index],
1373                        num_bytes);
1374        }
1375
1376        /*
1377         * Either we had an empty extent, and need to re-increment or
1378         * there was no empty extent on a non full rightmost leaf node,
1379         * in which case we still need to increment.
1380         */
1381        next_free++;
1382        el->l_next_free_rec = cpu_to_le16(next_free);
1383        /*
1384         * Make sure none of the math above just messed up our tree.
1385         */
1386        BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1387
1388        el->l_recs[insert_index] = *insert_rec;
1389
1390}
1391
1392static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1393{
1394        int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1395
1396        BUG_ON(num_recs == 0);
1397
1398        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1399                num_recs--;
1400                size = num_recs * sizeof(struct ocfs2_extent_rec);
1401                memmove(&el->l_recs[0], &el->l_recs[1], size);
1402                memset(&el->l_recs[num_recs], 0,
1403                       sizeof(struct ocfs2_extent_rec));
1404                el->l_next_free_rec = cpu_to_le16(num_recs);
1405        }
1406}
1407
1408/*
1409 * Create an empty extent record .
1410 *
1411 * l_next_free_rec may be updated.
1412 *
1413 * If an empty extent already exists do nothing.
1414 */
1415static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1416{
1417        int next_free = le16_to_cpu(el->l_next_free_rec);
1418
1419        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1420
1421        if (next_free == 0)
1422                goto set_and_inc;
1423
1424        if (ocfs2_is_empty_extent(&el->l_recs[0]))
1425                return;
1426
1427        mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1428                        "Asked to create an empty extent in a full list:\n"
1429                        "count = %u, tree depth = %u",
1430                        le16_to_cpu(el->l_count),
1431                        le16_to_cpu(el->l_tree_depth));
1432
1433        ocfs2_shift_records_right(el);
1434
1435set_and_inc:
1436        le16_add_cpu(&el->l_next_free_rec, 1);
1437        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1438}
1439
1440/*
1441 * For a rotation which involves two leaf nodes, the "root node" is
1442 * the lowest level tree node which contains a path to both leafs. This
1443 * resulting set of information can be used to form a complete "subtree"
1444 *
1445 * This function is passed two full paths from the dinode down to a
1446 * pair of adjacent leaves. It's task is to figure out which path
1447 * index contains the subtree root - this can be the root index itself
1448 * in a worst-case rotation.
1449 *
1450 * The array index of the subtree root is passed back.
1451 */
1452static int ocfs2_find_subtree_root(struct inode *inode,
1453                                   struct ocfs2_path *left,
1454                                   struct ocfs2_path *right)
1455{
1456        int i = 0;
1457
1458        /*
1459         * Check that the caller passed in two paths from the same tree.
1460         */
1461        BUG_ON(path_root_bh(left) != path_root_bh(right));
1462
1463        do {
1464                i++;
1465
1466                /*
1467                 * The caller didn't pass two adjacent paths.
1468                 */
1469                mlog_bug_on_msg(i > left->p_tree_depth,
1470                                "Inode %lu, left depth %u, right depth %u\n"
1471                                "left leaf blk %llu, right leaf blk %llu\n",
1472                                inode->i_ino, left->p_tree_depth,
1473                                right->p_tree_depth,
1474                                (unsigned long long)path_leaf_bh(left)->b_blocknr,
1475                                (unsigned long long)path_leaf_bh(right)->b_blocknr);
1476        } while (left->p_node[i].bh->b_blocknr ==
1477                 right->p_node[i].bh->b_blocknr);
1478
1479        return i - 1;
1480}
1481
1482typedef void (path_insert_t)(void *, struct buffer_head *);
1483
1484/*
1485 * Traverse a btree path in search of cpos, starting at root_el.
1486 *
1487 * This code can be called with a cpos larger than the tree, in which
1488 * case it will return the rightmost path.
1489 */
1490static int __ocfs2_find_path(struct inode *inode,
1491                             struct ocfs2_extent_list *root_el, u32 cpos,
1492                             path_insert_t *func, void *data)
1493{
1494        int i, ret = 0;
1495        u32 range;
1496        u64 blkno;
1497        struct buffer_head *bh = NULL;
1498        struct ocfs2_extent_block *eb;
1499        struct ocfs2_extent_list *el;
1500        struct ocfs2_extent_rec *rec;
1501        struct ocfs2_inode_info *oi = OCFS2_I(inode);
1502
1503        el = root_el;
1504        while (el->l_tree_depth) {
1505                if (le16_to_cpu(el->l_next_free_rec) == 0) {
1506                        ocfs2_error(inode->i_sb,
1507                                    "Inode %llu has empty extent list at "
1508                                    "depth %u\n",
1509                                    (unsigned long long)oi->ip_blkno,
1510                                    le16_to_cpu(el->l_tree_depth));
1511                        ret = -EROFS;
1512                        goto out;
1513
1514                }
1515
1516                for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1517                        rec = &el->l_recs[i];
1518
1519                        /*
1520                         * In the case that cpos is off the allocation
1521                         * tree, this should just wind up returning the
1522                         * rightmost record.
1523                         */
1524                        range = le32_to_cpu(rec->e_cpos) +
1525                                ocfs2_rec_clusters(el, rec);
1526                        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1527                            break;
1528                }
1529
1530                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1531                if (blkno == 0) {
1532                        ocfs2_error(inode->i_sb,
1533                                    "Inode %llu has bad blkno in extent list "
1534                                    "at depth %u (index %d)\n",
1535                                    (unsigned long long)oi->ip_blkno,
1536                                    le16_to_cpu(el->l_tree_depth), i);
1537                        ret = -EROFS;
1538                        goto out;
1539                }
1540
1541                brelse(bh);
1542                bh = NULL;
1543                ret = ocfs2_read_block(inode, blkno, &bh);
1544                if (ret) {
1545                        mlog_errno(ret);
1546                        goto out;
1547                }
1548
1549                eb = (struct ocfs2_extent_block *) bh->b_data;
1550                el = &eb->h_list;
1551                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1552                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1553                        ret = -EIO;
1554                        goto out;
1555                }
1556
1557                if (le16_to_cpu(el->l_next_free_rec) >
1558                    le16_to_cpu(el->l_count)) {
1559                        ocfs2_error(inode->i_sb,
1560                                    "Inode %llu has bad count in extent list "
1561                                    "at block %llu (next free=%u, count=%u)\n",
1562                                    (unsigned long long)oi->ip_blkno,
1563                                    (unsigned long long)bh->b_blocknr,
1564                                    le16_to_cpu(el->l_next_free_rec),
1565                                    le16_to_cpu(el->l_count));
1566                        ret = -EROFS;
1567                        goto out;
1568                }
1569
1570                if (func)
1571                        func(data, bh);
1572        }
1573
1574out:
1575        /*
1576         * Catch any trailing bh that the loop didn't handle.
1577         */
1578        brelse(bh);
1579
1580        return ret;
1581}
1582
1583/*
1584 * Given an initialized path (that is, it has a valid root extent
1585 * list), this function will traverse the btree in search of the path
1586 * which would contain cpos.
1587 *
1588 * The path traveled is recorded in the path structure.
1589 *
1590 * Note that this will not do any comparisons on leaf node extent
1591 * records, so it will work fine in the case that we just added a tree
1592 * branch.
1593 */
1594struct find_path_data {
1595        int index;
1596        struct ocfs2_path *path;
1597};
1598static void find_path_ins(void *data, struct buffer_head *bh)
1599{
1600        struct find_path_data *fp = data;
1601
1602        get_bh(bh);
1603        ocfs2_path_insert_eb(fp->path, fp->index, bh);
1604        fp->index++;
1605}
1606static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1607                           u32 cpos)
1608{
1609        struct find_path_data data;
1610
1611        data.index = 1;
1612        data.path = path;
1613        return __ocfs2_find_path(inode, path_root_el(path), cpos,
1614                                 find_path_ins, &data);
1615}
1616
1617static void find_leaf_ins(void *data, struct buffer_head *bh)
1618{
1619        struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1620        struct ocfs2_extent_list *el = &eb->h_list;
1621        struct buffer_head **ret = data;
1622
1623        /* We want to retain only the leaf block. */
1624        if (le16_to_cpu(el->l_tree_depth) == 0) {
1625                get_bh(bh);
1626                *ret = bh;
1627        }
1628}
1629/*
1630 * Find the leaf block in the tree which would contain cpos. No
1631 * checking of the actual leaf is done.
1632 *
1633 * Some paths want to call this instead of allocating a path structure
1634 * and calling ocfs2_find_path().
1635 *
1636 * This function doesn't handle non btree extent lists.
1637 */
1638int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1639                    u32 cpos, struct buffer_head **leaf_bh)
1640{
1641        int ret;
1642        struct buffer_head *bh = NULL;
1643
1644        ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1645        if (ret) {
1646                mlog_errno(ret);
1647                goto out;
1648        }
1649
1650        *leaf_bh = bh;
1651out:
1652        return ret;
1653}
1654
1655/*
1656 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1657 *
1658 * Basically, we've moved stuff around at the bottom of the tree and
1659 * we need to fix up the extent records above the changes to reflect
1660 * the new changes.
1661 *
1662 * left_rec: the record on the left.
1663 * left_child_el: is the child list pointed to by left_rec
1664 * right_rec: the record to the right of left_rec
1665 * right_child_el: is the child list pointed to by right_rec
1666 *
1667 * By definition, this only works on interior nodes.
1668 */
1669static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1670                                  struct ocfs2_extent_list *left_child_el,
1671                                  struct ocfs2_extent_rec *right_rec,
1672                                  struct ocfs2_extent_list *right_child_el)
1673{
1674        u32 left_clusters, right_end;
1675
1676        /*
1677         * Interior nodes never have holes. Their cpos is the cpos of
1678         * the leftmost record in their child list. Their cluster
1679         * count covers the full theoretical range of their child list
1680         * - the range between their cpos and the cpos of the record
1681         * immediately to their right.
1682         */
1683        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1684        if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1685                BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1686                left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1687        }
1688        left_clusters -= le32_to_cpu(left_rec->e_cpos);
1689        left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1690
1691        /*
1692         * Calculate the rightmost cluster count boundary before
1693         * moving cpos - we will need to adjust clusters after
1694         * updating e_cpos to keep the same highest cluster count.
1695         */
1696        right_end = le32_to_cpu(right_rec->e_cpos);
1697        right_end += le32_to_cpu(right_rec->e_int_clusters);
1698
1699        right_rec->e_cpos = left_rec->e_cpos;
1700        le32_add_cpu(&right_rec->e_cpos, left_clusters);
1701
1702        right_end -= le32_to_cpu(right_rec->e_cpos);
1703        right_rec->e_int_clusters = cpu_to_le32(right_end);
1704}
1705
1706/*
1707 * Adjust the adjacent root node records involved in a
1708 * rotation. left_el_blkno is passed in as a key so that we can easily
1709 * find it's index in the root list.
1710 */
1711static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1712                                      struct ocfs2_extent_list *left_el,
1713                                      struct ocfs2_extent_list *right_el,
1714                                      u64 left_el_blkno)
1715{
1716        int i;
1717
1718        BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1719               le16_to_cpu(left_el->l_tree_depth));
1720
1721        for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1722                if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1723                        break;
1724        }
1725
1726        /*
1727         * The path walking code should have never returned a root and
1728         * two paths which are not adjacent.
1729         */
1730        BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1731
1732        ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1733                                      &root_el->l_recs[i + 1], right_el);
1734}
1735
1736/*
1737 * We've changed a leaf block (in right_path) and need to reflect that
1738 * change back up the subtree.
1739 *
1740 * This happens in multiple places:
1741 *   - When we've moved an extent record from the left path leaf to the right
1742 *     path leaf to make room for an empty extent in the left path leaf.
1743 *   - When our insert into the right path leaf is at the leftmost edge
1744 *     and requires an update of the path immediately to it's left. This
1745 *     can occur at the end of some types of rotation and appending inserts.
1746 *   - When we've adjusted the last extent record in the left path leaf and the
1747 *     1st extent record in the right path leaf during cross extent block merge.
1748 */
1749static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1750                                       struct ocfs2_path *left_path,
1751                                       struct ocfs2_path *right_path,
1752                                       int subtree_index)
1753{
1754        int ret, i, idx;
1755        struct ocfs2_extent_list *el, *left_el, *right_el;
1756        struct ocfs2_extent_rec *left_rec, *right_rec;
1757        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1758
1759        /*
1760         * Update the counts and position values within all the
1761         * interior nodes to reflect the leaf rotation we just did.
1762         *
1763         * The root node is handled below the loop.
1764         *
1765         * We begin the loop with right_el and left_el pointing to the
1766         * leaf lists and work our way up.
1767         *
1768         * NOTE: within this loop, left_el and right_el always refer
1769         * to the *child* lists.
1770         */
1771        left_el = path_leaf_el(left_path);
1772        right_el = path_leaf_el(right_path);
1773        for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1774                mlog(0, "Adjust records at index %u\n", i);
1775
1776                /*
1777                 * One nice property of knowing that all of these
1778                 * nodes are below the root is that we only deal with
1779                 * the leftmost right node record and the rightmost
1780                 * left node record.
1781                 */
1782                el = left_path->p_node[i].el;
1783                idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1784                left_rec = &el->l_recs[idx];
1785
1786                el = right_path->p_node[i].el;
1787                right_rec = &el->l_recs[0];
1788
1789                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1790                                              right_el);
1791
1792                ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1793                if (ret)
1794                        mlog_errno(ret);
1795
1796                ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1797                if (ret)
1798                        mlog_errno(ret);
1799
1800                /*
1801                 * Setup our list pointers now so that the current
1802                 * parents become children in the next iteration.
1803                 */
1804                left_el = left_path->p_node[i].el;
1805                right_el = right_path->p_node[i].el;
1806        }
1807
1808        /*
1809         * At the root node, adjust the two adjacent records which
1810         * begin our path to the leaves.
1811         */
1812
1813        el = left_path->p_node[subtree_index].el;
1814        left_el = left_path->p_node[subtree_index + 1].el;
1815        right_el = right_path->p_node[subtree_index + 1].el;
1816
1817        ocfs2_adjust_root_records(el, left_el, right_el,
1818                                  left_path->p_node[subtree_index + 1].bh->b_blocknr);
1819
1820        root_bh = left_path->p_node[subtree_index].bh;
1821
1822        ret = ocfs2_journal_dirty(handle, root_bh);
1823        if (ret)
1824                mlog_errno(ret);
1825}
1826
1827static int ocfs2_rotate_subtree_right(struct inode *inode,
1828                                      handle_t *handle,
1829                                      struct ocfs2_path *left_path,
1830                                      struct ocfs2_path *right_path,
1831                                      int subtree_index)
1832{
1833        int ret, i;
1834        struct buffer_head *right_leaf_bh;
1835        struct buffer_head *left_leaf_bh = NULL;
1836        struct buffer_head *root_bh;
1837        struct ocfs2_extent_list *right_el, *left_el;
1838        struct ocfs2_extent_rec move_rec;
1839
1840        left_leaf_bh = path_leaf_bh(left_path);
1841        left_el = path_leaf_el(left_path);
1842
1843        if (left_el->l_next_free_rec != left_el->l_count) {
1844                ocfs2_error(inode->i_sb,
1845                            "Inode %llu has non-full interior leaf node %llu"
1846                            "(next free = %u)",
1847                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
1848                            (unsigned long long)left_leaf_bh->b_blocknr,
1849                            le16_to_cpu(left_el->l_next_free_rec));
1850                return -EROFS;
1851        }
1852
1853        /*
1854         * This extent block may already have an empty record, so we
1855         * return early if so.
1856         */
1857        if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1858                return 0;
1859
1860        root_bh = left_path->p_node[subtree_index].bh;
1861        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1862
1863        ret = ocfs2_journal_access(handle, inode, root_bh,
1864                                   OCFS2_JOURNAL_ACCESS_WRITE);
1865        if (ret) {
1866                mlog_errno(ret);
1867                goto out;
1868        }
1869
1870        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1871                ret = ocfs2_journal_access(handle, inode,
1872                                           right_path->p_node[i].bh,
1873                                           OCFS2_JOURNAL_ACCESS_WRITE);
1874                if (ret) {
1875                        mlog_errno(ret);
1876                        goto out;
1877                }
1878
1879                ret = ocfs2_journal_access(handle, inode,
1880                                           left_path->p_node[i].bh,
1881                                           OCFS2_JOURNAL_ACCESS_WRITE);
1882                if (ret) {
1883                        mlog_errno(ret);
1884                        goto out;
1885                }
1886        }
1887
1888        right_leaf_bh = path_leaf_bh(right_path);
1889        right_el = path_leaf_el(right_path);
1890
1891        /* This is a code error, not a disk corruption. */
1892        mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1893                        "because rightmost leaf block %llu is empty\n",
1894                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
1895                        (unsigned long long)right_leaf_bh->b_blocknr);
1896
1897        ocfs2_create_empty_extent(right_el);
1898
1899        ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1900        if (ret) {
1901                mlog_errno(ret);
1902                goto out;
1903        }
1904
1905        /* Do the copy now. */
1906        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1907        move_rec = left_el->l_recs[i];
1908        right_el->l_recs[0] = move_rec;
1909
1910        /*
1911         * Clear out the record we just copied and shift everything
1912         * over, leaving an empty extent in the left leaf.
1913         *
1914         * We temporarily subtract from next_free_rec so that the
1915         * shift will lose the tail record (which is now defunct).
1916         */
1917        le16_add_cpu(&left_el->l_next_free_rec, -1);
1918        ocfs2_shift_records_right(left_el);
1919        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1920        le16_add_cpu(&left_el->l_next_free_rec, 1);
1921
1922        ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1923        if (ret) {
1924                mlog_errno(ret);
1925                goto out;
1926        }
1927
1928        ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1929                                subtree_index);
1930
1931out:
1932        return ret;
1933}
1934
1935/*
1936 * Given a full path, determine what cpos value would return us a path
1937 * containing the leaf immediately to the left of the current one.
1938 *
1939 * Will return zero if the path passed in is already the leftmost path.
1940 */
1941static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1942                                         struct ocfs2_path *path, u32 *cpos)
1943{
1944        int i, j, ret = 0;
1945        u64 blkno;
1946        struct ocfs2_extent_list *el;
1947
1948        BUG_ON(path->p_tree_depth == 0);
1949
1950        *cpos = 0;
1951
1952        blkno = path_leaf_bh(path)->b_blocknr;
1953
1954        /* Start at the tree node just above the leaf and work our way up. */
1955        i = path->p_tree_depth - 1;
1956        while (i >= 0) {
1957                el = path->p_node[i].el;
1958
1959                /*
1960                 * Find the extent record just before the one in our
1961                 * path.
1962                 */
1963                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1964                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1965                                if (j == 0) {
1966                                        if (i == 0) {
1967                                                /*
1968                                                 * We've determined that the
1969                                                 * path specified is already
1970                                                 * the leftmost one - return a
1971                                                 * cpos of zero.
1972                                                 */
1973                                                goto out;
1974                                        }
1975                                        /*
1976                                         * The leftmost record points to our
1977                                         * leaf - we need to travel up the
1978                                         * tree one level.
1979                                         */
1980                                        goto next_node;
1981                                }
1982
1983                                *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1984                                *cpos = *cpos + ocfs2_rec_clusters(el,
1985                                                           &el->l_recs[j - 1]);
1986                                *cpos = *cpos - 1;
1987                                goto out;
1988                        }
1989                }
1990
1991                /*
1992                 * If we got here, we never found a valid node where
1993                 * the tree indicated one should be.
1994                 */
1995                ocfs2_error(sb,
1996                            "Invalid extent tree at extent block %llu\n",
1997                            (unsigned long long)blkno);
1998                ret = -EROFS;
1999                goto out;
2000
2001next_node:
2002                blkno = path->p_node[i].bh->b_blocknr;
2003                i--;
2004        }
2005
2006out:
2007        return ret;
2008}
2009
2010/*
2011 * Extend the transaction by enough credits to complete the rotation,
2012 * and still leave at least the original number of credits allocated
2013 * to this transaction.
2014 */
2015static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2016                                           int op_credits,
2017                                           struct ocfs2_path *path)
2018{
2019        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2020
2021        if (handle->h_buffer_credits < credits)
2022                return ocfs2_extend_trans(handle, credits);
2023
2024        return 0;
2025}
2026
2027/*
2028 * Trap the case where we're inserting into the theoretical range past
2029 * the _actual_ left leaf range. Otherwise, we'll rotate a record
2030 * whose cpos is less than ours into the right leaf.
2031 *
2032 * It's only necessary to look at the rightmost record of the left
2033 * leaf because the logic that calls us should ensure that the
2034 * theoretical ranges in the path components above the leaves are
2035 * correct.
2036 */
2037static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2038                                                 u32 insert_cpos)
2039{
2040        struct ocfs2_extent_list *left_el;
2041        struct ocfs2_extent_rec *rec;
2042        int next_free;
2043
2044        left_el = path_leaf_el(left_path);
2045        next_free = le16_to_cpu(left_el->l_next_free_rec);
2046        rec = &left_el->l_recs[next_free - 1];
2047
2048        if (insert_cpos > le32_to_cpu(rec->e_cpos))
2049                return 1;
2050        return 0;
2051}
2052
2053static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2054{
2055        int next_free = le16_to_cpu(el->l_next_free_rec);
2056        unsigned int range;
2057        struct ocfs2_extent_rec *rec;
2058
2059        if (next_free == 0)
2060                return 0;
2061
2062        rec = &el->l_recs[0];
2063        if (ocfs2_is_empty_extent(rec)) {
2064                /* Empty list. */
2065                if (next_free == 1)
2066                        return 0;
2067                rec = &el->l_recs[1];
2068        }
2069
2070        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2071        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2072                return 1;
2073        return 0;
2074}
2075
2076/*
2077 * Rotate all the records in a btree right one record, starting at insert_cpos.
2078 *
2079 * The path to the rightmost leaf should be passed in.
2080 *
2081 * The array is assumed to be large enough to hold an entire path (tree depth).
2082 *
2083 * Upon succesful return from this function:
2084 *
2085 * - The 'right_path' array will contain a path to the leaf block
2086 *   whose range contains e_cpos.
2087 * - That leaf block will have a single empty extent in list index 0.
2088 * - In the case that the rotation requires a post-insert update,
2089 *   *ret_left_path will contain a valid path which can be passed to
2090 *   ocfs2_insert_path().
2091 */
2092static int ocfs2_rotate_tree_right(struct inode *inode,
2093                                   handle_t *handle,
2094                                   enum ocfs2_split_type split,
2095                                   u32 insert_cpos,
2096                                   struct ocfs2_path *right_path,
2097                                   struct ocfs2_path **ret_left_path)
2098{
2099        int ret, start, orig_credits = handle->h_buffer_credits;
2100        u32 cpos;
2101        struct ocfs2_path *left_path = NULL;
2102
2103        *ret_left_path = NULL;
2104
2105        left_path = ocfs2_new_path(path_root_bh(right_path),
2106                                   path_root_el(right_path));
2107        if (!left_path) {
2108                ret = -ENOMEM;
2109                mlog_errno(ret);
2110                goto out;
2111        }
2112
2113        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2114        if (ret) {
2115                mlog_errno(ret);
2116                goto out;
2117        }
2118
2119        mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2120
2121        /*
2122         * What we want to do here is:
2123         *
2124         * 1) Start with the rightmost path.
2125         *
2126         * 2) Determine a path to the leaf block directly to the left
2127         *    of that leaf.
2128         *
2129         * 3) Determine the 'subtree root' - the lowest level tree node
2130         *    which contains a path to both leaves.
2131         *
2132         * 4) Rotate the subtree.
2133         *
2134         * 5) Find the next subtree by considering the left path to be
2135         *    the new right path.
2136         *
2137         * The check at the top of this while loop also accepts
2138         * insert_cpos == cpos because cpos is only a _theoretical_
2139         * value to get us the left path - insert_cpos might very well
2140         * be filling that hole.
2141         *
2142         * Stop at a cpos of '0' because we either started at the
2143         * leftmost branch (i.e., a tree with one branch and a
2144         * rotation inside of it), or we've gone as far as we can in
2145         * rotating subtrees.
2146         */
2147        while (cpos && insert_cpos <= cpos) {
2148                mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2149                     insert_cpos, cpos);
2150
2151                ret = ocfs2_find_path(inode, left_path, cpos);
2152                if (ret) {
2153                        mlog_errno(ret);
2154                        goto out;
2155                }
2156
2157                mlog_bug_on_msg(path_leaf_bh(left_path) ==
2158                                path_leaf_bh(right_path),
2159                                "Inode %lu: error during insert of %u "
2160                                "(left path cpos %u) results in two identical "
2161                                "paths ending at %llu\n",
2162                                inode->i_ino, insert_cpos, cpos,
2163                                (unsigned long long)
2164                                path_leaf_bh(left_path)->b_blocknr);
2165
2166                if (split == SPLIT_NONE &&
2167                    ocfs2_rotate_requires_path_adjustment(left_path,
2168                                                          insert_cpos)) {
2169
2170                        /*
2171                         * We've rotated the tree as much as we
2172                         * should. The rest is up to
2173                         * ocfs2_insert_path() to complete, after the
2174                         * record insertion. We indicate this
2175                         * situation by returning the left path.
2176                         *
2177                         * The reason we don't adjust the records here
2178                         * before the record insert is that an error
2179                         * later might break the rule where a parent
2180                         * record e_cpos will reflect the actual
2181                         * e_cpos of the 1st nonempty record of the
2182                         * child list.
2183                         */
2184                        *ret_left_path = left_path;
2185                        goto out_ret_path;
2186                }
2187
2188                start = ocfs2_find_subtree_root(inode, left_path, right_path);
2189
2190                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2191                     start,
2192                     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2193                     right_path->p_tree_depth);
2194
2195                ret = ocfs2_extend_rotate_transaction(handle, start,
2196                                                      orig_credits, right_path);
2197                if (ret) {
2198                        mlog_errno(ret);
2199                        goto out;
2200                }
2201
2202                ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2203                                                 right_path, start);
2204                if (ret) {
2205                        mlog_errno(ret);
2206                        goto out;
2207                }
2208
2209                if (split != SPLIT_NONE &&
2210                    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2211                                                insert_cpos)) {
2212                        /*
2213                         * A rotate moves the rightmost left leaf
2214                         * record over to the leftmost right leaf
2215                         * slot. If we're doing an extent split
2216                         * instead of a real insert, then we have to
2217                         * check that the extent to be split wasn't
2218                         * just moved over. If it was, then we can
2219                         * exit here, passing left_path back -
2220                         * ocfs2_split_extent() is smart enough to
2221                         * search both leaves.
2222                         */
2223                        *ret_left_path = left_path;
2224                        goto out_ret_path;
2225                }
2226
2227                /*
2228                 * There is no need to re-read the next right path
2229                 * as we know that it'll be our current left
2230                 * path. Optimize by copying values instead.
2231                 */
2232                ocfs2_mv_path(right_path, left_path);
2233
2234                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2235                                                    &cpos);
2236                if (ret) {
2237                        mlog_errno(ret);
2238                        goto out;
2239                }
2240        }
2241
2242out:
2243        ocfs2_free_path(left_path);
2244
2245out_ret_path:
2246        return ret;
2247}
2248
2249static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2250                                      struct ocfs2_path *path)
2251{
2252        int i, idx;
2253        struct ocfs2_extent_rec *rec;
2254        struct ocfs2_extent_list *el;
2255        struct ocfs2_extent_block *eb;
2256        u32 range;
2257
2258        /* Path should always be rightmost. */
2259        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2260        BUG_ON(eb->h_next_leaf_blk != 0ULL);
2261
2262        el = &eb->h_list;
2263        BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2264        idx = le16_to_cpu(el->l_next_free_rec) - 1;
2265        rec = &el->l_recs[idx];
2266        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2267
2268        for (i = 0; i < path->p_tree_depth; i++) {
2269                el = path->p_node[i].el;
2270                idx = le16_to_cpu(el->l_next_free_rec) - 1;
2271                rec = &el->l_recs[idx];
2272
2273                rec->e_int_clusters = cpu_to_le32(range);
2274                le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2275
2276                ocfs2_journal_dirty(handle, path->p_node[i].bh);
2277        }
2278}
2279
2280static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2281                              struct ocfs2_cached_dealloc_ctxt *dealloc,
2282                              struct ocfs2_path *path, int unlink_start)
2283{
2284        int ret, i;
2285        struct ocfs2_extent_block *eb;
2286        struct ocfs2_extent_list *el;
2287        struct buffer_head *bh;
2288
2289        for(i = unlink_start; i < path_num_items(path); i++) {
2290                bh = path->p_node[i].bh;
2291
2292                eb = (struct ocfs2_extent_block *)bh->b_data;
2293                /*
2294                 * Not all nodes might have had their final count
2295                 * decremented by the caller - handle this here.
2296                 */
2297                el = &eb->h_list;
2298                if (le16_to_cpu(el->l_next_free_rec) > 1) {
2299                        mlog(ML_ERROR,
2300                             "Inode %llu, attempted to remove extent block "
2301                             "%llu with %u records\n",
2302                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
2303                             (unsigned long long)le64_to_cpu(eb->h_blkno),
2304                             le16_to_cpu(el->l_next_free_rec));
2305
2306                        ocfs2_journal_dirty(handle, bh);
2307                        ocfs2_remove_from_cache(inode, bh);
2308                        continue;
2309                }
2310
2311                el->l_next_free_rec = 0;
2312                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2313
2314                ocfs2_journal_dirty(handle, bh);
2315
2316                ret = ocfs2_cache_extent_block_free(dealloc, eb);
2317                if (ret)
2318                        mlog_errno(ret);
2319
2320                ocfs2_remove_from_cache(inode, bh);
2321        }
2322}
2323
2324static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2325                                 struct ocfs2_path *left_path,
2326                                 struct ocfs2_path *right_path,
2327                                 int subtree_index,
2328                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2329{
2330        int i;
2331        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2332        struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2333        struct ocfs2_extent_list *el;
2334        struct ocfs2_extent_block *eb;
2335
2336        el = path_leaf_el(left_path);
2337
2338        eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2339
2340        for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2341                if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2342                        break;
2343
2344        BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2345
2346        memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2347        le16_add_cpu(&root_el->l_next_free_rec, -1);
2348
2349        eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2350        eb->h_next_leaf_blk = 0;
2351
2352        ocfs2_journal_dirty(handle, root_bh);
2353        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2354
2355        ocfs2_unlink_path(inode, handle, dealloc, right_path,
2356                          subtree_index + 1);
2357}
2358
2359static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2360                                     struct ocfs2_path *left_path,
2361                                     struct ocfs2_path *right_path,
2362                                     int subtree_index,
2363                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2364                                     int *deleted,
2365                                     struct ocfs2_extent_tree *et)
2366{
2367        int ret, i, del_right_subtree = 0, right_has_empty = 0;
2368        struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2369        struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2370        struct ocfs2_extent_block *eb;
2371
2372        *deleted = 0;
2373
2374        right_leaf_el = path_leaf_el(right_path);
2375        left_leaf_el = path_leaf_el(left_path);
2376        root_bh = left_path->p_node[subtree_index].bh;
2377        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2378
2379        if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2380                return 0;
2381
2382        eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2383        if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2384                /*
2385                 * It's legal for us to proceed if the right leaf is
2386                 * the rightmost one and it has an empty extent. There
2387                 * are two cases to handle - whether the leaf will be
2388                 * empty after removal or not. If the leaf isn't empty
2389                 * then just remove the empty extent up front. The
2390                 * next block will handle empty leaves by flagging
2391                 * them for unlink.
2392                 *
2393                 * Non rightmost leaves will throw -EAGAIN and the
2394                 * caller can manually move the subtree and retry.
2395                 */
2396
2397                if (eb->h_next_leaf_blk != 0ULL)
2398                        return -EAGAIN;
2399
2400                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2401                        ret = ocfs2_journal_access(handle, inode,
2402                                                   path_leaf_bh(right_path),
2403                                                   OCFS2_JOURNAL_ACCESS_WRITE);
2404                        if (ret) {
2405                                mlog_errno(ret);
2406                                goto out;
2407                        }
2408
2409                        ocfs2_remove_empty_extent(right_leaf_el);
2410                } else
2411                        right_has_empty = 1;
2412        }
2413
2414        if (eb->h_next_leaf_blk == 0ULL &&
2415            le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2416                /*
2417                 * We have to update i_last_eb_blk during the meta
2418                 * data delete.
2419                 */
2420                ret = ocfs2_journal_access(handle, inode, et_root_bh,
2421                                           OCFS2_JOURNAL_ACCESS_WRITE);
2422                if (ret) {
2423                        mlog_errno(ret);
2424                        goto out;
2425                }
2426
2427                del_right_subtree = 1;
2428        }
2429
2430        /*
2431         * Getting here with an empty extent in the right path implies
2432         * that it's the rightmost path and will be deleted.
2433         */
2434        BUG_ON(right_has_empty && !del_right_subtree);
2435
2436        ret = ocfs2_journal_access(handle, inode, root_bh,
2437                                   OCFS2_JOURNAL_ACCESS_WRITE);
2438        if (ret) {
2439                mlog_errno(ret);
2440                goto out;
2441        }
2442
2443        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2444                ret = ocfs2_journal_access(handle, inode,
2445                                           right_path->p_node[i].bh,
2446                                           OCFS2_JOURNAL_ACCESS_WRITE);
2447                if (ret) {
2448                        mlog_errno(ret);
2449                        goto out;
2450                }
2451
2452                ret = ocfs2_journal_access(handle, inode,
2453                                           left_path->p_node[i].bh,
2454                                           OCFS2_JOURNAL_ACCESS_WRITE);
2455                if (ret) {
2456                        mlog_errno(ret);
2457                        goto out;
2458                }
2459        }
2460
2461        if (!right_has_empty) {
2462                /*
2463                 * Only do this if we're moving a real
2464                 * record. Otherwise, the action is delayed until
2465                 * after removal of the right path in which case we
2466                 * can do a simple shift to remove the empty extent.
2467                 */
2468                ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2469                memset(&right_leaf_el->l_recs[0], 0,
2470                       sizeof(struct ocfs2_extent_rec));
2471        }
2472        if (eb->h_next_leaf_blk == 0ULL) {
2473                /*
2474                 * Move recs over to get rid of empty extent, decrease
2475                 * next_free. This is allowed to remove the last
2476                 * extent in our leaf (setting l_next_free_rec to
2477                 * zero) - the delete code below won't care.
2478                 */
2479                ocfs2_remove_empty_extent(right_leaf_el);
2480        }
2481
2482        ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2483        if (ret)
2484                mlog_errno(ret);
2485        ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2486        if (ret)
2487                mlog_errno(ret);
2488
2489        if (del_right_subtree) {
2490                ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2491                                     subtree_index, dealloc);
2492                ocfs2_update_edge_lengths(inode, handle, left_path);
2493
2494                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2495                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2496
2497                /*
2498                 * Removal of the extent in the left leaf was skipped
2499                 * above so we could delete the right path
2500                 * 1st.
2501                 */
2502                if (right_has_empty)
2503                        ocfs2_remove_empty_extent(left_leaf_el);
2504
2505                ret = ocfs2_journal_dirty(handle, et_root_bh);
2506                if (ret)
2507                        mlog_errno(ret);
2508
2509                *deleted = 1;
2510        } else
2511                ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2512                                           subtree_index);
2513
2514out:
2515        return ret;
2516}
2517
2518/*
2519 * Given a full path, determine what cpos value would return us a path
2520 * containing the leaf immediately to the right of the current one.
2521 *
2522 * Will return zero if the path passed in is already the rightmost path.
2523 *
2524 * This looks similar, but is subtly different to
2525 * ocfs2_find_cpos_for_left_leaf().
2526 */
2527static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2528                                          struct ocfs2_path *path, u32 *cpos)
2529{
2530        int i, j, ret = 0;
2531        u64 blkno;
2532        struct ocfs2_extent_list *el;
2533
2534        *cpos = 0;
2535
2536        if (path->p_tree_depth == 0)
2537                return 0;
2538
2539        blkno = path_leaf_bh(path)->b_blocknr;
2540
2541        /* Start at the tree node just above the leaf and work our way up. */
2542        i = path->p_tree_depth - 1;
2543        while (i >= 0) {
2544                int next_free;
2545
2546                el = path->p_node[i].el;
2547
2548                /*
2549                 * Find the extent record just after the one in our
2550                 * path.
2551                 */
2552                next_free = le16_to_cpu(el->l_next_free_rec);
2553                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2554                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2555                                if (j == (next_free - 1)) {
2556                                        if (i == 0) {
2557                                                /*
2558                                                 * We've determined that the
2559                                                 * path specified is already
2560                                                 * the rightmost one - return a
2561                                                 * cpos of zero.
2562                                                 */
2563                                                goto out;
2564                                        }
2565                                        /*
2566                                         * The rightmost record points to our
2567                                         * leaf - we need to travel up the
2568                                         * tree one level.
2569                                         */
2570                                        goto next_node;
2571                                }
2572
2573                                *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2574                                goto out;
2575                        }
2576                }
2577
2578                /*
2579                 * If we got here, we never found a valid node where
2580                 * the tree indicated one should be.
2581                 */
2582                ocfs2_error(sb,
2583                            "Invalid extent tree at extent block %llu\n",
2584                            (unsigned long long)blkno);
2585                ret = -EROFS;
2586                goto out;
2587
2588next_node:
2589                blkno = path->p_node[i].bh->b_blocknr;
2590                i--;
2591        }
2592
2593out:
2594        return ret;
2595}
2596
2597static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2598                                            handle_t *handle,
2599                                            struct buffer_head *bh,
2600                                            struct ocfs2_extent_list *el)
2601{
2602        int ret;
2603
2604        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2605                return 0;
2606
2607        ret = ocfs2_journal_access(handle, inode, bh,
2608                                   OCFS2_JOURNAL_ACCESS_WRITE);
2609        if (ret) {
2610                mlog_errno(ret);
2611                goto out;
2612        }
2613
2614        ocfs2_remove_empty_extent(el);
2615
2616        ret = ocfs2_journal_dirty(handle, bh);
2617        if (ret)
2618                mlog_errno(ret);
2619
2620out:
2621        return ret;
2622}
2623
2624static int __ocfs2_rotate_tree_left(struct inode *inode,
2625                                    handle_t *handle, int orig_credits,
2626                                    struct ocfs2_path *path,
2627                                    struct ocfs2_cached_dealloc_ctxt *dealloc,
2628                                    struct ocfs2_path **empty_extent_path,
2629                                    struct ocfs2_extent_tree *et)
2630{
2631        int ret, subtree_root, deleted;
2632        u32 right_cpos;
2633        struct ocfs2_path *left_path = NULL;
2634        struct ocfs2_path *right_path = NULL;
2635
2636        BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2637
2638        *empty_extent_path = NULL;
2639
2640        ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2641                                             &right_cpos);
2642        if (ret) {
2643                mlog_errno(ret);
2644                goto out;
2645        }
2646
2647        left_path = ocfs2_new_path(path_root_bh(path),
2648                                   path_root_el(path));
2649        if (!left_path) {
2650                ret = -ENOMEM;
2651                mlog_errno(ret);
2652                goto out;
2653        }
2654
2655        ocfs2_cp_path(left_path, path);
2656
2657        right_path = ocfs2_new_path(path_root_bh(path),
2658                                    path_root_el(path));
2659        if (!right_path) {
2660                ret = -ENOMEM;
2661                mlog_errno(ret);
2662                goto out;
2663        }
2664
2665        while (right_cpos) {
2666                ret = ocfs2_find_path(inode, right_path, right_cpos);
2667                if (ret) {
2668                        mlog_errno(ret);
2669                        goto out;
2670                }
2671
2672                subtree_root = ocfs2_find_subtree_root(inode, left_path,
2673                                                       right_path);
2674
2675                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2676                     subtree_root,
2677                     (unsigned long long)
2678                     right_path->p_node[subtree_root].bh->b_blocknr,
2679                     right_path->p_tree_depth);
2680
2681                ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2682                                                      orig_credits, left_path);
2683                if (ret) {
2684                        mlog_errno(ret);
2685                        goto out;
2686                }
2687
2688                /*
2689                 * Caller might still want to make changes to the
2690                 * tree root, so re-add it to the journal here.
2691                 */
2692                ret = ocfs2_journal_access(handle, inode,
2693                                           path_root_bh(left_path),
2694                                           OCFS2_JOURNAL_ACCESS_WRITE);
2695                if (ret) {
2696                        mlog_errno(ret);
2697                        goto out;
2698                }
2699
2700                ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2701                                                right_path, subtree_root,
2702                                                dealloc, &deleted, et);
2703                if (ret == -EAGAIN) {
2704                        /*
2705                         * The rotation has to temporarily stop due to
2706                         * the right subtree having an empty
2707                         * extent. Pass it back to the caller for a
2708                         * fixup.
2709                         */
2710                        *empty_extent_path = right_path;
2711                        right_path = NULL;
2712                        goto out;
2713                }
2714                if (ret) {
2715                        mlog_errno(ret);
2716                        goto out;
2717                }
2718
2719                /*
2720                 * The subtree rotate might have removed records on
2721                 * the rightmost edge. If so, then rotation is
2722                 * complete.
2723                 */
2724                if (deleted)
2725                        break;
2726
2727                ocfs2_mv_path(left_path, right_path);
2728
2729                ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2730                                                     &right_cpos);
2731                if (ret) {
2732                        mlog_errno(ret);
2733                        goto out;
2734                }
2735        }
2736
2737out:
2738        ocfs2_free_path(right_path);
2739        ocfs2_free_path(left_path);
2740
2741        return ret;
2742}
2743
2744static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2745                                struct ocfs2_path *path,
2746                                struct ocfs2_cached_dealloc_ctxt *dealloc,
2747                                struct ocfs2_extent_tree *et)
2748{
2749        int ret, subtree_index;
2750        u32 cpos;
2751        struct ocfs2_path *left_path = NULL;
2752        struct ocfs2_extent_block *eb;
2753        struct ocfs2_extent_list *el;
2754
2755
2756        ret = ocfs2_et_sanity_check(inode, et);
2757        if (ret)
2758                goto out;
2759        /*
2760         * There's two ways we handle this depending on
2761         * whether path is the only existing one.
2762         */
2763        ret = ocfs2_extend_rotate_transaction(handle, 0,
2764                                              handle->h_buffer_credits,
2765                                              path);
2766        if (ret) {
2767                mlog_errno(ret);
2768                goto out;
2769        }
2770
2771        ret = ocfs2_journal_access_path(inode, handle, path);
2772        if (ret) {
2773                mlog_errno(ret);
2774                goto out;
2775        }
2776
2777        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2778        if (ret) {
2779                mlog_errno(ret);
2780                goto out;
2781        }
2782
2783        if (cpos) {
2784                /*
2785                 * We have a path to the left of this one - it needs
2786                 * an update too.
2787                 */
2788                left_path = ocfs2_new_path(path_root_bh(path),
2789                                           path_root_el(path));
2790                if (!left_path) {
2791                        ret = -ENOMEM;
2792                        mlog_errno(ret);
2793                        goto out;
2794                }
2795
2796                ret = ocfs2_find_path(inode, left_path, cpos);
2797                if (ret) {
2798                        mlog_errno(ret);
2799                        goto out;
2800                }
2801
2802                ret = ocfs2_journal_access_path(inode, handle, left_path);
2803                if (ret) {
2804                        mlog_errno(ret);
2805                        goto out;
2806                }
2807
2808                subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2809
2810                ocfs2_unlink_subtree(inode, handle, left_path, path,
2811                                     subtree_index, dealloc);
2812                ocfs2_update_edge_lengths(inode, handle, left_path);
2813
2814                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2815                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2816        } else {
2817                /*
2818                 * 'path' is also the leftmost path which
2819                 * means it must be the only one. This gets
2820                 * handled differently because we want to
2821                 * revert the inode back to having extents
2822                 * in-line.
2823                 */
2824                ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2825
2826                el = et->et_root_el;
2827                el->l_tree_depth = 0;
2828                el->l_next_free_rec = 0;
2829                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2830
2831                ocfs2_et_set_last_eb_blk(et, 0);
2832        }
2833
2834        ocfs2_journal_dirty(handle, path_root_bh(path));
2835
2836out:
2837        ocfs2_free_path(left_path);
2838        return ret;
2839}
2840
2841/*
2842 * Left rotation of btree records.
2843 *
2844 * In many ways, this is (unsurprisingly) the opposite of right
2845 * rotation. We start at some non-rightmost path containing an empty
2846 * extent in the leaf block. The code works its way to the rightmost
2847 * path by rotating records to the left in every subtree.
2848 *
2849 * This is used by any code which reduces the number of extent records
2850 * in a leaf. After removal, an empty record should be placed in the
2851 * leftmost list position.
2852 *
2853 * This won't handle a length update of the rightmost path records if
2854 * the rightmost tree leaf record is removed so the caller is
2855 * responsible for detecting and correcting that.
2856 */
2857static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2858                                  struct ocfs2_path *path,
2859                                  struct ocfs2_cached_dealloc_ctxt *dealloc,
2860                                  struct ocfs2_extent_tree *et)
2861{
2862        int ret, orig_credits = handle->h_buffer_credits;
2863        struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2864        struct ocfs2_extent_block *eb;
2865        struct ocfs2_extent_list *el;
2866
2867        el = path_leaf_el(path);
2868        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2869                return 0;
2870
2871        if (path->p_tree_depth == 0) {
2872rightmost_no_delete:
2873                /*
2874                 * Inline extents. This is trivially handled, so do
2875                 * it up front.
2876                 */
2877                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2878                                                       path_leaf_bh(path),
2879                                                       path_leaf_el(path));
2880                if (ret)
2881                        mlog_errno(ret);
2882                goto out;
2883        }
2884
2885        /*
2886         * Handle rightmost branch now. There's several cases:
2887         *  1) simple rotation leaving records in there. That's trivial.
2888         *  2) rotation requiring a branch delete - there's no more
2889         *     records left. Two cases of this:
2890         *     a) There are branches to the left.
2891         *     b) This is also the leftmost (the only) branch.
2892         *
2893         *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2894         *  2a) we need the left branch so that we can update it with the unlink
2895         *  2b) we need to bring the inode back to inline extents.
2896         */
2897
2898        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2899        el = &eb->h_list;
2900        if (eb->h_next_leaf_blk == 0) {
2901                /*
2902                 * This gets a bit tricky if we're going to delete the
2903                 * rightmost path. Get the other cases out of the way
2904                 * 1st.
2905                 */
2906                if (le16_to_cpu(el->l_next_free_rec) > 1)
2907                        goto rightmost_no_delete;
2908
2909                if (le16_to_cpu(el->l_next_free_rec) == 0) {
2910                        ret = -EIO;
2911                        ocfs2_error(inode->i_sb,
2912                                    "Inode %llu has empty extent block at %llu",
2913                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
2914                                    (unsigned long long)le64_to_cpu(eb->h_blkno));
2915                        goto out;
2916                }
2917
2918                /*
2919                 * XXX: The caller can not trust "path" any more after
2920                 * this as it will have been deleted. What do we do?
2921                 *
2922                 * In theory the rotate-for-merge code will never get
2923                 * here because it'll always ask for a rotate in a
2924                 * nonempty list.
2925                 */
2926
2927                ret = ocfs2_remove_rightmost_path(inode, handle, path,
2928                                                  dealloc, et);
2929                if (ret)
2930                        mlog_errno(ret);
2931                goto out;
2932        }
2933
2934        /*
2935         * Now we can loop, remembering the path we get from -EAGAIN
2936         * and restarting from there.
2937         */
2938try_rotate:
2939        ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2940                                       dealloc, &restart_path, et);
2941        if (ret && ret != -EAGAIN) {
2942                mlog_errno(ret);
2943                goto out;
2944        }
2945
2946        while (ret == -EAGAIN) {
2947                tmp_path = restart_path;
2948                restart_path = NULL;
2949
2950                ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2951                                               tmp_path, dealloc,
2952                                               &restart_path, et);
2953                if (ret && ret != -EAGAIN) {
2954                        mlog_errno(ret);
2955                        goto out;
2956                }
2957
2958                ocfs2_free_path(tmp_path);
2959                tmp_path = NULL;
2960
2961                if (ret == 0)
2962                        goto try_rotate;
2963        }
2964
2965out:
2966        ocfs2_free_path(tmp_path);
2967        ocfs2_free_path(restart_path);
2968        return ret;
2969}
2970
2971static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2972                                int index)
2973{
2974        struct ocfs2_extent_rec *rec = &el->l_recs[index];
2975        unsigned int size;
2976
2977        if (rec->e_leaf_clusters == 0) {
2978                /*
2979                 * We consumed all of the merged-from record. An empty
2980                 * extent cannot exist anywhere but the 1st array
2981                 * position, so move things over if the merged-from
2982                 * record doesn't occupy that position.
2983                 *
2984                 * This creates a new empty extent so the caller
2985                 * should be smart enough to have removed any existing
2986                 * ones.
2987                 */
2988                if (index > 0) {
2989                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2990                        size = index * sizeof(struct ocfs2_extent_rec);
2991                        memmove(&el->l_recs[1], &el->l_recs[0], size);
2992                }
2993
2994                /*
2995                 * Always memset - the caller doesn't check whether it
2996                 * created an empty extent, so there could be junk in
2997                 * the other fields.
2998                 */
2999                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3000        }
3001}
3002
3003static int ocfs2_get_right_path(struct inode *inode,
3004                                struct ocfs2_path *left_path,
3005                                struct ocfs2_path **ret_right_path)
3006{
3007        int ret;
3008        u32 right_cpos;
3009        struct ocfs2_path *right_path = NULL;
3010        struct ocfs2_extent_list *left_el;
3011
3012        *ret_right_path = NULL;
3013
3014        /* This function shouldn't be called for non-trees. */
3015        BUG_ON(left_path->p_tree_depth == 0);
3016
3017        left_el = path_leaf_el(left_path);
3018        BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3019
3020        ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3021                                             &right_cpos);
3022        if (ret) {
3023                mlog_errno(ret);
3024                goto out;
3025        }
3026
3027        /* This function shouldn't be called for the rightmost leaf. */
3028        BUG_ON(right_cpos == 0);
3029
3030        right_path = ocfs2_new_path(path_root_bh(left_path),
3031                                    path_root_el(left_path));
3032        if (!right_path) {
3033                ret = -ENOMEM;
3034                mlog_errno(ret);
3035                goto out;
3036        }
3037
3038        ret = ocfs2_find_path(inode, right_path, right_cpos);
3039        if (ret) {
3040                mlog_errno(ret);
3041                goto out;
3042        }
3043
3044        *ret_right_path = right_path;
3045out:
3046        if (ret)
3047                ocfs2_free_path(right_path);
3048        return ret;
3049}
3050
3051/*
3052 * Remove split_rec clusters from the record at index and merge them
3053 * onto the beginning of the record "next" to it.
3054 * For index < l_count - 1, the next means the extent rec at index + 1.
3055 * For index == l_count - 1, the "next" means the 1st extent rec of the
3056 * next extent block.
3057 */
3058static int ocfs2_merge_rec_right(struct inode *inode,
3059                                 struct ocfs2_path *left_path,
3060                                 handle_t *handle,
3061                                 struct ocfs2_extent_rec *split_rec,
3062                                 int index)
3063{
3064        int ret, next_free, i;
3065        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3066        struct ocfs2_extent_rec *left_rec;
3067        struct ocfs2_extent_rec *right_rec;
3068        struct ocfs2_extent_list *right_el;
3069        struct ocfs2_path *right_path = NULL;
3070        int subtree_index = 0;
3071        struct ocfs2_extent_list *el = path_leaf_el(left_path);
3072        struct buffer_head *bh = path_leaf_bh(left_path);
3073        struct buffer_head *root_bh = NULL;
3074
3075        BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3076        left_rec = &el->l_recs[index];
3077
3078        if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3079            le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3080                /* we meet with a cross extent block merge. */
3081                ret = ocfs2_get_right_path(inode, left_path, &right_path);
3082                if (ret) {
3083                        mlog_errno(ret);
3084                        goto out;
3085                }
3086
3087                right_el = path_leaf_el(right_path);
3088                next_free = le16_to_cpu(right_el->l_next_free_rec);
3089                BUG_ON(next_free <= 0);
3090                right_rec = &right_el->l_recs[0];
3091                if (ocfs2_is_empty_extent(right_rec)) {
3092                        BUG_ON(next_free <= 1);
3093                        right_rec = &right_el->l_recs[1];
3094                }
3095
3096                BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3097                       le16_to_cpu(left_rec->e_leaf_clusters) !=
3098                       le32_to_cpu(right_rec->e_cpos));
3099
3100                subtree_index = ocfs2_find_subtree_root(inode,
3101                                                        left_path, right_path);
3102
3103                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3104                                                      handle->h_buffer_credits,
3105                                                      right_path);
3106                if (ret) {
3107                        mlog_errno(ret);
3108                        goto out;
3109                }
3110
3111                root_bh = left_path->p_node[subtree_index].bh;
3112                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3113
3114                ret = ocfs2_journal_access(handle, inode, root_bh,
3115                                           OCFS2_JOURNAL_ACCESS_WRITE);
3116                if (ret) {
3117                        mlog_errno(ret);
3118                        goto out;
3119                }
3120
3121                for (i = subtree_index + 1;
3122                     i < path_num_items(right_path); i++) {
3123                        ret = ocfs2_journal_access(handle, inode,
3124                                                   right_path->p_node[i].bh,
3125                                                   OCFS2_JOURNAL_ACCESS_WRITE);
3126                        if (ret) {
3127                                mlog_errno(ret);
3128                                goto out;
3129                        }
3130
3131                        ret = ocfs2_journal_access(handle, inode,
3132                                                   left_path->p_node[i].bh,
3133                                                   OCFS2_JOURNAL_ACCESS_WRITE);
3134                        if (ret) {
3135                                mlog_errno(ret);
3136                                goto out;
3137                        }
3138                }
3139
3140        } else {
3141                BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3142                right_rec = &el->l_recs[index + 1];
3143        }
3144
3145        ret = ocfs2_journal_access(handle, inode, bh,
3146                                   OCFS2_JOURNAL_ACCESS_WRITE);
3147        if (ret) {
3148                mlog_errno(ret);
3149                goto out;
3150        }
3151
3152        le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3153
3154        le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3155        le64_add_cpu(&right_rec->e_blkno,
3156                     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3157        le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3158
3159        ocfs2_cleanup_merge(el, index);
3160
3161        ret = ocfs2_journal_dirty(handle, bh);
3162        if (ret)
3163                mlog_errno(ret);
3164
3165        if (right_path) {
3166                ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3167                if (ret)
3168                        mlog_errno(ret);
3169
3170                ocfs2_complete_edge_insert(inode, handle, left_path,
3171                                           right_path, subtree_index);
3172        }
3173out:
3174        if (right_path)
3175                ocfs2_free_path(right_path);
3176        return ret;
3177}
3178
3179static int ocfs2_get_left_path(struct inode *inode,
3180                               struct ocfs2_path *right_path,
3181                               struct ocfs2_path **ret_left_path)
3182{
3183        int ret;
3184        u32 left_cpos;
3185        struct ocfs2_path *left_path = NULL;
3186
3187        *ret_left_path = NULL;
3188
3189        /* This function shouldn't be called for non-trees. */
3190        BUG_ON(right_path->p_tree_depth == 0);
3191
3192        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3193                                            right_path, &left_cpos);
3194        if (ret) {
3195                mlog_errno(ret);
3196                goto out;
3197        }
3198
3199        /* This function shouldn't be called for the leftmost leaf. */
3200        BUG_ON(left_cpos == 0);
3201
3202        left_path = ocfs2_new_path(path_root_bh(right_path),
3203                                   path_root_el(right_path));
3204        if (!left_path) {
3205                ret = -ENOMEM;
3206                mlog_errno(ret);
3207                goto out;
3208        }
3209
3210        ret = ocfs2_find_path(inode, left_path, left_cpos);
3211        if (ret) {
3212                mlog_errno(ret);
3213                goto out;
3214        }
3215
3216        *ret_left_path = left_path;
3217out:
3218        if (ret)
3219                ocfs2_free_path(left_path);
3220        return ret;
3221}
3222
3223/*
3224 * Remove split_rec clusters from the record at index and merge them
3225 * onto the tail of the record "before" it.
3226 * For index > 0, the "before" means the extent rec at index - 1.
3227 *
3228 * For index == 0, the "before" means the last record of the previous
3229 * extent block. And there is also a situation that we may need to
3230 * remove the rightmost leaf extent block in the right_path and change
3231 * the right path to indicate the new rightmost path.
3232 */
3233static int ocfs2_merge_rec_left(struct inode *inode,
3234                                struct ocfs2_path *right_path,
3235                                handle_t *handle,
3236                                struct ocfs2_extent_rec *split_rec,
3237                                struct ocfs2_cached_dealloc_ctxt *dealloc,
3238                                struct ocfs2_extent_tree *et,
3239                                int index)
3240{
3241        int ret, i, subtree_index = 0, has_empty_extent = 0;
3242        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3243        struct ocfs2_extent_rec *left_rec;
3244        struct ocfs2_extent_rec *right_rec;
3245        struct ocfs2_extent_list *el = path_leaf_el(right_path);
3246        struct buffer_head *bh = path_leaf_bh(right_path);
3247        struct buffer_head *root_bh = NULL;
3248        struct ocfs2_path *left_path = NULL;
3249        struct ocfs2_extent_list *left_el;
3250
3251        BUG_ON(index < 0);
3252
3253        right_rec = &el->l_recs[index];
3254        if (index == 0) {
3255                /* we meet with a cross extent block merge. */
3256                ret = ocfs2_get_left_path(inode, right_path, &left_path);
3257                if (ret) {
3258                        mlog_errno(ret);
3259                        goto out;
3260                }
3261
3262                left_el = path_leaf_el(left_path);
3263                BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3264                       le16_to_cpu(left_el->l_count));
3265
3266                left_rec = &left_el->l_recs[
3267                                le16_to_cpu(left_el->l_next_free_rec) - 1];
3268                BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3269                       le16_to_cpu(left_rec->e_leaf_clusters) !=
3270                       le32_to_cpu(split_rec->e_cpos));
3271
3272                subtree_index = ocfs2_find_subtree_root(inode,
3273                                                        left_path, right_path);
3274
3275                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3276                                                      handle->h_buffer_credits,
3277                                                      left_path);
3278                if (ret) {
3279                        mlog_errno(ret);
3280                        goto out;
3281                }
3282
3283                root_bh = left_path->p_node[subtree_index].bh;
3284                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3285
3286                ret = ocfs2_journal_access(handle, inode, root_bh,
3287                                           OCFS2_JOURNAL_ACCESS_WRITE);
3288                if (ret) {
3289                        mlog_errno(ret);
3290                        goto out;
3291                }
3292
3293                for (i = subtree_index + 1;
3294                     i < path_num_items(right_path); i++) {
3295                        ret = ocfs2_journal_access(handle, inode,
3296                                                   right_path->p_node[i].bh,
3297                                                   OCFS2_JOURNAL_ACCESS_WRITE);
3298                        if (ret) {
3299                                mlog_errno(ret);
3300                                goto out;
3301                        }
3302
3303                        ret = ocfs2_journal_access(handle, inode,
3304                                                   left_path->p_node[i].bh,
3305                                                   OCFS2_JOURNAL_ACCESS_WRITE);
3306                        if (ret) {
3307                                mlog_errno(ret);
3308                                goto out;
3309                        }
3310                }
3311        } else {
3312                left_rec = &el->l_recs[index - 1];
3313                if (ocfs2_is_empty_extent(&el->l_recs[0]))
3314                        has_empty_extent = 1;
3315        }
3316
3317        ret = ocfs2_journal_access(handle, inode, bh,
3318                                   OCFS2_JOURNAL_ACCESS_WRITE);
3319        if (ret) {
3320                mlog_errno(ret);
3321                goto out;
3322        }
3323
3324        if (has_empty_extent && index == 1) {
3325                /*
3326                 * The easy case - we can just plop the record right in.
3327                 */
3328                *left_rec = *split_rec;
3329
3330                has_empty_extent = 0;
3331        } else
3332                le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3333
3334        le32_add_cpu(&right_rec->e_cpos, split_clusters);
3335        le64_add_cpu(&right_rec->e_blkno,
3336                     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3337        le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3338
3339        ocfs2_cleanup_merge(el, index);
3340
3341        ret = ocfs2_journal_dirty(handle, bh);
3342        if (ret)
3343                mlog_errno(ret);
3344
3345        if (left_path) {
3346                ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3347                if (ret)
3348                        mlog_errno(ret);
3349
3350                /*
3351                 * In the situation that the right_rec is empty and the extent
3352                 * block is empty also,  ocfs2_complete_edge_insert can't handle
3353                 * it and we need to delete the right extent block.
3354                 */
3355                if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3356                    le16_to_cpu(el->l_next_free_rec) == 1) {
3357
3358                        ret = ocfs2_remove_rightmost_path(inode, handle,
3359                                                          right_path,
3360                                                          dealloc, et);
3361                        if (ret) {
3362                                mlog_errno(ret);
3363                                goto out;
3364                        }
3365
3366                        /* Now the rightmost extent block has been deleted.
3367                         * So we use the new rightmost path.
3368                         */
3369                        ocfs2_mv_path(right_path, left_path);
3370                        left_path = NULL;
3371                } else
3372                        ocfs2_complete_edge_insert(inode, handle, left_path,
3373                                                   right_path, subtree_index);
3374        }
3375out:
3376        if (left_path)
3377                ocfs2_free_path(left_path);
3378        return ret;
3379}
3380
3381static int ocfs2_try_to_merge_extent(struct inode *inode,
3382                                     handle_t *handle,
3383                                     struct ocfs2_path *path,
3384                                     int split_index,
3385                                     struct ocfs2_extent_rec *split_rec,
3386                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
3387                                     struct ocfs2_merge_ctxt *ctxt,
3388                                     struct ocfs2_extent_tree *et)
3389
3390{
3391        int ret = 0;
3392        struct ocfs2_extent_list *el = path_leaf_el(path);
3393        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3394
3395        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3396
3397        if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3398                /*
3399                 * The merge code will need to create an empty
3400                 * extent to take the place of the newly
3401                 * emptied slot. Remove any pre-existing empty
3402                 * extents - having more than one in a leaf is
3403                 * illegal.
3404                 */
3405                ret = ocfs2_rotate_tree_left(inode, handle, path,
3406                                             dealloc, et);
3407                if (ret) {
3408                        mlog_errno(ret);
3409                        goto out;
3410                }
3411                split_index--;
3412                rec = &el->l_recs[split_index];
3413        }
3414
3415        if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3416                /*
3417                 * Left-right contig implies this.
3418                 */
3419                BUG_ON(!ctxt->c_split_covers_rec);
3420
3421                /*
3422                 * Since the leftright insert always covers the entire
3423                 * extent, this call will delete the insert record
3424                 * entirely, resulting in an empty extent record added to
3425                 * the extent block.
3426                 *
3427                 * Since the adding of an empty extent shifts
3428                 * everything back to the right, there's no need to
3429                 * update split_index here.
3430                 *
3431                 * When the split_index is zero, we need to merge it to the
3432                 * prevoius extent block. It is more efficient and easier
3433                 * if we do merge_right first and merge_left later.
3434                 */
3435                ret = ocfs2_merge_rec_right(inode, path,
3436                                            handle, split_rec,
3437                                            split_index);
3438                if (ret) {
3439                        mlog_errno(ret);
3440                        goto out;
3441                }
3442
3443                /*
3444                 * We can only get this from logic error above.
3445                 */
3446                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3447
3448                /* The merge left us with an empty extent, remove it. */
3449                ret = ocfs2_rotate_tree_left(inode, handle, path,
3450                                             dealloc, et);
3451                if (ret) {
3452                        mlog_errno(ret);
3453                        goto out;
3454                }
3455
3456                rec = &el->l_recs[split_index];
3457
3458                /*
3459                 * Note that we don't pass split_rec here on purpose -
3460                 * we've merged it into the rec already.
3461                 */
3462                ret = ocfs2_merge_rec_left(inode, path,
3463                                           handle, rec,
3464                                           dealloc, et,
3465                                           split_index);
3466
3467                if (ret) {
3468                        mlog_errno(ret);
3469                        goto out;
3470                }
3471
3472                ret = ocfs2_rotate_tree_left(inode, handle, path,
3473                                             dealloc, et);
3474                /*
3475                 * Error from this last rotate is not critical, so
3476                 * print but don't bubble it up.
3477                 */
3478                if (ret)
3479                        mlog_errno(ret);
3480                ret = 0;
3481        } else {
3482                /*
3483                 * Merge a record to the left or right.
3484                 *
3485                 * 'contig_type' is relative to the existing record,
3486                 * so for example, if we're "right contig", it's to
3487                 * the record on the left (hence the left merge).
3488                 */
3489                if (ctxt->c_contig_type == CONTIG_RIGHT) {
3490                        ret = ocfs2_merge_rec_left(inode,
3491                                                   path,
3492                                                   handle, split_rec,
3493                                                   dealloc, et,
3494                                                   split_index);
3495                        if (ret) {
3496                                mlog_errno(ret);
3497                                goto out;
3498                        }
3499                } else {
3500                        ret = ocfs2_merge_rec_right(inode,
3501                                                    path,
3502                                                    handle, split_rec,
3503                                                    split_index);
3504                        if (ret) {
3505                                mlog_errno(ret);
3506                                goto out;
3507                        }
3508                }
3509
3510                if (ctxt->c_split_covers_rec) {
3511                        /*
3512                         * The merge may have left an empty extent in
3513                         * our leaf. Try to rotate it away.
3514                         */
3515                        ret = ocfs2_rotate_tree_left(inode, handle, path,
3516                                                     dealloc, et);
3517                        if (ret)
3518                                mlog_errno(ret);
3519                        ret = 0;
3520                }
3521        }
3522
3523out:
3524        return ret;
3525}
3526
3527static void ocfs2_subtract_from_rec(struct super_block *sb,
3528                                    enum ocfs2_split_type split,
3529                                    struct ocfs2_extent_rec *rec,
3530                                    struct ocfs2_extent_rec *split_rec)
3531{
3532        u64 len_blocks;
3533
3534        len_blocks = ocfs2_clusters_to_blocks(sb,
3535                                le16_to_cpu(split_rec->e_leaf_clusters));
3536
3537        if (split == SPLIT_LEFT) {
3538                /*
3539                 * Region is on the left edge of the existing
3540                 * record.
3541                 */
3542                le32_add_cpu(&rec->e_cpos,
3543                             le16_to_cpu(split_rec->e_leaf_clusters));
3544                le64_add_cpu(&rec->e_blkno, len_blocks);
3545                le16_add_cpu(&rec->e_leaf_clusters,
3546                             -le16_to_cpu(split_rec->e_leaf_clusters));
3547        } else {
3548                /*
3549                 * Region is on the right edge of the existing
3550                 * record.
3551                 */
3552                le16_add_cpu(&rec->e_leaf_clusters,
3553                             -le16_to_cpu(split_rec->e_leaf_clusters));
3554        }
3555}
3556
3557/*
3558 * Do the final bits of extent record insertion at the target leaf
3559 * list. If this leaf is part of an allocation tree, it is assumed
3560 * that the tree above has been prepared.
3561 */
3562static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3563                                 struct ocfs2_extent_list *el,
3564                                 struct ocfs2_insert_type *insert,
3565                                 struct inode *inode)
3566{
3567        int i = insert->ins_contig_index;
3568        unsigned int range;
3569        struct ocfs2_extent_rec *rec;
3570
3571        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3572
3573        if (insert->ins_split != SPLIT_NONE) {
3574                i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3575                BUG_ON(i == -1);
3576                rec = &el->l_recs[i];
3577                ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3578                                        insert_rec);
3579                goto rotate;
3580        }
3581
3582        /*
3583         * Contiguous insert - either left or right.
3584         */
3585        if (insert->ins_contig != CONTIG_NONE) {
3586                rec = &el->l_recs[i];
3587                if (insert->ins_contig == CONTIG_LEFT) {
3588                        rec->e_blkno = insert_rec->e_blkno;
3589                        rec->e_cpos = insert_rec->e_cpos;
3590                }
3591                le16_add_cpu(&rec->e_leaf_clusters,
3592                             le16_to_cpu(insert_rec->e_leaf_clusters));
3593                return;
3594        }
3595
3596        /*
3597         * Handle insert into an empty leaf.
3598         */
3599        if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3600            ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3601             ocfs2_is_empty_extent(&el->l_recs[0]))) {
3602                el->l_recs[0] = *insert_rec;
3603                el->l_next_free_rec = cpu_to_le16(1);
3604                return;
3605        }
3606
3607        /*
3608         * Appending insert.
3609         */
3610        if (insert->ins_appending == APPEND_TAIL) {
3611                i = le16_to_cpu(el->l_next_free_rec) - 1;
3612                rec = &el->l_recs[i];
3613                range = le32_to_cpu(rec->e_cpos)
3614                        + le16_to_cpu(rec->e_leaf_clusters);
3615                BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3616
3617                mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3618                                le16_to_cpu(el->l_count),
3619                                "inode %lu, depth %u, count %u, next free %u, "
3620                                "rec.cpos %u, rec.clusters %u, "
3621                                "insert.cpos %u, insert.clusters %u\n",
3622                                inode->i_ino,
3623                                le16_to_cpu(el->l_tree_depth),
3624                                le16_to_cpu(el->l_count),
3625                                le16_to_cpu(el->l_next_free_rec),
3626                                le32_to_cpu(el->l_recs[i].e_cpos),
3627                                le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3628                                le32_to_cpu(insert_rec->e_cpos),
3629                                le16_to_cpu(insert_rec->e_leaf_clusters));
3630                i++;
3631                el->l_recs[i] = *insert_rec;
3632                le16_add_cpu(&el->l_next_free_rec, 1);
3633                return;
3634        }
3635
3636rotate:
3637        /*
3638         * Ok, we have to rotate.
3639         *
3640         * At this point, it is safe to assume that inserting into an
3641         * empty leaf and appending to a leaf have both been handled
3642         * above.
3643         *
3644         * This leaf needs to have space, either by the empty 1st
3645         * extent record, or by virtue of an l_next_rec < l_count.
3646         */
3647        ocfs2_rotate_leaf(el, insert_rec);
3648}
3649
3650static void ocfs2_adjust_rightmost_records(struct inode *inode,
3651                                           handle_t *handle,
3652                                           struct ocfs2_path *path,
3653                                           struct ocfs2_extent_rec *insert_rec)
3654{
3655        int ret, i, next_free;
3656        struct buffer_head *bh;
3657        struct ocfs2_extent_list *el;
3658        struct ocfs2_extent_rec *rec;
3659
3660        /*
3661         * Update everything except the leaf block.
3662         */
3663        for (i = 0; i < path->p_tree_depth; i++) {
3664                bh = path->p_node[i].bh;
3665                el = path->p_node[i].el;
3666
3667                next_free = le16_to_cpu(el->l_next_free_rec);
3668                if (next_free == 0) {
3669                        ocfs2_error(inode->i_sb,
3670                                    "Dinode %llu has a bad extent list",
3671                                    (unsigned long long)OCFS2_I(inode)->ip_blkno);
3672                        ret = -EIO;
3673                        return;
3674                }
3675
3676                rec = &el->l_recs[next_free - 1];
3677
3678                rec->e_int_clusters = insert_rec->e_cpos;
3679                le32_add_cpu(&rec->e_int_clusters,
3680                             le16_to_cpu(insert_rec->e_leaf_clusters));
3681                le32_add_cpu(&rec->e_int_clusters,
3682                             -le32_to_cpu(rec->e_cpos));
3683
3684                ret = ocfs2_journal_dirty(handle, bh);
3685                if (ret)
3686                        mlog_errno(ret);
3687
3688        }
3689}
3690
3691static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3692                                    struct ocfs2_extent_rec *insert_rec,
3693                                    struct ocfs2_path *right_path,
3694                                    struct ocfs2_path **ret_left_path)
3695{
3696        int ret, next_free;
3697        struct ocfs2_extent_list *el;
3698        struct ocfs2_path *left_path = NULL;
3699
3700        *ret_left_path = NULL;
3701
3702        /*
3703         * This shouldn't happen for non-trees. The extent rec cluster
3704         * count manipulation below only works for interior nodes.
3705         */
3706        BUG_ON(right_path->p_tree_depth == 0);
3707
3708        /*
3709         * If our appending insert is at the leftmost edge of a leaf,
3710         * then we might need to update the rightmost records of the
3711         * neighboring path.
3712         */
3713        el = path_leaf_el(right_path);
3714        next_free = le16_to_cpu(el->l_next_free_rec);
3715        if (next_free == 0 ||
3716            (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3717                u32 left_cpos;
3718
3719                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3720                                                    &left_cpos);
3721                if (ret) {
3722                        mlog_errno(ret);
3723                        goto out;
3724                }
3725
3726                mlog(0, "Append may need a left path update. cpos: %u, "
3727                     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3728                     left_cpos);
3729
3730                /*
3731                 * No need to worry if the append is already in the
3732                 * leftmost leaf.
3733                 */
3734                if (left_cpos) {
3735                        left_path = ocfs2_new_path(path_root_bh(right_path),
3736                                                   path_root_el(right_path));
3737                        if (!left_path) {
3738                                ret = -ENOMEM;
3739                                mlog_errno(ret);
3740                                goto out;
3741                        }
3742
3743                        ret = ocfs2_find_path(inode, left_path, left_cpos);
3744                        if (ret) {
3745                                mlog_errno(ret);
3746                                goto out;
3747                        }
3748
3749                        /*
3750                         * ocfs2_insert_path() will pass the left_path to the
3751                         * journal for us.
3752                         */
3753                }
3754        }
3755
3756        ret = ocfs2_journal_access_path(inode, handle, right_path);
3757        if (ret) {
3758                mlog_errno(ret);
3759                goto out;
3760        }
3761
3762        ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3763
3764        *ret_left_path = left_path;
3765        ret = 0;
3766out:
3767        if (ret != 0)
3768                ocfs2_free_path(left_path);
3769
3770        return ret;
3771}
3772
3773static void ocfs2_split_record(struct inode *inode,
3774                               struct ocfs2_path *left_path,
3775                               struct ocfs2_path *right_path,
3776                               struct ocfs2_extent_rec *split_rec,
3777                               enum ocfs2_split_type split)
3778{
3779        int index;
3780        u32 cpos = le32_to_cpu(split_rec->e_cpos);
3781        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3782        struct ocfs2_extent_rec *rec, *tmprec;
3783
3784        right_el = path_leaf_el(right_path);;
3785        if (left_path)
3786                left_el = path_leaf_el(left_path);
3787
3788        el = right_el;
3789        insert_el = right_el;
3790        index = ocfs2_search_extent_list(el, cpos);
3791        if (index != -1) {
3792                if (index == 0 && left_path) {
3793                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3794
3795                        /*
3796                         * This typically means that the record
3797                         * started in the left path but moved to the
3798                         * right as a result of rotation. We either
3799                         * move the existing record to the left, or we
3800                         * do the later insert there.
3801                         *
3802                         * In this case, the left path should always
3803                         * exist as the rotate code will have passed
3804                         * it back for a post-insert update.
3805                         */
3806
3807                        if (split == SPLIT_LEFT) {
3808                                /*
3809                                 * It's a left split. Since we know
3810                                 * that the rotate code gave us an
3811                                 * empty extent in the left path, we
3812                                 * can just do the insert there.
3813                                 */
3814                                insert_el = left_el;
3815                        } else {
3816                                /*
3817                                 * Right split - we have to move the
3818                                 * existing record over to the left
3819                                 * leaf. The insert will be into the
3820                                 * newly created empty extent in the
3821                                 * right leaf.
3822                                 */
3823                                tmprec = &right_el->l_recs[index];
3824                                ocfs2_rotate_leaf(left_el, tmprec);
3825                                el = left_el;
3826
3827                                memset(tmprec, 0, sizeof(*tmprec));
3828                                index = ocfs2_search_extent_list(left_el, cpos);
3829                                BUG_ON(index == -1);
3830                        }
3831                }
3832        } else {
3833                BUG_ON(!left_path);
3834                BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3835                /*
3836                 * Left path is easy - we can just allow the insert to
3837                 * happen.
3838                 */
3839                el = left_el;
3840                insert_el = left_el;
3841                index = ocfs2_search_extent_list(el, cpos);
3842                BUG_ON(index == -1);
3843        }
3844
3845        rec = &el->l_recs[index];
3846        ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3847        ocfs2_rotate_leaf(insert_el, split_rec);
3848}
3849
3850/*
3851 * This function only does inserts on an allocation b-tree. For tree
3852 * depth = 0, ocfs2_insert_at_leaf() is called directly.
3853 *
3854 * right_path is the path we want to do the actual insert
3855 * in. left_path should only be passed in if we need to update that
3856 * portion of the tree after an edge insert.
3857 */
3858static int ocfs2_insert_path(struct inode *inode,
3859                             handle_t *handle,
3860                             struct ocfs2_path *left_path,
3861                             struct ocfs2_path *right_path,
3862                             struct ocfs2_extent_rec *insert_rec,
3863                             struct ocfs2_insert_type *insert)
3864{
3865        int ret, subtree_index;
3866        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3867
3868        if (left_path) {
3869                int credits = handle->h_buffer_credits;
3870
3871                /*
3872                 * There's a chance that left_path got passed back to
3873                 * us without being accounted for in the
3874                 * journal. Extend our transaction here to be sure we
3875                 * can change those blocks.
3876                 */
3877                credits += left_path->p_tree_depth;
3878
3879                ret = ocfs2_extend_trans(handle, credits);
3880                if (ret < 0) {
3881                        mlog_errno(ret);
3882                        goto out;
3883                }
3884
3885                ret = ocfs2_journal_access_path(inode, handle, left_path);
3886                if (ret < 0) {
3887                        mlog_errno(ret);
3888                        goto out;
3889                }
3890        }
3891
3892        /*
3893         * Pass both paths to the journal. The majority of inserts
3894         * will be touching all components anyway.
3895         */
3896        ret = ocfs2_journal_access_path(inode, handle, right_path);
3897        if (ret < 0) {
3898                mlog_errno(ret);
3899                goto out;
3900        }
3901
3902        if (insert->ins_split != SPLIT_NONE) {
3903                /*
3904                 * We could call ocfs2_insert_at_leaf() for some types
3905                 * of splits, but it's easier to just let one separate
3906                 * function sort it all out.
3907                 */
3908                ocfs2_split_record(inode, left_path, right_path,
3909                                   insert_rec, insert->ins_split);
3910
3911                /*
3912                 * Split might have modified either leaf and we don't
3913                 * have a guarantee that the later edge insert will
3914                 * dirty this for us.
3915                 */
3916                if (left_path)
3917                        ret = ocfs2_journal_dirty(handle,
3918                                                  path_leaf_bh(left_path));
3919                        if (ret)
3920                                mlog_errno(ret);
3921        } else
3922                ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3923                                     insert, inode);
3924
3925        ret = ocfs2_journal_dirty(handle, leaf_bh);
3926        if (ret)
3927                mlog_errno(ret);
3928
3929        if (left_path) {
3930                /*
3931                 * The rotate code has indicated that we need to fix
3932                 * up portions of the tree after the insert.
3933                 *
3934                 * XXX: Should we extend the transaction here?
3935                 */
3936                subtree_index = ocfs2_find_subtree_root(inode, left_path,
3937                                                        right_path);
3938                ocfs2_complete_edge_insert(inode, handle, left_path,
3939                                           right_path, subtree_index);
3940        }
3941
3942        ret = 0;
3943out:
3944        return ret;
3945}
3946
3947static int ocfs2_do_insert_extent(struct inode *inode,
3948                                  handle_t *handle,
3949                                  struct ocfs2_extent_tree *et,
3950                                  struct ocfs2_extent_rec *insert_rec,
3951                                  struct ocfs2_insert_type *type)
3952{
3953        int ret, rotate = 0;
3954        u32 cpos;
3955        struct ocfs2_path *right_path = NULL;
3956        struct ocfs2_path *left_path = NULL;
3957        struct ocfs2_extent_list *el;
3958
3959        el = et->et_root_el;
3960
3961        ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3962                                   OCFS2_JOURNAL_ACCESS_WRITE);
3963        if (ret) {
3964                mlog_errno(ret);
3965                goto out;
3966        }
3967
3968        if (le16_to_cpu(el->l_tree_depth) == 0) {
3969                ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3970                goto out_update_clusters;
3971        }
3972
3973        right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
3974        if (!right_path) {
3975                ret = -ENOMEM;
3976                mlog_errno(ret);
3977                goto out;
3978        }
3979
3980        /*
3981         * Determine the path to start with. Rotations need the
3982         * rightmost path, everything else can go directly to the
3983         * target leaf.
3984         */
3985        cpos = le32_to_cpu(insert_rec->e_cpos);
3986        if (type->ins_appending == APPEND_NONE &&
3987            type->ins_contig == CONTIG_NONE) {
3988                rotate = 1;
3989                cpos = UINT_MAX;
3990        }
3991
3992        ret = ocfs2_find_path(inode, right_path, cpos);
3993        if (ret) {
3994                mlog_errno(ret);
3995                goto out;
3996        }
3997
3998        /*
3999         * Rotations and appends need special treatment - they modify
4000         * parts of the tree's above them.
4001         *
4002         * Both might pass back a path immediate to the left of the
4003         * one being inserted to. This will be cause
4004         * ocfs2_insert_path() to modify the rightmost records of
4005         * left_path to account for an edge insert.
4006         *
4007         * XXX: When modifying this code, keep in mind that an insert
4008         * can wind up skipping both of these two special cases...
4009         */
4010        if (rotate) {
4011                ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
4012                                              le32_to_cpu(insert_rec->e_cpos),
4013                                              right_path, &left_path);
4014                if (ret) {
4015                        mlog_errno(ret);
4016                        goto out;
4017                }
4018
4019                /*
4020                 * ocfs2_rotate_tree_right() might have extended the
4021                 * transaction without re-journaling our tree root.
4022                 */
4023                ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
4024                                           OCFS2_JOURNAL_ACCESS_WRITE);
4025                if (ret) {
4026                        mlog_errno(ret);
4027                        goto out;
4028                }
4029        } else if (type->ins_appending == APPEND_TAIL
4030                   && type->ins_contig != CONTIG_LEFT) {
4031                ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4032                                               right_path, &left_path);
4033                if (ret) {
4034                        mlog_errno(ret);
4035                        goto out;
4036                }
4037        }
4038
4039        ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4040                                insert_rec, type);
4041        if (ret) {
4042                mlog_errno(ret);
4043                goto out;
4044        }
4045
4046out_update_clusters:
4047        if (type->ins_split == SPLIT_NONE)
4048                ocfs2_et_update_clusters(inode, et,
4049                                         le16_to_cpu(insert_rec->e_leaf_clusters));
4050
4051        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4052        if (ret)
4053                mlog_errno(ret);
4054
4055out:
4056        ocfs2_free_path(left_path);
4057        ocfs2_free_path(right_path);
4058
4059        return ret;
4060}
4061
4062static enum ocfs2_contig_type
4063ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4064                               struct ocfs2_extent_list *el, int index,
4065                               struct ocfs2_extent_rec *split_rec)
4066{
4067        int status;
4068        enum ocfs2_contig_type ret = CONTIG_NONE;
4069        u32 left_cpos, right_cpos;
4070        struct ocfs2_extent_rec *rec = NULL;
4071        struct ocfs2_extent_list *new_el;
4072        struct ocfs2_path *left_path = NULL, *right_path = NULL;
4073        struct buffer_head *bh;
4074        struct ocfs2_extent_block *eb;
4075
4076        if (index > 0) {
4077                rec = &el->l_recs[index - 1];
4078        } else if (path->p_tree_depth > 0) {
4079                status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4080                                                       path, &left_cpos);
4081                if (status)
4082                        goto out;
4083
4084                if (left_cpos != 0) {
4085                        left_path = ocfs2_new_path(path_root_bh(path),
4086                                                   path_root_el(path));
4087                        if (!left_path)
4088                                goto out;
4089
4090                        status = ocfs2_find_path(inode, left_path, left_cpos);
4091                        if (status)
4092                                goto out;
4093
4094                        new_el = path_leaf_el(left_path);
4095
4096                        if (le16_to_cpu(new_el->l_next_free_rec) !=
4097                            le16_to_cpu(new_el->l_count)) {
4098                                bh = path_leaf_bh(left_path);
4099                                eb = (struct ocfs2_extent_block *)bh->b_data;
4100                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4101                                                                 eb);
4102                                goto out;
4103                        }
4104                        rec = &new_el->l_recs[
4105                                le16_to_cpu(new_el->l_next_free_rec) - 1];
4106                }
4107        }
4108
4109        /*
4110         * We're careful to check for an empty extent record here -
4111         * the merge code will know what to do if it sees one.
4112         */
4113        if (rec) {
4114                if (index == 1 && ocfs2_is_empty_extent(rec)) {
4115                        if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4116                                ret = CONTIG_RIGHT;
4117                } else {
4118                        ret = ocfs2_extent_contig(inode, rec, split_rec);
4119                }
4120        }
4121
4122        rec = NULL;
4123        if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4124                rec = &el->l_recs[index + 1];
4125        else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4126                 path->p_tree_depth > 0) {
4127                status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4128                                                        path, &right_cpos);
4129                if (status)
4130                        goto out;
4131
4132                if (right_cpos == 0)
4133                        goto out;
4134
4135                right_path = ocfs2_new_path(path_root_bh(path),
4136                                            path_root_el(path));
4137                if (!right_path)
4138                        goto out;
4139
4140                status = ocfs2_find_path(inode, right_path, right_cpos);
4141                if (status)
4142                        goto out;
4143
4144                new_el = path_leaf_el(right_path);
4145                rec = &new_el->l_recs[0];
4146                if (ocfs2_is_empty_extent(rec)) {
4147                        if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4148                                bh = path_leaf_bh(right_path);
4149                                eb = (struct ocfs2_extent_block *)bh->b_data;
4150                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4151                                                                 eb);
4152                                goto out;
4153                        }
4154                        rec = &new_el->l_recs[1];
4155                }
4156        }
4157
4158        if (rec) {
4159                enum ocfs2_contig_type contig_type;
4160
4161                contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4162
4163                if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4164                        ret = CONTIG_LEFTRIGHT;
4165                else if (ret == CONTIG_NONE)
4166                        ret = contig_type;
4167        }
4168
4169out:
4170        if (left_path)
4171                ocfs2_free_path(left_path);
4172        if (right_path)
4173                ocfs2_free_path(right_path);
4174
4175        return ret;
4176}
4177
4178static void ocfs2_figure_contig_type(struct inode *inode,
4179                                     struct ocfs2_insert_type *insert,
4180                                     struct ocfs2_extent_list *el,
4181                                     struct ocfs2_extent_rec *insert_rec,
4182                                     struct ocfs2_extent_tree *et)
4183{
4184        int i;
4185        enum ocfs2_contig_type contig_type = CONTIG_NONE;
4186
4187        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4188
4189        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4190                contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4191                                                  insert_rec);
4192                if (contig_type != CONTIG_NONE) {
4193                        insert->ins_contig_index = i;
4194                        break;
4195                }
4196        }
4197        insert->ins_contig = contig_type;
4198
4199        if (insert->ins_contig != CONTIG_NONE) {
4200                struct ocfs2_extent_rec *rec =
4201                                &el->l_recs[insert->ins_contig_index];
4202                unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4203                                   le16_to_cpu(insert_rec->e_leaf_clusters);
4204
4205                /*
4206                 * Caller might want us to limit the size of extents, don't
4207                 * calculate contiguousness if we might exceed that limit.
4208                 */
4209                if (et->et_max_leaf_clusters &&
4210                    (len > et->et_max_leaf_clusters))
4211                        insert->ins_contig = CONTIG_NONE;
4212        }
4213}
4214
4215/*
4216 * This should only be called against the righmost leaf extent list.
4217 *
4218 * ocfs2_figure_appending_type() will figure out whether we'll have to
4219 * insert at the tail of the rightmost leaf.
4220 *
4221 * This should also work against the root extent list for tree's with 0
4222 * depth. If we consider the root extent list to be the rightmost leaf node
4223 * then the logic here makes sense.
4224 */
4225static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4226                                        struct ocfs2_extent_list *el,
4227                                        struct ocfs2_extent_rec *insert_rec)
4228{
4229        int i;
4230        u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4231        struct ocfs2_extent_rec *rec;
4232
4233        insert->ins_appending = APPEND_NONE;
4234
4235        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4236
4237        if (!el->l_next_free_rec)
4238                goto set_tail_append;
4239
4240        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4241                /* Were all records empty? */
4242                if (le16_to_cpu(el->l_next_free_rec) == 1)
4243                        goto set_tail_append;
4244        }
4245
4246        i = le16_to_cpu(el->l_next_free_rec) - 1;
4247        rec = &el->l_recs[i];
4248
4249        if (cpos >=
4250            (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4251                goto set_tail_append;
4252
4253        return;
4254
4255set_tail_append:
4256        insert->ins_appending = APPEND_TAIL;
4257}
4258
4259/*
4260 * Helper function called at the begining of an insert.
4261 *
4262 * This computes a few things that are commonly used in the process of
4263 * inserting into the btree:
4264 *   - Whether the new extent is contiguous with an existing one.
4265 *   - The current tree depth.
4266 *   - Whether the insert is an appending one.
4267 *   - The total # of free records in the tree.
4268 *
4269 * All of the information is stored on the ocfs2_insert_type
4270 * structure.
4271 */
4272static int ocfs2_figure_insert_type(struct inode *inode,
4273                                    struct ocfs2_extent_tree *et,
4274                                    struct buffer_head **last_eb_bh,
4275                                    struct ocfs2_extent_rec *insert_rec,
4276                                    int *free_records,
4277                                    struct ocfs2_insert_type *insert)
4278{
4279        int ret;
4280        struct ocfs2_extent_block *eb;
4281        struct ocfs2_extent_list *el;
4282        struct ocfs2_path *path = NULL;
4283        struct buffer_head *bh = NULL;
4284
4285        insert->ins_split = SPLIT_NONE;
4286
4287        el = et->et_root_el;
4288        insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4289
4290        if (el->l_tree_depth) {
4291                /*
4292                 * If we have tree depth, we read in the
4293                 * rightmost extent block ahead of time as
4294                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4295                 * may want it later.
4296                 */
4297                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
4298                if (ret) {
4299                        mlog_exit(ret);
4300                        goto out;
4301                }
4302                eb = (struct ocfs2_extent_block *) bh->b_data;
4303                el = &eb->h_list;
4304        }
4305
4306        /*
4307         * Unless we have a contiguous insert, we'll need to know if
4308         * there is room left in our allocation tree for another
4309         * extent record.
4310         *
4311         * XXX: This test is simplistic, we can search for empty
4312         * extent records too.
4313         */
4314        *free_records = le16_to_cpu(el->l_count) -
4315                le16_to_cpu(el->l_next_free_rec);
4316
4317        if (!insert->ins_tree_depth) {
4318                ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4319                ocfs2_figure_appending_type(insert, el, insert_rec);
4320                return 0;
4321        }
4322
4323        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4324        if (!path) {
4325                ret = -ENOMEM;
4326                mlog_errno(ret);
4327                goto out;
4328        }
4329
4330        /*
4331         * In the case that we're inserting past what the tree
4332         * currently accounts for, ocfs2_find_path() will return for
4333         * us the rightmost tree path. This is accounted for below in
4334         * the appending code.
4335         */
4336        ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4337        if (ret) {
4338                mlog_errno(ret);
4339                goto out;
4340        }
4341
4342        el = path_leaf_el(path);
4343
4344        /*
4345         * Now that we have the path, there's two things we want to determine:
4346         * 1) Contiguousness (also set contig_index if this is so)
4347         *
4348         * 2) Are we doing an append? We can trivially break this up
4349         *     into two types of appends: simple record append, or a
4350         *     rotate inside the tail leaf.
4351         */
4352        ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4353
4354        /*
4355         * The insert code isn't quite ready to deal with all cases of
4356         * left contiguousness. Specifically, if it's an insert into
4357         * the 1st record in a leaf, it will require the adjustment of
4358         * cluster count on the last record of the path directly to it's
4359         * left. For now, just catch that case and fool the layers
4360         * above us. This works just fine for tree_depth == 0, which
4361         * is why we allow that above.
4362         */
4363        if (insert->ins_contig == CONTIG_LEFT &&
4364            insert->ins_contig_index == 0)
4365                insert->ins_contig = CONTIG_NONE;
4366
4367        /*
4368         * Ok, so we can simply compare against last_eb to figure out
4369         * whether the path doesn't exist. This will only happen in
4370         * the case that we're doing a tail append, so maybe we can
4371         * take advantage of that information somehow.
4372         */
4373        if (ocfs2_et_get_last_eb_blk(et) ==
4374            path_leaf_bh(path)->b_blocknr) {
4375                /*
4376                 * Ok, ocfs2_find_path() returned us the rightmost
4377                 * tree path. This might be an appending insert. There are
4378                 * two cases:
4379                 *    1) We're doing a true append at the tail:
4380                 *        -This might even be off the end of the leaf
4381                 *    2) We're "appending" by rotating in the tail
4382                 */
4383                ocfs2_figure_appending_type(insert, el, insert_rec);
4384        }
4385
4386out:
4387        ocfs2_free_path(path);
4388
4389        if (ret == 0)
4390                *last_eb_bh = bh;
4391        else
4392                brelse(bh);
4393        return ret;
4394}
4395
4396/*
4397 * Insert an extent into an inode btree.
4398 *
4399 * The caller needs to update fe->i_clusters
4400 */
4401int ocfs2_insert_extent(struct ocfs2_super *osb,
4402                        handle_t *handle,
4403                        struct inode *inode,
4404                        struct ocfs2_extent_tree *et,
4405                        u32 cpos,
4406                        u64 start_blk,
4407                        u32 new_clusters,
4408                        u8 flags,
4409                        struct ocfs2_alloc_context *meta_ac)
4410{
4411        int status;
4412        int uninitialized_var(free_records);
4413        struct buffer_head *last_eb_bh = NULL;
4414        struct ocfs2_insert_type insert = {0, };
4415        struct ocfs2_extent_rec rec;
4416
4417        mlog(0, "add %u clusters at position %u to inode %llu\n",
4418             new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4419
4420        memset(&rec, 0, sizeof(rec));
4421        rec.e_cpos = cpu_to_le32(cpos);
4422        rec.e_blkno = cpu_to_le64(start_blk);
4423        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4424        rec.e_flags = flags;
4425        status = ocfs2_et_insert_check(inode, et, &rec);
4426        if (status) {
4427                mlog_errno(status);
4428                goto bail;
4429        }
4430
4431        status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4432                                          &free_records, &insert);
4433        if (status < 0) {
4434                mlog_errno(status);
4435                goto bail;
4436        }
4437
4438        mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4439             "Insert.contig_index: %d, Insert.free_records: %d, "
4440             "Insert.tree_depth: %d\n",
4441             insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4442             free_records, insert.ins_tree_depth);
4443
4444        if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4445                status = ocfs2_grow_tree(inode, handle, et,
4446                                         &insert.ins_tree_depth, &last_eb_bh,
4447                                         meta_ac);
4448                if (status) {
4449                        mlog_errno(status);
4450                        goto bail;
4451                }
4452        }
4453
4454        /* Finally, we can add clusters. This might rotate the tree for us. */
4455        status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4456        if (status < 0)
4457                mlog_errno(status);
4458        else if (et->et_ops == &ocfs2_dinode_et_ops)
4459                ocfs2_extent_map_insert_rec(inode, &rec);
4460
4461bail:
4462        brelse(last_eb_bh);
4463
4464        mlog_exit(status);
4465        return status;
4466}
4467
4468/*
4469 * Allcate and add clusters into the extent b-tree.
4470 * The new clusters(clusters_to_add) will be inserted at logical_offset.
4471 * The extent b-tree's root is specified by et, and
4472 * it is not limited to the file storage. Any extent tree can use this
4473 * function if it implements the proper ocfs2_extent_tree.
4474 */
4475int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4476                                struct inode *inode,
4477                                u32 *logical_offset,
4478                                u32 clusters_to_add,
4479                                int mark_unwritten,
4480                                struct ocfs2_extent_tree *et,
4481                                handle_t *handle,
4482                                struct ocfs2_alloc_context *data_ac,
4483                                struct ocfs2_alloc_context *meta_ac,
4484                                enum ocfs2_alloc_restarted *reason_ret)
4485{
4486        int status = 0;
4487        int free_extents;
4488        enum ocfs2_alloc_restarted reason = RESTART_NONE;
4489        u32 bit_off, num_bits;
4490        u64 block;
4491        u8 flags = 0;
4492
4493        BUG_ON(!clusters_to_add);
4494
4495        if (mark_unwritten)
4496                flags = OCFS2_EXT_UNWRITTEN;
4497
4498        free_extents = ocfs2_num_free_extents(osb, inode, et);
4499        if (free_extents < 0) {
4500                status = free_extents;
4501                mlog_errno(status);
4502                goto leave;
4503        }
4504
4505        /* there are two cases which could cause us to EAGAIN in the
4506         * we-need-more-metadata case:
4507         * 1) we haven't reserved *any*
4508         * 2) we are so fragmented, we've needed to add metadata too
4509         *    many times. */
4510        if (!free_extents && !meta_ac) {
4511                mlog(0, "we haven't reserved any metadata!\n");
4512                status = -EAGAIN;
4513                reason = RESTART_META;
4514                goto leave;
4515        } else if ((!free_extents)
4516                   && (ocfs2_alloc_context_bits_left(meta_ac)
4517                       < ocfs2_extend_meta_needed(et->et_root_el))) {
4518                mlog(0, "filesystem is really fragmented...\n");
4519                status = -EAGAIN;
4520                reason = RESTART_META;
4521                goto leave;
4522        }
4523
4524        status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4525                                        clusters_to_add, &bit_off, &num_bits);
4526        if (status < 0) {
4527                if (status != -ENOSPC)
4528                        mlog_errno(status);
4529                goto leave;
4530        }
4531
4532        BUG_ON(num_bits > clusters_to_add);
4533
4534        /* reserve our write early -- insert_extent may update the inode */
4535        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
4536                                      OCFS2_JOURNAL_ACCESS_WRITE);
4537        if (status < 0) {
4538                mlog_errno(status);
4539                goto leave;
4540        }
4541
4542        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4543        mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4544             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4545        status = ocfs2_insert_extent(osb, handle, inode, et,
4546                                     *logical_offset, block,
4547                                     num_bits, flags, meta_ac);
4548        if (status < 0) {
4549                mlog_errno(status);
4550                goto leave;
4551        }
4552
4553        status = ocfs2_journal_dirty(handle, et->et_root_bh);
4554        if (status < 0) {
4555                mlog_errno(status);
4556                goto leave;
4557        }
4558
4559        clusters_to_add -= num_bits;
4560        *logical_offset += num_bits;
4561
4562        if (clusters_to_add) {
4563                mlog(0, "need to alloc once more, wanted = %u\n",
4564                     clusters_to_add);
4565                status = -EAGAIN;
4566                reason = RESTART_TRANS;
4567        }
4568
4569leave:
4570        mlog_exit(status);
4571        if (reason_ret)
4572                *reason_ret = reason;
4573        return status;
4574}
4575
4576static void ocfs2_make_right_split_rec(struct super_block *sb,
4577                                       struct ocfs2_extent_rec *split_rec,
4578                                       u32 cpos,
4579                                       struct ocfs2_extent_rec *rec)
4580{
4581        u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4582        u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4583
4584        memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4585
4586        split_rec->e_cpos = cpu_to_le32(cpos);
4587        split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4588
4589        split_rec->e_blkno = rec->e_blkno;
4590        le64_add_cpu(&split_rec->e_blkno,
4591                     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4592
4593        split_rec->e_flags = rec->e_flags;
4594}
4595
4596static int ocfs2_split_and_insert(struct inode *inode,
4597                                  handle_t *handle,
4598                                  struct ocfs2_path *path,
4599                                  struct ocfs2_extent_tree *et,
4600                                  struct buffer_head **last_eb_bh,
4601                                  int split_index,
4602                                  struct ocfs2_extent_rec *orig_split_rec,
4603                                  struct ocfs2_alloc_context *meta_ac)
4604{
4605        int ret = 0, depth;
4606        unsigned int insert_range, rec_range, do_leftright = 0;
4607        struct ocfs2_extent_rec tmprec;
4608        struct ocfs2_extent_list *rightmost_el;
4609        struct ocfs2_extent_rec rec;
4610        struct ocfs2_extent_rec split_rec = *orig_split_rec;
4611        struct ocfs2_insert_type insert;
4612        struct ocfs2_extent_block *eb;
4613
4614leftright:
4615        /*
4616         * Store a copy of the record on the stack - it might move
4617         * around as the tree is manipulated below.
4618         */
4619        rec = path_leaf_el(path)->l_recs[split_index];
4620
4621        rightmost_el = et->et_root_el;
4622
4623        depth = le16_to_cpu(rightmost_el->l_tree_depth);
4624        if (depth) {
4625                BUG_ON(!(*last_eb_bh));
4626                eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4627                rightmost_el = &eb->h_list;
4628        }
4629
4630        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4631            le16_to_cpu(rightmost_el->l_count)) {
4632                ret = ocfs2_grow_tree(inode, handle, et,
4633                                      &depth, last_eb_bh, meta_ac);
4634                if (ret) {
4635                        mlog_errno(ret);
4636                        goto out;
4637                }
4638        }
4639
4640        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4641        insert.ins_appending = APPEND_NONE;
4642        insert.ins_contig = CONTIG_NONE;
4643        insert.ins_tree_depth = depth;
4644
4645        insert_range = le32_to_cpu(split_rec.e_cpos) +
4646                le16_to_cpu(split_rec.e_leaf_clusters);
4647        rec_range = le32_to_cpu(rec.e_cpos) +
4648                le16_to_cpu(rec.e_leaf_clusters);
4649
4650        if (split_rec.e_cpos == rec.e_cpos) {
4651                insert.ins_split = SPLIT_LEFT;
4652        } else if (insert_range == rec_range) {
4653                insert.ins_split = SPLIT_RIGHT;
4654        } else {
4655                /*
4656                 * Left/right split. We fake this as a right split
4657                 * first and then make a second pass as a left split.
4658                 */
4659                insert.ins_split = SPLIT_RIGHT;
4660
4661                ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4662                                           &rec);
4663
4664                split_rec = tmprec;
4665
4666                BUG_ON(do_leftright);
4667                do_leftright = 1;
4668        }
4669
4670        ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4671        if (ret) {
4672                mlog_errno(ret);
4673                goto out;
4674        }
4675
4676        if (do_leftright == 1) {
4677                u32 cpos;
4678                struct ocfs2_extent_list *el;
4679
4680                do_leftright++;
4681                split_rec = *orig_split_rec;
4682
4683                ocfs2_reinit_path(path, 1);
4684
4685                cpos = le32_to_cpu(split_rec.e_cpos);
4686                ret = ocfs2_find_path(inode, path, cpos);
4687                if (ret) {
4688                        mlog_errno(ret);
4689                        goto out;
4690                }
4691
4692                el = path_leaf_el(path);
4693                split_index = ocfs2_search_extent_list(el, cpos);
4694                goto leftright;
4695        }
4696out:
4697
4698        return ret;
4699}
4700
4701/*
4702 * Mark part or all of the extent record at split_index in the leaf
4703 * pointed to by path as written. This removes the unwritten
4704 * extent flag.
4705 *
4706 * Care is taken to handle contiguousness so as to not grow the tree.
4707 *
4708 * meta_ac is not strictly necessary - we only truly need it if growth
4709 * of the tree is required. All other cases will degrade into a less
4710 * optimal tree layout.
4711 *
4712 * last_eb_bh should be the rightmost leaf block for any extent
4713 * btree. Since a split may grow the tree or a merge might shrink it,
4714 * the caller cannot trust the contents of that buffer after this call.
4715 *
4716 * This code is optimized for readability - several passes might be
4717 * made over certain portions of the tree. All of those blocks will
4718 * have been brought into cache (and pinned via the journal), so the
4719 * extra overhead is not expressed in terms of disk reads.
4720 */
4721static int __ocfs2_mark_extent_written(struct inode *inode,
4722                                       struct ocfs2_extent_tree *et,
4723                                       handle_t *handle,
4724                                       struct ocfs2_path *path,
4725                                       int split_index,
4726                                       struct ocfs2_extent_rec *split_rec,
4727                                       struct ocfs2_alloc_context *meta_ac,
4728                                       struct ocfs2_cached_dealloc_ctxt *dealloc)
4729{
4730        int ret = 0;
4731        struct ocfs2_extent_list *el = path_leaf_el(path);
4732        struct buffer_head *last_eb_bh = NULL;
4733        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4734        struct ocfs2_merge_ctxt ctxt;
4735        struct ocfs2_extent_list *rightmost_el;
4736
4737        if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4738                ret = -EIO;
4739                mlog_errno(ret);
4740                goto out;
4741        }
4742
4743        if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4744            ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4745             (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4746                ret = -EIO;
4747                mlog_errno(ret);
4748                goto out;
4749        }
4750
4751        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4752                                                            split_index,
4753                                                            split_rec);
4754
4755        /*
4756         * The core merge / split code wants to know how much room is
4757         * left in this inodes allocation tree, so we pass the
4758         * rightmost extent list.
4759         */
4760        if (path->p_tree_depth) {
4761                struct ocfs2_extent_block *eb;
4762
4763                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
4764                                       &last_eb_bh);
4765                if (ret) {
4766                        mlog_exit(ret);
4767                        goto out;
4768                }
4769
4770                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4771                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4772                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4773                        ret = -EROFS;
4774                        goto out;
4775                }
4776
4777                rightmost_el = &eb->h_list;
4778        } else
4779                rightmost_el = path_root_el(path);
4780
4781        if (rec->e_cpos == split_rec->e_cpos &&
4782            rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4783                ctxt.c_split_covers_rec = 1;
4784        else
4785                ctxt.c_split_covers_rec = 0;
4786
4787        ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4788
4789        mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4790             split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4791             ctxt.c_split_covers_rec);
4792
4793        if (ctxt.c_contig_type == CONTIG_NONE) {
4794                if (ctxt.c_split_covers_rec)
4795                        el->l_recs[split_index] = *split_rec;
4796                else
4797                        ret = ocfs2_split_and_insert(inode, handle, path, et,
4798                                                     &last_eb_bh, split_index,
4799                                                     split_rec, meta_ac);
4800                if (ret)
4801                        mlog_errno(ret);
4802        } else {
4803                ret = ocfs2_try_to_merge_extent(inode, handle, path,
4804                                                split_index, split_rec,
4805                                                dealloc, &ctxt, et);
4806                if (ret)
4807                        mlog_errno(ret);
4808        }
4809
4810out:
4811        brelse(last_eb_bh);
4812        return ret;
4813}
4814
4815/*
4816 * Mark the already-existing extent at cpos as written for len clusters.
4817 *
4818 * If the existing extent is larger than the request, initiate a
4819 * split. An attempt will be made at merging with adjacent extents.
4820 *
4821 * The caller is responsible for passing down meta_ac if we'll need it.
4822 */
4823int ocfs2_mark_extent_written(struct inode *inode,
4824                              struct ocfs2_extent_tree *et,
4825                              handle_t *handle, u32 cpos, u32 len, u32 phys,
4826                              struct ocfs2_alloc_context *meta_ac,
4827                              struct ocfs2_cached_dealloc_ctxt *dealloc)
4828{
4829        int ret, index;
4830        u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4831        struct ocfs2_extent_rec split_rec;
4832        struct ocfs2_path *left_path = NULL;
4833        struct ocfs2_extent_list *el;
4834
4835        mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4836             inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4837
4838        if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4839                ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4840                            "that are being written to, but the feature bit "
4841                            "is not set in the super block.",
4842                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
4843                ret = -EROFS;
4844                goto out;
4845        }
4846
4847        /*
4848         * XXX: This should be fixed up so that we just re-insert the
4849         * next extent records.
4850         *
4851         * XXX: This is a hack on the extent tree, maybe it should be
4852         * an op?
4853         */
4854        if (et->et_ops == &ocfs2_dinode_et_ops)
4855                ocfs2_extent_map_trunc(inode, 0);
4856
4857        left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4858        if (!left_path) {
4859                ret = -ENOMEM;
4860                mlog_errno(ret);
4861                goto out;
4862        }
4863
4864        ret = ocfs2_find_path(inode, left_path, cpos);
4865        if (ret) {
4866                mlog_errno(ret);
4867                goto out;
4868        }
4869        el = path_leaf_el(left_path);
4870
4871        index = ocfs2_search_extent_list(el, cpos);
4872        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4873                ocfs2_error(inode->i_sb,
4874                            "Inode %llu has an extent at cpos %u which can no "
4875                            "longer be found.\n",
4876                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4877                ret = -EROFS;
4878                goto out;
4879        }
4880
4881        memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4882        split_rec.e_cpos = cpu_to_le32(cpos);
4883        split_rec.e_leaf_clusters = cpu_to_le16(len);
4884        split_rec.e_blkno = cpu_to_le64(start_blkno);
4885        split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4886        split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4887
4888        ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4889                                          index, &split_rec, meta_ac,
4890                                          dealloc);
4891        if (ret)
4892                mlog_errno(ret);
4893
4894out:
4895        ocfs2_free_path(left_path);
4896        return ret;
4897}
4898
4899static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4900                            handle_t *handle, struct ocfs2_path *path,
4901                            int index, u32 new_range,
4902                            struct ocfs2_alloc_context *meta_ac)
4903{
4904        int ret, depth, credits = handle->h_buffer_credits;
4905        struct buffer_head *last_eb_bh = NULL;
4906        struct ocfs2_extent_block *eb;
4907        struct ocfs2_extent_list *rightmost_el, *el;
4908        struct ocfs2_extent_rec split_rec;
4909        struct ocfs2_extent_rec *rec;
4910        struct ocfs2_insert_type insert;
4911
4912        /*
4913         * Setup the record to split before we grow the tree.
4914         */
4915        el = path_leaf_el(path);
4916        rec = &el->l_recs[index];
4917        ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4918
4919        depth = path->p_tree_depth;
4920        if (depth > 0) {
4921                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
4922                                       &last_eb_bh);
4923                if (ret < 0) {
4924                        mlog_errno(ret);
4925                        goto out;
4926                }
4927
4928                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4929                rightmost_el = &eb->h_list;
4930        } else
4931                rightmost_el = path_leaf_el(path);
4932
4933        credits += path->p_tree_depth +
4934                   ocfs2_extend_meta_needed(et->et_root_el);
4935        ret = ocfs2_extend_trans(handle, credits);
4936        if (ret) {
4937                mlog_errno(ret);
4938                goto out;
4939        }
4940
4941        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4942            le16_to_cpu(rightmost_el->l_count)) {
4943                ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4944                                      meta_ac);
4945                if (ret) {
4946                        mlog_errno(ret);
4947                        goto out;
4948                }
4949        }
4950
4951        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4952        insert.ins_appending = APPEND_NONE;
4953        insert.ins_contig = CONTIG_NONE;
4954        insert.ins_split = SPLIT_RIGHT;
4955        insert.ins_tree_depth = depth;
4956
4957        ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4958        if (ret)
4959                mlog_errno(ret);
4960
4961out:
4962        brelse(last_eb_bh);
4963        return ret;
4964}
4965
4966static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4967                              struct ocfs2_path *path, int index,
4968                              struct ocfs2_cached_dealloc_ctxt *dealloc,
4969                              u32 cpos, u32 len,
4970                              struct ocfs2_extent_tree *et)
4971{
4972        int ret;
4973        u32 left_cpos, rec_range, trunc_range;
4974        int wants_rotate = 0, is_rightmost_tree_rec = 0;
4975        struct super_block *sb = inode->i_sb;
4976        struct ocfs2_path *left_path = NULL;
4977        struct ocfs2_extent_list *el = path_leaf_el(path);
4978        struct ocfs2_extent_rec *rec;
4979        struct ocfs2_extent_block *eb;
4980
4981        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4982                ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4983                if (ret) {
4984                        mlog_errno(ret);
4985                        goto out;
4986                }
4987
4988                index--;
4989        }
4990
4991        if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4992            path->p_tree_depth) {
4993                /*
4994                 * Check whether this is the rightmost tree record. If
4995                 * we remove all of this record or part of its right
4996                 * edge then an update of the record lengths above it
4997                 * will be required.
4998                 */
4999                eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5000                if (eb->h_next_leaf_blk == 0)
5001                        is_rightmost_tree_rec = 1;
5002        }
5003
5004        rec = &el->l_recs[index];
5005        if (index == 0 && path->p_tree_depth &&
5006            le32_to_cpu(rec->e_cpos) == cpos) {
5007                /*
5008                 * Changing the leftmost offset (via partial or whole
5009                 * record truncate) of an interior (or rightmost) path
5010                 * means we have to update the subtree that is formed
5011                 * by this leaf and the one to it's left.
5012                 *
5013                 * There are two cases we can skip:
5014                 *   1) Path is the leftmost one in our inode tree.
5015                 *   2) The leaf is rightmost and will be empty after
5016                 *      we remove the extent record - the rotate code
5017                 *      knows how to update the newly formed edge.
5018                 */
5019
5020                ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5021                                                    &left_cpos);
5022                if (ret) {
5023                        mlog_errno(ret);
5024                        goto out;
5025                }
5026
5027                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5028                        left_path = ocfs2_new_path(path_root_bh(path),
5029                                                   path_root_el(path));
5030                        if (!left_path) {
5031                                ret = -ENOMEM;
5032                                mlog_errno(ret);
5033                                goto out;
5034                        }
5035
5036                        ret = ocfs2_find_path(inode, left_path, left_cpos);
5037                        if (ret) {
5038                                mlog_errno(ret);
5039                                goto out;
5040                        }
5041                }
5042        }
5043
5044        ret = ocfs2_extend_rotate_transaction(handle, 0,
5045                                              handle->h_buffer_credits,
5046                                              path);
5047        if (ret) {
5048                mlog_errno(ret);
5049                goto out;
5050        }
5051
5052        ret = ocfs2_journal_access_path(inode, handle, path);
5053        if (ret) {
5054                mlog_errno(ret);
5055                goto out;
5056        }
5057
5058        ret = ocfs2_journal_access_path(inode, handle, left_path);
5059        if (ret) {
5060                mlog_errno(ret);
5061                goto out;
5062        }
5063
5064        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5065        trunc_range = cpos + len;
5066
5067        if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5068                int next_free;
5069
5070                memset(rec, 0, sizeof(*rec));
5071                ocfs2_cleanup_merge(el, index);
5072                wants_rotate = 1;
5073
5074                next_free = le16_to_cpu(el->l_next_free_rec);
5075                if (is_rightmost_tree_rec && next_free > 1) {
5076                        /*
5077                         * We skip the edge update if this path will
5078                         * be deleted by the rotate code.
5079                         */
5080                        rec = &el->l_recs[next_free - 1];
5081                        ocfs2_adjust_rightmost_records(inode, handle, path,
5082                                                       rec);
5083                }
5084        } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5085                /* Remove leftmost portion of the record. */
5086                le32_add_cpu(&rec->e_cpos, len);
5087                le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5088                le16_add_cpu(&rec->e_leaf_clusters, -len);
5089        } else if (rec_range == trunc_range) {
5090                /* Remove rightmost portion of the record */
5091                le16_add_cpu(&rec->e_leaf_clusters, -len);
5092                if (is_rightmost_tree_rec)
5093                        ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5094        } else {
5095                /* Caller should have trapped this. */
5096                mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5097                     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5098                     le32_to_cpu(rec->e_cpos),
5099                     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5100                BUG();
5101        }
5102
5103        if (left_path) {
5104                int subtree_index;
5105
5106                subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5107                ocfs2_complete_edge_insert(inode, handle, left_path, path,
5108                                           subtree_index);
5109        }
5110
5111        ocfs2_journal_dirty(handle, path_leaf_bh(path));
5112
5113        ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5114        if (ret) {
5115                mlog_errno(ret);
5116                goto out;
5117        }
5118
5119out:
5120        ocfs2_free_path(left_path);
5121        return ret;
5122}
5123
5124int ocfs2_remove_extent(struct inode *inode,
5125                        struct ocfs2_extent_tree *et,
5126                        u32 cpos, u32 len, handle_t *handle,
5127                        struct ocfs2_alloc_context *meta_ac,
5128                        struct ocfs2_cached_dealloc_ctxt *dealloc)
5129{
5130        int ret, index;
5131        u32 rec_range, trunc_range;
5132        struct ocfs2_extent_rec *rec;
5133        struct ocfs2_extent_list *el;
5134        struct ocfs2_path *path = NULL;
5135
5136        ocfs2_extent_map_trunc(inode, 0);
5137
5138        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
5139        if (!path) {
5140                ret = -ENOMEM;
5141                mlog_errno(ret);
5142                goto out;
5143        }
5144
5145        ret = ocfs2_find_path(inode, path, cpos);
5146        if (ret) {
5147                mlog_errno(ret);
5148                goto out;
5149        }
5150
5151        el = path_leaf_el(path);
5152        index = ocfs2_search_extent_list(el, cpos);
5153        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5154                ocfs2_error(inode->i_sb,
5155                            "Inode %llu has an extent at cpos %u which can no "
5156                            "longer be found.\n",
5157                            (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5158                ret = -EROFS;
5159                goto out;
5160        }
5161
5162        /*
5163         * We have 3 cases of extent removal:
5164         *   1) Range covers the entire extent rec
5165         *   2) Range begins or ends on one edge of the extent rec
5166         *   3) Range is in the middle of the extent rec (no shared edges)
5167         *
5168         * For case 1 we remove the extent rec and left rotate to
5169         * fill the hole.
5170         *
5171         * For case 2 we just shrink the existing extent rec, with a
5172         * tree update if the shrinking edge is also the edge of an
5173         * extent block.
5174         *
5175         * For case 3 we do a right split to turn the extent rec into
5176         * something case 2 can handle.
5177         */
5178        rec = &el->l_recs[index];
5179        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5180        trunc_range = cpos + len;
5181
5182        BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5183
5184        mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5185             "(cpos %u, len %u)\n",
5186             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5187             le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5188
5189        if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5190                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5191                                         cpos, len, et);
5192                if (ret) {
5193                        mlog_errno(ret);
5194                        goto out;
5195                }
5196        } else {
5197                ret = ocfs2_split_tree(inode, et, handle, path, index,
5198                                       trunc_range, meta_ac);
5199                if (ret) {
5200                        mlog_errno(ret);
5201                        goto out;
5202                }
5203
5204                /*
5205                 * The split could have manipulated the tree enough to
5206                 * move the record location, so we have to look for it again.
5207                 */
5208                ocfs2_reinit_path(path, 1);
5209
5210                ret = ocfs2_find_path(inode, path, cpos);
5211                if (ret) {
5212                        mlog_errno(ret);
5213                        goto out;
5214                }
5215
5216                el = path_leaf_el(path);
5217                index = ocfs2_search_extent_list(el, cpos);
5218                if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5219                        ocfs2_error(inode->i_sb,
5220                                    "Inode %llu: split at cpos %u lost record.",
5221                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5222                                    cpos);
5223                        ret = -EROFS;
5224                        goto out;
5225                }
5226
5227                /*
5228                 * Double check our values here. If anything is fishy,
5229                 * it's easier to catch it at the top level.
5230                 */
5231                rec = &el->l_recs[index];
5232                rec_range = le32_to_cpu(rec->e_cpos) +
5233                        ocfs2_rec_clusters(el, rec);
5234                if (rec_range != trunc_range) {
5235                        ocfs2_error(inode->i_sb,
5236                                    "Inode %llu: error after split at cpos %u"
5237                                    "trunc len %u, existing record is (%u,%u)",
5238                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5239                                    cpos, len, le32_to_cpu(rec->e_cpos),
5240                                    ocfs2_rec_clusters(el, rec));
5241                        ret = -EROFS;
5242                        goto out;
5243                }
5244
5245                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5246                                         cpos, len, et);
5247                if (ret) {
5248                        mlog_errno(ret);
5249                        goto out;
5250                }
5251        }
5252
5253out:
5254        ocfs2_free_path(path);
5255        return ret;
5256}
5257
5258int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5259{
5260        struct buffer_head *tl_bh = osb->osb_tl_bh;
5261        struct ocfs2_dinode *di;
5262        struct ocfs2_truncate_log *tl;
5263
5264        di = (struct ocfs2_dinode *) tl_bh->b_data;
5265        tl = &di->id2.i_dealloc;
5266
5267        mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5268                        "slot %d, invalid truncate log parameters: used = "
5269                        "%u, count = %u\n", osb->slot_num,
5270                        le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5271        return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5272}
5273
5274static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5275                                           unsigned int new_start)
5276{
5277        unsigned int tail_index;
5278        unsigned int current_tail;
5279
5280        /* No records, nothing to coalesce */
5281        if (!le16_to_cpu(tl->tl_used))
5282                return 0;
5283
5284        tail_index = le16_to_cpu(tl->tl_used) - 1;
5285        current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5286        current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5287
5288        return current_tail == new_start;
5289}
5290
5291int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5292                              handle_t *handle,
5293                              u64 start_blk,
5294                              unsigned int num_clusters)
5295{
5296        int status, index;
5297        unsigned int start_cluster, tl_count;
5298        struct inode *tl_inode = osb->osb_tl_inode;
5299        struct buffer_head *tl_bh = osb->osb_tl_bh;
5300        struct ocfs2_dinode *di;
5301        struct ocfs2_truncate_log *tl;
5302
5303        mlog_entry("start_blk = %llu, num_clusters = %u\n",
5304                   (unsigned long long)start_blk, num_clusters);
5305
5306        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5307
5308        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5309
5310        di = (struct ocfs2_dinode *) tl_bh->b_data;
5311        tl = &di->id2.i_dealloc;
5312        if (!OCFS2_IS_VALID_DINODE(di)) {
5313                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5314                status = -EIO;
5315                goto bail;
5316        }
5317
5318        tl_count = le16_to_cpu(tl->tl_count);
5319        mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5320                        tl_count == 0,
5321                        "Truncate record count on #%llu invalid "
5322                        "wanted %u, actual %u\n",
5323                        (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5324                        ocfs2_truncate_recs_per_inode(osb->sb),
5325                        le16_to_cpu(tl->tl_count));
5326
5327        /* Caller should have known to flush before calling us. */
5328        index = le16_to_cpu(tl->tl_used);
5329        if (index >= tl_count) {
5330                status = -ENOSPC;
5331                mlog_errno(status);
5332                goto bail;
5333        }
5334
5335        status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5336                                      OCFS2_JOURNAL_ACCESS_WRITE);
5337        if (status < 0) {
5338                mlog_errno(status);
5339                goto bail;
5340        }
5341
5342        mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5343             "%llu (index = %d)\n", num_clusters, start_cluster,
5344             (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5345
5346        if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5347                /*
5348                 * Move index back to the record we are coalescing with.
5349                 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5350                 */
5351                index--;
5352
5353                num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5354                mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5355                     index, le32_to_cpu(tl->tl_recs[index].t_start),
5356                     num_clusters);
5357        } else {
5358                tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5359                tl->tl_used = cpu_to_le16(index + 1);
5360        }
5361        tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5362
5363        status = ocfs2_journal_dirty(handle, tl_bh);
5364        if (status < 0) {
5365                mlog_errno(status);
5366                goto bail;
5367        }
5368
5369bail:
5370        mlog_exit(status);
5371        return status;
5372}
5373
5374static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5375                                         handle_t *handle,
5376                                         struct inode *data_alloc_inode,
5377                                         struct buffer_head *data_alloc_bh)
5378{
5379        int status = 0;
5380        int i;
5381        unsigned int num_clusters;
5382        u64 start_blk;
5383        struct ocfs2_truncate_rec rec;
5384        struct ocfs2_dinode *di;
5385        struct ocfs2_truncate_log *tl;
5386        struct inode *tl_inode = osb->osb_tl_inode;
5387        struct buffer_head *tl_bh = osb->osb_tl_bh;
5388
5389        mlog_entry_void();
5390
5391        di = (struct ocfs2_dinode *) tl_bh->b_data;
5392        tl = &di->id2.i_dealloc;
5393        i = le16_to_cpu(tl->tl_used) - 1;
5394        while (i >= 0) {
5395                /* Caller has given us at least enough credits to
5396                 * update the truncate log dinode */
5397                status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5398                                              OCFS2_JOURNAL_ACCESS_WRITE);
5399                if (status < 0) {
5400                        mlog_errno(status);
5401                        goto bail;
5402                }
5403
5404                tl->tl_used = cpu_to_le16(i);
5405
5406                status = ocfs2_journal_dirty(handle, tl_bh);
5407                if (status < 0) {
5408                        mlog_errno(status);
5409                        goto bail;
5410                }
5411
5412                /* TODO: Perhaps we can calculate the bulk of the
5413                 * credits up front rather than extending like
5414                 * this. */
5415                status = ocfs2_extend_trans(handle,
5416                                            OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5417                if (status < 0) {
5418                        mlog_errno(status);
5419                        goto bail;
5420                }
5421
5422                rec = tl->tl_recs[i];
5423                start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5424                                                    le32_to_cpu(rec.t_start));
5425                num_clusters = le32_to_cpu(rec.t_clusters);
5426
5427                /* if start_blk is not set, we ignore the record as
5428                 * invalid. */
5429                if (start_blk) {
5430                        mlog(0, "free record %d, start = %u, clusters = %u\n",
5431                             i, le32_to_cpu(rec.t_start), num_clusters);
5432
5433                        status = ocfs2_free_clusters(handle, data_alloc_inode,
5434                                                     data_alloc_bh, start_blk,
5435                                                     num_clusters);
5436                        if (status < 0) {
5437                                mlog_errno(status);
5438                                goto bail;
5439                        }
5440                }
5441                i--;
5442        }
5443
5444bail:
5445        mlog_exit(status);
5446        return status;
5447}
5448
5449/* Expects you to already be holding tl_inode->i_mutex */
5450int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5451{
5452        int status;
5453        unsigned int num_to_flush;
5454        handle_t *handle;
5455        struct inode *tl_inode = osb->osb_tl_inode;
5456        struct inode *data_alloc_inode = NULL;
5457        struct buffer_head *tl_bh = osb->osb_tl_bh;
5458        struct buffer_head *data_alloc_bh = NULL;
5459        struct ocfs2_dinode *di;
5460        struct ocfs2_truncate_log *tl;
5461
5462        mlog_entry_void();
5463
5464        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5465
5466        di = (struct ocfs2_dinode *) tl_bh->b_data;
5467        tl = &di->id2.i_dealloc;
5468        if (!OCFS2_IS_VALID_DINODE(di)) {
5469                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5470                status = -EIO;
5471                goto out;
5472        }
5473
5474        num_to_flush = le16_to_cpu(tl->tl_used);
5475        mlog(0, "Flush %u records from truncate log #%llu\n",
5476             num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5477        if (!num_to_flush) {
5478                status = 0;
5479                goto out;
5480        }
5481
5482        data_alloc_inode = ocfs2_get_system_file_inode(osb,
5483                                                       GLOBAL_BITMAP_SYSTEM_INODE,
5484                                                       OCFS2_INVALID_SLOT);
5485        if (!data_alloc_inode) {
5486                status = -EINVAL;
5487                mlog(ML_ERROR, "Could not get bitmap inode!\n");
5488                goto out;
5489        }
5490
5491        mutex_lock(&data_alloc_inode->i_mutex);
5492
5493        status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5494        if (status < 0) {
5495                mlog_errno(status);
5496                goto out_mutex;
5497        }
5498
5499        handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5500        if (IS_ERR(handle)) {
5501                status = PTR_ERR(handle);
5502                mlog_errno(status);
5503                goto out_unlock;
5504        }
5505
5506        status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5507                                               data_alloc_bh);
5508        if (status < 0)
5509                mlog_errno(status);
5510
5511        ocfs2_commit_trans(osb, handle);
5512
5513out_unlock:
5514        brelse(data_alloc_bh);
5515        ocfs2_inode_unlock(data_alloc_inode, 1);
5516
5517out_mutex:
5518        mutex_unlock(&data_alloc_inode->i_mutex);
5519        iput(data_alloc_inode);
5520
5521out:
5522        mlog_exit(status);
5523        return status;
5524}
5525
5526int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5527{
5528        int status;
5529        struct inode *tl_inode = osb->osb_tl_inode;
5530
5531        mutex_lock(&tl_inode->i_mutex);
5532        status = __ocfs2_flush_truncate_log(osb);
5533        mutex_unlock(&tl_inode->i_mutex);
5534
5535        return status;
5536}
5537
5538static void ocfs2_truncate_log_worker(struct work_struct *work)
5539{
5540        int status;
5541        struct ocfs2_super *osb =
5542                container_of(work, struct ocfs2_super,
5543                             osb_truncate_log_wq.work);
5544
5545        mlog_entry_void();
5546
5547        status = ocfs2_flush_truncate_log(osb);
5548        if (status < 0)
5549                mlog_errno(status);
5550        else
5551                ocfs2_init_inode_steal_slot(osb);
5552
5553        mlog_exit(status);
5554}
5555
5556#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5557void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5558                                       int cancel)
5559{
5560        if (osb->osb_tl_inode) {
5561                /* We want to push off log flushes while truncates are
5562                 * still running. */
5563                if (cancel)
5564                        cancel_delayed_work(&osb->osb_truncate_log_wq);
5565
5566                queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5567                                   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5568        }
5569}
5570
5571static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5572                                       int slot_num,
5573                                       struct inode **tl_inode,
5574                                       struct buffer_head **tl_bh)
5575{
5576        int status;
5577        struct inode *inode = NULL;
5578        struct buffer_head *bh = NULL;
5579
5580        inode = ocfs2_get_system_file_inode(osb,
5581                                           TRUNCATE_LOG_SYSTEM_INODE,
5582                                           slot_num);
5583        if (!inode) {
5584                status = -EINVAL;
5585                mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5586                goto bail;
5587        }
5588
5589        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
5590        if (status < 0) {
5591                iput(inode);
5592                mlog_errno(status);
5593                goto bail;
5594        }
5595
5596        *tl_inode = inode;
5597        *tl_bh    = bh;
5598bail:
5599        mlog_exit(status);
5600        return status;
5601}
5602
5603/* called during the 1st stage of node recovery. we stamp a clean
5604 * truncate log and pass back a copy for processing later. if the
5605 * truncate log does not require processing, a *tl_copy is set to
5606 * NULL. */
5607int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5608                                      int slot_num,
5609                                      struct ocfs2_dinode **tl_copy)
5610{
5611        int status;
5612        struct inode *tl_inode = NULL;
5613        struct buffer_head *tl_bh = NULL;
5614        struct ocfs2_dinode *di;
5615        struct ocfs2_truncate_log *tl;
5616
5617        *tl_copy = NULL;
5618
5619        mlog(0, "recover truncate log from slot %d\n", slot_num);
5620
5621        status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5622        if (status < 0) {
5623                mlog_errno(status);
5624                goto bail;
5625        }
5626
5627        di = (struct ocfs2_dinode *) tl_bh->b_data;
5628        tl = &di->id2.i_dealloc;
5629        if (!OCFS2_IS_VALID_DINODE(di)) {
5630                OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5631                status = -EIO;
5632                goto bail;
5633        }
5634
5635        if (le16_to_cpu(tl->tl_used)) {
5636                mlog(0, "We'll have %u logs to recover\n",
5637                     le16_to_cpu(tl->tl_used));
5638
5639                *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5640                if (!(*tl_copy)) {
5641                        status = -ENOMEM;
5642                        mlog_errno(status);
5643                        goto bail;
5644                }
5645
5646                /* Assuming the write-out below goes well, this copy
5647                 * will be passed back to recovery for processing. */
5648                memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5649
5650                /* All we need to do to clear the truncate log is set
5651                 * tl_used. */
5652                tl->tl_used = 0;
5653
5654                status = ocfs2_write_block(osb, tl_bh, tl_inode);
5655                if (status < 0) {
5656                        mlog_errno(status);
5657                        goto bail;
5658                }
5659        }
5660
5661bail:
5662        if (tl_inode)
5663                iput(tl_inode);
5664        brelse(tl_bh);
5665
5666        if (status < 0 && (*tl_copy)) {
5667                kfree(*tl_copy);
5668                *tl_copy = NULL;
5669        }
5670
5671        mlog_exit(status);
5672        return status;
5673}
5674
5675int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5676                                         struct ocfs2_dinode *tl_copy)
5677{
5678        int status = 0;
5679        int i;
5680        unsigned int clusters, num_recs, start_cluster;
5681        u64 start_blk;
5682        handle_t *handle;
5683        struct inode *tl_inode = osb->osb_tl_inode;
5684        struct ocfs2_truncate_log *tl;
5685
5686        mlog_entry_void();
5687
5688        if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5689                mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5690                return -EINVAL;
5691        }
5692
5693        tl = &tl_copy->id2.i_dealloc;
5694        num_recs = le16_to_cpu(tl->tl_used);
5695        mlog(0, "cleanup %u records from %llu\n", num_recs,
5696             (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5697
5698        mutex_lock(&tl_inode->i_mutex);
5699        for(i = 0; i < num_recs; i++) {
5700                if (ocfs2_truncate_log_needs_flush(osb)) {
5701                        status = __ocfs2_flush_truncate_log(osb);
5702                        if (status < 0) {
5703                                mlog_errno(status);
5704                                goto bail_up;
5705                        }
5706                }
5707
5708                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5709                if (IS_ERR(handle)) {
5710                        status = PTR_ERR(handle);
5711                        mlog_errno(status);
5712                        goto bail_up;
5713                }
5714
5715                clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5716                start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5717                start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5718
5719                status = ocfs2_truncate_log_append(osb, handle,
5720                                                   start_blk, clusters);
5721                ocfs2_commit_trans(osb, handle);
5722                if (status < 0) {
5723                        mlog_errno(status);
5724                        goto bail_up;
5725                }
5726        }
5727
5728bail_up:
5729        mutex_unlock(&tl_inode->i_mutex);
5730
5731        mlog_exit(status);
5732        return status;
5733}
5734
5735void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5736{
5737        int status;
5738        struct inode *tl_inode = osb->osb_tl_inode;
5739
5740        mlog_entry_void();
5741
5742        if (tl_inode) {
5743                cancel_delayed_work(&osb->osb_truncate_log_wq);
5744                flush_workqueue(ocfs2_wq);
5745
5746                status = ocfs2_flush_truncate_log(osb);
5747                if (status < 0)
5748                        mlog_errno(status);
5749
5750                brelse(osb->osb_tl_bh);
5751                iput(osb->osb_tl_inode);
5752        }
5753
5754        mlog_exit_void();
5755}
5756
5757int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5758{
5759        int status;
5760        struct inode *tl_inode = NULL;
5761        struct buffer_head *tl_bh = NULL;
5762
5763        mlog_entry_void();
5764
5765        status = ocfs2_get_truncate_log_info(osb,
5766                                             osb->slot_num,
5767                                             &tl_inode,
5768                                             &tl_bh);
5769        if (status < 0)
5770                mlog_errno(status);
5771
5772        /* ocfs2_truncate_log_shutdown keys on the existence of
5773         * osb->osb_tl_inode so we don't set any of the osb variables
5774         * until we're sure all is well. */
5775        INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5776                          ocfs2_truncate_log_worker);
5777        osb->osb_tl_bh    = tl_bh;
5778        osb->osb_tl_inode = tl_inode;
5779
5780        mlog_exit(status);
5781        return status;
5782}
5783
5784/*
5785 * Delayed de-allocation of suballocator blocks.
5786 *
5787 * Some sets of block de-allocations might involve multiple suballocator inodes.
5788 *
5789 * The locking for this can get extremely complicated, especially when
5790 * the suballocator inodes to delete from aren't known until deep
5791 * within an unrelated codepath.
5792 *
5793 * ocfs2_extent_block structures are a good example of this - an inode
5794 * btree could have been grown by any number of nodes each allocating
5795 * out of their own suballoc inode.
5796 *
5797 * These structures allow the delay of block de-allocation until a
5798 * later time, when locking of multiple cluster inodes won't cause
5799 * deadlock.
5800 */
5801
5802/*
5803 * Describes a single block free from a suballocator
5804 */
5805struct ocfs2_cached_block_free {
5806        struct ocfs2_cached_block_free                *free_next;
5807        u64                                        free_blk;
5808        unsigned int                                free_bit;
5809};
5810
5811struct ocfs2_per_slot_free_list {
5812        struct ocfs2_per_slot_free_list                *f_next_suballocator;
5813        int                                        f_inode_type;
5814        int                                        f_slot;
5815        struct ocfs2_cached_block_free                *f_first;
5816};
5817
5818static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5819                                   int sysfile_type,
5820                                   int slot,
5821                                   struct ocfs2_cached_block_free *head)
5822{
5823        int ret;
5824        u64 bg_blkno;
5825        handle_t *handle;
5826        struct inode *inode;
5827        struct buffer_head *di_bh = NULL;
5828        struct ocfs2_cached_block_free *tmp;
5829
5830        inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5831        if (!inode) {
5832                ret = -EINVAL;
5833                mlog_errno(ret);
5834                goto out;
5835        }
5836
5837        mutex_lock(&inode->i_mutex);
5838
5839        ret = ocfs2_inode_lock(inode, &di_bh, 1);
5840        if (ret) {
5841                mlog_errno(ret);
5842                goto out_mutex;
5843        }
5844
5845        handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5846        if (IS_ERR(handle)) {
5847                ret = PTR_ERR(handle);
5848                mlog_errno(ret);
5849                goto out_unlock;
5850        }
5851
5852        while (head) {
5853                bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5854                                                      head->free_bit);
5855                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5856                     head->free_bit, (unsigned long long)head->free_blk);
5857
5858                ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5859                                               head->free_bit, bg_blkno, 1);
5860                if (ret) {
5861                        mlog_errno(ret);
5862                        goto out_journal;
5863                }
5864
5865                ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5866                if (ret) {
5867                        mlog_errno(ret);
5868                        goto out_journal;
5869                }
5870
5871                tmp = head;
5872                head = head->free_next;
5873                kfree(tmp);
5874        }
5875
5876out_journal:
5877        ocfs2_commit_trans(osb, handle);
5878
5879out_unlock:
5880        ocfs2_inode_unlock(inode, 1);
5881        brelse(di_bh);
5882out_mutex:
5883        mutex_unlock(&inode->i_mutex);
5884        iput(inode);
5885out:
5886        while(head) {
5887                /* Premature exit may have left some dangling items. */
5888                tmp = head;
5889                head = head->free_next;
5890                kfree(tmp);
5891        }
5892
5893        return ret;
5894}
5895
5896int ocfs2_run_deallocs(struct ocfs2_super *osb,
5897                       struct ocfs2_cached_dealloc_ctxt *ctxt)
5898{
5899        int ret = 0, ret2;
5900        struct ocfs2_per_slot_free_list *fl;
5901
5902        if (!ctxt)
5903                return 0;
5904
5905        while (ctxt->c_first_suballocator) {
5906                fl = ctxt->c_first_suballocator;
5907
5908                if (fl->f_first) {
5909                        mlog(0, "Free items: (type %u, slot %d)\n",
5910                             fl->f_inode_type, fl->f_slot);
5911                        ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5912                                                       fl->f_slot, fl->f_first);
5913                        if (ret2)
5914                                mlog_errno(ret2);
5915                        if (!ret)
5916                                ret = ret2;
5917                }
5918
5919                ctxt->c_first_suballocator = fl->f_next_suballocator;
5920                kfree(fl);
5921        }
5922
5923        return ret;
5924}
5925
5926static struct ocfs2_per_slot_free_list *
5927ocfs2_find_per_slot_free_list(int type,
5928                              int slot,
5929                              struct ocfs2_cached_dealloc_ctxt *ctxt)
5930{
5931        struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5932
5933        while (fl) {
5934                if (fl->f_inode_type == type && fl->f_slot == slot)
5935                        return fl;
5936
5937                fl = fl->f_next_suballocator;
5938        }
5939
5940        fl = kmalloc(sizeof(*fl), GFP_NOFS);
5941        if (fl) {
5942                fl->f_inode_type = type;
5943                fl->f_slot = slot;
5944                fl->f_first = NULL;
5945                fl->f_next_suballocator = ctxt->c_first_suballocator;
5946
5947                ctxt->c_first_suballocator = fl;
5948        }
5949        return fl;
5950}
5951
5952static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5953                                     int type, int slot, u64 blkno,
5954                                     unsigned int bit)
5955{
5956        int ret;
5957        struct ocfs2_per_slot_free_list *fl;
5958        struct ocfs2_cached_block_free *item;
5959
5960        fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5961        if (fl == NULL) {
5962                ret = -ENOMEM;
5963                mlog_errno(ret);
5964                goto out;
5965        }
5966
5967        item = kmalloc(sizeof(*item), GFP_NOFS);
5968        if (item == NULL) {
5969                ret = -ENOMEM;
5970                mlog_errno(ret);
5971                goto out;
5972        }
5973
5974        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5975             type, slot, bit, (unsigned long long)blkno);
5976
5977        item->free_blk = blkno;
5978        item->free_bit = bit;
5979        item->free_next = fl->f_first;
5980
5981        fl->f_first = item;
5982
5983        ret = 0;
5984out:
5985        return ret;
5986}
5987
5988static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5989                                         struct ocfs2_extent_block *eb)
5990{
5991        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5992                                         le16_to_cpu(eb->h_suballoc_slot),
5993                                         le64_to_cpu(eb->h_blkno),
5994                                         le16_to_cpu(eb->h_suballoc_bit));
5995}
5996
5997/* This function will figure out whether the currently last extent
5998 * block will be deleted, and if it will, what the new last extent
5999 * block will be so we can update his h_next_leaf_blk field, as well
6000 * as the dinodes i_last_eb_blk */
6001static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6002                                       unsigned int clusters_to_del,
6003                                       struct ocfs2_path *path,
6004                                       struct buffer_head **new_last_eb)
6005{
6006        int next_free, ret = 0;
6007        u32 cpos;
6008        struct ocfs2_extent_rec *rec;
6009        struct ocfs2_extent_block *eb;
6010        struct ocfs2_extent_list *el;
6011        struct buffer_head *bh = NULL;
6012
6013        *new_last_eb = NULL;
6014
6015        /* we have no tree, so of course, no last_eb. */
6016        if (!path->p_tree_depth)
6017                goto out;
6018
6019        /* trunc to zero special case - this makes tree_depth = 0
6020         * regardless of what it is.  */
6021        if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6022                goto out;
6023
6024        el = path_leaf_el(path);
6025        BUG_ON(!el->l_next_free_rec);
6026
6027        /*
6028         * Make sure that this extent list will actually be empty
6029         * after we clear away the data. We can shortcut out if
6030         * there's more than one non-empty extent in the
6031         * list. Otherwise, a check of the remaining extent is
6032         * necessary.
6033         */
6034        next_free = le16_to_cpu(el->l_next_free_rec);
6035        rec = NULL;
6036        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6037                if (next_free > 2)
6038                        goto out;
6039
6040                /* We may have a valid extent in index 1, check it. */
6041                if (next_free == 2)
6042                        rec = &el->l_recs[1];
6043
6044                /*
6045                 * Fall through - no more nonempty extents, so we want
6046                 * to delete this leaf.
6047                 */
6048        } else {
6049                if (next_free > 1)
6050                        goto out;
6051
6052                rec = &el->l_recs[0];
6053        }
6054
6055        if (rec) {
6056                /*
6057                 * Check it we'll only be trimming off the end of this
6058                 * cluster.
6059                 */
6060                if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6061                        goto out;
6062        }
6063
6064        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6065        if (ret) {
6066                mlog_errno(ret);
6067                goto out;
6068        }
6069
6070        ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6071        if (ret) {
6072                mlog_errno(ret);
6073                goto out;
6074        }
6075
6076        eb = (struct ocfs2_extent_block *) bh->b_data;
6077        el = &eb->h_list;
6078        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6079                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6080                ret = -EROFS;
6081                goto out;
6082        }
6083
6084        *new_last_eb = bh;
6085        get_bh(*new_last_eb);
6086        mlog(0, "returning block %llu, (cpos: %u)\n",
6087             (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6088out:
6089        brelse(bh);
6090
6091        return ret;
6092}
6093
6094/*
6095 * Trim some clusters off the rightmost edge of a tree. Only called
6096 * during truncate.
6097 *
6098 * The caller needs to:
6099 *   - start journaling of each path component.
6100 *   - compute and fully set up any new last ext block
6101 */
6102static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6103                           handle_t *handle, struct ocfs2_truncate_context *tc,
6104                           u32 clusters_to_del, u64 *delete_start)
6105{
6106        int ret, i, index = path->p_tree_depth;
6107        u32 new_edge = 0;
6108        u64 deleted_eb = 0;
6109        struct buffer_head *bh;
6110        struct ocfs2_extent_list *el;
6111        struct ocfs2_extent_rec *rec;
6112
6113        *delete_start = 0;
6114
6115        while (index >= 0) {
6116                bh = path->p_node[index].bh;
6117                el = path->p_node[index].el;
6118
6119                mlog(0, "traveling tree (index = %d, block = %llu)\n",
6120                     index,  (unsigned long long)bh->b_blocknr);
6121
6122                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6123
6124                if (index !=
6125                    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6126                        ocfs2_error(inode->i_sb,
6127                                    "Inode %lu has invalid ext. block %llu",
6128                                    inode->i_ino,
6129                                    (unsigned long long)bh->b_blocknr);
6130                        ret = -EROFS;
6131                        goto out;
6132                }
6133
6134find_tail_record:
6135                i = le16_to_cpu(el->l_next_free_rec) - 1;
6136                rec = &el->l_recs[i];
6137
6138                mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6139                     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6140                     ocfs2_rec_clusters(el, rec),
6141                     (unsigned long long)le64_to_cpu(rec->e_blkno),
6142                     le16_to_cpu(el->l_next_free_rec));
6143
6144                BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6145
6146                if (le16_to_cpu(el->l_tree_depth) == 0) {
6147                        /*
6148                         * If the leaf block contains a single empty
6149                         * extent and no records, we can just remove
6150                         * the block.
6151                         */
6152                        if (i == 0 && ocfs2_is_empty_extent(rec)) {
6153                                memset(rec, 0,
6154                                       sizeof(struct ocfs2_extent_rec));
6155                                el->l_next_free_rec = cpu_to_le16(0);
6156
6157                                goto delete;
6158                        }
6159
6160                        /*
6161                         * Remove any empty extents by shifting things
6162                         * left. That should make life much easier on
6163                         * the code below. This condition is rare
6164                         * enough that we shouldn't see a performance
6165                         * hit.
6166                         */
6167                        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6168                                le16_add_cpu(&el->l_next_free_rec, -1);
6169
6170                                for(i = 0;
6171                                    i < le16_to_cpu(el->l_next_free_rec); i++)
6172                                        el->l_recs[i] = el->l_recs[i + 1];
6173
6174                                memset(&el->l_recs[i], 0,
6175                                       sizeof(struct ocfs2_extent_rec));
6176
6177                                /*
6178                                 * We've modified our extent list. The
6179                                 * simplest way to handle this change
6180                                 * is to being the search from the
6181                                 * start again.
6182                                 */
6183                                goto find_tail_record;
6184                        }
6185
6186                        le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6187
6188                        /*
6189                         * We'll use "new_edge" on our way back up the
6190                         * tree to know what our rightmost cpos is.
6191                         */
6192                        new_edge = le16_to_cpu(rec->e_leaf_clusters);
6193                        new_edge += le32_to_cpu(rec->e_cpos);
6194
6195                        /*
6196                         * The caller will use this to delete data blocks.
6197                         */
6198                        *delete_start = le64_to_cpu(rec->e_blkno)
6199                                + ocfs2_clusters_to_blocks(inode->i_sb,
6200                                        le16_to_cpu(rec->e_leaf_clusters));
6201
6202                        /*
6203                         * If it's now empty, remove this record.
6204                         */
6205                        if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6206                                memset(rec, 0,
6207                                       sizeof(struct ocfs2_extent_rec));
6208                                le16_add_cpu(&el->l_next_free_rec, -1);
6209                        }
6210                } else {
6211                        if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6212                                memset(rec, 0,
6213                                       sizeof(struct ocfs2_extent_rec));
6214                                le16_add_cpu(&el->l_next_free_rec, -1);
6215
6216                                goto delete;
6217                        }
6218
6219                        /* Can this actually happen? */
6220                        if (le16_to_cpu(el->l_next_free_rec) == 0)
6221                                goto delete;
6222
6223                        /*
6224                         * We never actually deleted any clusters
6225                         * because our leaf was empty. There's no
6226                         * reason to adjust the rightmost edge then.
6227                         */
6228                        if (new_edge == 0)
6229                                goto delete;
6230
6231                        rec->e_int_clusters = cpu_to_le32(new_edge);
6232                        le32_add_cpu(&rec->e_int_clusters,
6233                                     -le32_to_cpu(rec->e_cpos));
6234
6235                         /*
6236                          * A deleted child record should have been
6237                          * caught above.
6238                          */
6239                         BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6240                }
6241
6242delete:
6243                ret = ocfs2_journal_dirty(handle, bh);
6244                if (ret) {
6245                        mlog_errno(ret);
6246                        goto out;
6247                }
6248
6249                mlog(0, "extent list container %llu, after: record %d: "
6250                     "(%u, %u, %llu), next = %u.\n",
6251                     (unsigned long long)bh->b_blocknr, i,
6252                     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6253                     (unsigned long long)le64_to_cpu(rec->e_blkno),
6254                     le16_to_cpu(el->l_next_free_rec));
6255
6256                /*
6257                 * We must be careful to only attempt delete of an
6258                 * extent block (and not the root inode block).
6259                 */
6260                if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6261                        struct ocfs2_extent_block *eb =
6262                                (struct ocfs2_extent_block *)bh->b_data;
6263
6264                        /*
6265                         * Save this for use when processing the
6266                         * parent block.
6267                         */
6268                        deleted_eb = le64_to_cpu(eb->h_blkno);
6269
6270                        mlog(0, "deleting this extent block.\n");
6271
6272                        ocfs2_remove_from_cache(inode, bh);
6273
6274                        BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6275                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6276                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6277
6278                        ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6279                        /* An error here is not fatal. */
6280                        if (ret < 0)
6281                                mlog_errno(ret);
6282                } else {
6283                        deleted_eb = 0;
6284                }
6285
6286                index--;
6287        }
6288
6289        ret = 0;
6290out:
6291        return ret;
6292}
6293
6294static int ocfs2_do_truncate(struct ocfs2_super *osb,
6295                             unsigned int clusters_to_del,
6296                             struct inode *inode,
6297                             struct buffer_head *fe_bh,
6298                             handle_t *handle,
6299                             struct ocfs2_truncate_context *tc,
6300                             struct ocfs2_path *path)
6301{
6302        int status;
6303        struct ocfs2_dinode *fe;
6304        struct ocfs2_extent_block *last_eb = NULL;
6305        struct ocfs2_extent_list *el;
6306        struct buffer_head *last_eb_bh = NULL;
6307        u64 delete_blk = 0;
6308
6309        fe = (struct ocfs2_dinode *) fe_bh->b_data;
6310
6311        status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6312                                             path, &last_eb_bh);
6313        if (status < 0) {
6314                mlog_errno(status);
6315                goto bail;
6316        }
6317
6318        /*
6319         * Each component will be touched, so we might as well journal
6320         * here to avoid having to handle errors later.
6321         */
6322        status = ocfs2_journal_access_path(inode, handle, path);
6323        if (status < 0) {
6324                mlog_errno(status);
6325                goto bail;
6326        }
6327
6328        if (last_eb_bh) {
6329                status = ocfs2_journal_access(handle, inode, last_eb_bh,
6330                                              OCFS2_JOURNAL_ACCESS_WRITE);
6331                if (status < 0) {
6332                        mlog_errno(status);
6333                        goto bail;
6334                }
6335
6336                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6337        }
6338
6339        el = &(fe->id2.i_list);
6340
6341        /*
6342         * Lower levels depend on this never happening, but it's best
6343         * to check it up here before changing the tree.
6344         */
6345        if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6346                ocfs2_error(inode->i_sb,
6347                            "Inode %lu has an empty extent record, depth %u\n",
6348                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
6349                status = -EROFS;
6350                goto bail;
6351        }
6352
6353        spin_lock(&OCFS2_I(inode)->ip_lock);
6354        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6355                                      clusters_to_del;
6356        spin_unlock(&OCFS2_I(inode)->ip_lock);
6357        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6358        inode->i_blocks = ocfs2_inode_sector_count(inode);
6359
6360        status = ocfs2_trim_tree(inode, path, handle, tc,
6361                                 clusters_to_del, &delete_blk);
6362        if (status) {
6363                mlog_errno(status);
6364                goto bail;
6365        }
6366
6367        if (le32_to_cpu(fe->i_clusters) == 0) {
6368                /* trunc to zero is a special case. */
6369                el->l_tree_depth = 0;
6370                fe->i_last_eb_blk = 0;
6371        } else if (last_eb)
6372                fe->i_last_eb_blk = last_eb->h_blkno;
6373
6374        status = ocfs2_journal_dirty(handle, fe_bh);
6375        if (status < 0) {
6376                mlog_errno(status);
6377                goto bail;
6378        }
6379
6380        if (last_eb) {
6381                /* If there will be a new last extent block, then by
6382                 * definition, there cannot be any leaves to the right of
6383                 * him. */
6384                last_eb->h_next_leaf_blk = 0;
6385                status = ocfs2_journal_dirty(handle, last_eb_bh);
6386                if (status < 0) {
6387                        mlog_errno(status);
6388                        goto bail;
6389                }
6390        }
6391
6392        if (delete_blk) {
6393                status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6394                                                   clusters_to_del);
6395                if (status < 0) {
6396                        mlog_errno(status);
6397                        goto bail;
6398                }
6399        }
6400        status = 0;
6401bail:
6402
6403        mlog_exit(status);
6404        return status;
6405}
6406
6407static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6408{
6409        set_buffer_uptodate(bh);
6410        mark_buffer_dirty(bh);
6411        return 0;
6412}
6413
6414static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6415                                     unsigned int from, unsigned int to,
6416                                     struct page *page, int zero, u64 *phys)
6417{
6418        int ret, partial = 0;
6419
6420        ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6421        if (ret)
6422                mlog_errno(ret);
6423
6424        if (zero)
6425                zero_user_segment(page, from, to);
6426
6427        /*
6428         * Need to set the buffers we zero'd into uptodate
6429         * here if they aren't - ocfs2_map_page_blocks()
6430         * might've skipped some
6431         */
6432        ret = walk_page_buffers(handle, page_buffers(page),
6433                                from, to, &partial,
6434                                ocfs2_zero_func);
6435        if (ret < 0)
6436                mlog_errno(ret);
6437        else if (ocfs2_should_order_data(inode)) {
6438                ret = ocfs2_jbd2_file_inode(handle, inode);
6439#ifdef CONFIG_OCFS2_COMPAT_JBD
6440                ret = walk_page_buffers(handle, page_buffers(page),
6441                                        from, to, &partial,
6442                                        ocfs2_journal_dirty_data);
6443#endif
6444                if (ret < 0)
6445                        mlog_errno(ret);
6446        }
6447
6448        if (!partial)
6449                SetPageUptodate(page);
6450
6451        flush_dcache_page(page);
6452}
6453
6454static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6455                                     loff_t end, struct page **pages,
6456                                     int numpages, u64 phys, handle_t *handle)
6457{
6458        int i;
6459        struct page *page;
6460        unsigned int from, to = PAGE_CACHE_SIZE;
6461        struct super_block *sb = inode->i_sb;
6462
6463        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6464
6465        if (numpages == 0)
6466                goto out;
6467
6468        to = PAGE_CACHE_SIZE;
6469        for(i = 0; i < numpages; i++) {
6470                page = pages[i];
6471
6472                from = start & (PAGE_CACHE_SIZE - 1);
6473                if ((end >> PAGE_CACHE_SHIFT) == page->index)
6474                        to = end & (PAGE_CACHE_SIZE - 1);
6475
6476                BUG_ON(from > PAGE_CACHE_SIZE);
6477                BUG_ON(to > PAGE_CACHE_SIZE);
6478
6479                ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6480                                         &phys);
6481
6482                start = (page->index + 1) << PAGE_CACHE_SHIFT;
6483        }
6484out:
6485        if (pages)
6486                ocfs2_unlock_and_free_pages(pages, numpages);
6487}
6488
6489static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6490                                struct page **pages, int *num)
6491{
6492        int numpages, ret = 0;
6493        struct super_block *sb = inode->i_sb;
6494        struct address_space *mapping = inode->i_mapping;
6495        unsigned long index;
6496        loff_t last_page_bytes;
6497
6498        BUG_ON(start > end);
6499
6500        BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6501               (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6502
6503        numpages = 0;
6504        last_page_bytes = PAGE_ALIGN(end);
6505        index = start >> PAGE_CACHE_SHIFT;
6506        do {
6507                pages[numpages] = grab_cache_page(mapping, index);
6508                if (!pages[numpages]) {
6509                        ret = -ENOMEM;
6510                        mlog_errno(ret);
6511                        goto out;
6512                }
6513
6514                numpages++;
6515                index++;
6516        } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6517
6518out:
6519        if (ret != 0) {
6520                if (pages)
6521                        ocfs2_unlock_and_free_pages(pages, numpages);
6522                numpages = 0;
6523        }
6524
6525        *num = numpages;
6526
6527        return ret;
6528}
6529
6530/*
6531 * Zero the area past i_size but still within an allocated
6532 * cluster. This avoids exposing nonzero data on subsequent file
6533 * extends.
6534 *
6535 * We need to call this before i_size is updated on the inode because
6536 * otherwise block_write_full_page() will skip writeout of pages past
6537 * i_size. The new_i_size parameter is passed for this reason.
6538 */
6539int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6540                                  u64 range_start, u64 range_end)
6541{
6542        int ret = 0, numpages;
6543        struct page **pages = NULL;
6544        u64 phys;
6545        unsigned int ext_flags;
6546        struct super_block *sb = inode->i_sb;
6547
6548        /*
6549         * File systems which don't support sparse files zero on every
6550         * extend.
6551         */
6552        if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6553                return 0;
6554
6555        pages = kcalloc(ocfs2_pages_per_cluster(sb),
6556                        sizeof(struct page *), GFP_NOFS);
6557        if (pages == NULL) {
6558                ret = -ENOMEM;
6559                mlog_errno(ret);
6560                goto out;
6561        }
6562
6563        if (range_start == range_end)
6564                goto out;
6565
6566        ret = ocfs2_extent_map_get_blocks(inode,
6567                                          range_start >> sb->s_blocksize_bits,
6568                                          &phys, NULL, &ext_flags);
6569        if (ret) {
6570                mlog_errno(ret);
6571                goto out;
6572        }
6573
6574        /*
6575         * Tail is a hole, or is marked unwritten. In either case, we
6576         * can count on read and write to return/push zero's.
6577         */
6578        if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6579                goto out;
6580
6581        ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6582                                   &numpages);
6583        if (ret) {
6584                mlog_errno(ret);
6585                goto out;
6586        }
6587
6588        ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6589                                 numpages, phys, handle);
6590
6591        /*
6592         * Initiate writeout of the pages we zero'd here. We don't
6593         * wait on them - the truncate_inode_pages() call later will
6594         * do that for us.
6595         */
6596        ret = do_sync_mapping_range(inode->i_mapping, range_start,
6597                                    range_end - 1, SYNC_FILE_RANGE_WRITE);
6598        if (ret)
6599                mlog_errno(ret);
6600
6601out:
6602        if (pages)
6603                kfree(pages);
6604
6605        return ret;
6606}
6607
6608static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6609                                             struct ocfs2_dinode *di)
6610{
6611        unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6612        unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6613
6614        if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6615                memset(&di->id2, 0, blocksize -
6616                                    offsetof(struct ocfs2_dinode, id2) -
6617                                    xattrsize);
6618        else
6619                memset(&di->id2, 0, blocksize -
6620                                    offsetof(struct ocfs2_dinode, id2));
6621}
6622
6623void ocfs2_dinode_new_extent_list(struct inode *inode,
6624                                  struct ocfs2_dinode *di)
6625{
6626        ocfs2_zero_dinode_id2_with_xattr(inode, di);
6627        di->id2.i_list.l_tree_depth = 0;
6628        di->id2.i_list.l_next_free_rec = 0;
6629        di->id2.i_list.l_count = cpu_to_le16(
6630                ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6631}
6632
6633void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6634{
6635        struct ocfs2_inode_info *oi = OCFS2_I(inode);
6636        struct ocfs2_inline_data *idata = &di->id2.i_data;
6637
6638        spin_lock(&oi->ip_lock);
6639        oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6640        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6641        spin_unlock(&oi->ip_lock);
6642
6643        /*
6644         * We clear the entire i_data structure here so that all
6645         * fields can be properly initialized.
6646         */
6647        ocfs2_zero_dinode_id2_with_xattr(inode, di);
6648
6649        idata->id_count = cpu_to_le16(
6650                        ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6651}
6652
6653int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6654                                         struct buffer_head *di_bh)
6655{
6656        int ret, i, has_data, num_pages = 0;
6657        handle_t *handle;
6658        u64 uninitialized_var(block);
6659        struct ocfs2_inode_info *oi = OCFS2_I(inode);
6660        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6661        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6662        struct ocfs2_alloc_context *data_ac = NULL;
6663        struct page **pages = NULL;
6664        loff_t end = osb->s_clustersize;
6665        struct ocfs2_extent_tree et;
6666
6667        has_data = i_size_read(inode) ? 1 : 0;
6668
6669        if (has_data) {
6670                pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6671                                sizeof(struct page *), GFP_NOFS);
6672                if (pages == NULL) {
6673                        ret = -ENOMEM;
6674                        mlog_errno(ret);
6675                        goto out;
6676                }
6677
6678                ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6679                if (ret) {
6680                        mlog_errno(ret);
6681                        goto out;
6682                }
6683        }
6684
6685        handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6686        if (IS_ERR(handle)) {
6687                ret = PTR_ERR(handle);
6688                mlog_errno(ret);
6689                goto out_unlock;
6690        }
6691
6692        ret = ocfs2_journal_access(handle, inode, di_bh,
6693                                   OCFS2_JOURNAL_ACCESS_WRITE);
6694        if (ret) {
6695                mlog_errno(ret);
6696                goto out_commit;
6697        }
6698
6699        if (has_data) {
6700                u32 bit_off, num;
6701                unsigned int page_end;
6702                u64 phys;
6703
6704                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6705                                           &num);
6706                if (ret) {
6707                        mlog_errno(ret);
6708                        goto out_commit;
6709                }
6710
6711                /*
6712                 * Save two copies, one for insert, and one that can
6713                 * be changed by ocfs2_map_and_dirty_page() below.
6714                 */
6715                block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6716
6717                /*
6718                 * Non sparse file systems zero on extend, so no need
6719                 * to do that now.
6720                 */
6721                if (!ocfs2_sparse_alloc(osb) &&
6722                    PAGE_CACHE_SIZE < osb->s_clustersize)
6723                        end = PAGE_CACHE_SIZE;
6724
6725                ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6726                if (ret) {
6727                        mlog_errno(ret);
6728                        goto out_commit;
6729                }
6730
6731                /*
6732                 * This should populate the 1st page for us and mark
6733                 * it up to date.
6734                 */
6735                ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6736                if (ret) {
6737                        mlog_errno(ret);
6738                        goto out_commit;
6739                }
6740
6741                page_end = PAGE_CACHE_SIZE;
6742                if (PAGE_CACHE_SIZE > osb->s_clustersize)
6743                        page_end = osb->s_clustersize;
6744
6745                for (i = 0; i < num_pages; i++)
6746                        ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6747                                                 pages[i], i > 0, &phys);
6748        }
6749
6750        spin_lock(&oi->ip_lock);
6751        oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6752        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6753        spin_unlock(&oi->ip_lock);
6754
6755        ocfs2_dinode_new_extent_list(inode, di);
6756
6757        ocfs2_journal_dirty(handle, di_bh);
6758
6759        if (has_data) {
6760                /*
6761                 * An error at this point should be extremely rare. If
6762                 * this proves to be false, we could always re-build
6763                 * the in-inode data from our pages.
6764                 */
6765                ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
6766                ret = ocfs2_insert_extent(osb, handle, inode, &et,
6767                                          0, block, 1, 0, NULL);
6768                if (ret) {
6769                        mlog_errno(ret);
6770                        goto out_commit;
6771                }
6772
6773                inode->i_blocks = ocfs2_inode_sector_count(inode);
6774        }
6775
6776out_commit:
6777        ocfs2_commit_trans(osb, handle);
6778
6779out_unlock:
6780        if (data_ac)
6781                ocfs2_free_alloc_context(data_ac);
6782
6783out:
6784        if (pages) {
6785                ocfs2_unlock_and_free_pages(pages, num_pages);
6786                kfree(pages);
6787        }
6788
6789        return ret;
6790}
6791
6792/*
6793 * It is expected, that by the time you call this function,
6794 * inode->i_size and fe->i_size have been adjusted.
6795 *
6796 * WARNING: This will kfree the truncate context
6797 */
6798int ocfs2_commit_truncate(struct ocfs2_super *osb,
6799                          struct inode *inode,
6800                          struct buffer_head *fe_bh,
6801                          struct ocfs2_truncate_context *tc)
6802{
6803        int status, i, credits, tl_sem = 0;
6804        u32 clusters_to_del, new_highest_cpos, range;
6805        struct ocfs2_extent_list *el;
6806        handle_t *handle = NULL;
6807        struct inode *tl_inode = osb->osb_tl_inode;
6808        struct ocfs2_path *path = NULL;
6809        struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6810
6811        mlog_entry_void();
6812
6813        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6814                                                     i_size_read(inode));
6815
6816        path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6817        if (!path) {
6818                status = -ENOMEM;
6819                mlog_errno(status);
6820                goto bail;
6821        }
6822
6823        ocfs2_extent_map_trunc(inode, new_highest_cpos);
6824
6825start:
6826        /*
6827         * Check that we still have allocation to delete.
6828         */
6829        if (OCFS2_I(inode)->ip_clusters == 0) {
6830                status = 0;
6831                goto bail;
6832        }
6833
6834        /*
6835         * Truncate always works against the rightmost tree branch.
6836         */
6837        status = ocfs2_find_path(inode, path, UINT_MAX);
6838        if (status) {
6839                mlog_errno(status);
6840                goto bail;
6841        }
6842
6843        mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6844             OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6845
6846        /*
6847         * By now, el will point to the extent list on the bottom most
6848         * portion of this tree. Only the tail record is considered in
6849         * each pass.
6850         *
6851         * We handle the following cases, in order:
6852         * - empty extent: delete the remaining branch
6853         * - remove the entire record
6854         * - remove a partial record
6855         * - no record needs to be removed (truncate has completed)
6856         */
6857        el = path_leaf_el(path);
6858        if (le16_to_cpu(el->l_next_free_rec) == 0) {
6859                ocfs2_error(inode->i_sb,
6860                            "Inode %llu has empty extent block at %llu\n",
6861                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
6862                            (unsigned long long)path_leaf_bh(path)->b_blocknr);
6863                status = -EROFS;
6864                goto bail;
6865        }
6866
6867        i = le16_to_cpu(el->l_next_free_rec) - 1;
6868        range = le32_to_cpu(el->l_recs[i].e_cpos) +
6869                ocfs2_rec_clusters(el, &el->l_recs[i]);
6870        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6871                clusters_to_del = 0;
6872        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6873                clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6874        } else if (range > new_highest_cpos) {
6875                clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6876                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
6877                                  new_highest_cpos;
6878        } else {
6879                status = 0;
6880                goto bail;
6881        }
6882
6883        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6884             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6885
6886        mutex_lock(&tl_inode->i_mutex);
6887        tl_sem = 1;
6888        /* ocfs2_truncate_log_needs_flush guarantees us at least one
6889         * record is free for use. If there isn't any, we flush to get
6890         * an empty truncate log.  */
6891        if (ocfs2_truncate_log_needs_flush(osb)) {
6892                status = __ocfs2_flush_truncate_log(osb);
6893                if (status < 0) {
6894                        mlog_errno(status);
6895                        goto bail;
6896                }
6897        }
6898
6899        credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6900                                                (struct ocfs2_dinode *)fe_bh->b_data,
6901                                                el);
6902        handle = ocfs2_start_trans(osb, credits);
6903        if (IS_ERR(handle)) {
6904                status = PTR_ERR(handle);
6905                handle = NULL;
6906                mlog_errno(status);
6907                goto bail;
6908        }
6909
6910        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6911                                   tc, path);
6912        if (status < 0) {
6913                mlog_errno(status);
6914                goto bail;
6915        }
6916
6917        mutex_unlock(&tl_inode->i_mutex);
6918        tl_sem = 0;
6919
6920        ocfs2_commit_trans(osb, handle);
6921        handle = NULL;
6922
6923        ocfs2_reinit_path(path, 1);
6924
6925        /*
6926         * The check above will catch the case where we've truncated
6927         * away all allocation.
6928         */
6929        goto start;
6930
6931bail:
6932
6933        ocfs2_schedule_truncate_log_flush(osb, 1);
6934
6935        if (tl_sem)
6936                mutex_unlock(&tl_inode->i_mutex);
6937
6938        if (handle)
6939                ocfs2_commit_trans(osb, handle);
6940
6941        ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6942
6943        ocfs2_free_path(path);
6944
6945        /* This will drop the ext_alloc cluster lock for us */
6946        ocfs2_free_truncate_context(tc);
6947
6948        mlog_exit(status);
6949        return status;
6950}
6951
6952/*
6953 * Expects the inode to already be locked.
6954 */
6955int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6956                           struct inode *inode,
6957                           struct buffer_head *fe_bh,
6958                           struct ocfs2_truncate_context **tc)
6959{
6960        int status;
6961        unsigned int new_i_clusters;
6962        struct ocfs2_dinode *fe;
6963        struct ocfs2_extent_block *eb;
6964        struct buffer_head *last_eb_bh = NULL;
6965
6966        mlog_entry_void();
6967
6968        *tc = NULL;
6969
6970        new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
6971                                                  i_size_read(inode));
6972        fe = (struct ocfs2_dinode *) fe_bh->b_data;
6973
6974        mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
6975             "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
6976             (unsigned long long)le64_to_cpu(fe->i_size));
6977
6978        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
6979        if (!(*tc)) {
6980                status = -ENOMEM;
6981                mlog_errno(status);
6982                goto bail;
6983        }
6984        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6985
6986        if (fe->id2.i_list.l_tree_depth) {
6987                status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
6988                                          &last_eb_bh);
6989                if (status < 0) {
6990                        mlog_errno(status);
6991                        goto bail;
6992                }
6993                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6994                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6995                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6996
6997                        brelse(last_eb_bh);
6998                        status = -EIO;
6999                        goto bail;
7000                }
7001        }
7002
7003        (*tc)->tc_last_eb_bh = last_eb_bh;
7004
7005        status = 0;
7006bail:
7007        if (status < 0) {
7008                if (*tc)
7009                        ocfs2_free_truncate_context(*tc);
7010                *tc = NULL;
7011        }
7012        mlog_exit_void();
7013        return status;
7014}
7015
7016/*
7017 * 'start' is inclusive, 'end' is not.
7018 */
7019int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7020                          unsigned int start, unsigned int end, int trunc)
7021{
7022        int ret;
7023        unsigned int numbytes;
7024        handle_t *handle;
7025        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7026        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7027        struct ocfs2_inline_data *idata = &di->id2.i_data;
7028
7029        if (end > i_size_read(inode))
7030                end = i_size_read(inode);
7031
7032        BUG_ON(start >= end);
7033
7034        if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7035            !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7036            !ocfs2_supports_inline_data(osb)) {
7037                ocfs2_error(inode->i_sb,
7038                            "Inline data flags for inode %llu don't agree! "
7039                            "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7040                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
7041                            le16_to_cpu(di->i_dyn_features),
7042                            OCFS2_I(inode)->ip_dyn_features,
7043                            osb->s_feature_incompat);
7044                ret = -EROFS;
7045                goto out;
7046        }
7047
7048        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7049        if (IS_ERR(handle)) {
7050                ret = PTR_ERR(handle);
7051                mlog_errno(ret);
7052                goto out;
7053        }
7054
7055        ret = ocfs2_journal_access(handle, inode, di_bh,
7056                                   OCFS2_JOURNAL_ACCESS_WRITE);
7057        if (ret) {
7058                mlog_errno(ret);
7059                goto out_commit;
7060        }
7061
7062        numbytes = end - start;
7063        memset(idata->id_data + start, 0, numbytes);
7064
7065        /*
7066         * No need to worry about the data page here - it's been
7067         * truncated already and inline data doesn't need it for
7068         * pushing zero's to disk, so we'll let readpage pick it up
7069         * later.
7070         */
7071        if (trunc) {
7072                i_size_write(inode, start);
7073                di->i_size = cpu_to_le64(start);
7074        }
7075
7076        inode->i_blocks = ocfs2_inode_sector_count(inode);
7077        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7078
7079        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7080        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7081
7082        ocfs2_journal_dirty(handle, di_bh);
7083
7084out_commit:
7085        ocfs2_commit_trans(osb, handle);
7086
7087out:
7088        return ret;
7089}
7090
7091static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7092{
7093        /*
7094         * The caller is responsible for completing deallocation
7095         * before freeing the context.
7096         */
7097        if (tc->tc_dealloc.c_first_suballocator != NULL)
7098                mlog(ML_NOTICE,
7099                     "Truncate completion has non-empty dealloc context\n");
7100
7101        brelse(tc->tc_last_eb_bh);
7102
7103        kfree(tc);
7104}