ClabureDB: Classified Bug-Reports Database

Project: Linux Kernel

Showing error 1670

User:	Jiri Slaby
Error type:	Invalid Pointer Dereference
Error type description:	A pointer which is invalid is being dereferenced
File location:	fs/xfs/xfs_inode.c
Line in file:	2128
Project:	Linux Kernel
Project version:	2.6.28
Tools:	Smatch (1.59)
Entered:	2013-09-10 07:54:05 UTC
Source:

   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include <linux/log2.h>
  19
  20#include "xfs.h"
  21#include "xfs_fs.h"
  22#include "xfs_types.h"
  23#include "xfs_bit.h"
  24#include "xfs_log.h"
  25#include "xfs_inum.h"
  26#include "xfs_imap.h"
  27#include "xfs_trans.h"
  28#include "xfs_trans_priv.h"
  29#include "xfs_sb.h"
  30#include "xfs_ag.h"
  31#include "xfs_dir2.h"
  32#include "xfs_dmapi.h"
  33#include "xfs_mount.h"
  34#include "xfs_bmap_btree.h"
  35#include "xfs_alloc_btree.h"
  36#include "xfs_ialloc_btree.h"
  37#include "xfs_dir2_sf.h"
  38#include "xfs_attr_sf.h"
  39#include "xfs_dinode.h"
  40#include "xfs_inode.h"
  41#include "xfs_buf_item.h"
  42#include "xfs_inode_item.h"
  43#include "xfs_btree.h"
  44#include "xfs_alloc.h"
  45#include "xfs_ialloc.h"
  46#include "xfs_bmap.h"
  47#include "xfs_rw.h"
  48#include "xfs_error.h"
  49#include "xfs_utils.h"
  50#include "xfs_dir2_trace.h"
  51#include "xfs_quota.h"
  52#include "xfs_acl.h"
  53#include "xfs_filestream.h"
  54#include "xfs_vnodeops.h"
  55
  56kmem_zone_t *xfs_ifork_zone;
  57kmem_zone_t *xfs_inode_zone;
  58
  59/*
  60 * Used in xfs_itruncate().  This is the maximum number of extents
  61 * freed from a file in a single transaction.
  62 */
  63#define        XFS_ITRUNC_MAX_EXTENTS        2
  64
  65STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
  66STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
  67STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
  68STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
  69
  70#ifdef DEBUG
  71/*
  72 * Make sure that the extents in the given memory buffer
  73 * are valid.
  74 */
  75STATIC void
  76xfs_validate_extents(
  77        xfs_ifork_t                *ifp,
  78        int                        nrecs,
  79        xfs_exntfmt_t                fmt)
  80{
  81        xfs_bmbt_irec_t                irec;
  82        xfs_bmbt_rec_host_t        rec;
  83        int                        i;
  84
  85        for (i = 0; i < nrecs; i++) {
  86                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
  87                rec.l0 = get_unaligned(&ep->l0);
  88                rec.l1 = get_unaligned(&ep->l1);
  89                xfs_bmbt_get_all(&rec, &irec);
  90                if (fmt == XFS_EXTFMT_NOSTATE)
  91                        ASSERT(irec.br_state == XFS_EXT_NORM);
  92        }
  93}
  94#else /* DEBUG */
  95#define xfs_validate_extents(ifp, nrecs, fmt)
  96#endif /* DEBUG */
  97
  98/*
  99 * Check that none of the inode's in the buffer have a next
 100 * unlinked field of 0.
 101 */
 102#if defined(DEBUG)
 103void
 104xfs_inobp_check(
 105        xfs_mount_t        *mp,
 106        xfs_buf_t        *bp)
 107{
 108        int                i;
 109        int                j;
 110        xfs_dinode_t        *dip;
 111
 112        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 113
 114        for (i = 0; i < j; i++) {
 115                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 116                                        i * mp->m_sb.sb_inodesize);
 117                if (!dip->di_next_unlinked)  {
 118                        xfs_fs_cmn_err(CE_ALERT, mp,
 119                                "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
 120                                bp);
 121                        ASSERT(dip->di_next_unlinked);
 122                }
 123        }
 124}
 125#endif
 126
 127/*
 128 * Find the buffer associated with the given inode map
 129 * We do basic validation checks on the buffer once it has been
 130 * retrieved from disk.
 131 */
 132STATIC int
 133xfs_imap_to_bp(
 134        xfs_mount_t        *mp,
 135        xfs_trans_t        *tp,
 136        xfs_imap_t        *imap,
 137        xfs_buf_t        **bpp,
 138        uint                buf_flags,
 139        uint                imap_flags)
 140{
 141        int                error;
 142        int                i;
 143        int                ni;
 144        xfs_buf_t        *bp;
 145
 146        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 147                                   (int)imap->im_len, buf_flags, &bp);
 148        if (error) {
 149                if (error != EAGAIN) {
 150                        cmn_err(CE_WARN,
 151                                "xfs_imap_to_bp: xfs_trans_read_buf()returned "
 152                                "an error %d on %s.  Returning error.",
 153                                error, mp->m_fsname);
 154                } else {
 155                        ASSERT(buf_flags & XFS_BUF_TRYLOCK);
 156                }
 157                return error;
 158        }
 159
 160        /*
 161         * Validate the magic number and version of every inode in the buffer
 162         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
 163         */
 164#ifdef DEBUG
 165        ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
 166#else        /* usual case */
 167        ni = 1;
 168#endif
 169
 170        for (i = 0; i < ni; i++) {
 171                int                di_ok;
 172                xfs_dinode_t        *dip;
 173
 174                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 175                                        (i << mp->m_sb.sb_inodelog));
 176                di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
 177                            XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
 178                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 179                                                XFS_ERRTAG_ITOBP_INOTOBP,
 180                                                XFS_RANDOM_ITOBP_INOTOBP))) {
 181                        if (imap_flags & XFS_IMAP_BULKSTAT) {
 182                                xfs_trans_brelse(tp, bp);
 183                                return XFS_ERROR(EINVAL);
 184                        }
 185                        XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
 186                                                XFS_ERRLEVEL_HIGH, mp, dip);
 187#ifdef DEBUG
 188                        cmn_err(CE_PANIC,
 189                                        "Device %s - bad inode magic/vsn "
 190                                        "daddr %lld #%d (magic=%x)",
 191                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
 192                                (unsigned long long)imap->im_blkno, i,
 193                                be16_to_cpu(dip->di_core.di_magic));
 194#endif
 195                        xfs_trans_brelse(tp, bp);
 196                        return XFS_ERROR(EFSCORRUPTED);
 197                }
 198        }
 199
 200        xfs_inobp_check(mp, bp);
 201
 202        /*
 203         * Mark the buffer as an inode buffer now that it looks good
 204         */
 205        XFS_BUF_SET_VTYPE(bp, B_FS_INO);
 206
 207        *bpp = bp;
 208        return 0;
 209}
 210
 211/*
 212 * This routine is called to map an inode number within a file
 213 * system to the buffer containing the on-disk version of the
 214 * inode.  It returns a pointer to the buffer containing the
 215 * on-disk inode in the bpp parameter, and in the dip parameter
 216 * it returns a pointer to the on-disk inode within that buffer.
 217 *
 218 * If a non-zero error is returned, then the contents of bpp and
 219 * dipp are undefined.
 220 *
 221 * Use xfs_imap() to determine the size and location of the
 222 * buffer to read from disk.
 223 */
 224STATIC int
 225xfs_inotobp(
 226        xfs_mount_t        *mp,
 227        xfs_trans_t        *tp,
 228        xfs_ino_t        ino,
 229        xfs_dinode_t        **dipp,
 230        xfs_buf_t        **bpp,
 231        int                *offset)
 232{
 233        xfs_imap_t        imap;
 234        xfs_buf_t        *bp;
 235        int                error;
 236
 237        imap.im_blkno = 0;
 238        error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
 239        if (error)
 240                return error;
 241
 242        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
 243        if (error)
 244                return error;
 245
 246        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 247        *bpp = bp;
 248        *offset = imap.im_boffset;
 249        return 0;
 250}
 251
 252
 253/*
 254 * This routine is called to map an inode to the buffer containing
 255 * the on-disk version of the inode.  It returns a pointer to the
 256 * buffer containing the on-disk inode in the bpp parameter, and in
 257 * the dip parameter it returns a pointer to the on-disk inode within
 258 * that buffer.
 259 *
 260 * If a non-zero error is returned, then the contents of bpp and
 261 * dipp are undefined.
 262 *
 263 * If the inode is new and has not yet been initialized, use xfs_imap()
 264 * to determine the size and location of the buffer to read from disk.
 265 * If the inode has already been mapped to its buffer and read in once,
 266 * then use the mapping information stored in the inode rather than
 267 * calling xfs_imap().  This allows us to avoid the overhead of looking
 268 * at the inode btree for small block file systems (see xfs_dilocate()).
 269 * We can tell whether the inode has been mapped in before by comparing
 270 * its disk block address to 0.  Only uninitialized inodes will have
 271 * 0 for the disk block address.
 272 */
 273int
 274xfs_itobp(
 275        xfs_mount_t        *mp,
 276        xfs_trans_t        *tp,
 277        xfs_inode_t        *ip,
 278        xfs_dinode_t        **dipp,
 279        xfs_buf_t        **bpp,
 280        xfs_daddr_t        bno,
 281        uint                imap_flags,
 282        uint                buf_flags)
 283{
 284        xfs_imap_t        imap;
 285        xfs_buf_t        *bp;
 286        int                error;
 287
 288        if (ip->i_blkno == (xfs_daddr_t)0) {
 289                imap.im_blkno = bno;
 290                error = xfs_imap(mp, tp, ip->i_ino, &imap,
 291                                        XFS_IMAP_LOOKUP | imap_flags);
 292                if (error)
 293                        return error;
 294
 295                /*
 296                 * Fill in the fields in the inode that will be used to
 297                 * map the inode to its buffer from now on.
 298                 */
 299                ip->i_blkno = imap.im_blkno;
 300                ip->i_len = imap.im_len;
 301                ip->i_boffset = imap.im_boffset;
 302        } else {
 303                /*
 304                 * We've already mapped the inode once, so just use the
 305                 * mapping that we saved the first time.
 306                 */
 307                imap.im_blkno = ip->i_blkno;
 308                imap.im_len = ip->i_len;
 309                imap.im_boffset = ip->i_boffset;
 310        }
 311        ASSERT(bno == 0 || bno == imap.im_blkno);
 312
 313        error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
 314        if (error)
 315                return error;
 316
 317        if (!bp) {
 318                ASSERT(buf_flags & XFS_BUF_TRYLOCK);
 319                ASSERT(tp == NULL);
 320                *bpp = NULL;
 321                return EAGAIN;
 322        }
 323
 324        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 325        *bpp = bp;
 326        return 0;
 327}
 328
 329/*
 330 * Move inode type and inode format specific information from the
 331 * on-disk inode to the in-core inode.  For fifos, devs, and sockets
 332 * this means set if_rdev to the proper value.  For files, directories,
 333 * and symlinks this means to bring in the in-line data or extent
 334 * pointers.  For a file in B-tree format, only the root is immediately
 335 * brought in-core.  The rest will be in-lined in if_extents when it
 336 * is first referenced (see xfs_iread_extents()).
 337 */
 338STATIC int
 339xfs_iformat(
 340        xfs_inode_t                *ip,
 341        xfs_dinode_t                *dip)
 342{
 343        xfs_attr_shortform_t        *atp;
 344        int                        size;
 345        int                        error;
 346        xfs_fsize_t             di_size;
 347        ip->i_df.if_ext_max =
 348                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 349        error = 0;
 350
 351        if (unlikely(be32_to_cpu(dip->di_core.di_nextents) +
 352                     be16_to_cpu(dip->di_core.di_anextents) >
 353                     be64_to_cpu(dip->di_core.di_nblocks))) {
 354                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 355                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
 356                        (unsigned long long)ip->i_ino,
 357                        (int)(be32_to_cpu(dip->di_core.di_nextents) +
 358                              be16_to_cpu(dip->di_core.di_anextents)),
 359                        (unsigned long long)
 360                                be64_to_cpu(dip->di_core.di_nblocks));
 361                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
 362                                     ip->i_mount, dip);
 363                return XFS_ERROR(EFSCORRUPTED);
 364        }
 365
 366        if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
 367                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 368                        "corrupt dinode %Lu, forkoff = 0x%x.",
 369                        (unsigned long long)ip->i_ino,
 370                        dip->di_core.di_forkoff);
 371                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
 372                                     ip->i_mount, dip);
 373                return XFS_ERROR(EFSCORRUPTED);
 374        }
 375
 376        switch (ip->i_d.di_mode & S_IFMT) {
 377        case S_IFIFO:
 378        case S_IFCHR:
 379        case S_IFBLK:
 380        case S_IFSOCK:
 381                if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) {
 382                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
 383                                              ip->i_mount, dip);
 384                        return XFS_ERROR(EFSCORRUPTED);
 385                }
 386                ip->i_d.di_size = 0;
 387                ip->i_size = 0;
 388                ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev);
 389                break;
 390
 391        case S_IFREG:
 392        case S_IFLNK:
 393        case S_IFDIR:
 394                switch (dip->di_core.di_format) {
 395                case XFS_DINODE_FMT_LOCAL:
 396                        /*
 397                         * no local regular files yet
 398                         */
 399                        if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) {
 400                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 401                                        "corrupt inode %Lu "
 402                                        "(local format for regular file).",
 403                                        (unsigned long long) ip->i_ino);
 404                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 405                                                     XFS_ERRLEVEL_LOW,
 406                                                     ip->i_mount, dip);
 407                                return XFS_ERROR(EFSCORRUPTED);
 408                        }
 409
 410                        di_size = be64_to_cpu(dip->di_core.di_size);
 411                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
 412                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 413                                        "corrupt inode %Lu "
 414                                        "(bad size %Ld for local inode).",
 415                                        (unsigned long long) ip->i_ino,
 416                                        (long long) di_size);
 417                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
 418                                                     XFS_ERRLEVEL_LOW,
 419                                                     ip->i_mount, dip);
 420                                return XFS_ERROR(EFSCORRUPTED);
 421                        }
 422
 423                        size = (int)di_size;
 424                        error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
 425                        break;
 426                case XFS_DINODE_FMT_EXTENTS:
 427                        error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 428                        break;
 429                case XFS_DINODE_FMT_BTREE:
 430                        error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 431                        break;
 432                default:
 433                        XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
 434                                         ip->i_mount);
 435                        return XFS_ERROR(EFSCORRUPTED);
 436                }
 437                break;
 438
 439        default:
 440                XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
 441                return XFS_ERROR(EFSCORRUPTED);
 442        }
 443        if (error) {
 444                return error;
 445        }
 446        if (!XFS_DFORK_Q(dip))
 447                return 0;
 448        ASSERT(ip->i_afp == NULL);
 449        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
 450        ip->i_afp->if_ext_max =
 451                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 452        switch (dip->di_core.di_aformat) {
 453        case XFS_DINODE_FMT_LOCAL:
 454                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
 455                size = be16_to_cpu(atp->hdr.totsize);
 456                error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
 457                break;
 458        case XFS_DINODE_FMT_EXTENTS:
 459                error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
 460                break;
 461        case XFS_DINODE_FMT_BTREE:
 462                error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
 463                break;
 464        default:
 465                error = XFS_ERROR(EFSCORRUPTED);
 466                break;
 467        }
 468        if (error) {
 469                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 470                ip->i_afp = NULL;
 471                xfs_idestroy_fork(ip, XFS_DATA_FORK);
 472        }
 473        return error;
 474}
 475
 476/*
 477 * The file is in-lined in the on-disk inode.
 478 * If it fits into if_inline_data, then copy
 479 * it there, otherwise allocate a buffer for it
 480 * and copy the data there.  Either way, set
 481 * if_data to point at the data.
 482 * If we allocate a buffer for the data, make
 483 * sure that its size is a multiple of 4 and
 484 * record the real size in i_real_bytes.
 485 */
 486STATIC int
 487xfs_iformat_local(
 488        xfs_inode_t        *ip,
 489        xfs_dinode_t        *dip,
 490        int                whichfork,
 491        int                size)
 492{
 493        xfs_ifork_t        *ifp;
 494        int                real_size;
 495
 496        /*
 497         * If the size is unreasonable, then something
 498         * is wrong and we just bail out rather than crash in
 499         * kmem_alloc() or memcpy() below.
 500         */
 501        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 502                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 503                        "corrupt inode %Lu "
 504                        "(bad size %d for local fork, size = %d).",
 505                        (unsigned long long) ip->i_ino, size,
 506                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
 507                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
 508                                     ip->i_mount, dip);
 509                return XFS_ERROR(EFSCORRUPTED);
 510        }
 511        ifp = XFS_IFORK_PTR(ip, whichfork);
 512        real_size = 0;
 513        if (size == 0)
 514                ifp->if_u1.if_data = NULL;
 515        else if (size <= sizeof(ifp->if_u2.if_inline_data))
 516                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 517        else {
 518                real_size = roundup(size, 4);
 519                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
 520        }
 521        ifp->if_bytes = size;
 522        ifp->if_real_bytes = real_size;
 523        if (size)
 524                memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
 525        ifp->if_flags &= ~XFS_IFEXTENTS;
 526        ifp->if_flags |= XFS_IFINLINE;
 527        return 0;
 528}
 529
 530/*
 531 * The file consists of a set of extents all
 532 * of which fit into the on-disk inode.
 533 * If there are few enough extents to fit into
 534 * the if_inline_ext, then copy them there.
 535 * Otherwise allocate a buffer for them and copy
 536 * them into it.  Either way, set if_extents
 537 * to point at the extents.
 538 */
 539STATIC int
 540xfs_iformat_extents(
 541        xfs_inode_t        *ip,
 542        xfs_dinode_t        *dip,
 543        int                whichfork)
 544{
 545        xfs_bmbt_rec_t        *dp;
 546        xfs_ifork_t        *ifp;
 547        int                nex;
 548        int                size;
 549        int                i;
 550
 551        ifp = XFS_IFORK_PTR(ip, whichfork);
 552        nex = XFS_DFORK_NEXTENTS(dip, whichfork);
 553        size = nex * (uint)sizeof(xfs_bmbt_rec_t);
 554
 555        /*
 556         * If the number of extents is unreasonable, then something
 557         * is wrong and we just bail out rather than crash in
 558         * kmem_alloc() or memcpy() below.
 559         */
 560        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 561                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 562                        "corrupt inode %Lu ((a)extents = %d).",
 563                        (unsigned long long) ip->i_ino, nex);
 564                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 565                                     ip->i_mount, dip);
 566                return XFS_ERROR(EFSCORRUPTED);
 567        }
 568
 569        ifp->if_real_bytes = 0;
 570        if (nex == 0)
 571                ifp->if_u1.if_extents = NULL;
 572        else if (nex <= XFS_INLINE_EXTS)
 573                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 574        else
 575                xfs_iext_add(ifp, 0, nex);
 576
 577        ifp->if_bytes = size;
 578        if (size) {
 579                dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
 580                xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
 581                for (i = 0; i < nex; i++, dp++) {
 582                        xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 583                        ep->l0 = get_unaligned_be64(&dp->l0);
 584                        ep->l1 = get_unaligned_be64(&dp->l1);
 585                }
 586                XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
 587                if (whichfork != XFS_DATA_FORK ||
 588                        XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
 589                                if (unlikely(xfs_check_nostate_extents(
 590                                    ifp, 0, nex))) {
 591                                        XFS_ERROR_REPORT("xfs_iformat_extents(2)",
 592                                                         XFS_ERRLEVEL_LOW,
 593                                                         ip->i_mount);
 594                                        return XFS_ERROR(EFSCORRUPTED);
 595                                }
 596        }
 597        ifp->if_flags |= XFS_IFEXTENTS;
 598        return 0;
 599}
 600
 601/*
 602 * The file has too many extents to fit into
 603 * the inode, so they are in B-tree format.
 604 * Allocate a buffer for the root of the B-tree
 605 * and copy the root into it.  The i_extents
 606 * field will remain NULL until all of the
 607 * extents are read in (when they are needed).
 608 */
 609STATIC int
 610xfs_iformat_btree(
 611        xfs_inode_t                *ip,
 612        xfs_dinode_t                *dip,
 613        int                        whichfork)
 614{
 615        xfs_bmdr_block_t        *dfp;
 616        xfs_ifork_t                *ifp;
 617        /* REFERENCED */
 618        int                        nrecs;
 619        int                        size;
 620
 621        ifp = XFS_IFORK_PTR(ip, whichfork);
 622        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 623        size = XFS_BMAP_BROOT_SPACE(dfp);
 624        nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
 625
 626        /*
 627         * blow out if -- fork has less extents than can fit in
 628         * fork (fork shouldn't be a btree format), root btree
 629         * block has more records than can fit into the fork,
 630         * or the number of extents is greater than the number of
 631         * blocks.
 632         */
 633        if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
 634            || XFS_BMDR_SPACE_CALC(nrecs) >
 635                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
 636            || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 637                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
 638                        "corrupt inode %Lu (btree).",
 639                        (unsigned long long) ip->i_ino);
 640                XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
 641                                 ip->i_mount);
 642                return XFS_ERROR(EFSCORRUPTED);
 643        }
 644
 645        ifp->if_broot_bytes = size;
 646        ifp->if_broot = kmem_alloc(size, KM_SLEEP);
 647        ASSERT(ifp->if_broot != NULL);
 648        /*
 649         * Copy and convert from the on-disk structure
 650         * to the in-memory structure.
 651         */
 652        xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
 653                ifp->if_broot, size);
 654        ifp->if_flags &= ~XFS_IFEXTENTS;
 655        ifp->if_flags |= XFS_IFBROOT;
 656
 657        return 0;
 658}
 659
 660void
 661xfs_dinode_from_disk(
 662        xfs_icdinode_t                *to,
 663        xfs_dinode_core_t        *from)
 664{
 665        to->di_magic = be16_to_cpu(from->di_magic);
 666        to->di_mode = be16_to_cpu(from->di_mode);
 667        to->di_version = from ->di_version;
 668        to->di_format = from->di_format;
 669        to->di_onlink = be16_to_cpu(from->di_onlink);
 670        to->di_uid = be32_to_cpu(from->di_uid);
 671        to->di_gid = be32_to_cpu(from->di_gid);
 672        to->di_nlink = be32_to_cpu(from->di_nlink);
 673        to->di_projid = be16_to_cpu(from->di_projid);
 674        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 675        to->di_flushiter = be16_to_cpu(from->di_flushiter);
 676        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
 677        to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
 678        to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
 679        to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
 680        to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
 681        to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
 682        to->di_size = be64_to_cpu(from->di_size);
 683        to->di_nblocks = be64_to_cpu(from->di_nblocks);
 684        to->di_extsize = be32_to_cpu(from->di_extsize);
 685        to->di_nextents = be32_to_cpu(from->di_nextents);
 686        to->di_anextents = be16_to_cpu(from->di_anextents);
 687        to->di_forkoff = from->di_forkoff;
 688        to->di_aformat        = from->di_aformat;
 689        to->di_dmevmask        = be32_to_cpu(from->di_dmevmask);
 690        to->di_dmstate        = be16_to_cpu(from->di_dmstate);
 691        to->di_flags        = be16_to_cpu(from->di_flags);
 692        to->di_gen        = be32_to_cpu(from->di_gen);
 693}
 694
 695void
 696xfs_dinode_to_disk(
 697        xfs_dinode_core_t        *to,
 698        xfs_icdinode_t                *from)
 699{
 700        to->di_magic = cpu_to_be16(from->di_magic);
 701        to->di_mode = cpu_to_be16(from->di_mode);
 702        to->di_version = from ->di_version;
 703        to->di_format = from->di_format;
 704        to->di_onlink = cpu_to_be16(from->di_onlink);
 705        to->di_uid = cpu_to_be32(from->di_uid);
 706        to->di_gid = cpu_to_be32(from->di_gid);
 707        to->di_nlink = cpu_to_be32(from->di_nlink);
 708        to->di_projid = cpu_to_be16(from->di_projid);
 709        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 710        to->di_flushiter = cpu_to_be16(from->di_flushiter);
 711        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 712        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 713        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 714        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 715        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 716        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 717        to->di_size = cpu_to_be64(from->di_size);
 718        to->di_nblocks = cpu_to_be64(from->di_nblocks);
 719        to->di_extsize = cpu_to_be32(from->di_extsize);
 720        to->di_nextents = cpu_to_be32(from->di_nextents);
 721        to->di_anextents = cpu_to_be16(from->di_anextents);
 722        to->di_forkoff = from->di_forkoff;
 723        to->di_aformat = from->di_aformat;
 724        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 725        to->di_dmstate = cpu_to_be16(from->di_dmstate);
 726        to->di_flags = cpu_to_be16(from->di_flags);
 727        to->di_gen = cpu_to_be32(from->di_gen);
 728}
 729
 730STATIC uint
 731_xfs_dic2xflags(
 732        __uint16_t                di_flags)
 733{
 734        uint                        flags = 0;
 735
 736        if (di_flags & XFS_DIFLAG_ANY) {
 737                if (di_flags & XFS_DIFLAG_REALTIME)
 738                        flags |= XFS_XFLAG_REALTIME;
 739                if (di_flags & XFS_DIFLAG_PREALLOC)
 740                        flags |= XFS_XFLAG_PREALLOC;
 741                if (di_flags & XFS_DIFLAG_IMMUTABLE)
 742                        flags |= XFS_XFLAG_IMMUTABLE;
 743                if (di_flags & XFS_DIFLAG_APPEND)
 744                        flags |= XFS_XFLAG_APPEND;
 745                if (di_flags & XFS_DIFLAG_SYNC)
 746                        flags |= XFS_XFLAG_SYNC;
 747                if (di_flags & XFS_DIFLAG_NOATIME)
 748                        flags |= XFS_XFLAG_NOATIME;
 749                if (di_flags & XFS_DIFLAG_NODUMP)
 750                        flags |= XFS_XFLAG_NODUMP;
 751                if (di_flags & XFS_DIFLAG_RTINHERIT)
 752                        flags |= XFS_XFLAG_RTINHERIT;
 753                if (di_flags & XFS_DIFLAG_PROJINHERIT)
 754                        flags |= XFS_XFLAG_PROJINHERIT;
 755                if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 756                        flags |= XFS_XFLAG_NOSYMLINKS;
 757                if (di_flags & XFS_DIFLAG_EXTSIZE)
 758                        flags |= XFS_XFLAG_EXTSIZE;
 759                if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 760                        flags |= XFS_XFLAG_EXTSZINHERIT;
 761                if (di_flags & XFS_DIFLAG_NODEFRAG)
 762                        flags |= XFS_XFLAG_NODEFRAG;
 763                if (di_flags & XFS_DIFLAG_FILESTREAM)
 764                        flags |= XFS_XFLAG_FILESTREAM;
 765        }
 766
 767        return flags;
 768}
 769
 770uint
 771xfs_ip2xflags(
 772        xfs_inode_t                *ip)
 773{
 774        xfs_icdinode_t                *dic = &ip->i_d;
 775
 776        return _xfs_dic2xflags(dic->di_flags) |
 777                                (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
 778}
 779
 780uint
 781xfs_dic2xflags(
 782        xfs_dinode_t                *dip)
 783{
 784        xfs_dinode_core_t        *dic = &dip->di_core;
 785
 786        return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) |
 787                                (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 788}
 789
 790/*
 791 * Given a mount structure and an inode number, return a pointer
 792 * to a newly allocated in-core inode corresponding to the given
 793 * inode number.
 794 *
 795 * Initialize the inode's attributes and extent pointers if it
 796 * already has them (it will not if the inode has no links).
 797 */
 798int
 799xfs_iread(
 800        xfs_mount_t        *mp,
 801        xfs_trans_t        *tp,
 802        xfs_ino_t        ino,
 803        xfs_inode_t        **ipp,
 804        xfs_daddr_t        bno,
 805        uint                imap_flags)
 806{
 807        xfs_buf_t        *bp;
 808        xfs_dinode_t        *dip;
 809        xfs_inode_t        *ip;
 810        int                error;
 811
 812        ASSERT(xfs_inode_zone != NULL);
 813
 814        ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
 815        ip->i_ino = ino;
 816        ip->i_mount = mp;
 817        atomic_set(&ip->i_iocount, 0);
 818        spin_lock_init(&ip->i_flags_lock);
 819
 820        /*
 821         * Get pointer's to the on-disk inode and the buffer containing it.
 822         * If the inode number refers to a block outside the file system
 823         * then xfs_itobp() will return NULL.  In this case we should
 824         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
 825         * know that this is a new incore inode.
 826         */
 827        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
 828        if (error) {
 829                kmem_zone_free(xfs_inode_zone, ip);
 830                return error;
 831        }
 832
 833        /*
 834         * Initialize inode's trace buffers.
 835         * Do this before xfs_iformat in case it adds entries.
 836         */
 837#ifdef        XFS_INODE_TRACE
 838        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
 839#endif
 840#ifdef XFS_BMAP_TRACE
 841        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
 842#endif
 843#ifdef XFS_BMBT_TRACE
 844        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
 845#endif
 846#ifdef XFS_RW_TRACE
 847        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
 848#endif
 849#ifdef XFS_ILOCK_TRACE
 850        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
 851#endif
 852#ifdef XFS_DIR2_TRACE
 853        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 854#endif
 855
 856        /*
 857         * If we got something that isn't an inode it means someone
 858         * (nfs or dmi) has a stale handle.
 859         */
 860        if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) {
 861                kmem_zone_free(xfs_inode_zone, ip);
 862                xfs_trans_brelse(tp, bp);
 863#ifdef DEBUG
 864                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
 865                                "dip->di_core.di_magic (0x%x) != "
 866                                "XFS_DINODE_MAGIC (0x%x)",
 867                                be16_to_cpu(dip->di_core.di_magic),
 868                                XFS_DINODE_MAGIC);
 869#endif /* DEBUG */
 870                return XFS_ERROR(EINVAL);
 871        }
 872
 873        /*
 874         * If the on-disk inode is already linked to a directory
 875         * entry, copy all of the inode into the in-core inode.
 876         * xfs_iformat() handles copying in the inode format
 877         * specific information.
 878         * Otherwise, just get the truly permanent information.
 879         */
 880        if (dip->di_core.di_mode) {
 881                xfs_dinode_from_disk(&ip->i_d, &dip->di_core);
 882                error = xfs_iformat(ip, dip);
 883                if (error)  {
 884                        kmem_zone_free(xfs_inode_zone, ip);
 885                        xfs_trans_brelse(tp, bp);
 886#ifdef DEBUG
 887                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
 888                                        "xfs_iformat() returned error %d",
 889                                        error);
 890#endif /* DEBUG */
 891                        return error;
 892                }
 893        } else {
 894                ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic);
 895                ip->i_d.di_version = dip->di_core.di_version;
 896                ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen);
 897                ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter);
 898                /*
 899                 * Make sure to pull in the mode here as well in
 900                 * case the inode is released without being used.
 901                 * This ensures that xfs_inactive() will see that
 902                 * the inode is already free and not try to mess
 903                 * with the uninitialized part of it.
 904                 */
 905                ip->i_d.di_mode = 0;
 906                /*
 907                 * Initialize the per-fork minima and maxima for a new
 908                 * inode here.  xfs_iformat will do it for old inodes.
 909                 */
 910                ip->i_df.if_ext_max =
 911                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 912        }
 913
 914        INIT_LIST_HEAD(&ip->i_reclaim);
 915
 916        /*
 917         * The inode format changed when we moved the link count and
 918         * made it 32 bits long.  If this is an old format inode,
 919         * convert it in memory to look like a new one.  If it gets
 920         * flushed to disk we will convert back before flushing or
 921         * logging it.  We zero out the new projid field and the old link
 922         * count field.  We'll handle clearing the pad field (the remains
 923         * of the old uuid field) when we actually convert the inode to
 924         * the new format. We don't change the version number so that we
 925         * can distinguish this from a real new format inode.
 926         */
 927        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
 928                ip->i_d.di_nlink = ip->i_d.di_onlink;
 929                ip->i_d.di_onlink = 0;
 930                ip->i_d.di_projid = 0;
 931        }
 932
 933        ip->i_delayed_blks = 0;
 934        ip->i_size = ip->i_d.di_size;
 935
 936        /*
 937         * Mark the buffer containing the inode as something to keep
 938         * around for a while.  This helps to keep recently accessed
 939         * meta-data in-core longer.
 940         */
 941         XFS_BUF_SET_REF(bp, XFS_INO_REF);
 942
 943        /*
 944         * Use xfs_trans_brelse() to release the buffer containing the
 945         * on-disk inode, because it was acquired with xfs_trans_read_buf()
 946         * in xfs_itobp() above.  If tp is NULL, this is just a normal
 947         * brelse().  If we're within a transaction, then xfs_trans_brelse()
 948         * will only release the buffer if it is not dirty within the
 949         * transaction.  It will be OK to release the buffer in this case,
 950         * because inodes on disk are never destroyed and we will be
 951         * locking the new in-core inode before putting it in the hash
 952         * table where other processes can find it.  Thus we don't have
 953         * to worry about the inode being changed just because we released
 954         * the buffer.
 955         */
 956        xfs_trans_brelse(tp, bp);
 957        *ipp = ip;
 958        return 0;
 959}
 960
 961/*
 962 * Read in extents from a btree-format inode.
 963 * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
 964 */
 965int
 966xfs_iread_extents(
 967        xfs_trans_t        *tp,
 968        xfs_inode_t        *ip,
 969        int                whichfork)
 970{
 971        int                error;
 972        xfs_ifork_t        *ifp;
 973        xfs_extnum_t        nextents;
 974        size_t                size;
 975
 976        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
 977                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
 978                                 ip->i_mount);
 979                return XFS_ERROR(EFSCORRUPTED);
 980        }
 981        nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
 982        size = nextents * sizeof(xfs_bmbt_rec_t);
 983        ifp = XFS_IFORK_PTR(ip, whichfork);
 984
 985        /*
 986         * We know that the size is valid (it's checked in iformat_btree)
 987         */
 988        ifp->if_lastex = NULLEXTNUM;
 989        ifp->if_bytes = ifp->if_real_bytes = 0;
 990        ifp->if_flags |= XFS_IFEXTENTS;
 991        xfs_iext_add(ifp, 0, nextents);
 992        error = xfs_bmap_read_extents(tp, ip, whichfork);
 993        if (error) {
 994                xfs_iext_destroy(ifp);
 995                ifp->if_flags &= ~XFS_IFEXTENTS;
 996                return error;
 997        }
 998        xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
 999        return 0;
1000}
1001
1002/*
1003 * Allocate an inode on disk and return a copy of its in-core version.
1004 * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
1005 * appropriately within the inode.  The uid and gid for the inode are
1006 * set according to the contents of the given cred structure.
1007 *
1008 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
1009 * has a free inode available, call xfs_iget()
1010 * to obtain the in-core version of the allocated inode.  Finally,
1011 * fill in the inode and log its initial contents.  In this case,
1012 * ialloc_context would be set to NULL and call_again set to false.
1013 *
1014 * If xfs_dialloc() does not have an available inode,
1015 * it will replenish its supply by doing an allocation. Since we can
1016 * only do one allocation within a transaction without deadlocks, we
1017 * must commit the current transaction before returning the inode itself.
1018 * In this case, therefore, we will set call_again to true and return.
1019 * The caller should then commit the current transaction, start a new
1020 * transaction, and call xfs_ialloc() again to actually get the inode.
1021 *
1022 * To ensure that some other process does not grab the inode that
1023 * was allocated during the first call to xfs_ialloc(), this routine
1024 * also returns the [locked] bp pointing to the head of the freelist
1025 * as ialloc_context.  The caller should hold this buffer across
1026 * the commit and pass it back into this routine on the second call.
1027 *
1028 * If we are allocating quota inodes, we do not have a parent inode
1029 * to attach to or associate with (i.e. pip == NULL) because they
1030 * are not linked into the directory structure - they are attached
1031 * directly to the superblock - and so have no parent.
1032 */
1033int
1034xfs_ialloc(
1035        xfs_trans_t        *tp,
1036        xfs_inode_t        *pip,
1037        mode_t                mode,
1038        xfs_nlink_t        nlink,
1039        xfs_dev_t        rdev,
1040        cred_t                *cr,
1041        xfs_prid_t        prid,
1042        int                okalloc,
1043        xfs_buf_t        **ialloc_context,
1044        boolean_t        *call_again,
1045        xfs_inode_t        **ipp)
1046{
1047        xfs_ino_t        ino;
1048        xfs_inode_t        *ip;
1049        uint                flags;
1050        int                error;
1051        timespec_t        tv;
1052
1053        /*
1054         * Call the space management code to pick
1055         * the on-disk inode to be allocated.
1056         */
1057        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
1058                            ialloc_context, call_again, &ino);
1059        if (error != 0) {
1060                return error;
1061        }
1062        if (*call_again || ino == NULLFSINO) {
1063                *ipp = NULL;
1064                return 0;
1065        }
1066        ASSERT(*ialloc_context == NULL);
1067
1068        /*
1069         * Get the in-core inode with the lock held exclusively.
1070         * This is because we're setting fields here we need
1071         * to prevent others from looking at until we're done.
1072         */
1073        error = xfs_trans_iget(tp->t_mountp, tp, ino,
1074                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
1075        if (error != 0) {
1076                return error;
1077        }
1078        ASSERT(ip != NULL);
1079
1080        ip->i_d.di_mode = (__uint16_t)mode;
1081        ip->i_d.di_onlink = 0;
1082        ip->i_d.di_nlink = nlink;
1083        ASSERT(ip->i_d.di_nlink == nlink);
1084        ip->i_d.di_uid = current_fsuid();
1085        ip->i_d.di_gid = current_fsgid();
1086        ip->i_d.di_projid = prid;
1087        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1088
1089        /*
1090         * If the superblock version is up to where we support new format
1091         * inodes and this is currently an old format inode, then change
1092         * the inode version number now.  This way we only do the conversion
1093         * here rather than here and in the flush/logging code.
1094         */
1095        if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
1096            ip->i_d.di_version == XFS_DINODE_VERSION_1) {
1097                ip->i_d.di_version = XFS_DINODE_VERSION_2;
1098                /*
1099                 * We've already zeroed the old link count, the projid field,
1100                 * and the pad field.
1101                 */
1102        }
1103
1104        /*
1105         * Project ids won't be stored on disk if we are using a version 1 inode.
1106         */
1107        if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
1108                xfs_bump_ino_vers2(tp, ip);
1109
1110        if (pip && XFS_INHERIT_GID(pip)) {
1111                ip->i_d.di_gid = pip->i_d.di_gid;
1112                if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
1113                        ip->i_d.di_mode |= S_ISGID;
1114                }
1115        }
1116
1117        /*
1118         * If the group ID of the new file does not match the effective group
1119         * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1120         * (and only if the irix_sgid_inherit compatibility variable is set).
1121         */
1122        if ((irix_sgid_inherit) &&
1123            (ip->i_d.di_mode & S_ISGID) &&
1124            (!in_group_p((gid_t)ip->i_d.di_gid))) {
1125                ip->i_d.di_mode &= ~S_ISGID;
1126        }
1127
1128        ip->i_d.di_size = 0;
1129        ip->i_size = 0;
1130        ip->i_d.di_nextents = 0;
1131        ASSERT(ip->i_d.di_nblocks == 0);
1132
1133        nanotime(&tv);
1134        ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
1135        ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
1136        ip->i_d.di_atime = ip->i_d.di_mtime;
1137        ip->i_d.di_ctime = ip->i_d.di_mtime;
1138
1139        /*
1140         * di_gen will have been taken care of in xfs_iread.
1141         */
1142        ip->i_d.di_extsize = 0;
1143        ip->i_d.di_dmevmask = 0;
1144        ip->i_d.di_dmstate = 0;
1145        ip->i_d.di_flags = 0;
1146        flags = XFS_ILOG_CORE;
1147        switch (mode & S_IFMT) {
1148        case S_IFIFO:
1149        case S_IFCHR:
1150        case S_IFBLK:
1151        case S_IFSOCK:
1152                ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1153                ip->i_df.if_u2.if_rdev = rdev;
1154                ip->i_df.if_flags = 0;
1155                flags |= XFS_ILOG_DEV;
1156                break;
1157        case S_IFREG:
1158                if (pip && xfs_inode_is_filestream(pip)) {
1159                        error = xfs_filestream_associate(pip, ip);
1160                        if (error < 0)
1161                                return -error;
1162                        if (!error)
1163                                xfs_iflags_set(ip, XFS_IFILESTREAM);
1164                }
1165                /* fall through */
1166        case S_IFDIR:
1167                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1168                        uint        di_flags = 0;
1169
1170                        if ((mode & S_IFMT) == S_IFDIR) {
1171                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1172                                        di_flags |= XFS_DIFLAG_RTINHERIT;
1173                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1174                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
1175                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
1176                                }
1177                        } else if ((mode & S_IFMT) == S_IFREG) {
1178                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1179                                        di_flags |= XFS_DIFLAG_REALTIME;
1180                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1181                                        di_flags |= XFS_DIFLAG_EXTSIZE;
1182                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
1183                                }
1184                        }
1185                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1186                            xfs_inherit_noatime)
1187                                di_flags |= XFS_DIFLAG_NOATIME;
1188                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1189                            xfs_inherit_nodump)
1190                                di_flags |= XFS_DIFLAG_NODUMP;
1191                        if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1192                            xfs_inherit_sync)
1193                                di_flags |= XFS_DIFLAG_SYNC;
1194                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
1195                            xfs_inherit_nosymlinks)
1196                                di_flags |= XFS_DIFLAG_NOSYMLINKS;
1197                        if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1198                                di_flags |= XFS_DIFLAG_PROJINHERIT;
1199                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
1200                            xfs_inherit_nodefrag)
1201                                di_flags |= XFS_DIFLAG_NODEFRAG;
1202                        if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1203                                di_flags |= XFS_DIFLAG_FILESTREAM;
1204                        ip->i_d.di_flags |= di_flags;
1205                }
1206                /* FALLTHROUGH */
1207        case S_IFLNK:
1208                ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1209                ip->i_df.if_flags = XFS_IFEXTENTS;
1210                ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1211                ip->i_df.if_u1.if_extents = NULL;
1212                break;
1213        default:
1214                ASSERT(0);
1215        }
1216        /*
1217         * Attribute fork settings for new inode.
1218         */
1219        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1220        ip->i_d.di_anextents = 0;
1221
1222        /*
1223         * Log the new values stuffed into the inode.
1224         */
1225        xfs_trans_log_inode(tp, ip, flags);
1226
1227        /* now that we have an i_mode we can setup inode ops and unlock */
1228        xfs_setup_inode(ip);
1229
1230        *ipp = ip;
1231        return 0;
1232}
1233
1234/*
1235 * Check to make sure that there are no blocks allocated to the
1236 * file beyond the size of the file.  We don't check this for
1237 * files with fixed size extents or real time extents, but we
1238 * at least do it for regular files.
1239 */
1240#ifdef DEBUG
1241void
1242xfs_isize_check(
1243        xfs_mount_t        *mp,
1244        xfs_inode_t        *ip,
1245        xfs_fsize_t        isize)
1246{
1247        xfs_fileoff_t        map_first;
1248        int                nimaps;
1249        xfs_bmbt_irec_t        imaps[2];
1250
1251        if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
1252                return;
1253
1254        if (XFS_IS_REALTIME_INODE(ip))
1255                return;
1256
1257        if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
1258                return;
1259
1260        nimaps = 2;
1261        map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
1262        /*
1263         * The filesystem could be shutting down, so bmapi may return
1264         * an error.
1265         */
1266        if (xfs_bmapi(NULL, ip, map_first,
1267                         (XFS_B_TO_FSB(mp,
1268                                       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
1269                          map_first),
1270                         XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
1271                         NULL, NULL))
1272            return;
1273        ASSERT(nimaps == 1);
1274        ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1275}
1276#endif        /* DEBUG */
1277
1278/*
1279 * Calculate the last possible buffered byte in a file.  This must
1280 * include data that was buffered beyond the EOF by the write code.
1281 * This also needs to deal with overflowing the xfs_fsize_t type
1282 * which can happen for sizes near the limit.
1283 *
1284 * We also need to take into account any blocks beyond the EOF.  It
1285 * may be the case that they were buffered by a write which failed.
1286 * In that case the pages will still be in memory, but the inode size
1287 * will never have been updated.
1288 */
1289xfs_fsize_t
1290xfs_file_last_byte(
1291        xfs_inode_t        *ip)
1292{
1293        xfs_mount_t        *mp;
1294        xfs_fsize_t        last_byte;
1295        xfs_fileoff_t        last_block;
1296        xfs_fileoff_t        size_last_block;
1297        int                error;
1298
1299        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
1300
1301        mp = ip->i_mount;
1302        /*
1303         * Only check for blocks beyond the EOF if the extents have
1304         * been read in.  This eliminates the need for the inode lock,
1305         * and it also saves us from looking when it really isn't
1306         * necessary.
1307         */
1308        if (ip->i_df.if_flags & XFS_IFEXTENTS) {
1309                error = xfs_bmap_last_offset(NULL, ip, &last_block,
1310                        XFS_DATA_FORK);
1311                if (error) {
1312                        last_block = 0;
1313                }
1314        } else {
1315                last_block = 0;
1316        }
1317        size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size);
1318        last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
1319
1320        last_byte = XFS_FSB_TO_B(mp, last_block);
1321        if (last_byte < 0) {
1322                return XFS_MAXIOFFSET(mp);
1323        }
1324        last_byte += (1 << mp->m_writeio_log);
1325        if (last_byte < 0) {
1326                return XFS_MAXIOFFSET(mp);
1327        }
1328        return last_byte;
1329}
1330
1331#if defined(XFS_RW_TRACE)
1332STATIC void
1333xfs_itrunc_trace(
1334        int                tag,
1335        xfs_inode_t        *ip,
1336        int                flag,
1337        xfs_fsize_t        new_size,
1338        xfs_off_t        toss_start,
1339        xfs_off_t        toss_finish)
1340{
1341        if (ip->i_rwtrace == NULL) {
1342                return;
1343        }
1344
1345        ktrace_enter(ip->i_rwtrace,
1346                     (void*)((long)tag),
1347                     (void*)ip,
1348                     (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
1349                     (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
1350                     (void*)((long)flag),
1351                     (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
1352                     (void*)(unsigned long)(new_size & 0xffffffff),
1353                     (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
1354                     (void*)(unsigned long)(toss_start & 0xffffffff),
1355                     (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
1356                     (void*)(unsigned long)(toss_finish & 0xffffffff),
1357                     (void*)(unsigned long)current_cpu(),
1358                     (void*)(unsigned long)current_pid(),
1359                     (void*)NULL,
1360                     (void*)NULL,
1361                     (void*)NULL);
1362}
1363#else
1364#define        xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
1365#endif
1366
1367/*
1368 * Start the truncation of the file to new_size.  The new size
1369 * must be smaller than the current size.  This routine will
1370 * clear the buffer and page caches of file data in the removed
1371 * range, and xfs_itruncate_finish() will remove the underlying
1372 * disk blocks.
1373 *
1374 * The inode must have its I/O lock locked EXCLUSIVELY, and it
1375 * must NOT have the inode lock held at all.  This is because we're
1376 * calling into the buffer/page cache code and we can't hold the
1377 * inode lock when we do so.
1378 *
1379 * We need to wait for any direct I/Os in flight to complete before we
1380 * proceed with the truncate. This is needed to prevent the extents
1381 * being read or written by the direct I/Os from being removed while the
1382 * I/O is in flight as there is no other method of synchronising
1383 * direct I/O with the truncate operation.  Also, because we hold
1384 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
1385 * started until the truncate completes and drops the lock. Essentially,
1386 * the vn_iowait() call forms an I/O barrier that provides strict ordering
1387 * between direct I/Os and the truncate operation.
1388 *
1389 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
1390 * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
1391 * in the case that the caller is locking things out of order and
1392 * may not be able to call xfs_itruncate_finish() with the inode lock
1393 * held without dropping the I/O lock.  If the caller must drop the
1394 * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
1395 * must be called again with all the same restrictions as the initial
1396 * call.
1397 */
1398int
1399xfs_itruncate_start(
1400        xfs_inode_t        *ip,
1401        uint                flags,
1402        xfs_fsize_t        new_size)
1403{
1404        xfs_fsize_t        last_byte;
1405        xfs_off_t        toss_start;
1406        xfs_mount_t        *mp;
1407        int                error = 0;
1408
1409        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1410        ASSERT((new_size == 0) || (new_size <= ip->i_size));
1411        ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
1412               (flags == XFS_ITRUNC_MAYBE));
1413
1414        mp = ip->i_mount;
1415
1416        /* wait for the completion of any pending DIOs */
1417        if (new_size == 0 || new_size < ip->i_size)
1418                vn_iowait(ip);
1419
1420        /*
1421         * Call toss_pages or flushinval_pages to get rid of pages
1422         * overlapping the region being removed.  We have to use
1423         * the less efficient flushinval_pages in the case that the
1424         * caller may not be able to finish the truncate without
1425         * dropping the inode's I/O lock.  Make sure
1426         * to catch any pages brought in by buffers overlapping
1427         * the EOF by searching out beyond the isize by our
1428         * block size. We round new_size up to a block boundary
1429         * so that we don't toss things on the same block as
1430         * new_size but before it.
1431         *
1432         * Before calling toss_page or flushinval_pages, make sure to
1433         * call remapf() over the same region if the file is mapped.
1434         * This frees up mapped file references to the pages in the
1435         * given range and for the flushinval_pages case it ensures
1436         * that we get the latest mapped changes flushed out.
1437         */
1438        toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1439        toss_start = XFS_FSB_TO_B(mp, toss_start);
1440        if (toss_start < 0) {
1441                /*
1442                 * The place to start tossing is beyond our maximum
1443                 * file size, so there is no way that the data extended
1444                 * out there.
1445                 */
1446                return 0;
1447        }
1448        last_byte = xfs_file_last_byte(ip);
1449        xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
1450                         last_byte);
1451        if (last_byte > toss_start) {
1452                if (flags & XFS_ITRUNC_DEFINITE) {
1453                        xfs_tosspages(ip, toss_start,
1454                                        -1, FI_REMAPF_LOCKED);
1455                } else {
1456                        error = xfs_flushinval_pages(ip, toss_start,
1457                                        -1, FI_REMAPF_LOCKED);
1458                }
1459        }
1460
1461#ifdef DEBUG
1462        if (new_size == 0) {
1463                ASSERT(VN_CACHED(VFS_I(ip)) == 0);
1464        }
1465#endif
1466        return error;
1467}
1468
1469/*
1470 * Shrink the file to the given new_size.  The new size must be smaller than
1471 * the current size.  This will free up the underlying blocks in the removed
1472 * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
1473 *
1474 * The transaction passed to this routine must have made a permanent log
1475 * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1476 * given transaction and start new ones, so make sure everything involved in
1477 * the transaction is tidy before calling here.  Some transaction will be
1478 * returned to the caller to be committed.  The incoming transaction must
1479 * already include the inode, and both inode locks must be held exclusively.
1480 * The inode must also be "held" within the transaction.  On return the inode
1481 * will be "held" within the returned transaction.  This routine does NOT
1482 * require any disk space to be reserved for it within the transaction.
1483 *
1484 * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
1485 * indicates the fork which is to be truncated.  For the attribute fork we only
1486 * support truncation to size 0.
1487 *
1488 * We use the sync parameter to indicate whether or not the first transaction
1489 * we perform might have to be synchronous.  For the attr fork, it needs to be
1490 * so if the unlink of the inode is not yet known to be permanent in the log.
1491 * This keeps us from freeing and reusing the blocks of the attribute fork
1492 * before the unlink of the inode becomes permanent.
1493 *
1494 * For the data fork, we normally have to run synchronously if we're being
1495 * called out of the inactive path or we're being called out of the create path
1496 * where we're truncating an existing file.  Either way, the truncate needs to
1497 * be sync so blocks don't reappear in the file with altered data in case of a
1498 * crash.  wsync filesystems can run the first case async because anything that
1499 * shrinks the inode has to run sync so by the time we're called here from
1500 * inactive, the inode size is permanently set to 0.
1501 *
1502 * Calls from the truncate path always need to be sync unless we're in a wsync
1503 * filesystem and the file has already been unlinked.
1504 *
1505 * The caller is responsible for correctly setting the sync parameter.  It gets
1506 * too hard for us to guess here which path we're being called out of just
1507 * based on inode state.
1508 *
1509 * If we get an error, we must return with the inode locked and linked into the
1510 * current transaction. This keeps things simple for the higher level code,
1511 * because it always knows that the inode is locked and held in the transaction
1512 * that returns to it whether errors occur or not.  We don't mark the inode
1513 * dirty on error so that transactions can be easily aborted if possible.
1514 */
1515int
1516xfs_itruncate_finish(
1517        xfs_trans_t        **tp,
1518        xfs_inode_t        *ip,
1519        xfs_fsize_t        new_size,
1520        int                fork,
1521        int                sync)
1522{
1523        xfs_fsblock_t        first_block;
1524        xfs_fileoff_t        first_unmap_block;
1525        xfs_fileoff_t        last_block;
1526        xfs_filblks_t        unmap_len=0;
1527        xfs_mount_t        *mp;
1528        xfs_trans_t        *ntp;
1529        int                done;
1530        int                committed;
1531        xfs_bmap_free_t        free_list;
1532        int                error;
1533
1534        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
1535        ASSERT((new_size == 0) || (new_size <= ip->i_size));
1536        ASSERT(*tp != NULL);
1537        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
1538        ASSERT(ip->i_transp == *tp);
1539        ASSERT(ip->i_itemp != NULL);
1540        ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
1541
1542
1543        ntp = *tp;
1544        mp = (ntp)->t_mountp;
1545        ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
1546
1547        /*
1548         * We only support truncating the entire attribute fork.
1549         */
1550        if (fork == XFS_ATTR_FORK) {
1551                new_size = 0LL;
1552        }
1553        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1554        xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
1555        /*
1556         * The first thing we do is set the size to new_size permanently
1557         * on disk.  This way we don't have to worry about anyone ever
1558         * being able to look at the data being freed even in the face
1559         * of a crash.  What we're getting around here is the case where
1560         * we free a block, it is allocated to another file, it is written
1561         * to, and then we crash.  If the new data gets written to the
1562         * file but the log buffers containing the free and reallocation
1563         * don't, then we'd end up with garbage in the blocks being freed.
1564         * As long as we make the new_size permanent before actually
1565         * freeing any blocks it doesn't matter if they get writtten to.
1566         *
1567         * The callers must signal into us whether or not the size
1568         * setting here must be synchronous.  There are a few cases
1569         * where it doesn't have to be synchronous.  Those cases
1570         * occur if the file is unlinked and we know the unlink is
1571         * permanent or if the blocks being truncated are guaranteed
1572         * to be beyond the inode eof (regardless of the link count)
1573         * and the eof value is permanent.  Both of these cases occur
1574         * only on wsync-mounted filesystems.  In those cases, we're
1575         * guaranteed that no user will ever see the data in the blocks
1576         * that are being truncated so the truncate can run async.
1577         * In the free beyond eof case, the file may wind up with
1578         * more blocks allocated to it than it needs if we crash
1579         * and that won't get fixed until the next time the file
1580         * is re-opened and closed but that's ok as that shouldn't
1581         * be too many blocks.
1582         *
1583         * However, we can't just make all wsync xactions run async
1584         * because there's one call out of the create path that needs
1585         * to run sync where it's truncating an existing file to size
1586         * 0 whose size is > 0.
1587         *
1588         * It's probably possible to come up with a test in this
1589         * routine that would correctly distinguish all the above
1590         * cases from the values of the function parameters and the
1591         * inode state but for sanity's sake, I've decided to let the
1592         * layers above just tell us.  It's simpler to correctly figure
1593         * out in the layer above exactly under what conditions we
1594         * can run async and I think it's easier for others read and
1595         * follow the logic in case something has to be changed.
1596         * cscope is your friend -- rcc.
1597         *
1598         * The attribute fork is much simpler.
1599         *
1600         * For the attribute fork we allow the caller to tell us whether
1601         * the unlink of the inode that led to this call is yet permanent
1602         * in the on disk log.  If it is not and we will be freeing extents
1603         * in this inode then we make the first transaction synchronous
1604         * to make sure that the unlink is permanent by the time we free
1605         * the blocks.
1606         */
1607        if (fork == XFS_DATA_FORK) {
1608                if (ip->i_d.di_nextents > 0) {
1609                        /*
1610                         * If we are not changing the file size then do
1611                         * not update the on-disk file size - we may be
1612                         * called from xfs_inactive_free_eofblocks().  If we
1613                         * update the on-disk file size and then the system
1614                         * crashes before the contents of the file are
1615                         * flushed to disk then the files may be full of
1616                         * holes (ie NULL files bug).
1617                         */
1618                        if (ip->i_size != new_size) {
1619                                ip->i_d.di_size = new_size;
1620                                ip->i_size = new_size;
1621                                xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1622                        }
1623                }
1624        } else if (sync) {
1625                ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
1626                if (ip->i_d.di_anextents > 0)
1627                        xfs_trans_set_sync(ntp);
1628        }
1629        ASSERT(fork == XFS_DATA_FORK ||
1630                (fork == XFS_ATTR_FORK &&
1631                        ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
1632                         (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
1633
1634        /*
1635         * Since it is possible for space to become allocated beyond
1636         * the end of the file (in a crash where the space is allocated
1637         * but the inode size is not yet updated), simply remove any
1638         * blocks which show up between the new EOF and the maximum
1639         * possible file size.  If the first block to be removed is
1640         * beyond the maximum file size (ie it is the same as last_block),
1641         * then there is nothing to do.
1642         */
1643        last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1644        ASSERT(first_unmap_block <= last_block);
1645        done = 0;
1646        if (last_block == first_unmap_block) {
1647                done = 1;
1648        } else {
1649                unmap_len = last_block - first_unmap_block + 1;
1650        }
1651        while (!done) {
1652                /*
1653                 * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
1654                 * will tell us whether it freed the entire range or
1655                 * not.  If this is a synchronous mount (wsync),
1656                 * then we can tell bunmapi to keep all the
1657                 * transactions asynchronous since the unlink
1658                 * transaction that made this inode inactive has
1659                 * already hit the disk.  There's no danger of
1660                 * the freed blocks being reused, there being a
1661                 * crash, and the reused blocks suddenly reappearing
1662                 * in this file with garbage in them once recovery
1663                 * runs.
1664                 */
1665                XFS_BMAP_INIT(&free_list, &first_block);
1666                error = xfs_bunmapi(ntp, ip,
1667                                    first_unmap_block, unmap_len,
1668                                    XFS_BMAPI_AFLAG(fork) |
1669                                      (sync ? 0 : XFS_BMAPI_ASYNC),
1670                                    XFS_ITRUNC_MAX_EXTENTS,
1671                                    &first_block, &free_list,
1672                                    NULL, &done);
1673                if (error) {
1674                        /*
1675                         * If the bunmapi call encounters an error,
1676                         * return to the caller where the transaction
1677                         * can be properly aborted.  We just need to
1678                         * make sure we're not holding any resources
1679                         * that we were not when we came in.
1680                         */
1681                        xfs_bmap_cancel(&free_list);
1682                        return error;
1683                }
1684
1685                /*
1686                 * Duplicate the transaction that has the permanent
1687                 * reservation and commit the old transaction.
1688                 */
1689                error = xfs_bmap_finish(tp, &free_list, &committed);
1690                ntp = *tp;
1691                if (committed) {
1692                        /* link the inode into the next xact in the chain */
1693                        xfs_trans_ijoin(ntp, ip,
1694                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1695                        xfs_trans_ihold(ntp, ip);
1696                }
1697
1698                if (error) {
1699                        /*
1700                         * If the bmap finish call encounters an error, return
1701                         * to the caller where the transaction can be properly
1702                         * aborted.  We just need to make sure we're not
1703                         * holding any resources that we were not when we came
1704                         * in.
1705                         *
1706                         * Aborting from this point might lose some blocks in
1707                         * the file system, but oh well.
1708                         */
1709                        xfs_bmap_cancel(&free_list);
1710                        return error;
1711                }
1712
1713                if (committed) {
1714                        /*
1715                         * Mark the inode dirty so it will be logged and
1716                         * moved forward in the log as part of every commit.
1717                         */
1718                        xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1719                }
1720
1721                ntp = xfs_trans_dup(ntp);
1722                error = xfs_trans_commit(*tp, 0);
1723                *tp = ntp;
1724
1725                /* link the inode into the next transaction in the chain */
1726                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1727                xfs_trans_ihold(ntp, ip);
1728
1729                if (!error)
1730                        error = xfs_trans_reserve(ntp, 0,
1731                                        XFS_ITRUNCATE_LOG_RES(mp), 0,
1732                                        XFS_TRANS_PERM_LOG_RES,
1733                                        XFS_ITRUNCATE_LOG_COUNT);
1734                if (error)
1735                        return error;
1736        }
1737        /*
1738         * Only update the size in the case of the data fork, but
1739         * always re-log the inode so that our permanent transaction
1740         * can keep on rolling it forward in the log.
1741         */
1742        if (fork == XFS_DATA_FORK) {
1743                xfs_isize_check(mp, ip, new_size);
1744                /*
1745                 * If we are not changing the file size then do
1746                 * not update the on-disk file size - we may be
1747                 * called from xfs_inactive_free_eofblocks().  If we
1748                 * update the on-disk file size and then the system
1749                 * crashes before the contents of the file are
1750                 * flushed to disk then the files may be full of
1751                 * holes (ie NULL files bug).
1752                 */
1753                if (ip->i_size != new_size) {
1754                        ip->i_d.di_size = new_size;
1755                        ip->i_size = new_size;
1756                }
1757        }
1758        xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1759        ASSERT((new_size != 0) ||
1760               (fork == XFS_ATTR_FORK) ||
1761               (ip->i_delayed_blks == 0));
1762        ASSERT((new_size != 0) ||
1763               (fork == XFS_ATTR_FORK) ||
1764               (ip->i_d.di_nextents == 0));
1765        xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
1766        return 0;
1767}
1768
1769/*
1770 * This is called when the inode's link count goes to 0.
1771 * We place the on-disk inode on a list in the AGI.  It
1772 * will be pulled from this list when the inode is freed.
1773 */
1774int
1775xfs_iunlink(
1776        xfs_trans_t        *tp,
1777        xfs_inode_t        *ip)
1778{
1779        xfs_mount_t        *mp;
1780        xfs_agi_t        *agi;
1781        xfs_dinode_t        *dip;
1782        xfs_buf_t        *agibp;
1783        xfs_buf_t        *ibp;
1784        xfs_agnumber_t        agno;
1785        xfs_daddr_t        agdaddr;
1786        xfs_agino_t        agino;
1787        short                bucket_index;
1788        int                offset;
1789        int                error;
1790        int                agi_ok;
1791
1792        ASSERT(ip->i_d.di_nlink == 0);
1793        ASSERT(ip->i_d.di_mode != 0);
1794        ASSERT(ip->i_transp == tp);
1795
1796        mp = tp->t_mountp;
1797
1798        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1799        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1800
1801        /*
1802         * Get the agi buffer first.  It ensures lock ordering
1803         * on the list.
1804         */
1805        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
1806                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1807        if (error)
1808                return error;
1809
1810        /*
1811         * Validate the magic number of the agi block.
1812         */
1813        agi = XFS_BUF_TO_AGI(agibp);
1814        agi_ok =
1815                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
1816                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1817        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
1818                        XFS_RANDOM_IUNLINK))) {
1819                XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
1820                xfs_trans_brelse(tp, agibp);
1821                return XFS_ERROR(EFSCORRUPTED);
1822        }
1823        /*
1824         * Get the index into the agi hash table for the
1825         * list this inode will go on.
1826         */
1827        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1828        ASSERT(agino != 0);
1829        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1830        ASSERT(agi->agi_unlinked[bucket_index]);
1831        ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1832
1833        if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) {
1834                /*
1835                 * There is already another inode in the bucket we need
1836                 * to add ourselves to.  Add us at the front of the list.
1837                 * Here we put the head pointer into our next pointer,
1838                 * and then we fall through to point the head at us.
1839                 */
1840                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
1841                if (error)
1842                        return error;
1843
1844                ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
1845                /* both on-disk, don't endian flip twice */
1846                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1847                offset = ip->i_boffset +
1848                        offsetof(xfs_dinode_t, di_next_unlinked);
1849                xfs_trans_inode_buf(tp, ibp);
1850                xfs_trans_log_buf(tp, ibp, offset,
1851                                  (offset + sizeof(xfs_agino_t) - 1));
1852                xfs_inobp_check(mp, ibp);
1853        }
1854
1855        /*
1856         * Point the bucket head pointer at the inode being inserted.
1857         */
1858        ASSERT(agino != 0);
1859        agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1860        offset = offsetof(xfs_agi_t, agi_unlinked) +
1861                (sizeof(xfs_agino_t) * bucket_index);
1862        xfs_trans_log_buf(tp, agibp, offset,
1863                          (offset + sizeof(xfs_agino_t) - 1));
1864        return 0;
1865}
1866
1867/*
1868 * Pull the on-disk inode from the AGI unlinked list.
1869 */
1870STATIC int
1871xfs_iunlink_remove(
1872        xfs_trans_t        *tp,
1873        xfs_inode_t        *ip)
1874{
1875        xfs_ino_t        next_ino;
1876        xfs_mount_t        *mp;
1877        xfs_agi_t        *agi;
1878        xfs_dinode_t        *dip;
1879        xfs_buf_t        *agibp;
1880        xfs_buf_t        *ibp;
1881        xfs_agnumber_t        agno;
1882        xfs_daddr_t        agdaddr;
1883        xfs_agino_t        agino;
1884        xfs_agino_t        next_agino;
1885        xfs_buf_t        *last_ibp;
1886        xfs_dinode_t        *last_dip = NULL;
1887        short                bucket_index;
1888        int                offset, last_offset = 0;
1889        int                error;
1890        int                agi_ok;
1891
1892        /*
1893         * First pull the on-disk inode from the AGI unlinked list.
1894         */
1895        mp = tp->t_mountp;
1896
1897        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1898        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1899
1900        /*
1901         * Get the agi buffer first.  It ensures lock ordering
1902         * on the list.
1903         */
1904        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
1905                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1906        if (error) {
1907                cmn_err(CE_WARN,
1908                        "xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.  Returning error.",
1909                        error, mp->m_fsname);
1910                return error;
1911        }
1912        /*
1913         * Validate the magic number of the agi block.
1914         */
1915        agi = XFS_BUF_TO_AGI(agibp);
1916        agi_ok =
1917                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
1918                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1919        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
1920                        XFS_RANDOM_IUNLINK_REMOVE))) {
1921                XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
1922                                     mp, agi);
1923                xfs_trans_brelse(tp, agibp);
1924                cmn_err(CE_WARN,
1925                        "xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.  Returning EFSCORRUPTED.",
1926                         mp->m_fsname);
1927                return XFS_ERROR(EFSCORRUPTED);
1928        }
1929        /*
1930         * Get the index into the agi hash table for the
1931         * list this inode will go on.
1932         */
1933        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1934        ASSERT(agino != 0);
1935        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1936        ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO);
1937        ASSERT(agi->agi_unlinked[bucket_index]);
1938
1939        if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
1940                /*
1941                 * We're at the head of the list.  Get the inode's
1942                 * on-disk buffer to see if there is anyone after us
1943                 * on the list.  Only modify our next pointer if it
1944                 * is not already NULLAGINO.  This saves us the overhead
1945                 * of dealing with the buffer when there is no need to
1946                 * change it.
1947                 */
1948                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
1949                if (error) {
1950                        cmn_err(CE_WARN,
1951                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
1952                                error, mp->m_fsname);
1953                        return error;
1954                }
1955                next_agino = be32_to_cpu(dip->di_next_unlinked);
1956                ASSERT(next_agino != 0);
1957                if (next_agino != NULLAGINO) {
1958                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1959                        offset = ip->i_boffset +
1960                                offsetof(xfs_dinode_t, di_next_unlinked);
1961                        xfs_trans_inode_buf(tp, ibp);
1962                        xfs_trans_log_buf(tp, ibp, offset,
1963                                          (offset + sizeof(xfs_agino_t) - 1));
1964                        xfs_inobp_check(mp, ibp);
1965                } else {
1966                        xfs_trans_brelse(tp, ibp);
1967                }
1968                /*
1969                 * Point the bucket head pointer at the next inode.
1970                 */
1971                ASSERT(next_agino != 0);
1972                ASSERT(next_agino != agino);
1973                agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
1974                offset = offsetof(xfs_agi_t, agi_unlinked) +
1975                        (sizeof(xfs_agino_t) * bucket_index);
1976                xfs_trans_log_buf(tp, agibp, offset,
1977                                  (offset + sizeof(xfs_agino_t) - 1));
1978        } else {
1979                /*
1980                 * We need to search the list for the inode being freed.
1981                 */
1982                next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1983                last_ibp = NULL;
1984                while (next_agino != agino) {
1985                        /*
1986                         * If the last inode wasn't the one pointing to
1987                         * us, then release its buffer since we're not
1988                         * going to do anything with it.
1989                         */
1990                        if (last_ibp != NULL) {
1991                                xfs_trans_brelse(tp, last_ibp);
1992                        }
1993                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
1994                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
1995                                            &last_ibp, &last_offset);
1996                        if (error) {
1997                                cmn_err(CE_WARN,
1998                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
1999                                        error, mp->m_fsname);
2000                                return error;
2001                        }
2002                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
2003                        ASSERT(next_agino != NULLAGINO);
2004                        ASSERT(next_agino != 0);
2005                }
2006                /*
2007                 * Now last_ibp points to the buffer previous to us on
2008                 * the unlinked list.  Pull us from the list.
2009                 */
2010                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
2011                if (error) {
2012                        cmn_err(CE_WARN,
2013                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
2014                                error, mp->m_fsname);
2015                        return error;
2016                }
2017                next_agino = be32_to_cpu(dip->di_next_unlinked);
2018                ASSERT(next_agino != 0);
2019                ASSERT(next_agino != agino);
2020                if (next_agino != NULLAGINO) {
2021                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2022                        offset = ip->i_boffset +
2023                                offsetof(xfs_dinode_t, di_next_unlinked);
2024                        xfs_trans_inode_buf(tp, ibp);
2025                        xfs_trans_log_buf(tp, ibp, offset,
2026                                          (offset + sizeof(xfs_agino_t) - 1));
2027                        xfs_inobp_check(mp, ibp);
2028                } else {
2029                        xfs_trans_brelse(tp, ibp);
2030                }
2031                /*
2032                 * Point the previous inode on the list to the next inode.
2033                 */
2034                last_dip->di_next_unlinked = cpu_to_be32(next_agino);
2035                ASSERT(next_agino != 0);
2036                offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2037                xfs_trans_inode_buf(tp, last_ibp);
2038                xfs_trans_log_buf(tp, last_ibp, offset,
2039                                  (offset + sizeof(xfs_agino_t) - 1));
2040                xfs_inobp_check(mp, last_ibp);
2041        }
2042        return 0;
2043}
2044
2045STATIC void
2046xfs_ifree_cluster(
2047        xfs_inode_t        *free_ip,
2048        xfs_trans_t        *tp,
2049        xfs_ino_t        inum)
2050{
2051        xfs_mount_t                *mp = free_ip->i_mount;
2052        int                        blks_per_cluster;
2053        int                        nbufs;
2054        int                        ninodes;
2055        int                        i, j, found, pre_flushed;
2056        xfs_daddr_t                blkno;
2057        xfs_buf_t                *bp;
2058        xfs_inode_t                *ip, **ip_found;
2059        xfs_inode_log_item_t        *iip;
2060        xfs_log_item_t                *lip;
2061        xfs_perag_t                *pag = xfs_get_perag(mp, inum);
2062
2063        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
2064                blks_per_cluster = 1;
2065                ninodes = mp->m_sb.sb_inopblock;
2066                nbufs = XFS_IALLOC_BLOCKS(mp);
2067        } else {
2068                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
2069                                        mp->m_sb.sb_blocksize;
2070                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
2071                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
2072        }
2073
2074        ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
2075
2076        for (j = 0; j < nbufs; j++, inum += ninodes) {
2077                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2078                                         XFS_INO_TO_AGBNO(mp, inum));
2079
2080
2081                /*
2082                 * Look for each inode in memory and attempt to lock it,
2083                 * we can be racing with flush and tail pushing here.
2084                 * any inode we get the locks on, add to an array of
2085                 * inode items to process later.
2086                 *
2087                 * The get the buffer lock, we could beat a flush
2088                 * or tail pushing thread to the lock here, in which
2089                 * case they will go looking for the inode buffer
2090                 * and fail, we need some other form of interlock
2091                 * here.
2092                 */
2093                found = 0;
2094                for (i = 0; i < ninodes; i++) {
2095                        read_lock(&pag->pag_ici_lock);
2096                        ip = radix_tree_lookup(&pag->pag_ici_root,
2097                                        XFS_INO_TO_AGINO(mp, (inum + i)));
2098
2099                        /* Inode not in memory or we found it already,
2100                         * nothing to do
2101                         */
2102                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
2103                                read_unlock(&pag->pag_ici_lock);
2104                                continue;
2105                        }
2106
2107                        if (xfs_inode_clean(ip)) {
2108                                read_unlock(&pag->pag_ici_lock);
2109                                continue;
2110                        }
2111
2112                        /* If we can get the locks then add it to the
2113                         * list, otherwise by the time we get the bp lock
2114                         * below it will already be attached to the
2115                         * inode buffer.
2116                         */
2117
2118                        /* This inode will already be locked - by us, lets
2119                         * keep it that way.
2120                         */
2121
2122                        if (ip == free_ip) {
2123                                if (xfs_iflock_nowait(ip)) {
2124                                        xfs_iflags_set(ip, XFS_ISTALE);
2125                                        if (xfs_inode_clean(ip)) {
2126                                                xfs_ifunlock(ip);
2127                                        } else {
2128                                                ip_found[found++] = ip;
2129                                        }
2130                                }
2131                                read_unlock(&pag->pag_ici_lock);
2132                                continue;
2133                        }
2134
2135                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2136                                if (xfs_iflock_nowait(ip)) {
2137                                        xfs_iflags_set(ip, XFS_ISTALE);
2138
2139                                        if (xfs_inode_clean(ip)) {
2140                                                xfs_ifunlock(ip);
2141                                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2142                                        } else {
2143                                                ip_found[found++] = ip;
2144                                        }
2145                                } else {
2146                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
2147                                }
2148                        }
2149                        read_unlock(&pag->pag_ici_lock);
2150                }
2151
2152                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
2153                                        mp->m_bsize * blks_per_cluster,
2154                                        XFS_BUF_LOCK);
2155
2156                pre_flushed = 0;
2157                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
2158                while (lip) {
2159                        if (lip->li_type == XFS_LI_INODE) {
2160                                iip = (xfs_inode_log_item_t *)lip;
2161                                ASSERT(iip->ili_logged == 1);
2162                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2163                                spin_lock(&mp->m_ail_lock);
2164                                iip->ili_flush_lsn = iip->ili_item.li_lsn;
2165                                spin_unlock(&mp->m_ail_lock);
2166                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2167                                pre_flushed++;
2168                        }
2169                        lip = lip->li_bio_list;
2170                }
2171
2172                for (i = 0; i < found; i++) {
2173                        ip = ip_found[i];
2174                        iip = ip->i_itemp;
2175
2176                        if (!iip) {
2177                                ip->i_update_core = 0;
2178                                xfs_ifunlock(ip);
2179                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2180                                continue;
2181                        }
2182
2183                        iip->ili_last_fields = iip->ili_format.ilf_fields;
2184                        iip->ili_format.ilf_fields = 0;
2185                        iip->ili_logged = 1;
2186                        spin_lock(&mp->m_ail_lock);
2187                        iip->ili_flush_lsn = iip->ili_item.li_lsn;
2188                        spin_unlock(&mp->m_ail_lock);
2189
2190                        xfs_buf_attach_iodone(bp,
2191                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
2192                                xfs_istale_done, (xfs_log_item_t *)iip);
2193                        if (ip != free_ip) {
2194                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
2195                        }
2196                }
2197
2198                if (found || pre_flushed)
2199                        xfs_trans_stale_inode_buf(tp, bp);
2200                xfs_trans_binval(tp, bp);
2201        }
2202
2203        kmem_free(ip_found);
2204        xfs_put_perag(mp, pag);
2205}
2206
2207/*
2208 * This is called to return an inode to the inode free list.
2209 * The inode should already be truncated to 0 length and have
2210 * no pages associated with it.  This routine also assumes that
2211 * the inode is already a part of the transaction.
2212 *
2213 * The on-disk copy of the inode will have been added to the list
2214 * of unlinked inodes in the AGI. We need to remove the inode from
2215 * that list atomically with respect to freeing it here.
2216 */
2217int
2218xfs_ifree(
2219        xfs_trans_t        *tp,
2220        xfs_inode_t        *ip,
2221        xfs_bmap_free_t        *flist)
2222{
2223        int                        error;
2224        int                        delete;
2225        xfs_ino_t                first_ino;
2226        xfs_dinode_t            *dip;
2227        xfs_buf_t               *ibp;
2228
2229        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2230        ASSERT(ip->i_transp == tp);
2231        ASSERT(ip->i_d.di_nlink == 0);
2232        ASSERT(ip->i_d.di_nextents == 0);
2233        ASSERT(ip->i_d.di_anextents == 0);
2234        ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
2235               ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
2236        ASSERT(ip->i_d.di_nblocks == 0);
2237
2238        /*
2239         * Pull the on-disk inode from the AGI unlinked list.
2240         */
2241        error = xfs_iunlink_remove(tp, ip);
2242        if (error != 0) {
2243                return error;
2244        }
2245
2246        error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2247        if (error != 0) {
2248                return error;
2249        }
2250        ip->i_d.di_mode = 0;                /* mark incore inode as free */
2251        ip->i_d.di_flags = 0;
2252        ip->i_d.di_dmevmask = 0;
2253        ip->i_d.di_forkoff = 0;                /* mark the attr fork not in use */
2254        ip->i_df.if_ext_max =
2255                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
2256        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2257        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2258        /*
2259         * Bump the generation count so no one will be confused
2260         * by reincarnations of this inode.
2261         */
2262        ip->i_d.di_gen++;
2263
2264        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2265
2266        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
2267        if (error)
2268                return error;
2269
2270        /*
2271        * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
2272        * from picking up this inode when it is reclaimed (its incore state
2273        * initialzed but not flushed to disk yet). The in-core di_mode is
2274        * already cleared  and a corresponding transaction logged.
2275        * The hack here just synchronizes the in-core to on-disk
2276        * di_mode value in advance before the actual inode sync to disk.
2277        * This is OK because the inode is already unlinked and would never
2278        * change its di_mode again for this inode generation.
2279        * This is a temporary hack that would require a proper fix
2280        * in the future.
2281        */
2282        dip->di_core.di_mode = 0;
2283
2284        if (delete) {
2285                xfs_ifree_cluster(ip, tp, first_ino);
2286        }
2287
2288        return 0;
2289}
2290
2291/*
2292 * Reallocate the space for if_broot based on the number of records
2293 * being added or deleted as indicated in rec_diff.  Move the records
2294 * and pointers in if_broot to fit the new size.  When shrinking this
2295 * will eliminate holes between the records and pointers created by
2296 * the caller.  When growing this will create holes to be filled in
2297 * by the caller.
2298 *
2299 * The caller must not request to add more records than would fit in
2300 * the on-disk inode root.  If the if_broot is currently NULL, then
2301 * if we adding records one will be allocated.  The caller must also
2302 * not request that the number of records go below zero, although
2303 * it can go to zero.
2304 *
2305 * ip -- the inode whose if_broot area is changing
2306 * ext_diff -- the change in the number of records, positive or negative,
2307 *         requested for the if_broot array.
2308 */
2309void
2310xfs_iroot_realloc(
2311        xfs_inode_t                *ip,
2312        int                        rec_diff,
2313        int                        whichfork)
2314{
2315        int                        cur_max;
2316        xfs_ifork_t                *ifp;
2317        xfs_bmbt_block_t        *new_broot;
2318        int                        new_max;
2319        size_t                        new_size;
2320        char                        *np;
2321        char                        *op;
2322
2323        /*
2324         * Handle the degenerate case quietly.
2325         */
2326        if (rec_diff == 0) {
2327                return;
2328        }
2329
2330        ifp = XFS_IFORK_PTR(ip, whichfork);
2331        if (rec_diff > 0) {
2332                /*
2333                 * If there wasn't any memory allocated before, just
2334                 * allocate it now and get out.
2335                 */
2336                if (ifp->if_broot_bytes == 0) {
2337                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2338                        ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
2339                                                                     KM_SLEEP);
2340                        ifp->if_broot_bytes = (int)new_size;
2341                        return;
2342                }
2343
2344                /*
2345                 * If there is already an existing if_broot, then we need
2346                 * to realloc() it and shift the pointers to their new
2347                 * location.  The records don't change location because
2348                 * they are kept butted up against the btree block header.
2349                 */
2350                cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2351                new_max = cur_max + rec_diff;
2352                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2353                ifp->if_broot = (xfs_bmbt_block_t *)
2354                  kmem_realloc(ifp->if_broot,
2355                                new_size,
2356                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2357                                KM_SLEEP);
2358                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2359                                                      ifp->if_broot_bytes);
2360                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2361                                                      (int)new_size);
2362                ifp->if_broot_bytes = (int)new_size;
2363                ASSERT(ifp->if_broot_bytes <=
2364                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2365                memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2366                return;
2367        }
2368
2369        /*
2370         * rec_diff is less than 0.  In this case, we are shrinking the
2371         * if_broot buffer.  It must already exist.  If we go to zero
2372         * records, just get rid of the root and clear the status bit.
2373         */
2374        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2375        cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2376        new_max = cur_max + rec_diff;
2377        ASSERT(new_max >= 0);
2378        if (new_max > 0)
2379                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2380        else
2381                new_size = 0;
2382        if (new_size > 0) {
2383                new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
2384                /*
2385                 * First copy over the btree block header.
2386                 */
2387                memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
2388        } else {
2389                new_broot = NULL;
2390                ifp->if_flags &= ~XFS_IFBROOT;
2391        }
2392
2393        /*
2394         * Only copy the records and pointers if there are any.
2395         */
2396        if (new_max > 0) {
2397                /*
2398                 * First copy the records.
2399                 */
2400                op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
2401                                                     ifp->if_broot_bytes);
2402                np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
2403                                                     (int)new_size);
2404                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2405
2406                /*
2407                 * Then copy the pointers.
2408                 */
2409                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2410                                                     ifp->if_broot_bytes);
2411                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
2412                                                     (int)new_size);
2413                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2414        }
2415        kmem_free(ifp->if_broot);
2416        ifp->if_broot = new_broot;
2417        ifp->if_broot_bytes = (int)new_size;
2418        ASSERT(ifp->if_broot_bytes <=
2419                XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2420        return;
2421}
2422
2423
2424/*
2425 * This is called when the amount of space needed for if_data
2426 * is increased or decreased.  The change in size is indicated by
2427 * the number of bytes that need to be added or deleted in the
2428 * byte_diff parameter.
2429 *
2430 * If the amount of space needed has decreased below the size of the
2431 * inline buffer, then switch to using the inline buffer.  Otherwise,
2432 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2433 * to what is needed.
2434 *
2435 * ip -- the inode whose if_data area is changing
2436 * byte_diff -- the change in the number of bytes, positive or negative,
2437 *         requested for the if_data array.
2438 */
2439void
2440xfs_idata_realloc(
2441        xfs_inode_t        *ip,
2442        int                byte_diff,
2443        int                whichfork)
2444{
2445        xfs_ifork_t        *ifp;
2446        int                new_size;
2447        int                real_size;
2448
2449        if (byte_diff == 0) {
2450                return;
2451        }
2452
2453        ifp = XFS_IFORK_PTR(ip, whichfork);
2454        new_size = (int)ifp->if_bytes + byte_diff;
2455        ASSERT(new_size >= 0);
2456
2457        if (new_size == 0) {
2458                if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2459                        kmem_free(ifp->if_u1.if_data);
2460                }
2461                ifp->if_u1.if_data = NULL;
2462                real_size = 0;
2463        } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2464                /*
2465                 * If the valid extents/data can fit in if_inline_ext/data,
2466                 * copy them from the malloc'd vector and free it.
2467                 */
2468                if (ifp->if_u1.if_data == NULL) {
2469                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2470                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2471                        ASSERT(ifp->if_real_bytes != 0);
2472                        memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2473                              new_size);
2474                        kmem_free(ifp->if_u1.if_data);
2475                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2476                }
2477                real_size = 0;
2478        } else {
2479                /*
2480                 * Stuck with malloc/realloc.
2481                 * For inline data, the underlying buffer must be
2482                 * a multiple of 4 bytes in size so that it can be
2483                 * logged and stay on word boundaries.  We enforce
2484                 * that here.
2485                 */
2486                real_size = roundup(new_size, 4);
2487                if (ifp->if_u1.if_data == NULL) {
2488                        ASSERT(ifp->if_real_bytes == 0);
2489                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2490                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2491                        /*
2492                         * Only do the realloc if the underlying size
2493                         * is really changing.
2494                         */
2495                        if (ifp->if_real_bytes != real_size) {
2496                                ifp->if_u1.if_data =
2497                                        kmem_realloc(ifp->if_u1.if_data,
2498                                                        real_size,
2499                                                        ifp->if_real_bytes,
2500                                                        KM_SLEEP);
2501                        }
2502                } else {
2503                        ASSERT(ifp->if_real_bytes == 0);
2504                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2505                        memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2506                                ifp->if_bytes);
2507                }
2508        }
2509        ifp->if_real_bytes = real_size;
2510        ifp->if_bytes = new_size;
2511        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2512}
2513
2514
2515
2516
2517/*
2518 * Map inode to disk block and offset.
2519 *
2520 * mp -- the mount point structure for the current file system
2521 * tp -- the current transaction
2522 * ino -- the inode number of the inode to be located
2523 * imap -- this structure is filled in with the information necessary
2524 *         to retrieve the given inode from disk
2525 * flags -- flags to pass to xfs_dilocate indicating whether or not
2526 *         lookups in the inode btree were OK or not
2527 */
2528int
2529xfs_imap(
2530        xfs_mount_t        *mp,
2531        xfs_trans_t        *tp,
2532        xfs_ino_t        ino,
2533        xfs_imap_t        *imap,
2534        uint                flags)
2535{
2536        xfs_fsblock_t        fsbno;
2537        int                len;
2538        int                off;
2539        int                error;
2540
2541        fsbno = imap->im_blkno ?
2542                XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
2543        error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
2544        if (error)
2545                return error;
2546
2547        imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
2548        imap->im_len = XFS_FSB_TO_BB(mp, len);
2549        imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
2550        imap->im_ioffset = (ushort)off;
2551        imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
2552
2553        /*
2554         * If the inode number maps to a block outside the bounds
2555         * of the file system then return NULL rather than calling
2556         * read_buf and panicing when we get an error from the
2557         * driver.
2558         */
2559        if ((imap->im_blkno + imap->im_len) >
2560            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
2561                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
2562                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
2563                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
2564                        (unsigned long long) imap->im_blkno,
2565                        (unsigned long long) imap->im_len,
2566                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
2567                return EINVAL;
2568        }
2569        return 0;
2570}
2571
2572void
2573xfs_idestroy_fork(
2574        xfs_inode_t        *ip,
2575        int                whichfork)
2576{
2577        xfs_ifork_t        *ifp;
2578
2579        ifp = XFS_IFORK_PTR(ip, whichfork);
2580        if (ifp->if_broot != NULL) {
2581                kmem_free(ifp->if_broot);
2582                ifp->if_broot = NULL;
2583        }
2584
2585        /*
2586         * If the format is local, then we can't have an extents
2587         * array so just look for an inline data array.  If we're
2588         * not local then we may or may not have an extents list,
2589         * so check and free it up if we do.
2590         */
2591        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2592                if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2593                    (ifp->if_u1.if_data != NULL)) {
2594                        ASSERT(ifp->if_real_bytes != 0);
2595                        kmem_free(ifp->if_u1.if_data);
2596                        ifp->if_u1.if_data = NULL;
2597                        ifp->if_real_bytes = 0;
2598                }
2599        } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2600                   ((ifp->if_flags & XFS_IFEXTIREC) ||
2601                    ((ifp->if_u1.if_extents != NULL) &&
2602                     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
2603                ASSERT(ifp->if_real_bytes != 0);
2604                xfs_iext_destroy(ifp);
2605        }
2606        ASSERT(ifp->if_u1.if_extents == NULL ||
2607               ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2608        ASSERT(ifp->if_real_bytes == 0);
2609        if (whichfork == XFS_ATTR_FORK) {
2610                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2611                ip->i_afp = NULL;
2612        }
2613}
2614
2615/*
2616 * This is called free all the memory associated with an inode.
2617 * It must free the inode itself and any buffers allocated for
2618 * if_extents/if_data and if_broot.  It must also free the lock
2619 * associated with the inode.
2620 */
2621void
2622xfs_idestroy(
2623        xfs_inode_t        *ip)
2624{
2625        switch (ip->i_d.di_mode & S_IFMT) {
2626        case S_IFREG:
2627        case S_IFDIR:
2628        case S_IFLNK:
2629                xfs_idestroy_fork(ip, XFS_DATA_FORK);
2630                break;
2631        }
2632        if (ip->i_afp)
2633                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
2634        mrfree(&ip->i_lock);
2635        mrfree(&ip->i_iolock);
2636
2637#ifdef XFS_INODE_TRACE
2638        ktrace_free(ip->i_trace);
2639#endif
2640#ifdef XFS_BMAP_TRACE
2641        ktrace_free(ip->i_xtrace);
2642#endif
2643#ifdef XFS_BMBT_TRACE
2644        ktrace_free(ip->i_btrace);
2645#endif
2646#ifdef XFS_RW_TRACE
2647        ktrace_free(ip->i_rwtrace);
2648#endif
2649#ifdef XFS_ILOCK_TRACE
2650        ktrace_free(ip->i_lock_trace);
2651#endif
2652#ifdef XFS_DIR2_TRACE
2653        ktrace_free(ip->i_dir_trace);
2654#endif
2655        if (ip->i_itemp) {
2656                /*
2657                 * Only if we are shutting down the fs will we see an
2658                 * inode still in the AIL. If it is there, we should remove
2659                 * it to prevent a use-after-free from occurring.
2660                 */
2661                xfs_mount_t        *mp = ip->i_mount;
2662                xfs_log_item_t        *lip = &ip->i_itemp->ili_item;
2663
2664                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
2665                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
2666                if (lip->li_flags & XFS_LI_IN_AIL) {
2667                        spin_lock(&mp->m_ail_lock);
2668                        if (lip->li_flags & XFS_LI_IN_AIL)
2669                                xfs_trans_delete_ail(mp, lip);
2670                        else
2671                                spin_unlock(&mp->m_ail_lock);
2672                }
2673                xfs_inode_item_destroy(ip);
2674        }
2675        kmem_zone_free(xfs_inode_zone, ip);
2676}
2677
2678
2679/*
2680 * Increment the pin count of the given buffer.
2681 * This value is protected by ipinlock spinlock in the mount structure.
2682 */
2683void
2684xfs_ipin(
2685        xfs_inode_t        *ip)
2686{
2687        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2688
2689        atomic_inc(&ip->i_pincount);
2690}
2691
2692/*
2693 * Decrement the pin count of the given inode, and wake up
2694 * anyone in xfs_iwait_unpin() if the count goes to 0.  The
2695 * inode must have been previously pinned with a call to xfs_ipin().
2696 */
2697void
2698xfs_iunpin(
2699        xfs_inode_t        *ip)
2700{
2701        ASSERT(atomic_read(&ip->i_pincount) > 0);
2702
2703        if (atomic_dec_and_test(&ip->i_pincount))
2704                wake_up(&ip->i_ipin_wait);
2705}
2706
2707/*
2708 * This is called to unpin an inode. It can be directed to wait or to return
2709 * immediately without waiting for the inode to be unpinned.  The caller must
2710 * have the inode locked in at least shared mode so that the buffer cannot be
2711 * subsequently pinned once someone is waiting for it to be unpinned.
2712 */
2713STATIC void
2714__xfs_iunpin_wait(
2715        xfs_inode_t        *ip,
2716        int                wait)
2717{
2718        xfs_inode_log_item_t        *iip = ip->i_itemp;
2719
2720        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2721        if (atomic_read(&ip->i_pincount) == 0)
2722                return;
2723
2724        /* Give the log a push to start the unpinning I/O */
2725        xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
2726                                iip->ili_last_lsn : 0, XFS_LOG_FORCE);
2727        if (wait)
2728                wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2729}
2730
2731static inline void
2732xfs_iunpin_wait(
2733        xfs_inode_t        *ip)
2734{
2735        __xfs_iunpin_wait(ip, 1);
2736}
2737
2738static inline void
2739xfs_iunpin_nowait(
2740        xfs_inode_t        *ip)
2741{
2742        __xfs_iunpin_wait(ip, 0);
2743}
2744
2745
2746/*
2747 * xfs_iextents_copy()
2748 *
2749 * This is called to copy the REAL extents (as opposed to the delayed
2750 * allocation extents) from the inode into the given buffer.  It
2751 * returns the number of bytes copied into the buffer.
2752 *
2753 * If there are no delayed allocation extents, then we can just
2754 * memcpy() the extents into the buffer.  Otherwise, we need to
2755 * examine each extent in turn and skip those which are delayed.
2756 */
2757int
2758xfs_iextents_copy(
2759        xfs_inode_t                *ip,
2760        xfs_bmbt_rec_t                *dp,
2761        int                        whichfork)
2762{
2763        int                        copied;
2764        int                        i;
2765        xfs_ifork_t                *ifp;
2766        int                        nrecs;
2767        xfs_fsblock_t                start_block;
2768
2769        ifp = XFS_IFORK_PTR(ip, whichfork);
2770        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2771        ASSERT(ifp->if_bytes > 0);
2772
2773        nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2774        XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
2775        ASSERT(nrecs > 0);
2776
2777        /*
2778         * There are some delayed allocation extents in the
2779         * inode, so copy the extents one at a time and skip
2780         * the delayed ones.  There must be at least one
2781         * non-delayed extent.
2782         */
2783        copied = 0;
2784        for (i = 0; i < nrecs; i++) {
2785                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
2786                start_block = xfs_bmbt_get_startblock(ep);
2787                if (ISNULLSTARTBLOCK(start_block)) {
2788                        /*
2789                         * It's a delayed allocation extent, so skip it.
2790                         */
2791                        continue;
2792                }
2793
2794                /* Translate to on disk format */
2795                put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
2796                put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
2797                dp++;
2798                copied++;
2799        }
2800        ASSERT(copied != 0);
2801        xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
2802
2803        return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2804}
2805
2806/*
2807 * Each of the following cases stores data into the same region
2808 * of the on-disk inode, so only one of them can be valid at
2809 * any given time. While it is possible to have conflicting formats
2810 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2811 * in EXTENTS format, this can only happen when the fork has
2812 * changed formats after being modified but before being flushed.
2813 * In these cases, the format always takes precedence, because the
2814 * format indicates the current state of the fork.
2815 */
2816/*ARGSUSED*/
2817STATIC void
2818xfs_iflush_fork(
2819        xfs_inode_t                *ip,
2820        xfs_dinode_t                *dip,
2821        xfs_inode_log_item_t        *iip,
2822        int                        whichfork,
2823        xfs_buf_t                *bp)
2824{
2825        char                        *cp;
2826        xfs_ifork_t                *ifp;
2827        xfs_mount_t                *mp;
2828#ifdef XFS_TRANS_DEBUG
2829        int                        first;
2830#endif
2831        static const short        brootflag[2] =
2832                { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2833        static const short        dataflag[2] =
2834                { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2835        static const short        extflag[2] =
2836                { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2837
2838        if (!iip)
2839                return;
2840        ifp = XFS_IFORK_PTR(ip, whichfork);
2841        /*
2842         * This can happen if we gave up in iformat in an error path,
2843         * for the attribute fork.
2844         */
2845        if (!ifp) {
2846                ASSERT(whichfork == XFS_ATTR_FORK);
2847                return;
2848        }
2849        cp = XFS_DFORK_PTR(dip, whichfork);
2850        mp = ip->i_mount;
2851        switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2852        case XFS_DINODE_FMT_LOCAL:
2853                if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
2854                    (ifp->if_bytes > 0)) {
2855                        ASSERT(ifp->if_u1.if_data != NULL);
2856                        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2857                        memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2858                }
2859                break;
2860
2861        case XFS_DINODE_FMT_EXTENTS:
2862                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2863                       !(iip->ili_format.ilf_fields & extflag[whichfork]));
2864                ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
2865                        (ifp->if_bytes == 0));
2866                ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
2867                        (ifp->if_bytes > 0));
2868                if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
2869                    (ifp->if_bytes > 0)) {
2870                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2871                        (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2872                                whichfork);
2873                }
2874                break;
2875
2876        case XFS_DINODE_FMT_BTREE:
2877                if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
2878                    (ifp->if_broot_bytes > 0)) {
2879                        ASSERT(ifp->if_broot != NULL);
2880                        ASSERT(ifp->if_broot_bytes <=
2881                               (XFS_IFORK_SIZE(ip, whichfork) +
2882                                XFS_BROOT_SIZE_ADJ));
2883                        xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
2884                                (xfs_bmdr_block_t *)cp,
2885                                XFS_DFORK_SIZE(dip, mp, whichfork));
2886                }
2887                break;
2888
2889        case XFS_DINODE_FMT_DEV:
2890                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
2891                        ASSERT(whichfork == XFS_DATA_FORK);
2892                        dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev);
2893                }
2894                break;
2895
2896        case XFS_DINODE_FMT_UUID:
2897                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
2898                        ASSERT(whichfork == XFS_DATA_FORK);
2899                        memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
2900                                sizeof(uuid_t));
2901                }
2902                break;
2903
2904        default:
2905                ASSERT(0);
2906                break;
2907        }
2908}
2909
2910STATIC int
2911xfs_iflush_cluster(
2912        xfs_inode_t        *ip,
2913        xfs_buf_t        *bp)
2914{
2915        xfs_mount_t                *mp = ip->i_mount;
2916        xfs_perag_t                *pag = xfs_get_perag(mp, ip->i_ino);
2917        unsigned long                first_index, mask;
2918        unsigned long                inodes_per_cluster;
2919        int                        ilist_size;
2920        xfs_inode_t                **ilist;
2921        xfs_inode_t                *iq;
2922        int                        nr_found;
2923        int                        clcount = 0;
2924        int                        bufwasdelwri;
2925        int                        i;
2926
2927        ASSERT(pag->pagi_inodeok);
2928        ASSERT(pag->pag_ici_init);
2929
2930        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2931        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2932        ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2933        if (!ilist)
2934                return 0;
2935
2936        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2937        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2938        read_lock(&pag->pag_ici_lock);
2939        /* really need a gang lookup range call here */
2940        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2941                                        first_index, inodes_per_cluster);
2942        if (nr_found == 0)
2943                goto out_free;
2944
2945        for (i = 0; i < nr_found; i++) {
2946                iq = ilist[i];
2947                if (iq == ip)
2948                        continue;
2949                /* if the inode lies outside this cluster, we're done. */
2950                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
2951                        break;
2952                /*
2953                 * Do an un-protected check to see if the inode is dirty and
2954                 * is a candidate for flushing.  These checks will be repeated
2955                 * later after the appropriate locks are acquired.
2956                 */
2957                if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
2958                        continue;
2959
2960                /*
2961                 * Try to get locks.  If any are unavailable or it is pinned,
2962                 * then this inode cannot be flushed and is skipped.
2963                 */
2964
2965                if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
2966                        continue;
2967                if (!xfs_iflock_nowait(iq)) {
2968                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
2969                        continue;
2970                }
2971                if (xfs_ipincount(iq)) {
2972                        xfs_ifunlock(iq);
2973                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
2974                        continue;
2975                }
2976
2977                /*
2978                 * arriving here means that this inode can be flushed.  First
2979                 * re-check that it's dirty before flushing.
2980                 */
2981                if (!xfs_inode_clean(iq)) {
2982                        int        error;
2983                        error = xfs_iflush_int(iq, bp);
2984                        if (error) {
2985                                xfs_iunlock(iq, XFS_ILOCK_SHARED);
2986                                goto cluster_corrupt_out;
2987                        }
2988                        clcount++;
2989                } else {
2990                        xfs_ifunlock(iq);
2991                }
2992                xfs_iunlock(iq, XFS_ILOCK_SHARED);
2993        }
2994
2995        if (clcount) {
2996                XFS_STATS_INC(xs_icluster_flushcnt);
2997                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
2998        }
2999
3000out_free:
3001        read_unlock(&pag->pag_ici_lock);
3002        kmem_free(ilist);
3003        return 0;
3004
3005
3006cluster_corrupt_out:
3007        /*
3008         * Corruption detected in the clustering loop.  Invalidate the
3009         * inode buffer and shut down the filesystem.
3010         */
3011        read_unlock(&pag->pag_ici_lock);
3012        /*
3013         * Clean up the buffer.  If it was B_DELWRI, just release it --
3014         * brelse can handle it with no problems.  If not, shut down the
3015         * filesystem before releasing the buffer.
3016         */
3017        bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
3018        if (bufwasdelwri)
3019                xfs_buf_relse(bp);
3020
3021        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3022
3023        if (!bufwasdelwri) {
3024                /*
3025                 * Just like incore_relse: if we have b_iodone functions,
3026                 * mark the buffer as an error and call them.  Otherwise
3027                 * mark it as stale and brelse.
3028                 */
3029                if (XFS_BUF_IODONE_FUNC(bp)) {
3030                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
3031                        XFS_BUF_UNDONE(bp);
3032                        XFS_BUF_STALE(bp);
3033                        XFS_BUF_SHUT(bp);
3034                        XFS_BUF_ERROR(bp,EIO);
3035                        xfs_biodone(bp);
3036                } else {
3037                        XFS_BUF_STALE(bp);
3038                        xfs_buf_relse(bp);
3039                }
3040        }
3041
3042        /*
3043         * Unlocks the flush lock
3044         */
3045        xfs_iflush_abort(iq);
3046        kmem_free(ilist);
3047        return XFS_ERROR(EFSCORRUPTED);
3048}
3049
3050/*
3051 * xfs_iflush() will write a modified inode's changes out to the
3052 * inode's on disk home.  The caller must have the inode lock held
3053 * in at least shared mode and the inode flush completion must be
3054 * active as well.  The inode lock will still be held upon return from
3055 * the call and the caller is free to unlock it.
3056 * The inode flush will be completed when the inode reaches the disk.
3057 * The flags indicate how the inode's buffer should be written out.
3058 */
3059int
3060xfs_iflush(
3061        xfs_inode_t                *ip,
3062        uint                        flags)
3063{
3064        xfs_inode_log_item_t        *iip;
3065        xfs_buf_t                *bp;
3066        xfs_dinode_t                *dip;
3067        xfs_mount_t                *mp;
3068        int                        error;
3069        int                        noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
3070        enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
3071
3072        XFS_STATS_INC(xs_iflush_count);
3073
3074        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3075        ASSERT(!completion_done(&ip->i_flush));
3076        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3077               ip->i_d.di_nextents > ip->i_df.if_ext_max);
3078
3079        iip = ip->i_itemp;
3080        mp = ip->i_mount;
3081
3082        /*
3083         * If the inode isn't dirty, then just release the inode
3084         * flush lock and do nothing.
3085         */
3086        if (xfs_inode_clean(ip)) {
3087                xfs_ifunlock(ip);
3088                return 0;
3089        }
3090
3091        /*
3092         * We can't flush the inode until it is unpinned, so wait for it if we
3093         * are allowed to block.  We know noone new can pin it, because we are
3094         * holding the inode lock shared and you need to hold it exclusively to
3095         * pin the inode.
3096         *
3097         * If we are not allowed to block, force the log out asynchronously so
3098         * that when we come back the inode will be unpinned. If other inodes
3099         * in the same cluster are dirty, they will probably write the inode
3100         * out for us if they occur after the log force completes.
3101         */
3102        if (noblock && xfs_ipincount(ip)) {
3103                xfs_iunpin_nowait(ip);
3104                xfs_ifunlock(ip);
3105                return EAGAIN;
3106        }
3107        xfs_iunpin_wait(ip);
3108
3109        /*
3110         * This may have been unpinned because the filesystem is shutting
3111         * down forcibly. If that's the case we must not write this inode
3112         * to disk, because the log record didn't make it to disk!
3113         */
3114        if (XFS_FORCED_SHUTDOWN(mp)) {
3115                ip->i_update_core = 0;
3116                if (iip)
3117                        iip->ili_format.ilf_fields = 0;
3118                xfs_ifunlock(ip);
3119                return XFS_ERROR(EIO);
3120        }
3121
3122        /*
3123         * Decide how buffer will be flushed out.  This is done before
3124         * the call to xfs_iflush_int because this field is zeroed by it.
3125         */
3126        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3127                /*
3128                 * Flush out the inode buffer according to the directions
3129                 * of the caller.  In the cases where the caller has given
3130                 * us a choice choose the non-delwri case.  This is because
3131                 * the inode is in the AIL and we need to get it out soon.
3132                 */
3133                switch (flags) {
3134                case XFS_IFLUSH_SYNC:
3135                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3136                        flags = 0;
3137                        break;
3138                case XFS_IFLUSH_ASYNC_NOBLOCK:
3139                case XFS_IFLUSH_ASYNC:
3140                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3141                        flags = INT_ASYNC;
3142                        break;
3143                case XFS_IFLUSH_DELWRI:
3144                        flags = INT_DELWRI;
3145                        break;
3146                default:
3147                        ASSERT(0);
3148                        flags = 0;
3149                        break;
3150                }
3151        } else {
3152                switch (flags) {
3153                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3154                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3155                case XFS_IFLUSH_DELWRI:
3156                        flags = INT_DELWRI;
3157                        break;
3158                case XFS_IFLUSH_ASYNC_NOBLOCK:
3159                case XFS_IFLUSH_ASYNC:
3160                        flags = INT_ASYNC;
3161                        break;
3162                case XFS_IFLUSH_SYNC:
3163                        flags = 0;
3164                        break;
3165                default:
3166                        ASSERT(0);
3167                        flags = 0;
3168                        break;
3169                }
3170        }
3171
3172        /*
3173         * Get the buffer containing the on-disk inode.
3174         */
3175        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
3176                                noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
3177        if (error || !bp) {
3178                xfs_ifunlock(ip);
3179                return error;
3180        }
3181
3182        /*
3183         * First flush out the inode that xfs_iflush was called with.
3184         */
3185        error = xfs_iflush_int(ip, bp);
3186        if (error)
3187                goto corrupt_out;
3188
3189        /*
3190         * If the buffer is pinned then push on the log now so we won't
3191         * get stuck waiting in the write for too long.
3192         */
3193        if (XFS_BUF_ISPINNED(bp))
3194                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
3195
3196        /*
3197         * inode clustering:
3198         * see if other inodes can be gathered into this write
3199         */
3200        error = xfs_iflush_cluster(ip, bp);
3201        if (error)
3202                goto cluster_corrupt_out;
3203
3204        if (flags & INT_DELWRI) {
3205                xfs_bdwrite(mp, bp);
3206        } else if (flags & INT_ASYNC) {
3207                error = xfs_bawrite(mp, bp);
3208        } else {
3209                error = xfs_bwrite(mp, bp);
3210        }
3211        return error;
3212
3213corrupt_out:
3214        xfs_buf_relse(bp);
3215        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3216cluster_corrupt_out:
3217        /*
3218         * Unlocks the flush lock
3219         */
3220        xfs_iflush_abort(ip);
3221        return XFS_ERROR(EFSCORRUPTED);
3222}
3223
3224
3225STATIC int
3226xfs_iflush_int(
3227        xfs_inode_t                *ip,
3228        xfs_buf_t                *bp)
3229{
3230        xfs_inode_log_item_t        *iip;
3231        xfs_dinode_t                *dip;
3232        xfs_mount_t                *mp;
3233#ifdef XFS_TRANS_DEBUG
3234        int                        first;
3235#endif
3236
3237        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3238        ASSERT(!completion_done(&ip->i_flush));
3239        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3240               ip->i_d.di_nextents > ip->i_df.if_ext_max);
3241
3242        iip = ip->i_itemp;
3243        mp = ip->i_mount;
3244
3245
3246        /*
3247         * If the inode isn't dirty, then just release the inode
3248         * flush lock and do nothing.
3249         */
3250        if (xfs_inode_clean(ip)) {
3251                xfs_ifunlock(ip);
3252                return 0;
3253        }
3254
3255        /* set *dip = inode's place in the buffer */
3256        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
3257
3258        /*
3259         * Clear i_update_core before copying out the data.
3260         * This is for coordination with our timestamp updates
3261         * that don't hold the inode lock. They will always
3262         * update the timestamps BEFORE setting i_update_core,
3263         * so if we clear i_update_core after they set it we
3264         * are guaranteed to see their updates to the timestamps.
3265         * I believe that this depends on strongly ordered memory
3266         * semantics, but we have that.  We use the SYNCHRONIZE
3267         * macro to make sure that the compiler does not reorder
3268         * the i_update_core access below the data copy below.
3269         */
3270        ip->i_update_core = 0;
3271        SYNCHRONIZE();
3272
3273        /*
3274         * Make sure to get the latest atime from the Linux inode.
3275         */
3276        xfs_synchronize_atime(ip);
3277
3278        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC,
3279                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3280                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3281                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3282                        ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip);
3283                goto corrupt_out;
3284        }
3285        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
3286                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
3287                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3288                        "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
3289                        ip->i_ino, ip, ip->i_d.di_magic);
3290                goto corrupt_out;
3291        }
3292        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
3293                if (XFS_TEST_ERROR(
3294                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3295                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3296                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
3297                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3298                                "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
3299                                ip->i_ino, ip);
3300                        goto corrupt_out;
3301                }
3302        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
3303                if (XFS_TEST_ERROR(
3304                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3305                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3306                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3307                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
3308                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3309                                "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
3310                                ip->i_ino, ip);
3311                        goto corrupt_out;
3312                }
3313        }
3314        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3315                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
3316                                XFS_RANDOM_IFLUSH_5)) {
3317                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3318                        "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
3319                        ip->i_ino,
3320                        ip->i_d.di_nextents + ip->i_d.di_anextents,
3321                        ip->i_d.di_nblocks,
3322                        ip);
3323                goto corrupt_out;
3324        }
3325        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3326                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
3327                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3328                        "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
3329                        ip->i_ino, ip->i_d.di_forkoff, ip);
3330                goto corrupt_out;
3331        }
3332        /*
3333         * bump the flush iteration count, used to detect flushes which
3334         * postdate a log record during recovery.
3335         */
3336
3337        ip->i_d.di_flushiter++;
3338
3339        /*
3340         * Copy the dirty parts of the inode into the on-disk
3341         * inode.  We always copy out the core of the inode,
3342         * because if the inode is dirty at all the core must
3343         * be.
3344         */
3345        xfs_dinode_to_disk(&dip->di_core, &ip->i_d);
3346
3347        /* Wrap, we never let the log put out DI_MAX_FLUSH */
3348        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3349                ip->i_d.di_flushiter = 0;
3350
3351        /*
3352         * If this is really an old format inode and the superblock version
3353         * has not been updated to support only new format inodes, then
3354         * convert back to the old inode format.  If the superblock version
3355         * has been updated, then make the conversion permanent.
3356         */
3357        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
3358               xfs_sb_version_hasnlink(&mp->m_sb));
3359        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
3360                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
3361                        /*
3362                         * Convert it back.
3363                         */
3364                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
3365                        dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink);
3366                } else {
3367                        /*
3368                         * The superblock version has already been bumped,
3369                         * so just make the conversion to the new inode
3370                         * format permanent.
3371                         */
3372                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
3373                        dip->di_core.di_version =  XFS_DINODE_VERSION_2;
3374                        ip->i_d.di_onlink = 0;
3375                        dip->di_core.di_onlink = 0;
3376                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3377                        memset(&(dip->di_core.di_pad[0]), 0,
3378                              sizeof(dip->di_core.di_pad));
3379                        ASSERT(ip->i_d.di_projid == 0);
3380                }
3381        }
3382
3383        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
3384        if (XFS_IFORK_Q(ip))
3385                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
3386        xfs_inobp_check(mp, bp);
3387
3388        /*
3389         * We've recorded everything logged in the inode, so we'd
3390         * like to clear the ilf_fields bits so we don't log and
3391         * flush things unnecessarily.  However, we can't stop
3392         * logging all this information until the data we've copied
3393         * into the disk buffer is written to disk.  If we did we might
3394         * overwrite the copy of the inode in the log with all the
3395         * data after re-logging only part of it, and in the face of
3396         * a crash we wouldn't have all the data we need to recover.
3397         *
3398         * What we do is move the bits to the ili_last_fields field.
3399         * When logging the inode, these bits are moved back to the
3400         * ilf_fields field.  In the xfs_iflush_done() routine we
3401         * clear ili_last_fields, since we know that the information
3402         * those bits represent is permanently on disk.  As long as
3403         * the flush completes before the inode is logged again, then
3404         * both ilf_fields and ili_last_fields will be cleared.
3405         *
3406         * We can play with the ilf_fields bits here, because the inode
3407         * lock must be held exclusively in order to set bits there
3408         * and the flush lock protects the ili_last_fields bits.
3409         * Set ili_logged so the flush done
3410         * routine can tell whether or not to look in the AIL.
3411         * Also, store the current LSN of the inode so that we can tell
3412         * whether the item has moved in the AIL from xfs_iflush_done().
3413         * In order to read the lsn we need the AIL lock, because
3414         * it is a 64 bit value that cannot be read atomically.
3415         */
3416        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3417                iip->ili_last_fields = iip->ili_format.ilf_fields;
3418                iip->ili_format.ilf_fields = 0;
3419                iip->ili_logged = 1;
3420
3421                ASSERT(sizeof(xfs_lsn_t) == 8);        /* don't lock if it shrinks */
3422                spin_lock(&mp->m_ail_lock);
3423                iip->ili_flush_lsn = iip->ili_item.li_lsn;
3424                spin_unlock(&mp->m_ail_lock);
3425
3426                /*
3427                 * Attach the function xfs_iflush_done to the inode's
3428                 * buffer.  This will remove the inode from the AIL
3429                 * and unlock the inode's flush lock when the inode is
3430                 * completely written to disk.
3431                 */
3432                xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
3433                                      xfs_iflush_done, (xfs_log_item_t *)iip);
3434
3435                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
3436                ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
3437        } else {
3438                /*
3439                 * We're flushing an inode which is not in the AIL and has
3440                 * not been logged but has i_update_core set.  For this
3441                 * case we can use a B_DELWRI flush and immediately drop
3442                 * the inode flush lock because we can avoid the whole
3443                 * AIL state thing.  It's OK to drop the flush lock now,
3444                 * because we've already locked the buffer and to do anything
3445                 * you really need both.
3446                 */
3447                if (iip != NULL) {
3448                        ASSERT(iip->ili_logged == 0);
3449                        ASSERT(iip->ili_last_fields == 0);
3450                        ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
3451                }
3452                xfs_ifunlock(ip);
3453        }
3454
3455        return 0;
3456
3457corrupt_out:
3458        return XFS_ERROR(EFSCORRUPTED);
3459}
3460
3461
3462/*
3463 * Flush all inactive inodes in mp.
3464 */
3465void
3466xfs_iflush_all(
3467        xfs_mount_t        *mp)
3468{
3469        xfs_inode_t        *ip;
3470
3471 again:
3472        XFS_MOUNT_ILOCK(mp);
3473        ip = mp->m_inodes;
3474        if (ip == NULL)
3475                goto out;
3476
3477        do {
3478                /* Make sure we skip markers inserted by sync */
3479                if (ip->i_mount == NULL) {
3480                        ip = ip->i_mnext;
3481                        continue;
3482                }
3483
3484                if (!VFS_I(ip)) {
3485                        XFS_MOUNT_IUNLOCK(mp);
3486                        xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
3487                        goto again;
3488                }
3489
3490                ASSERT(vn_count(VFS_I(ip)) == 0);
3491
3492                ip = ip->i_mnext;
3493        } while (ip != mp->m_inodes);
3494 out:
3495        XFS_MOUNT_IUNLOCK(mp);
3496}
3497
3498#ifdef XFS_ILOCK_TRACE
3499ktrace_t        *xfs_ilock_trace_buf;
3500
3501void
3502xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
3503{
3504        ktrace_enter(ip->i_lock_trace,
3505                     (void *)ip,
3506                     (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
3507                     (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
3508                     (void *)ra,                /* caller of ilock */
3509                     (void *)(unsigned long)current_cpu(),
3510                     (void *)(unsigned long)current_pid(),
3511                     NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
3512}
3513#endif
3514
3515/*
3516 * Return a pointer to the extent record at file index idx.
3517 */
3518xfs_bmbt_rec_host_t *
3519xfs_iext_get_ext(
3520        xfs_ifork_t        *ifp,                /* inode fork pointer */
3521        xfs_extnum_t        idx)                /* index of target extent */
3522{
3523        ASSERT(idx >= 0);
3524        if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
3525                return ifp->if_u1.if_ext_irec->er_extbuf;
3526        } else if (ifp->if_flags & XFS_IFEXTIREC) {
3527                xfs_ext_irec_t        *erp;                /* irec pointer */
3528                int                erp_idx = 0;        /* irec index */
3529                xfs_extnum_t        page_idx = idx;        /* ext index in target list */
3530
3531                erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
3532                return &erp->er_extbuf[page_idx];
3533        } else if (ifp->if_bytes) {
3534                return &ifp->if_u1.if_extents[idx];
3535        } else {
3536                return NULL;
3537        }
3538}
3539
3540/*
3541 * Insert new item(s) into the extent records for incore inode
3542 * fork 'ifp'.  'count' new items are inserted at index 'idx'.
3543 */
3544void
3545xfs_iext_insert(
3546        xfs_ifork_t        *ifp,                /* inode fork pointer */
3547        xfs_extnum_t        idx,                /* starting index of new items */
3548        xfs_extnum_t        count,                /* number of inserted items */
3549        xfs_bmbt_irec_t        *new)                /* items to insert */
3550{
3551        xfs_extnum_t        i;                /* extent record index */
3552
3553        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3554        xfs_iext_add(ifp, idx, count);
3555        for (i = idx; i < idx + count; i++, new++)
3556                xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
3557}
3558
3559/*
3560 * This is called when the amount of space required for incore file
3561 * extents needs to be increased. The ext_diff parameter stores the
3562 * number of new extents being added and the idx parameter contains
3563 * the extent index where the new extents will be added. If the new
3564 * extents are being appended, then we just need to (re)allocate and
3565 * initialize the space. Otherwise, if the new extents are being
3566 * inserted into the middle of the existing entries, a bit more work
3567 * is required to make room for the new extents to be inserted. The
3568 * caller is responsible for filling in the new extent entries upon
3569 * return.
3570 */
3571void
3572xfs_iext_add(
3573        xfs_ifork_t        *ifp,                /* inode fork pointer */
3574        xfs_extnum_t        idx,                /* index to begin adding exts */
3575        int                ext_diff)        /* number of extents to add */
3576{
3577        int                byte_diff;        /* new bytes being added */
3578        int                new_size;        /* size of extents after adding */
3579        xfs_extnum_t        nextents;        /* number of extents in file */
3580
3581        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3582        ASSERT((idx >= 0) && (idx <= nextents));
3583        byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
3584        new_size = ifp->if_bytes + byte_diff;
3585        /*
3586         * If the new number of extents (nextents + ext_diff)
3587         * fits inside the inode, then continue to use the inline
3588         * extent buffer.
3589         */
3590        if (nextents + ext_diff <= XFS_INLINE_EXTS) {
3591                if (idx < nextents) {
3592                        memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
3593                                &ifp->if_u2.if_inline_ext[idx],
3594                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3595                        memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
3596                }
3597                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3598                ifp->if_real_bytes = 0;
3599                ifp->if_lastex = nextents + ext_diff;
3600        }
3601        /*
3602         * Otherwise use a linear (direct) extent list.
3603         * If the extents are currently inside the inode,
3604         * xfs_iext_realloc_direct will switch us from
3605         * inline to direct extent allocation mode.
3606         */
3607        else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
3608                xfs_iext_realloc_direct(ifp, new_size);
3609                if (idx < nextents) {
3610                        memmove(&ifp->if_u1.if_extents[idx + ext_diff],
3611                                &ifp->if_u1.if_extents[idx],
3612                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3613                        memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
3614                }
3615        }
3616        /* Indirection array */
3617        else {
3618                xfs_ext_irec_t        *erp;
3619                int                erp_idx = 0;
3620                int                page_idx = idx;
3621
3622                ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
3623                if (ifp->if_flags & XFS_IFEXTIREC) {
3624                        erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
3625                } else {
3626                        xfs_iext_irec_init(ifp);
3627                        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3628                        erp = ifp->if_u1.if_ext_irec;
3629                }
3630                /* Extents fit in target extent page */
3631                if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
3632                        if (page_idx < erp->er_extcount) {
3633                                memmove(&erp->er_extbuf[page_idx + ext_diff],
3634                                        &erp->er_extbuf[page_idx],
3635                                        (erp->er_extcount - page_idx) *
3636                                        sizeof(xfs_bmbt_rec_t));
3637                                memset(&erp->er_extbuf[page_idx], 0, byte_diff);
3638                        }
3639                        erp->er_extcount += ext_diff;
3640                        xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3641                }
3642                /* Insert a new extent page */
3643                else if (erp) {
3644                        xfs_iext_add_indirect_multi(ifp,
3645                                erp_idx, page_idx, ext_diff);
3646                }
3647                /*
3648                 * If extent(s) are being appended to the last page in
3649                 * the indirection array and the new extent(s) don't fit
3650                 * in the page, then erp is NULL and erp_idx is set to
3651                 * the next index needed in the indirection array.
3652                 */
3653                else {
3654                        int        count = ext_diff;
3655
3656                        while (count) {
3657                                erp = xfs_iext_irec_new(ifp, erp_idx);
3658                                erp->er_extcount = count;
3659                                count -= MIN(count, (int)XFS_LINEAR_EXTS);
3660                                if (count) {
3661                                        erp_idx++;
3662                                }
3663                        }
3664                }
3665        }
3666        ifp->if_bytes = new_size;
3667}
3668
3669/*
3670 * This is called when incore extents are being added to the indirection
3671 * array and the new extents do not fit in the target extent list. The
3672 * erp_idx parameter contains the irec index for the target extent list
3673 * in the indirection array, and the idx parameter contains the extent
3674 * index within the list. The number of extents being added is stored
3675 * in the count parameter.
3676 *
3677 *    |-------|   |-------|
3678 *    |       |   |       |    idx - number of extents before idx
3679 *    |  idx  |   | count |
3680 *    |       |   |       |    count - number of extents being inserted at idx
3681 *    |-------|   |-------|
3682 *    | count |   | nex2  |    nex2 - number of extents after idx + count
3683 *    |-------|   |-------|
3684 */
3685void
3686xfs_iext_add_indirect_multi(
3687        xfs_ifork_t        *ifp,                        /* inode fork pointer */
3688        int                erp_idx,                /* target extent irec index */
3689        xfs_extnum_t        idx,                        /* index within target list */
3690        int                count)                        /* new extents being added */
3691{
3692        int                byte_diff;                /* new bytes being added */
3693        xfs_ext_irec_t        *erp;                        /* pointer to irec entry */
3694        xfs_extnum_t        ext_diff;                /* number of extents to add */
3695        xfs_extnum_t        ext_cnt;                /* new extents still needed */
3696        xfs_extnum_t        nex2;                        /* extents after idx + count */
3697        xfs_bmbt_rec_t        *nex2_ep = NULL;        /* temp list for nex2 extents */
3698        int                nlists;                        /* number of irec's (lists) */
3699
3700        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3701        erp = &ifp->if_u1.if_ext_irec[erp_idx];
3702        nex2 = erp->er_extcount - idx;
3703        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3704
3705        /*
3706         * Save second part of target extent list
3707         * (all extents past */
3708        if (nex2) {
3709                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3710                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
3711                memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
3712                erp->er_extcount -= nex2;
3713                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
3714                memset(&erp->er_extbuf[idx], 0, byte_diff);
3715        }
3716
3717        /*
3718         * Add the new extents to the end of the target
3719         * list, then allocate new irec record(s) and
3720         * extent buffer(s) as needed to store the rest
3721         * of the new extents.
3722         */
3723        ext_cnt = count;
3724        ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
3725        if (ext_diff) {
3726                erp->er_extcount += ext_diff;
3727                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3728                ext_cnt -= ext_diff;
3729        }
3730        while (ext_cnt) {
3731                erp_idx++;
3732                erp = xfs_iext_irec_new(ifp, erp_idx);
3733                ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
3734                erp->er_extcount = ext_diff;
3735                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3736                ext_cnt -= ext_diff;
3737        }
3738
3739        /* Add nex2 extents back to indirection array */
3740        if (nex2) {
3741                xfs_extnum_t        ext_avail;
3742                int                i;
3743
3744                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3745                ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
3746                i = 0;
3747                /*
3748                 * If nex2 extents fit in the current page, append
3749                 * nex2_ep after the new extents.
3750                 */
3751                if (nex2 <= ext_avail) {
3752                        i = erp->er_extcount;
3753                }
3754                /*
3755                 * Otherwise, check if space is available in the
3756                 * next page.
3757                 */
3758                else if ((erp_idx < nlists - 1) &&
3759                         (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
3760                          ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
3761                        erp_idx++;
3762                        erp++;
3763                        /* Create a hole for nex2 extents */
3764                        memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
3765                                erp->er_extcount * sizeof(xfs_bmbt_rec_t));
3766                }
3767                /*
3768                 * Final choice, create a new extent page for
3769                 * nex2 extents.
3770                 */
3771                else {
3772                        erp_idx++;
3773                        erp = xfs_iext_irec_new(ifp, erp_idx);
3774                }
3775                memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
3776                kmem_free(nex2_ep);
3777                erp->er_extcount += nex2;
3778                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
3779        }
3780}
3781
3782/*
3783 * This is called when the amount of space required for incore file
3784 * extents needs to be decreased. The ext_diff parameter stores the
3785 * number of extents to be removed and the idx parameter contains
3786 * the extent index where the extents will be removed from.
3787 *
3788 * If the amount of space needed has decreased below the linear
3789 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
3790 * extent array.  Otherwise, use kmem_realloc() to adjust the
3791 * size to what is needed.
3792 */
3793void
3794xfs_iext_remove(
3795        xfs_ifork_t        *ifp,                /* inode fork pointer */
3796        xfs_extnum_t        idx,                /* index to begin removing exts */
3797        int                ext_diff)        /* number of extents to remove */
3798{
3799        xfs_extnum_t        nextents;        /* number of extents in file */
3800        int                new_size;        /* size of extents after removal */
3801
3802        ASSERT(ext_diff > 0);
3803        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3804        new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
3805
3806        if (new_size == 0) {
3807                xfs_iext_destroy(ifp);
3808        } else if (ifp->if_flags & XFS_IFEXTIREC) {
3809                xfs_iext_remove_indirect(ifp, idx, ext_diff);
3810        } else if (ifp->if_real_bytes) {
3811                xfs_iext_remove_direct(ifp, idx, ext_diff);
3812        } else {
3813                xfs_iext_remove_inline(ifp, idx, ext_diff);
3814        }
3815        ifp->if_bytes = new_size;
3816}
3817
3818/*
3819 * This removes ext_diff extents from the inline buffer, beginning
3820 * at extent index idx.
3821 */
3822void
3823xfs_iext_remove_inline(
3824        xfs_ifork_t        *ifp,                /* inode fork pointer */
3825        xfs_extnum_t        idx,                /* index to begin removing exts */
3826        int                ext_diff)        /* number of extents to remove */
3827{
3828        int                nextents;        /* number of extents in file */
3829
3830        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3831        ASSERT(idx < XFS_INLINE_EXTS);
3832        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3833        ASSERT(((nextents - ext_diff) > 0) &&
3834                (nextents - ext_diff) < XFS_INLINE_EXTS);
3835
3836        if (idx + ext_diff < nextents) {
3837                memmove(&ifp->if_u2.if_inline_ext[idx],
3838                        &ifp->if_u2.if_inline_ext[idx + ext_diff],
3839                        (nextents - (idx + ext_diff)) *
3840                         sizeof(xfs_bmbt_rec_t));
3841                memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
3842                        0, ext_diff * sizeof(xfs_bmbt_rec_t));
3843        } else {
3844                memset(&ifp->if_u2.if_inline_ext[idx], 0,
3845                        ext_diff * sizeof(xfs_bmbt_rec_t));
3846        }
3847}
3848
3849/*
3850 * This removes ext_diff extents from a linear (direct) extent list,
3851 * beginning at extent index idx. If the extents are being removed
3852 * from the end of the list (ie. truncate) then we just need to re-
3853 * allocate the list to remove the extra space. Otherwise, if the
3854 * extents are being removed from the middle of the existing extent
3855 * entries, then we first need to move the extent records beginning
3856 * at idx + ext_diff up in the list to overwrite the records being
3857 * removed, then remove the extra space via kmem_realloc.
3858 */
3859void
3860xfs_iext_remove_direct(
3861        xfs_ifork_t        *ifp,                /* inode fork pointer */
3862        xfs_extnum_t        idx,                /* index to begin removing exts */
3863        int                ext_diff)        /* number of extents to remove */
3864{
3865        xfs_extnum_t        nextents;        /* number of extents in file */
3866        int                new_size;        /* size of extents after removal */
3867
3868        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3869        new_size = ifp->if_bytes -
3870                (ext_diff * sizeof(xfs_bmbt_rec_t));
3871        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3872
3873        if (new_size == 0) {
3874                xfs_iext_destroy(ifp);
3875                return;
3876        }
3877        /* Move extents up in the list (if needed) */
3878        if (idx + ext_diff < nextents) {
3879                memmove(&ifp->if_u1.if_extents[idx],
3880                        &ifp->if_u1.if_extents[idx + ext_diff],
3881                        (nextents - (idx + ext_diff)) *
3882                         sizeof(xfs_bmbt_rec_t));
3883        }
3884        memset(&ifp->if_u1.if_extents[nextents - ext_diff],
3885                0, ext_diff * sizeof(xfs_bmbt_rec_t));
3886        /*
3887         * Reallocate the direct extent list. If the extents
3888         * will fit inside the inode then xfs_iext_realloc_direct
3889         * will switch from direct to inline extent allocation
3890         * mode for us.
3891         */
3892        xfs_iext_realloc_direct(ifp, new_size);
3893        ifp->if_bytes = new_size;
3894}
3895
3896/*
3897 * This is called when incore extents are being removed from the
3898 * indirection array and the extents being removed span multiple extent
3899 * buffers. The idx parameter contains the file extent index where we
3900 * want to begin removing extents, and the count parameter contains
3901 * how many extents need to be removed.
3902 *
3903 *    |-------|   |-------|
3904 *    | nex1  |   |       |    nex1 - number of extents before idx
3905 *    |-------|   | count |
3906 *    |       |   |       |    count - number of extents being removed at idx
3907 *    | count |   |-------|
3908 *    |       |   | nex2  |    nex2 - number of extents after idx + count
3909 *    |-------|   |-------|
3910 */
3911void
3912xfs_iext_remove_indirect(
3913        xfs_ifork_t        *ifp,                /* inode fork pointer */
3914        xfs_extnum_t        idx,                /* index to begin removing extents */
3915        int                count)                /* number of extents to remove */
3916{
3917        xfs_ext_irec_t        *erp;                /* indirection array pointer */
3918        int                erp_idx = 0;        /* indirection array index */
3919        xfs_extnum_t        ext_cnt;        /* extents left to remove */
3920        xfs_extnum_t        ext_diff;        /* extents to remove in current list */
3921        xfs_extnum_t        nex1;                /* number of extents before idx */
3922        xfs_extnum_t        nex2;                /* extents after idx + count */
3923        int                nlists;                /* entries in indirection array */
3924        int                page_idx = idx;        /* index in target extent list */
3925
3926        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3927        erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
3928        ASSERT(erp != NULL);
3929        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3930        nex1 = page_idx;
3931        ext_cnt = count;
3932        while (ext_cnt) {
3933                nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
3934                ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
3935                /*
3936                 * Check for deletion of entire list;
3937                 * xfs_iext_irec_remove() updates extent offsets.
3938                 */
3939                if (ext_diff == erp->er_extcount) {
3940                        xfs_iext_irec_remove(ifp, erp_idx);
3941                        ext_cnt -= ext_diff;
3942                        nex1 = 0;
3943                        if (ext_cnt) {
3944                                ASSERT(erp_idx < ifp->if_real_bytes /
3945                                        XFS_IEXT_BUFSZ);
3946                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
3947                                nex1 = 0;
3948                                continue;
3949                        } else {
3950                                break;
3951                        }
3952                }
3953                /* Move extents up (if needed) */
3954                if (nex2) {
3955                        memmove(&erp->er_extbuf[nex1],
3956                                &erp->er_extbuf[nex1 + ext_diff],
3957                                nex2 * sizeof(xfs_bmbt_rec_t));
3958                }
3959                /* Zero out rest of page */
3960                memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
3961                        ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
3962                /* Update remaining counters */
3963                erp->er_extcount -= ext_diff;
3964                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
3965                ext_cnt -= ext_diff;
3966                nex1 = 0;
3967                erp_idx++;
3968                erp++;
3969        }
3970        ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
3971        xfs_iext_irec_compact(ifp);
3972}
3973
3974/*
3975 * Create, destroy, or resize a linear (direct) block of extents.
3976 */
3977void
3978xfs_iext_realloc_direct(
3979        xfs_ifork_t        *ifp,                /* inode fork pointer */
3980        int                new_size)        /* new size of extents */
3981{
3982        int                rnew_size;        /* real new size of extents */
3983
3984        rnew_size = new_size;
3985
3986        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
3987                ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
3988                 (new_size != ifp->if_real_bytes)));
3989
3990        /* Free extent records */
3991        if (new_size == 0) {
3992                xfs_iext_destroy(ifp);
3993        }
3994        /* Resize direct extent list and zero any new bytes */
3995        else if (ifp->if_real_bytes) {
3996                /* Check if extents will fit inside the inode */
3997                if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
3998                        xfs_iext_direct_to_inline(ifp, new_size /
3999                                (uint)sizeof(xfs_bmbt_rec_t));
4000                        ifp->if_bytes = new_size;
4001                        return;
4002                }
4003                if (!is_power_of_2(new_size)){
4004                        rnew_size = roundup_pow_of_two(new_size);
4005                }
4006                if (rnew_size != ifp->if_real_bytes) {
4007                        ifp->if_u1.if_extents =
4008                                kmem_realloc(ifp->if_u1.if_extents,
4009                                                rnew_size,
4010                                                ifp->if_real_bytes, KM_NOFS);
4011                }
4012                if (rnew_size > ifp->if_real_bytes) {
4013                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
4014                                (uint)sizeof(xfs_bmbt_rec_t)], 0,
4015                                rnew_size - ifp->if_real_bytes);
4016                }
4017        }
4018        /*
4019         * Switch from the inline extent buffer to a direct
4020         * extent list. Be sure to include the inline extent
4021         * bytes in new_size.
4022         */
4023        else {
4024                new_size += ifp->if_bytes;
4025                if (!is_power_of_2(new_size)) {
4026                        rnew_size = roundup_pow_of_two(new_size);
4027                }
4028                xfs_iext_inline_to_direct(ifp, rnew_size);
4029        }
4030        ifp->if_real_bytes = rnew_size;
4031        ifp->if_bytes = new_size;
4032}
4033
4034/*
4035 * Switch from linear (direct) extent records to inline buffer.
4036 */
4037void
4038xfs_iext_direct_to_inline(
4039        xfs_ifork_t        *ifp,                /* inode fork pointer */
4040        xfs_extnum_t        nextents)        /* number of extents in file */
4041{
4042        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
4043        ASSERT(nextents <= XFS_INLINE_EXTS);
4044        /*
4045         * The inline buffer was zeroed when we switched
4046         * from inline to direct extent allocation mode,
4047         * so we don't need to clear it here.
4048         */
4049        memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
4050                nextents * sizeof(xfs_bmbt_rec_t));
4051        kmem_free(ifp->if_u1.if_extents);
4052        ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
4053        ifp->if_real_bytes = 0;
4054}
4055
4056/*
4057 * Switch from inline buffer to linear (direct) extent records.
4058 * new_size should already be rounded up to the next power of 2
4059 * by the caller (when appropriate), so use new_size as it is.
4060 * However, since new_size may be rounded up, we can't update
4061 * if_bytes here. It is the caller's responsibility to update
4062 * if_bytes upon return.
4063 */
4064void
4065xfs_iext_inline_to_direct(
4066        xfs_ifork_t        *ifp,                /* inode fork pointer */
4067        int                new_size)        /* number of extents in file */
4068{
4069        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
4070        memset(ifp->if_u1.if_extents, 0, new_size);
4071        if (ifp->if_bytes) {
4072                memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
4073                        ifp->if_bytes);
4074                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
4075                        sizeof(xfs_bmbt_rec_t));
4076        }
4077        ifp->if_real_bytes = new_size;
4078}
4079
4080/*
4081 * Resize an extent indirection array to new_size bytes.
4082 */
4083void
4084xfs_iext_realloc_indirect(
4085        xfs_ifork_t        *ifp,                /* inode fork pointer */
4086        int                new_size)        /* new indirection array size */
4087{
4088        int                nlists;                /* number of irec's (ex lists) */
4089        int                size;                /* current indirection array size */
4090
4091        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4092        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4093        size = nlists * sizeof(xfs_ext_irec_t);
4094        ASSERT(ifp->if_real_bytes);
4095        ASSERT((new_size >= 0) && (new_size != size));
4096        if (new_size == 0) {
4097                xfs_iext_destroy(ifp);
4098        } else {
4099                ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
4100                        kmem_realloc(ifp->if_u1.if_ext_irec,
4101                                new_size, size, KM_NOFS);
4102        }
4103}
4104
4105/*
4106 * Switch from indirection array to linear (direct) extent allocations.
4107 */
4108void
4109xfs_iext_indirect_to_direct(
4110         xfs_ifork_t        *ifp)                /* inode fork pointer */
4111{
4112        xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
4113        xfs_extnum_t        nextents;        /* number of extents in file */
4114        int                size;                /* size of file extents */
4115
4116        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4117        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4118        ASSERT(nextents <= XFS_LINEAR_EXTS);
4119        size = nextents * sizeof(xfs_bmbt_rec_t);
4120
4121        xfs_iext_irec_compact_pages(ifp);
4122        ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
4123
4124        ep = ifp->if_u1.if_ext_irec->er_extbuf;
4125        kmem_free(ifp->if_u1.if_ext_irec);
4126        ifp->if_flags &= ~XFS_IFEXTIREC;
4127        ifp->if_u1.if_extents = ep;
4128        ifp->if_bytes = size;
4129        if (nextents < XFS_LINEAR_EXTS) {
4130                xfs_iext_realloc_direct(ifp, size);
4131        }
4132}
4133
4134/*
4135 * Free incore file extents.
4136 */
4137void
4138xfs_iext_destroy(
4139        xfs_ifork_t        *ifp)                /* inode fork pointer */
4140{
4141        if (ifp->if_flags & XFS_IFEXTIREC) {
4142                int        erp_idx;
4143                int        nlists;
4144
4145                nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4146                for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
4147                        xfs_iext_irec_remove(ifp, erp_idx);
4148                }
4149                ifp->if_flags &= ~XFS_IFEXTIREC;
4150        } else if (ifp->if_real_bytes) {
4151                kmem_free(ifp->if_u1.if_extents);
4152        } else if (ifp->if_bytes) {
4153                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
4154                        sizeof(xfs_bmbt_rec_t));
4155        }
4156        ifp->if_u1.if_extents = NULL;
4157        ifp->if_real_bytes = 0;
4158        ifp->if_bytes = 0;
4159}
4160
4161/*
4162 * Return a pointer to the extent record for file system block bno.
4163 */
4164xfs_bmbt_rec_host_t *                        /* pointer to found extent record */
4165xfs_iext_bno_to_ext(
4166        xfs_ifork_t        *ifp,                /* inode fork pointer */
4167        xfs_fileoff_t        bno,                /* block number to search for */
4168        xfs_extnum_t        *idxp)                /* index of target extent */
4169{
4170        xfs_bmbt_rec_host_t *base;        /* pointer to first extent */
4171        xfs_filblks_t        blockcount = 0;        /* number of blocks in extent */
4172        xfs_bmbt_rec_host_t *ep = NULL;        /* pointer to target extent */
4173        xfs_ext_irec_t        *erp = NULL;        /* indirection array pointer */
4174        int                high;                /* upper boundary in search */
4175        xfs_extnum_t        idx = 0;        /* index of target extent */
4176        int                low;                /* lower boundary in search */
4177        xfs_extnum_t        nextents;        /* number of file extents */
4178        xfs_fileoff_t        startoff = 0;        /* start offset of extent */
4179
4180        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4181        if (nextents == 0) {
4182                *idxp = 0;
4183                return NULL;
4184        }
4185        low = 0;
4186        if (ifp->if_flags & XFS_IFEXTIREC) {
4187                /* Find target extent list */
4188                int        erp_idx = 0;
4189                erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
4190                base = erp->er_extbuf;
4191                high = erp->er_extcount - 1;
4192        } else {
4193                base = ifp->if_u1.if_extents;
4194                high = nextents - 1;
4195        }
4196        /* Binary search extent records */
4197        while (low <= high) {
4198                idx = (low + high) >> 1;
4199                ep = base + idx;
4200                startoff = xfs_bmbt_get_startoff(ep);
4201                blockcount = xfs_bmbt_get_blockcount(ep);
4202                if (bno < startoff) {
4203                        high = idx - 1;
4204                } else if (bno >= startoff + blockcount) {
4205                        low = idx + 1;
4206                } else {
4207                        /* Convert back to file-based extent index */
4208                        if (ifp->if_flags & XFS_IFEXTIREC) {
4209                                idx += erp->er_extoff;
4210                        }
4211                        *idxp = idx;
4212                        return ep;
4213                }
4214        }
4215        /* Convert back to file-based extent index */
4216        if (ifp->if_flags & XFS_IFEXTIREC) {
4217                idx += erp->er_extoff;
4218        }
4219        if (bno >= startoff + blockcount) {
4220                if (++idx == nextents) {
4221                        ep = NULL;
4222                } else {
4223                        ep = xfs_iext_get_ext(ifp, idx);
4224                }
4225        }
4226        *idxp = idx;
4227        return ep;
4228}
4229
4230/*
4231 * Return a pointer to the indirection array entry containing the
4232 * extent record for filesystem block bno. Store the index of the
4233 * target irec in *erp_idxp.
4234 */
4235xfs_ext_irec_t *                        /* pointer to found extent record */
4236xfs_iext_bno_to_irec(
4237        xfs_ifork_t        *ifp,                /* inode fork pointer */
4238        xfs_fileoff_t        bno,                /* block number to search for */
4239        int                *erp_idxp)        /* irec index of target ext list */
4240{
4241        xfs_ext_irec_t        *erp = NULL;        /* indirection array pointer */
4242        xfs_ext_irec_t        *erp_next;        /* next indirection array entry */
4243        int                erp_idx;        /* indirection array index */
4244        int                nlists;                /* number of extent irec's (lists) */
4245        int                high;                /* binary search upper limit */
4246        int                low;                /* binary search lower limit */
4247
4248        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4249        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4250        erp_idx = 0;
4251        low = 0;
4252        high = nlists - 1;
4253        while (low <= high) {
4254                erp_idx = (low + high) >> 1;
4255                erp = &ifp->if_u1.if_ext_irec[erp_idx];
4256                erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
4257                if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
4258                        high = erp_idx - 1;
4259                } else if (erp_next && bno >=
4260                           xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
4261                        low = erp_idx + 1;
4262                } else {
4263                        break;
4264                }
4265        }
4266        *erp_idxp = erp_idx;
4267        return erp;
4268}
4269
4270/*
4271 * Return a pointer to the indirection array entry containing the
4272 * extent record at file extent index *idxp. Store the index of the
4273 * target irec in *erp_idxp and store the page index of the target
4274 * extent record in *idxp.
4275 */
4276xfs_ext_irec_t *
4277xfs_iext_idx_to_irec(
4278        xfs_ifork_t        *ifp,                /* inode fork pointer */
4279        xfs_extnum_t        *idxp,                /* extent index (file -> page) */
4280        int                *erp_idxp,        /* pointer to target irec */
4281        int                realloc)        /* new bytes were just added */
4282{
4283        xfs_ext_irec_t        *prev;                /* pointer to previous irec */
4284        xfs_ext_irec_t        *erp = NULL;        /* pointer to current irec */
4285        int                erp_idx;        /* indirection array index */
4286        int                nlists;                /* number of irec's (ex lists) */
4287        int                high;                /* binary search upper limit */
4288        int                low;                /* binary search lower limit */
4289        xfs_extnum_t        page_idx = *idxp; /* extent index in target list */
4290
4291        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4292        ASSERT(page_idx >= 0 && page_idx <=
4293                ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
4294        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4295        erp_idx = 0;
4296        low = 0;
4297        high = nlists - 1;
4298
4299        /* Binary search extent irec's */
4300        while (low <= high) {
4301                erp_idx = (low + high) >> 1;
4302                erp = &ifp->if_u1.if_ext_irec[erp_idx];
4303                prev = erp_idx > 0 ? erp - 1 : NULL;
4304                if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
4305                     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
4306                        high = erp_idx - 1;
4307                } else if (page_idx > erp->er_extoff + erp->er_extcount ||
4308                           (page_idx == erp->er_extoff + erp->er_extcount &&
4309                            !realloc)) {
4310                        low = erp_idx + 1;
4311                } else if (page_idx == erp->er_extoff + erp->er_extcount &&
4312                           erp->er_extcount == XFS_LINEAR_EXTS) {
4313                        ASSERT(realloc);
4314                        page_idx = 0;
4315                        erp_idx++;
4316                        erp = erp_idx < nlists ? erp + 1 : NULL;
4317                        break;
4318                } else {
4319                        page_idx -= erp->er_extoff;
4320                        break;
4321                }
4322        }
4323        *idxp = page_idx;
4324        *erp_idxp = erp_idx;
4325        return(erp);
4326}
4327
4328/*
4329 * Allocate and initialize an indirection array once the space needed
4330 * for incore extents increases above XFS_IEXT_BUFSZ.
4331 */
4332void
4333xfs_iext_irec_init(
4334        xfs_ifork_t        *ifp)                /* inode fork pointer */
4335{
4336        xfs_ext_irec_t        *erp;                /* indirection array pointer */
4337        xfs_extnum_t        nextents;        /* number of extents in file */
4338
4339        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
4340        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4341        ASSERT(nextents <= XFS_LINEAR_EXTS);
4342
4343        erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
4344
4345        if (nextents == 0) {
4346                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
4347        } else if (!ifp->if_real_bytes) {
4348                xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
4349        } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
4350                xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
4351        }
4352        erp->er_extbuf = ifp->if_u1.if_extents;
4353        erp->er_extcount = nextents;
4354        erp->er_extoff = 0;
4355
4356        ifp->if_flags |= XFS_IFEXTIREC;
4357        ifp->if_real_bytes = XFS_IEXT_BUFSZ;
4358        ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
4359        ifp->if_u1.if_ext_irec = erp;
4360
4361        return;
4362}
4363
4364/*
4365 * Allocate and initialize a new entry in the indirection array.
4366 */
4367xfs_ext_irec_t *
4368xfs_iext_irec_new(
4369        xfs_ifork_t        *ifp,                /* inode fork pointer */
4370        int                erp_idx)        /* index for new irec */
4371{
4372        xfs_ext_irec_t        *erp;                /* indirection array pointer */
4373        int                i;                /* loop counter */
4374        int                nlists;                /* number of irec's (ex lists) */
4375
4376        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4377        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4378
4379        /* Resize indirection array */
4380        xfs_iext_realloc_indirect(ifp, ++nlists *
4381                                  sizeof(xfs_ext_irec_t));
4382        /*
4383         * Move records down in the array so the
4384         * new page can use erp_idx.
4385         */
4386        erp = ifp->if_u1.if_ext_irec;
4387        for (i = nlists - 1; i > erp_idx; i--) {
4388                memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
4389        }
4390        ASSERT(i == erp_idx);
4391
4392        /* Initialize new extent record */
4393        erp = ifp->if_u1.if_ext_irec;
4394        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
4395        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
4396        memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
4397        erp[erp_idx].er_extcount = 0;
4398        erp[erp_idx].er_extoff = erp_idx > 0 ?
4399                erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
4400        return (&erp[erp_idx]);
4401}
4402
4403/*
4404 * Remove a record from the indirection array.
4405 */
4406void
4407xfs_iext_irec_remove(
4408        xfs_ifork_t        *ifp,                /* inode fork pointer */
4409        int                erp_idx)        /* irec index to remove */
4410{
4411        xfs_ext_irec_t        *erp;                /* indirection array pointer */
4412        int                i;                /* loop counter */
4413        int                nlists;                /* number of irec's (ex lists) */
4414
4415        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4416        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4417        erp = &ifp->if_u1.if_ext_irec[erp_idx];
4418        if (erp->er_extbuf) {
4419                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
4420                        -erp->er_extcount);
4421                kmem_free(erp->er_extbuf);
4422        }
4423        /* Compact extent records */
4424        erp = ifp->if_u1.if_ext_irec;
4425        for (i = erp_idx; i < nlists - 1; i++) {
4426                memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
4427        }
4428        /*
4429         * Manually free the last extent record from the indirection
4430         * array.  A call to xfs_iext_realloc_indirect() with a size
4431         * of zero would result in a call to xfs_iext_destroy() which
4432         * would in turn call this function again, creating a nasty
4433         * infinite loop.
4434         */
4435        if (--nlists) {
4436                xfs_iext_realloc_indirect(ifp,
4437                        nlists * sizeof(xfs_ext_irec_t));
4438        } else {
4439                kmem_free(ifp->if_u1.if_ext_irec);
4440        }
4441        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
4442}
4443
4444/*
4445 * This is called to clean up large amounts of unused memory allocated
4446 * by the indirection array.  Before compacting anything though, verify
4447 * that the indirection array is still needed and switch back to the
4448 * linear extent list (or even the inline buffer) if possible.  The
4449 * compaction policy is as follows:
4450 *
4451 *    Full Compaction: Extents fit into a single page (or inline buffer)
4452 * Partial Compaction: Extents occupy less than 50% of allocated space
4453 *      No Compaction: Extents occupy at least 50% of allocated space
4454 */
4455void
4456xfs_iext_irec_compact(
4457        xfs_ifork_t        *ifp)                /* inode fork pointer */
4458{
4459        xfs_extnum_t        nextents;        /* number of extents in file */
4460        int                nlists;                /* number of irec's (ex lists) */
4461
4462        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4463        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4464        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4465
4466        if (nextents == 0) {
4467                xfs_iext_destroy(ifp);
4468        } else if (nextents <= XFS_INLINE_EXTS) {
4469                xfs_iext_indirect_to_direct(ifp);
4470                xfs_iext_direct_to_inline(ifp, nextents);
4471        } else if (nextents <= XFS_LINEAR_EXTS) {
4472                xfs_iext_indirect_to_direct(ifp);
4473        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
4474                xfs_iext_irec_compact_pages(ifp);
4475        }
4476}
4477
4478/*
4479 * Combine extents from neighboring extent pages.
4480 */
4481void
4482xfs_iext_irec_compact_pages(
4483        xfs_ifork_t        *ifp)                /* inode fork pointer */
4484{
4485        xfs_ext_irec_t        *erp, *erp_next;/* pointers to irec entries */
4486        int                erp_idx = 0;        /* indirection array index */
4487        int                nlists;                /* number of irec's (ex lists) */
4488
4489        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4490        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4491        while (erp_idx < nlists - 1) {
4492                erp = &ifp->if_u1.if_ext_irec[erp_idx];
4493                erp_next = erp + 1;
4494                if (erp_next->er_extcount <=
4495                    (XFS_LINEAR_EXTS - erp->er_extcount)) {
4496                        memcpy(&erp->er_extbuf[erp->er_extcount],
4497                                erp_next->er_extbuf, erp_next->er_extcount *
4498                                sizeof(xfs_bmbt_rec_t));
4499                        erp->er_extcount += erp_next->er_extcount;
4500                        /*
4501                         * Free page before removing extent record
4502                         * so er_extoffs don't get modified in
4503                         * xfs_iext_irec_remove.
4504                         */
4505                        kmem_free(erp_next->er_extbuf);
4506                        erp_next->er_extbuf = NULL;
4507                        xfs_iext_irec_remove(ifp, erp_idx + 1);
4508                        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4509                } else {
4510                        erp_idx++;
4511                }
4512        }
4513}
4514
4515/*
4516 * This is called to update the er_extoff field in the indirection
4517 * array when extents have been added or removed from one of the
4518 * extent lists. erp_idx contains the irec index to begin updating
4519 * at and ext_diff contains the number of extents that were added
4520 * or removed.
4521 */
4522void
4523xfs_iext_irec_update_extoffs(
4524        xfs_ifork_t        *ifp,                /* inode fork pointer */
4525        int                erp_idx,        /* irec index to update */
4526        int                ext_diff)        /* number of new extents */
4527{
4528        int                i;                /* loop counter */
4529        int                nlists;                /* number of irec's (ex lists */
4530
4531        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4532        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4533        for (i = erp_idx; i < nlists; i++) {
4534                ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
4535        }
4536}