Showing error 1009

User: Jiri Slaby
Error type: Leaving function in locked state
Error type description: Some lock is not unlocked on all paths of a function, so it is leaked
File location: fs/xfs/xfs_log_recover.c
Line in file: 2750
Project: Linux Kernel
Project version: 2.6.28
Tools: Stanse (1.2)
Entered: 2012-03-02 21:35:18 UTC


Source:

   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_types.h"
  21#include "xfs_bit.h"
  22#include "xfs_log.h"
  23#include "xfs_inum.h"
  24#include "xfs_trans.h"
  25#include "xfs_sb.h"
  26#include "xfs_ag.h"
  27#include "xfs_dir2.h"
  28#include "xfs_dmapi.h"
  29#include "xfs_mount.h"
  30#include "xfs_error.h"
  31#include "xfs_bmap_btree.h"
  32#include "xfs_alloc_btree.h"
  33#include "xfs_ialloc_btree.h"
  34#include "xfs_dir2_sf.h"
  35#include "xfs_attr_sf.h"
  36#include "xfs_dinode.h"
  37#include "xfs_inode.h"
  38#include "xfs_inode_item.h"
  39#include "xfs_imap.h"
  40#include "xfs_alloc.h"
  41#include "xfs_ialloc.h"
  42#include "xfs_log_priv.h"
  43#include "xfs_buf_item.h"
  44#include "xfs_log_recover.h"
  45#include "xfs_extfree_item.h"
  46#include "xfs_trans_priv.h"
  47#include "xfs_quota.h"
  48#include "xfs_rw.h"
  49#include "xfs_utils.h"
  50
  51STATIC int        xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
  52STATIC int        xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
  53STATIC void        xlog_recover_insert_item_backq(xlog_recover_item_t **q,
  54                                               xlog_recover_item_t *item);
  55#if defined(DEBUG)
  56STATIC void        xlog_recover_check_summary(xlog_t *);
  57STATIC void        xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
  58#else
  59#define        xlog_recover_check_summary(log)
  60#define        xlog_recover_check_ail(mp, lip, gen)
  61#endif
  62
  63
  64/*
  65 * Sector aligned buffer routines for buffer create/read/write/access
  66 */
  67
  68#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)        \
  69        ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
  70        ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
  71#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)        ((bno) & ~(log)->l_sectbb_mask)
  72
  73xfs_buf_t *
  74xlog_get_bp(
  75        xlog_t                *log,
  76        int                num_bblks)
  77{
  78        ASSERT(num_bblks > 0);
  79
  80        if (log->l_sectbb_log) {
  81                if (num_bblks > 1)
  82                        num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
  83                num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
  84        }
  85        return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
  86}
  87
  88void
  89xlog_put_bp(
  90        xfs_buf_t        *bp)
  91{
  92        xfs_buf_free(bp);
  93}
  94
  95
  96/*
  97 * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
  98 */
  99int
 100xlog_bread(
 101        xlog_t                *log,
 102        xfs_daddr_t        blk_no,
 103        int                nbblks,
 104        xfs_buf_t        *bp)
 105{
 106        int                error;
 107
 108        if (log->l_sectbb_log) {
 109                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 110                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
 111        }
 112
 113        ASSERT(nbblks > 0);
 114        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
 115        ASSERT(bp);
 116
 117        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 118        XFS_BUF_READ(bp);
 119        XFS_BUF_BUSY(bp);
 120        XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 121        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 122
 123        xfsbdstrat(log->l_mp, bp);
 124        error = xfs_iowait(bp);
 125        if (error)
 126                xfs_ioerror_alert("xlog_bread", log->l_mp,
 127                                  bp, XFS_BUF_ADDR(bp));
 128        return error;
 129}
 130
 131/*
 132 * Write out the buffer at the given block for the given number of blocks.
 133 * The buffer is kept locked across the write and is returned locked.
 134 * This can only be used for synchronous log writes.
 135 */
 136STATIC int
 137xlog_bwrite(
 138        xlog_t                *log,
 139        xfs_daddr_t        blk_no,
 140        int                nbblks,
 141        xfs_buf_t        *bp)
 142{
 143        int                error;
 144
 145        if (log->l_sectbb_log) {
 146                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 147                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
 148        }
 149
 150        ASSERT(nbblks > 0);
 151        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
 152
 153        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 154        XFS_BUF_ZEROFLAGS(bp);
 155        XFS_BUF_BUSY(bp);
 156        XFS_BUF_HOLD(bp);
 157        XFS_BUF_PSEMA(bp, PRIBIO);
 158        XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 159        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 160
 161        if ((error = xfs_bwrite(log->l_mp, bp)))
 162                xfs_ioerror_alert("xlog_bwrite", log->l_mp,
 163                                  bp, XFS_BUF_ADDR(bp));
 164        return error;
 165}
 166
 167STATIC xfs_caddr_t
 168xlog_align(
 169        xlog_t                *log,
 170        xfs_daddr_t        blk_no,
 171        int                nbblks,
 172        xfs_buf_t        *bp)
 173{
 174        xfs_caddr_t        ptr;
 175
 176        if (!log->l_sectbb_log)
 177                return XFS_BUF_PTR(bp);
 178
 179        ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
 180        ASSERT(XFS_BUF_SIZE(bp) >=
 181                BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
 182        return ptr;
 183}
 184
 185#ifdef DEBUG
 186/*
 187 * dump debug superblock and log record information
 188 */
 189STATIC void
 190xlog_header_check_dump(
 191        xfs_mount_t                *mp,
 192        xlog_rec_header_t        *head)
 193{
 194        int                        b;
 195
 196        cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
 197        for (b = 0; b < 16; b++)
 198                cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
 199        cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
 200        cmn_err(CE_DEBUG, "    log : uuid = ");
 201        for (b = 0; b < 16; b++)
 202                cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
 203        cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
 204}
 205#else
 206#define xlog_header_check_dump(mp, head)
 207#endif
 208
 209/*
 210 * check log record header for recovery
 211 */
 212STATIC int
 213xlog_header_check_recover(
 214        xfs_mount_t                *mp,
 215        xlog_rec_header_t        *head)
 216{
 217        ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
 218
 219        /*
 220         * IRIX doesn't write the h_fmt field and leaves it zeroed
 221         * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 222         * a dirty log created in IRIX.
 223         */
 224        if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
 225                xlog_warn(
 226        "XFS: dirty log written in incompatible format - can't recover");
 227                xlog_header_check_dump(mp, head);
 228                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 229                                 XFS_ERRLEVEL_HIGH, mp);
 230                return XFS_ERROR(EFSCORRUPTED);
 231        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 232                xlog_warn(
 233        "XFS: dirty log entry has mismatched uuid - can't recover");
 234                xlog_header_check_dump(mp, head);
 235                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 236                                 XFS_ERRLEVEL_HIGH, mp);
 237                return XFS_ERROR(EFSCORRUPTED);
 238        }
 239        return 0;
 240}
 241
 242/*
 243 * read the head block of the log and check the header
 244 */
 245STATIC int
 246xlog_header_check_mount(
 247        xfs_mount_t                *mp,
 248        xlog_rec_header_t        *head)
 249{
 250        ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
 251
 252        if (uuid_is_nil(&head->h_fs_uuid)) {
 253                /*
 254                 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 255                 * h_fs_uuid is nil, we assume this log was last mounted
 256                 * by IRIX and continue.
 257                 */
 258                xlog_warn("XFS: nil uuid in log - IRIX style log");
 259        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 260                xlog_warn("XFS: log has mismatched uuid - can't recover");
 261                xlog_header_check_dump(mp, head);
 262                XFS_ERROR_REPORT("xlog_header_check_mount",
 263                                 XFS_ERRLEVEL_HIGH, mp);
 264                return XFS_ERROR(EFSCORRUPTED);
 265        }
 266        return 0;
 267}
 268
 269STATIC void
 270xlog_recover_iodone(
 271        struct xfs_buf        *bp)
 272{
 273        xfs_mount_t        *mp;
 274
 275        ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
 276
 277        if (XFS_BUF_GETERROR(bp)) {
 278                /*
 279                 * We're not going to bother about retrying
 280                 * this during recovery. One strike!
 281                 */
 282                mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
 283                xfs_ioerror_alert("xlog_recover_iodone",
 284                                  mp, bp, XFS_BUF_ADDR(bp));
 285                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 286        }
 287        XFS_BUF_SET_FSPRIVATE(bp, NULL);
 288        XFS_BUF_CLR_IODONE_FUNC(bp);
 289        xfs_biodone(bp);
 290}
 291
 292/*
 293 * This routine finds (to an approximation) the first block in the physical
 294 * log which contains the given cycle.  It uses a binary search algorithm.
 295 * Note that the algorithm can not be perfect because the disk will not
 296 * necessarily be perfect.
 297 */
 298STATIC int
 299xlog_find_cycle_start(
 300        xlog_t                *log,
 301        xfs_buf_t        *bp,
 302        xfs_daddr_t        first_blk,
 303        xfs_daddr_t        *last_blk,
 304        uint                cycle)
 305{
 306        xfs_caddr_t        offset;
 307        xfs_daddr_t        mid_blk;
 308        uint                mid_cycle;
 309        int                error;
 310
 311        mid_blk = BLK_AVG(first_blk, *last_blk);
 312        while (mid_blk != first_blk && mid_blk != *last_blk) {
 313                if ((error = xlog_bread(log, mid_blk, 1, bp)))
 314                        return error;
 315                offset = xlog_align(log, mid_blk, 1, bp);
 316                mid_cycle = xlog_get_cycle(offset);
 317                if (mid_cycle == cycle) {
 318                        *last_blk = mid_blk;
 319                        /* last_half_cycle == mid_cycle */
 320                } else {
 321                        first_blk = mid_blk;
 322                        /* first_half_cycle == mid_cycle */
 323                }
 324                mid_blk = BLK_AVG(first_blk, *last_blk);
 325        }
 326        ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
 327               (mid_blk == *last_blk && mid_blk-1 == first_blk));
 328
 329        return 0;
 330}
 331
 332/*
 333 * Check that the range of blocks does not contain the cycle number
 334 * given.  The scan needs to occur from front to back and the ptr into the
 335 * region must be updated since a later routine will need to perform another
 336 * test.  If the region is completely good, we end up returning the same
 337 * last block number.
 338 *
 339 * Set blkno to -1 if we encounter no errors.  This is an invalid block number
 340 * since we don't ever expect logs to get this large.
 341 */
 342STATIC int
 343xlog_find_verify_cycle(
 344        xlog_t                *log,
 345        xfs_daddr_t        start_blk,
 346        int                nbblks,
 347        uint                stop_on_cycle_no,
 348        xfs_daddr_t        *new_blk)
 349{
 350        xfs_daddr_t        i, j;
 351        uint                cycle;
 352        xfs_buf_t        *bp;
 353        xfs_daddr_t        bufblks;
 354        xfs_caddr_t        buf = NULL;
 355        int                error = 0;
 356
 357        bufblks = 1 << ffs(nbblks);
 358
 359        while (!(bp = xlog_get_bp(log, bufblks))) {
 360                /* can't get enough memory to do everything in one big buffer */
 361                bufblks >>= 1;
 362                if (bufblks <= log->l_sectbb_log)
 363                        return ENOMEM;
 364        }
 365
 366        for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 367                int        bcount;
 368
 369                bcount = min(bufblks, (start_blk + nbblks - i));
 370
 371                if ((error = xlog_bread(log, i, bcount, bp)))
 372                        goto out;
 373
 374                buf = xlog_align(log, i, bcount, bp);
 375                for (j = 0; j < bcount; j++) {
 376                        cycle = xlog_get_cycle(buf);
 377                        if (cycle == stop_on_cycle_no) {
 378                                *new_blk = i+j;
 379                                goto out;
 380                        }
 381
 382                        buf += BBSIZE;
 383                }
 384        }
 385
 386        *new_blk = -1;
 387
 388out:
 389        xlog_put_bp(bp);
 390        return error;
 391}
 392
 393/*
 394 * Potentially backup over partial log record write.
 395 *
 396 * In the typical case, last_blk is the number of the block directly after
 397 * a good log record.  Therefore, we subtract one to get the block number
 398 * of the last block in the given buffer.  extra_bblks contains the number
 399 * of blocks we would have read on a previous read.  This happens when the
 400 * last log record is split over the end of the physical log.
 401 *
 402 * extra_bblks is the number of blocks potentially verified on a previous
 403 * call to this routine.
 404 */
 405STATIC int
 406xlog_find_verify_log_record(
 407        xlog_t                        *log,
 408        xfs_daddr_t                start_blk,
 409        xfs_daddr_t                *last_blk,
 410        int                        extra_bblks)
 411{
 412        xfs_daddr_t                i;
 413        xfs_buf_t                *bp;
 414        xfs_caddr_t                offset = NULL;
 415        xlog_rec_header_t        *head = NULL;
 416        int                        error = 0;
 417        int                        smallmem = 0;
 418        int                        num_blks = *last_blk - start_blk;
 419        int                        xhdrs;
 420
 421        ASSERT(start_blk != 0 || *last_blk != start_blk);
 422
 423        if (!(bp = xlog_get_bp(log, num_blks))) {
 424                if (!(bp = xlog_get_bp(log, 1)))
 425                        return ENOMEM;
 426                smallmem = 1;
 427        } else {
 428                if ((error = xlog_bread(log, start_blk, num_blks, bp)))
 429                        goto out;
 430                offset = xlog_align(log, start_blk, num_blks, bp);
 431                offset += ((num_blks - 1) << BBSHIFT);
 432        }
 433
 434        for (i = (*last_blk) - 1; i >= 0; i--) {
 435                if (i < start_blk) {
 436                        /* valid log record not found */
 437                        xlog_warn(
 438                "XFS: Log inconsistent (didn't find previous header)");
 439                        ASSERT(0);
 440                        error = XFS_ERROR(EIO);
 441                        goto out;
 442                }
 443
 444                if (smallmem) {
 445                        if ((error = xlog_bread(log, i, 1, bp)))
 446                                goto out;
 447                        offset = xlog_align(log, i, 1, bp);
 448                }
 449
 450                head = (xlog_rec_header_t *)offset;
 451
 452                if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
 453                        break;
 454
 455                if (!smallmem)
 456                        offset -= BBSIZE;
 457        }
 458
 459        /*
 460         * We hit the beginning of the physical log & still no header.  Return
 461         * to caller.  If caller can handle a return of -1, then this routine
 462         * will be called again for the end of the physical log.
 463         */
 464        if (i == -1) {
 465                error = -1;
 466                goto out;
 467        }
 468
 469        /*
 470         * We have the final block of the good log (the first block
 471         * of the log record _before_ the head. So we check the uuid.
 472         */
 473        if ((error = xlog_header_check_mount(log->l_mp, head)))
 474                goto out;
 475
 476        /*
 477         * We may have found a log record header before we expected one.
 478         * last_blk will be the 1st block # with a given cycle #.  We may end
 479         * up reading an entire log record.  In this case, we don't want to
 480         * reset last_blk.  Only when last_blk points in the middle of a log
 481         * record do we update last_blk.
 482         */
 483        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 484                uint        h_size = be32_to_cpu(head->h_size);
 485
 486                xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 487                if (h_size % XLOG_HEADER_CYCLE_SIZE)
 488                        xhdrs++;
 489        } else {
 490                xhdrs = 1;
 491        }
 492
 493        if (*last_blk - i + extra_bblks !=
 494            BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 495                *last_blk = i;
 496
 497out:
 498        xlog_put_bp(bp);
 499        return error;
 500}
 501
 502/*
 503 * Head is defined to be the point of the log where the next log write
 504 * write could go.  This means that incomplete LR writes at the end are
 505 * eliminated when calculating the head.  We aren't guaranteed that previous
 506 * LR have complete transactions.  We only know that a cycle number of
 507 * current cycle number -1 won't be present in the log if we start writing
 508 * from our current block number.
 509 *
 510 * last_blk contains the block number of the first block with a given
 511 * cycle number.
 512 *
 513 * Return: zero if normal, non-zero if error.
 514 */
 515STATIC int
 516xlog_find_head(
 517        xlog_t                 *log,
 518        xfs_daddr_t        *return_head_blk)
 519{
 520        xfs_buf_t        *bp;
 521        xfs_caddr_t        offset;
 522        xfs_daddr_t        new_blk, first_blk, start_blk, last_blk, head_blk;
 523        int                num_scan_bblks;
 524        uint                first_half_cycle, last_half_cycle;
 525        uint                stop_on_cycle;
 526        int                error, log_bbnum = log->l_logBBsize;
 527
 528        /* Is the end of the log device zeroed? */
 529        if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
 530                *return_head_blk = first_blk;
 531
 532                /* Is the whole lot zeroed? */
 533                if (!first_blk) {
 534                        /* Linux XFS shouldn't generate totally zeroed logs -
 535                         * mkfs etc write a dummy unmount record to a fresh
 536                         * log so we can store the uuid in there
 537                         */
 538                        xlog_warn("XFS: totally zeroed log");
 539                }
 540
 541                return 0;
 542        } else if (error) {
 543                xlog_warn("XFS: empty log check failed");
 544                return error;
 545        }
 546
 547        first_blk = 0;                        /* get cycle # of 1st block */
 548        bp = xlog_get_bp(log, 1);
 549        if (!bp)
 550                return ENOMEM;
 551        if ((error = xlog_bread(log, 0, 1, bp)))
 552                goto bp_err;
 553        offset = xlog_align(log, 0, 1, bp);
 554        first_half_cycle = xlog_get_cycle(offset);
 555
 556        last_blk = head_blk = log_bbnum - 1;        /* get cycle # of last block */
 557        if ((error = xlog_bread(log, last_blk, 1, bp)))
 558                goto bp_err;
 559        offset = xlog_align(log, last_blk, 1, bp);
 560        last_half_cycle = xlog_get_cycle(offset);
 561        ASSERT(last_half_cycle != 0);
 562
 563        /*
 564         * If the 1st half cycle number is equal to the last half cycle number,
 565         * then the entire log is stamped with the same cycle number.  In this
 566         * case, head_blk can't be set to zero (which makes sense).  The below
 567         * math doesn't work out properly with head_blk equal to zero.  Instead,
 568         * we set it to log_bbnum which is an invalid block number, but this
 569         * value makes the math correct.  If head_blk doesn't changed through
 570         * all the tests below, *head_blk is set to zero at the very end rather
 571         * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 572         * in a circular file.
 573         */
 574        if (first_half_cycle == last_half_cycle) {
 575                /*
 576                 * In this case we believe that the entire log should have
 577                 * cycle number last_half_cycle.  We need to scan backwards
 578                 * from the end verifying that there are no holes still
 579                 * containing last_half_cycle - 1.  If we find such a hole,
 580                 * then the start of that hole will be the new head.  The
 581                 * simple case looks like
 582                 *        x | x ... | x - 1 | x
 583                 * Another case that fits this picture would be
 584                 *        x | x + 1 | x ... | x
 585                 * In this case the head really is somewhere at the end of the
 586                 * log, as one of the latest writes at the beginning was
 587                 * incomplete.
 588                 * One more case is
 589                 *        x | x + 1 | x ... | x - 1 | x
 590                 * This is really the combination of the above two cases, and
 591                 * the head has to end up at the start of the x-1 hole at the
 592                 * end of the log.
 593                 *
 594                 * In the 256k log case, we will read from the beginning to the
 595                 * end of the log and search for cycle numbers equal to x-1.
 596                 * We don't worry about the x+1 blocks that we encounter,
 597                 * because we know that they cannot be the head since the log
 598                 * started with x.
 599                 */
 600                head_blk = log_bbnum;
 601                stop_on_cycle = last_half_cycle - 1;
 602        } else {
 603                /*
 604                 * In this case we want to find the first block with cycle
 605                 * number matching last_half_cycle.  We expect the log to be
 606                 * some variation on
 607                 *        x + 1 ... | x ...
 608                 * The first block with cycle number x (last_half_cycle) will
 609                 * be where the new head belongs.  First we do a binary search
 610                 * for the first occurrence of last_half_cycle.  The binary
 611                 * search may not be totally accurate, so then we scan back
 612                 * from there looking for occurrences of last_half_cycle before
 613                 * us.  If that backwards scan wraps around the beginning of
 614                 * the log, then we look for occurrences of last_half_cycle - 1
 615                 * at the end of the log.  The cases we're looking for look
 616                 * like
 617                 *        x + 1 ... | x | x + 1 | x ...
 618                 *                               ^ binary search stopped here
 619                 * or
 620                 *        x + 1 ... | x ... | x - 1 | x
 621                 *        <---------> less than scan distance
 622                 */
 623                stop_on_cycle = last_half_cycle;
 624                if ((error = xlog_find_cycle_start(log, bp, first_blk,
 625                                                &head_blk, last_half_cycle)))
 626                        goto bp_err;
 627        }
 628
 629        /*
 630         * Now validate the answer.  Scan back some number of maximum possible
 631         * blocks and make sure each one has the expected cycle number.  The
 632         * maximum is determined by the total possible amount of buffering
 633         * in the in-core log.  The following number can be made tighter if
 634         * we actually look at the block size of the filesystem.
 635         */
 636        num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 637        if (head_blk >= num_scan_bblks) {
 638                /*
 639                 * We are guaranteed that the entire check can be performed
 640                 * in one buffer.
 641                 */
 642                start_blk = head_blk - num_scan_bblks;
 643                if ((error = xlog_find_verify_cycle(log,
 644                                                start_blk, num_scan_bblks,
 645                                                stop_on_cycle, &new_blk)))
 646                        goto bp_err;
 647                if (new_blk != -1)
 648                        head_blk = new_blk;
 649        } else {                /* need to read 2 parts of log */
 650                /*
 651                 * We are going to scan backwards in the log in two parts.
 652                 * First we scan the physical end of the log.  In this part
 653                 * of the log, we are looking for blocks with cycle number
 654                 * last_half_cycle - 1.
 655                 * If we find one, then we know that the log starts there, as
 656                 * we've found a hole that didn't get written in going around
 657                 * the end of the physical log.  The simple case for this is
 658                 *        x + 1 ... | x ... | x - 1 | x
 659                 *        <---------> less than scan distance
 660                 * If all of the blocks at the end of the log have cycle number
 661                 * last_half_cycle, then we check the blocks at the start of
 662                 * the log looking for occurrences of last_half_cycle.  If we
 663                 * find one, then our current estimate for the location of the
 664                 * first occurrence of last_half_cycle is wrong and we move
 665                 * back to the hole we've found.  This case looks like
 666                 *        x + 1 ... | x | x + 1 | x ...
 667                 *                               ^ binary search stopped here
 668                 * Another case we need to handle that only occurs in 256k
 669                 * logs is
 670                 *        x + 1 ... | x ... | x+1 | x ...
 671                 *                   ^ binary search stops here
 672                 * In a 256k log, the scan at the end of the log will see the
 673                 * x + 1 blocks.  We need to skip past those since that is
 674                 * certainly not the head of the log.  By searching for
 675                 * last_half_cycle-1 we accomplish that.
 676                 */
 677                start_blk = log_bbnum - num_scan_bblks + head_blk;
 678                ASSERT(head_blk <= INT_MAX &&
 679                        (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
 680                if ((error = xlog_find_verify_cycle(log, start_blk,
 681                                        num_scan_bblks - (int)head_blk,
 682                                        (stop_on_cycle - 1), &new_blk)))
 683                        goto bp_err;
 684                if (new_blk != -1) {
 685                        head_blk = new_blk;
 686                        goto bad_blk;
 687                }
 688
 689                /*
 690                 * Scan beginning of log now.  The last part of the physical
 691                 * log is good.  This scan needs to verify that it doesn't find
 692                 * the last_half_cycle.
 693                 */
 694                start_blk = 0;
 695                ASSERT(head_blk <= INT_MAX);
 696                if ((error = xlog_find_verify_cycle(log,
 697                                        start_blk, (int)head_blk,
 698                                        stop_on_cycle, &new_blk)))
 699                        goto bp_err;
 700                if (new_blk != -1)
 701                        head_blk = new_blk;
 702        }
 703
 704 bad_blk:
 705        /*
 706         * Now we need to make sure head_blk is not pointing to a block in
 707         * the middle of a log record.
 708         */
 709        num_scan_bblks = XLOG_REC_SHIFT(log);
 710        if (head_blk >= num_scan_bblks) {
 711                start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 712
 713                /* start ptr at last block ptr before head_blk */
 714                if ((error = xlog_find_verify_log_record(log, start_blk,
 715                                                        &head_blk, 0)) == -1) {
 716                        error = XFS_ERROR(EIO);
 717                        goto bp_err;
 718                } else if (error)
 719                        goto bp_err;
 720        } else {
 721                start_blk = 0;
 722                ASSERT(head_blk <= INT_MAX);
 723                if ((error = xlog_find_verify_log_record(log, start_blk,
 724                                                        &head_blk, 0)) == -1) {
 725                        /* We hit the beginning of the log during our search */
 726                        start_blk = log_bbnum - num_scan_bblks + head_blk;
 727                        new_blk = log_bbnum;
 728                        ASSERT(start_blk <= INT_MAX &&
 729                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
 730                        ASSERT(head_blk <= INT_MAX);
 731                        if ((error = xlog_find_verify_log_record(log,
 732                                                        start_blk, &new_blk,
 733                                                        (int)head_blk)) == -1) {
 734                                error = XFS_ERROR(EIO);
 735                                goto bp_err;
 736                        } else if (error)
 737                                goto bp_err;
 738                        if (new_blk != log_bbnum)
 739                                head_blk = new_blk;
 740                } else if (error)
 741                        goto bp_err;
 742        }
 743
 744        xlog_put_bp(bp);
 745        if (head_blk == log_bbnum)
 746                *return_head_blk = 0;
 747        else
 748                *return_head_blk = head_blk;
 749        /*
 750         * When returning here, we have a good block number.  Bad block
 751         * means that during a previous crash, we didn't have a clean break
 752         * from cycle number N to cycle number N-1.  In this case, we need
 753         * to find the first block with cycle number N-1.
 754         */
 755        return 0;
 756
 757 bp_err:
 758        xlog_put_bp(bp);
 759
 760        if (error)
 761            xlog_warn("XFS: failed to find log head");
 762        return error;
 763}
 764
 765/*
 766 * Find the sync block number or the tail of the log.
 767 *
 768 * This will be the block number of the last record to have its
 769 * associated buffers synced to disk.  Every log record header has
 770 * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
 771 * to get a sync block number.  The only concern is to figure out which
 772 * log record header to believe.
 773 *
 774 * The following algorithm uses the log record header with the largest
 775 * lsn.  The entire log record does not need to be valid.  We only care
 776 * that the header is valid.
 777 *
 778 * We could speed up search by using current head_blk buffer, but it is not
 779 * available.
 780 */
 781int
 782xlog_find_tail(
 783        xlog_t                        *log,
 784        xfs_daddr_t                *head_blk,
 785        xfs_daddr_t                *tail_blk)
 786{
 787        xlog_rec_header_t        *rhead;
 788        xlog_op_header_t        *op_head;
 789        xfs_caddr_t                offset = NULL;
 790        xfs_buf_t                *bp;
 791        int                        error, i, found;
 792        xfs_daddr_t                umount_data_blk;
 793        xfs_daddr_t                after_umount_blk;
 794        xfs_lsn_t                tail_lsn;
 795        int                        hblks;
 796
 797        found = 0;
 798
 799        /*
 800         * Find previous log record
 801         */
 802        if ((error = xlog_find_head(log, head_blk)))
 803                return error;
 804
 805        bp = xlog_get_bp(log, 1);
 806        if (!bp)
 807                return ENOMEM;
 808        if (*head_blk == 0) {                                /* special case */
 809                if ((error = xlog_bread(log, 0, 1, bp)))
 810                        goto bread_err;
 811                offset = xlog_align(log, 0, 1, bp);
 812                if (xlog_get_cycle(offset) == 0) {
 813                        *tail_blk = 0;
 814                        /* leave all other log inited values alone */
 815                        goto exit;
 816                }
 817        }
 818
 819        /*
 820         * Search backwards looking for log record header block
 821         */
 822        ASSERT(*head_blk < INT_MAX);
 823        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
 824                if ((error = xlog_bread(log, i, 1, bp)))
 825                        goto bread_err;
 826                offset = xlog_align(log, i, 1, bp);
 827                if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
 828                        found = 1;
 829                        break;
 830                }
 831        }
 832        /*
 833         * If we haven't found the log record header block, start looking
 834         * again from the end of the physical log.  XXXmiken: There should be
 835         * a check here to make sure we didn't search more than N blocks in
 836         * the previous code.
 837         */
 838        if (!found) {
 839                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
 840                        if ((error = xlog_bread(log, i, 1, bp)))
 841                                goto bread_err;
 842                        offset = xlog_align(log, i, 1, bp);
 843                        if (XLOG_HEADER_MAGIC_NUM ==
 844                            be32_to_cpu(*(__be32 *)offset)) {
 845                                found = 2;
 846                                break;
 847                        }
 848                }
 849        }
 850        if (!found) {
 851                xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
 852                ASSERT(0);
 853                return XFS_ERROR(EIO);
 854        }
 855
 856        /* find blk_no of tail of log */
 857        rhead = (xlog_rec_header_t *)offset;
 858        *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
 859
 860        /*
 861         * Reset log values according to the state of the log when we
 862         * crashed.  In the case where head_blk == 0, we bump curr_cycle
 863         * one because the next write starts a new cycle rather than
 864         * continuing the cycle of the last good log record.  At this
 865         * point we have guaranteed that all partial log records have been
 866         * accounted for.  Therefore, we know that the last good log record
 867         * written was complete and ended exactly on the end boundary
 868         * of the physical log.
 869         */
 870        log->l_prev_block = i;
 871        log->l_curr_block = (int)*head_blk;
 872        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
 873        if (found == 2)
 874                log->l_curr_cycle++;
 875        log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
 876        log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
 877        log->l_grant_reserve_cycle = log->l_curr_cycle;
 878        log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
 879        log->l_grant_write_cycle = log->l_curr_cycle;
 880        log->l_grant_write_bytes = BBTOB(log->l_curr_block);
 881
 882        /*
 883         * Look for unmount record.  If we find it, then we know there
 884         * was a clean unmount.  Since 'i' could be the last block in
 885         * the physical log, we convert to a log block before comparing
 886         * to the head_blk.
 887         *
 888         * Save the current tail lsn to use to pass to
 889         * xlog_clear_stale_blocks() below.  We won't want to clear the
 890         * unmount record if there is one, so we pass the lsn of the
 891         * unmount record rather than the block after it.
 892         */
 893        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 894                int        h_size = be32_to_cpu(rhead->h_size);
 895                int        h_version = be32_to_cpu(rhead->h_version);
 896
 897                if ((h_version & XLOG_VERSION_2) &&
 898                    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
 899                        hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
 900                        if (h_size % XLOG_HEADER_CYCLE_SIZE)
 901                                hblks++;
 902                } else {
 903                        hblks = 1;
 904                }
 905        } else {
 906                hblks = 1;
 907        }
 908        after_umount_blk = (i + hblks + (int)
 909                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
 910        tail_lsn = log->l_tail_lsn;
 911        if (*head_blk == after_umount_blk &&
 912            be32_to_cpu(rhead->h_num_logops) == 1) {
 913                umount_data_blk = (i + hblks) % log->l_logBBsize;
 914                if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
 915                        goto bread_err;
 916                }
 917                offset = xlog_align(log, umount_data_blk, 1, bp);
 918                op_head = (xlog_op_header_t *)offset;
 919                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
 920                        /*
 921                         * Set tail and last sync so that newly written
 922                         * log records will point recovery to after the
 923                         * current unmount record.
 924                         */
 925                        log->l_tail_lsn =
 926                                xlog_assign_lsn(log->l_curr_cycle,
 927                                                after_umount_blk);
 928                        log->l_last_sync_lsn =
 929                                xlog_assign_lsn(log->l_curr_cycle,
 930                                                after_umount_blk);
 931                        *tail_blk = after_umount_blk;
 932
 933                        /*
 934                         * Note that the unmount was clean. If the unmount
 935                         * was not clean, we need to know this to rebuild the
 936                         * superblock counters from the perag headers if we
 937                         * have a filesystem using non-persistent counters.
 938                         */
 939                        log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
 940                }
 941        }
 942
 943        /*
 944         * Make sure that there are no blocks in front of the head
 945         * with the same cycle number as the head.  This can happen
 946         * because we allow multiple outstanding log writes concurrently,
 947         * and the later writes might make it out before earlier ones.
 948         *
 949         * We use the lsn from before modifying it so that we'll never
 950         * overwrite the unmount record after a clean unmount.
 951         *
 952         * Do this only if we are going to recover the filesystem
 953         *
 954         * NOTE: This used to say "if (!readonly)"
 955         * However on Linux, we can & do recover a read-only filesystem.
 956         * We only skip recovery if NORECOVERY is specified on mount,
 957         * in which case we would not be here.
 958         *
 959         * But... if the -device- itself is readonly, just skip this.
 960         * We can't recover this device anyway, so it won't matter.
 961         */
 962        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
 963                error = xlog_clear_stale_blocks(log, tail_lsn);
 964        }
 965
 966bread_err:
 967exit:
 968        xlog_put_bp(bp);
 969
 970        if (error)
 971                xlog_warn("XFS: failed to locate log tail");
 972        return error;
 973}
 974
 975/*
 976 * Is the log zeroed at all?
 977 *
 978 * The last binary search should be changed to perform an X block read
 979 * once X becomes small enough.  You can then search linearly through
 980 * the X blocks.  This will cut down on the number of reads we need to do.
 981 *
 982 * If the log is partially zeroed, this routine will pass back the blkno
 983 * of the first block with cycle number 0.  It won't have a complete LR
 984 * preceding it.
 985 *
 986 * Return:
 987 *        0  => the log is completely written to
 988 *        -1 => use *blk_no as the first block of the log
 989 *        >0 => error has occurred
 990 */
 991STATIC int
 992xlog_find_zeroed(
 993        xlog_t                *log,
 994        xfs_daddr_t        *blk_no)
 995{
 996        xfs_buf_t        *bp;
 997        xfs_caddr_t        offset;
 998        uint                first_cycle, last_cycle;
 999        xfs_daddr_t        new_blk, last_blk, start_blk;
1000        xfs_daddr_t     num_scan_bblks;
1001        int                error, log_bbnum = log->l_logBBsize;
1002
1003        *blk_no = 0;
1004
1005        /* check totally zeroed log */
1006        bp = xlog_get_bp(log, 1);
1007        if (!bp)
1008                return ENOMEM;
1009        if ((error = xlog_bread(log, 0, 1, bp)))
1010                goto bp_err;
1011        offset = xlog_align(log, 0, 1, bp);
1012        first_cycle = xlog_get_cycle(offset);
1013        if (first_cycle == 0) {                /* completely zeroed log */
1014                *blk_no = 0;
1015                xlog_put_bp(bp);
1016                return -1;
1017        }
1018
1019        /* check partially zeroed log */
1020        if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
1021                goto bp_err;
1022        offset = xlog_align(log, log_bbnum-1, 1, bp);
1023        last_cycle = xlog_get_cycle(offset);
1024        if (last_cycle != 0) {                /* log completely written to */
1025                xlog_put_bp(bp);
1026                return 0;
1027        } else if (first_cycle != 1) {
1028                /*
1029                 * If the cycle of the last block is zero, the cycle of
1030                 * the first block must be 1. If it's not, maybe we're
1031                 * not looking at a log... Bail out.
1032                 */
1033                xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
1034                return XFS_ERROR(EINVAL);
1035        }
1036
1037        /* we have a partially zeroed log */
1038        last_blk = log_bbnum-1;
1039        if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1040                goto bp_err;
1041
1042        /*
1043         * Validate the answer.  Because there is no way to guarantee that
1044         * the entire log is made up of log records which are the same size,
1045         * we scan over the defined maximum blocks.  At this point, the maximum
1046         * is not chosen to mean anything special.   XXXmiken
1047         */
1048        num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1049        ASSERT(num_scan_bblks <= INT_MAX);
1050
1051        if (last_blk < num_scan_bblks)
1052                num_scan_bblks = last_blk;
1053        start_blk = last_blk - num_scan_bblks;
1054
1055        /*
1056         * We search for any instances of cycle number 0 that occur before
1057         * our current estimate of the head.  What we're trying to detect is
1058         *        1 ... | 0 | 1 | 0...
1059         *                       ^ binary search ends here
1060         */
1061        if ((error = xlog_find_verify_cycle(log, start_blk,
1062                                         (int)num_scan_bblks, 0, &new_blk)))
1063                goto bp_err;
1064        if (new_blk != -1)
1065                last_blk = new_blk;
1066
1067        /*
1068         * Potentially backup over partial log record write.  We don't need
1069         * to search the end of the log because we know it is zero.
1070         */
1071        if ((error = xlog_find_verify_log_record(log, start_blk,
1072                                &last_blk, 0)) == -1) {
1073            error = XFS_ERROR(EIO);
1074            goto bp_err;
1075        } else if (error)
1076            goto bp_err;
1077
1078        *blk_no = last_blk;
1079bp_err:
1080        xlog_put_bp(bp);
1081        if (error)
1082                return error;
1083        return -1;
1084}
1085
1086/*
1087 * These are simple subroutines used by xlog_clear_stale_blocks() below
1088 * to initialize a buffer full of empty log record headers and write
1089 * them into the log.
1090 */
1091STATIC void
1092xlog_add_record(
1093        xlog_t                        *log,
1094        xfs_caddr_t                buf,
1095        int                        cycle,
1096        int                        block,
1097        int                        tail_cycle,
1098        int                        tail_block)
1099{
1100        xlog_rec_header_t        *recp = (xlog_rec_header_t *)buf;
1101
1102        memset(buf, 0, BBSIZE);
1103        recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1104        recp->h_cycle = cpu_to_be32(cycle);
1105        recp->h_version = cpu_to_be32(
1106                        xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1107        recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1108        recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1109        recp->h_fmt = cpu_to_be32(XLOG_FMT);
1110        memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1111}
1112
1113STATIC int
1114xlog_write_log_records(
1115        xlog_t                *log,
1116        int                cycle,
1117        int                start_block,
1118        int                blocks,
1119        int                tail_cycle,
1120        int                tail_block)
1121{
1122        xfs_caddr_t        offset;
1123        xfs_buf_t        *bp;
1124        int                balign, ealign;
1125        int                sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
1126        int                end_block = start_block + blocks;
1127        int                bufblks;
1128        int                error = 0;
1129        int                i, j = 0;
1130
1131        bufblks = 1 << ffs(blocks);
1132        while (!(bp = xlog_get_bp(log, bufblks))) {
1133                bufblks >>= 1;
1134                if (bufblks <= log->l_sectbb_log)
1135                        return ENOMEM;
1136        }
1137
1138        /* We may need to do a read at the start to fill in part of
1139         * the buffer in the starting sector not covered by the first
1140         * write below.
1141         */
1142        balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
1143        if (balign != start_block) {
1144                if ((error = xlog_bread(log, start_block, 1, bp))) {
1145                        xlog_put_bp(bp);
1146                        return error;
1147                }
1148                j = start_block - balign;
1149        }
1150
1151        for (i = start_block; i < end_block; i += bufblks) {
1152                int                bcount, endcount;
1153
1154                bcount = min(bufblks, end_block - start_block);
1155                endcount = bcount - j;
1156
1157                /* We may need to do a read at the end to fill in part of
1158                 * the buffer in the final sector not covered by the write.
1159                 * If this is the same sector as the above read, skip it.
1160                 */
1161                ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
1162                if (j == 0 && (start_block + endcount > ealign)) {
1163                        offset = XFS_BUF_PTR(bp);
1164                        balign = BBTOB(ealign - start_block);
1165                        error = XFS_BUF_SET_PTR(bp, offset + balign,
1166                                                BBTOB(sectbb));
1167                        if (!error)
1168                                error = xlog_bread(log, ealign, sectbb, bp);
1169                        if (!error)
1170                                error = XFS_BUF_SET_PTR(bp, offset, bufblks);
1171                        if (error)
1172                                break;
1173                }
1174
1175                offset = xlog_align(log, start_block, endcount, bp);
1176                for (; j < endcount; j++) {
1177                        xlog_add_record(log, offset, cycle, i+j,
1178                                        tail_cycle, tail_block);
1179                        offset += BBSIZE;
1180                }
1181                error = xlog_bwrite(log, start_block, endcount, bp);
1182                if (error)
1183                        break;
1184                start_block += endcount;
1185                j = 0;
1186        }
1187        xlog_put_bp(bp);
1188        return error;
1189}
1190
1191/*
1192 * This routine is called to blow away any incomplete log writes out
1193 * in front of the log head.  We do this so that we won't become confused
1194 * if we come up, write only a little bit more, and then crash again.
1195 * If we leave the partial log records out there, this situation could
1196 * cause us to think those partial writes are valid blocks since they
1197 * have the current cycle number.  We get rid of them by overwriting them
1198 * with empty log records with the old cycle number rather than the
1199 * current one.
1200 *
1201 * The tail lsn is passed in rather than taken from
1202 * the log so that we will not write over the unmount record after a
1203 * clean unmount in a 512 block log.  Doing so would leave the log without
1204 * any valid log records in it until a new one was written.  If we crashed
1205 * during that time we would not be able to recover.
1206 */
1207STATIC int
1208xlog_clear_stale_blocks(
1209        xlog_t                *log,
1210        xfs_lsn_t        tail_lsn)
1211{
1212        int                tail_cycle, head_cycle;
1213        int                tail_block, head_block;
1214        int                tail_distance, max_distance;
1215        int                distance;
1216        int                error;
1217
1218        tail_cycle = CYCLE_LSN(tail_lsn);
1219        tail_block = BLOCK_LSN(tail_lsn);
1220        head_cycle = log->l_curr_cycle;
1221        head_block = log->l_curr_block;
1222
1223        /*
1224         * Figure out the distance between the new head of the log
1225         * and the tail.  We want to write over any blocks beyond the
1226         * head that we may have written just before the crash, but
1227         * we don't want to overwrite the tail of the log.
1228         */
1229        if (head_cycle == tail_cycle) {
1230                /*
1231                 * The tail is behind the head in the physical log,
1232                 * so the distance from the head to the tail is the
1233                 * distance from the head to the end of the log plus
1234                 * the distance from the beginning of the log to the
1235                 * tail.
1236                 */
1237                if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1238                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1239                                         XFS_ERRLEVEL_LOW, log->l_mp);
1240                        return XFS_ERROR(EFSCORRUPTED);
1241                }
1242                tail_distance = tail_block + (log->l_logBBsize - head_block);
1243        } else {
1244                /*
1245                 * The head is behind the tail in the physical log,
1246                 * so the distance from the head to the tail is just
1247                 * the tail block minus the head block.
1248                 */
1249                if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1250                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1251                                         XFS_ERRLEVEL_LOW, log->l_mp);
1252                        return XFS_ERROR(EFSCORRUPTED);
1253                }
1254                tail_distance = tail_block - head_block;
1255        }
1256
1257        /*
1258         * If the head is right up against the tail, we can't clear
1259         * anything.
1260         */
1261        if (tail_distance <= 0) {
1262                ASSERT(tail_distance == 0);
1263                return 0;
1264        }
1265
1266        max_distance = XLOG_TOTAL_REC_SHIFT(log);
1267        /*
1268         * Take the smaller of the maximum amount of outstanding I/O
1269         * we could have and the distance to the tail to clear out.
1270         * We take the smaller so that we don't overwrite the tail and
1271         * we don't waste all day writing from the head to the tail
1272         * for no reason.
1273         */
1274        max_distance = MIN(max_distance, tail_distance);
1275
1276        if ((head_block + max_distance) <= log->l_logBBsize) {
1277                /*
1278                 * We can stomp all the blocks we need to without
1279                 * wrapping around the end of the log.  Just do it
1280                 * in a single write.  Use the cycle number of the
1281                 * current cycle minus one so that the log will look like:
1282                 *     n ... | n - 1 ...
1283                 */
1284                error = xlog_write_log_records(log, (head_cycle - 1),
1285                                head_block, max_distance, tail_cycle,
1286                                tail_block);
1287                if (error)
1288                        return error;
1289        } else {
1290                /*
1291                 * We need to wrap around the end of the physical log in
1292                 * order to clear all the blocks.  Do it in two separate
1293                 * I/Os.  The first write should be from the head to the
1294                 * end of the physical log, and it should use the current
1295                 * cycle number minus one just like above.
1296                 */
1297                distance = log->l_logBBsize - head_block;
1298                error = xlog_write_log_records(log, (head_cycle - 1),
1299                                head_block, distance, tail_cycle,
1300                                tail_block);
1301
1302                if (error)
1303                        return error;
1304
1305                /*
1306                 * Now write the blocks at the start of the physical log.
1307                 * This writes the remainder of the blocks we want to clear.
1308                 * It uses the current cycle number since we're now on the
1309                 * same cycle as the head so that we get:
1310                 *    n ... n ... | n - 1 ...
1311                 *    ^^^^^ blocks we're writing
1312                 */
1313                distance = max_distance - (log->l_logBBsize - head_block);
1314                error = xlog_write_log_records(log, head_cycle, 0, distance,
1315                                tail_cycle, tail_block);
1316                if (error)
1317                        return error;
1318        }
1319
1320        return 0;
1321}
1322
1323/******************************************************************************
1324 *
1325 *                Log recover routines
1326 *
1327 ******************************************************************************
1328 */
1329
1330STATIC xlog_recover_t *
1331xlog_recover_find_tid(
1332        xlog_recover_t                *q,
1333        xlog_tid_t                tid)
1334{
1335        xlog_recover_t                *p = q;
1336
1337        while (p != NULL) {
1338                if (p->r_log_tid == tid)
1339                    break;
1340                p = p->r_next;
1341        }
1342        return p;
1343}
1344
1345STATIC void
1346xlog_recover_put_hashq(
1347        xlog_recover_t                **q,
1348        xlog_recover_t                *trans)
1349{
1350        trans->r_next = *q;
1351        *q = trans;
1352}
1353
1354STATIC void
1355xlog_recover_add_item(
1356        xlog_recover_item_t        **itemq)
1357{
1358        xlog_recover_item_t        *item;
1359
1360        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1361        xlog_recover_insert_item_backq(itemq, item);
1362}
1363
1364STATIC int
1365xlog_recover_add_to_cont_trans(
1366        xlog_recover_t                *trans,
1367        xfs_caddr_t                dp,
1368        int                        len)
1369{
1370        xlog_recover_item_t        *item;
1371        xfs_caddr_t                ptr, old_ptr;
1372        int                        old_len;
1373
1374        item = trans->r_itemq;
1375        if (item == NULL) {
1376                /* finish copying rest of trans header */
1377                xlog_recover_add_item(&trans->r_itemq);
1378                ptr = (xfs_caddr_t) &trans->r_theader +
1379                                sizeof(xfs_trans_header_t) - len;
1380                memcpy(ptr, dp, len); /* d, s, l */
1381                return 0;
1382        }
1383        item = item->ri_prev;
1384
1385        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1386        old_len = item->ri_buf[item->ri_cnt-1].i_len;
1387
1388        ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
1389        memcpy(&ptr[old_len], dp, len); /* d, s, l */
1390        item->ri_buf[item->ri_cnt-1].i_len += len;
1391        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1392        return 0;
1393}
1394
1395/*
1396 * The next region to add is the start of a new region.  It could be
1397 * a whole region or it could be the first part of a new region.  Because
1398 * of this, the assumption here is that the type and size fields of all
1399 * format structures fit into the first 32 bits of the structure.
1400 *
1401 * This works because all regions must be 32 bit aligned.  Therefore, we
1402 * either have both fields or we have neither field.  In the case we have
1403 * neither field, the data part of the region is zero length.  We only have
1404 * a log_op_header and can throw away the header since a new one will appear
1405 * later.  If we have at least 4 bytes, then we can determine how many regions
1406 * will appear in the current log item.
1407 */
1408STATIC int
1409xlog_recover_add_to_trans(
1410        xlog_recover_t                *trans,
1411        xfs_caddr_t                dp,
1412        int                        len)
1413{
1414        xfs_inode_log_format_t        *in_f;                        /* any will do */
1415        xlog_recover_item_t        *item;
1416        xfs_caddr_t                ptr;
1417
1418        if (!len)
1419                return 0;
1420        item = trans->r_itemq;
1421        if (item == NULL) {
1422                /* we need to catch log corruptions here */
1423                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1424                        xlog_warn("XFS: xlog_recover_add_to_trans: "
1425                                  "bad header magic number");
1426                        ASSERT(0);
1427                        return XFS_ERROR(EIO);
1428                }
1429                if (len == sizeof(xfs_trans_header_t))
1430                        xlog_recover_add_item(&trans->r_itemq);
1431                memcpy(&trans->r_theader, dp, len); /* d, s, l */
1432                return 0;
1433        }
1434
1435        ptr = kmem_alloc(len, KM_SLEEP);
1436        memcpy(ptr, dp, len);
1437        in_f = (xfs_inode_log_format_t *)ptr;
1438
1439        if (item->ri_prev->ri_total != 0 &&
1440             item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
1441                xlog_recover_add_item(&trans->r_itemq);
1442        }
1443        item = trans->r_itemq;
1444        item = item->ri_prev;
1445
1446        if (item->ri_total == 0) {                /* first region to be added */
1447                item->ri_total        = in_f->ilf_size;
1448                ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
1449                item->ri_buf = kmem_zalloc((item->ri_total *
1450                                            sizeof(xfs_log_iovec_t)), KM_SLEEP);
1451        }
1452        ASSERT(item->ri_total > item->ri_cnt);
1453        /* Description region is ri_buf[0] */
1454        item->ri_buf[item->ri_cnt].i_addr = ptr;
1455        item->ri_buf[item->ri_cnt].i_len  = len;
1456        item->ri_cnt++;
1457        return 0;
1458}
1459
1460STATIC void
1461xlog_recover_new_tid(
1462        xlog_recover_t                **q,
1463        xlog_tid_t                tid,
1464        xfs_lsn_t                lsn)
1465{
1466        xlog_recover_t                *trans;
1467
1468        trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1469        trans->r_log_tid   = tid;
1470        trans->r_lsn           = lsn;
1471        xlog_recover_put_hashq(q, trans);
1472}
1473
1474STATIC int
1475xlog_recover_unlink_tid(
1476        xlog_recover_t                **q,
1477        xlog_recover_t                *trans)
1478{
1479        xlog_recover_t                *tp;
1480        int                        found = 0;
1481
1482        ASSERT(trans != NULL);
1483        if (trans == *q) {
1484                *q = (*q)->r_next;
1485        } else {
1486                tp = *q;
1487                while (tp) {
1488                        if (tp->r_next == trans) {
1489                                found = 1;
1490                                break;
1491                        }
1492                        tp = tp->r_next;
1493                }
1494                if (!found) {
1495                        xlog_warn(
1496                             "XFS: xlog_recover_unlink_tid: trans not found");
1497                        ASSERT(0);
1498                        return XFS_ERROR(EIO);
1499                }
1500                tp->r_next = tp->r_next->r_next;
1501        }
1502        return 0;
1503}
1504
1505STATIC void
1506xlog_recover_insert_item_backq(
1507        xlog_recover_item_t        **q,
1508        xlog_recover_item_t        *item)
1509{
1510        if (*q == NULL) {
1511                item->ri_prev = item->ri_next = item;
1512                *q = item;
1513        } else {
1514                item->ri_next                = *q;
1515                item->ri_prev                = (*q)->ri_prev;
1516                (*q)->ri_prev                = item;
1517                item->ri_prev->ri_next        = item;
1518        }
1519}
1520
1521STATIC void
1522xlog_recover_insert_item_frontq(
1523        xlog_recover_item_t        **q,
1524        xlog_recover_item_t        *item)
1525{
1526        xlog_recover_insert_item_backq(q, item);
1527        *q = item;
1528}
1529
1530STATIC int
1531xlog_recover_reorder_trans(
1532        xlog_recover_t                *trans)
1533{
1534        xlog_recover_item_t        *first_item, *itemq, *itemq_next;
1535        xfs_buf_log_format_t        *buf_f;
1536        ushort                        flags = 0;
1537
1538        first_item = itemq = trans->r_itemq;
1539        trans->r_itemq = NULL;
1540        do {
1541                itemq_next = itemq->ri_next;
1542                buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
1543
1544                switch (ITEM_TYPE(itemq)) {
1545                case XFS_LI_BUF:
1546                        flags = buf_f->blf_flags;
1547                        if (!(flags & XFS_BLI_CANCEL)) {
1548                                xlog_recover_insert_item_frontq(&trans->r_itemq,
1549                                                                itemq);
1550                                break;
1551                        }
1552                case XFS_LI_INODE:
1553                case XFS_LI_DQUOT:
1554                case XFS_LI_QUOTAOFF:
1555                case XFS_LI_EFD:
1556                case XFS_LI_EFI:
1557                        xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
1558                        break;
1559                default:
1560                        xlog_warn(
1561        "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
1562                        ASSERT(0);
1563                        return XFS_ERROR(EIO);
1564                }
1565                itemq = itemq_next;
1566        } while (first_item != itemq);
1567        return 0;
1568}
1569
1570/*
1571 * Build up the table of buf cancel records so that we don't replay
1572 * cancelled data in the second pass.  For buffer records that are
1573 * not cancel records, there is nothing to do here so we just return.
1574 *
1575 * If we get a cancel record which is already in the table, this indicates
1576 * that the buffer was cancelled multiple times.  In order to ensure
1577 * that during pass 2 we keep the record in the table until we reach its
1578 * last occurrence in the log, we keep a reference count in the cancel
1579 * record in the table to tell us how many times we expect to see this
1580 * record during the second pass.
1581 */
1582STATIC void
1583xlog_recover_do_buffer_pass1(
1584        xlog_t                        *log,
1585        xfs_buf_log_format_t        *buf_f)
1586{
1587        xfs_buf_cancel_t        *bcp;
1588        xfs_buf_cancel_t        *nextp;
1589        xfs_buf_cancel_t        *prevp;
1590        xfs_buf_cancel_t        **bucket;
1591        xfs_daddr_t                blkno = 0;
1592        uint                        len = 0;
1593        ushort                        flags = 0;
1594
1595        switch (buf_f->blf_type) {
1596        case XFS_LI_BUF:
1597                blkno = buf_f->blf_blkno;
1598                len = buf_f->blf_len;
1599                flags = buf_f->blf_flags;
1600                break;
1601        }
1602
1603        /*
1604         * If this isn't a cancel buffer item, then just return.
1605         */
1606        if (!(flags & XFS_BLI_CANCEL))
1607                return;
1608
1609        /*
1610         * Insert an xfs_buf_cancel record into the hash table of
1611         * them.  If there is already an identical record, bump
1612         * its reference count.
1613         */
1614        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1615                                          XLOG_BC_TABLE_SIZE];
1616        /*
1617         * If the hash bucket is empty then just insert a new record into
1618         * the bucket.
1619         */
1620        if (*bucket == NULL) {
1621                bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1622                                                     KM_SLEEP);
1623                bcp->bc_blkno = blkno;
1624                bcp->bc_len = len;
1625                bcp->bc_refcount = 1;
1626                bcp->bc_next = NULL;
1627                *bucket = bcp;
1628                return;
1629        }
1630
1631        /*
1632         * The hash bucket is not empty, so search for duplicates of our
1633         * record.  If we find one them just bump its refcount.  If not
1634         * then add us at the end of the list.
1635         */
1636        prevp = NULL;
1637        nextp = *bucket;
1638        while (nextp != NULL) {
1639                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1640                        nextp->bc_refcount++;
1641                        return;
1642                }
1643                prevp = nextp;
1644                nextp = nextp->bc_next;
1645        }
1646        ASSERT(prevp != NULL);
1647        bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1648                                             KM_SLEEP);
1649        bcp->bc_blkno = blkno;
1650        bcp->bc_len = len;
1651        bcp->bc_refcount = 1;
1652        bcp->bc_next = NULL;
1653        prevp->bc_next = bcp;
1654}
1655
1656/*
1657 * Check to see whether the buffer being recovered has a corresponding
1658 * entry in the buffer cancel record table.  If it does then return 1
1659 * so that it will be cancelled, otherwise return 0.  If the buffer is
1660 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
1661 * the refcount on the entry in the table and remove it from the table
1662 * if this is the last reference.
1663 *
1664 * We remove the cancel record from the table when we encounter its
1665 * last occurrence in the log so that if the same buffer is re-used
1666 * again after its last cancellation we actually replay the changes
1667 * made at that point.
1668 */
1669STATIC int
1670xlog_check_buffer_cancelled(
1671        xlog_t                        *log,
1672        xfs_daddr_t                blkno,
1673        uint                        len,
1674        ushort                        flags)
1675{
1676        xfs_buf_cancel_t        *bcp;
1677        xfs_buf_cancel_t        *prevp;
1678        xfs_buf_cancel_t        **bucket;
1679
1680        if (log->l_buf_cancel_table == NULL) {
1681                /*
1682                 * There is nothing in the table built in pass one,
1683                 * so this buffer must not be cancelled.
1684                 */
1685                ASSERT(!(flags & XFS_BLI_CANCEL));
1686                return 0;
1687        }
1688
1689        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1690                                          XLOG_BC_TABLE_SIZE];
1691        bcp = *bucket;
1692        if (bcp == NULL) {
1693                /*
1694                 * There is no corresponding entry in the table built
1695                 * in pass one, so this buffer has not been cancelled.
1696                 */
1697                ASSERT(!(flags & XFS_BLI_CANCEL));
1698                return 0;
1699        }
1700
1701        /*
1702         * Search for an entry in the buffer cancel table that
1703         * matches our buffer.
1704         */
1705        prevp = NULL;
1706        while (bcp != NULL) {
1707                if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
1708                        /*
1709                         * We've go a match, so return 1 so that the
1710                         * recovery of this buffer is cancelled.
1711                         * If this buffer is actually a buffer cancel
1712                         * log item, then decrement the refcount on the
1713                         * one in the table and remove it if this is the
1714                         * last reference.
1715                         */
1716                        if (flags & XFS_BLI_CANCEL) {
1717                                bcp->bc_refcount--;
1718                                if (bcp->bc_refcount == 0) {
1719                                        if (prevp == NULL) {
1720                                                *bucket = bcp->bc_next;
1721                                        } else {
1722                                                prevp->bc_next = bcp->bc_next;
1723                                        }
1724                                        kmem_free(bcp);
1725                                }
1726                        }
1727                        return 1;
1728                }
1729                prevp = bcp;
1730                bcp = bcp->bc_next;
1731        }
1732        /*
1733         * We didn't find a corresponding entry in the table, so
1734         * return 0 so that the buffer is NOT cancelled.
1735         */
1736        ASSERT(!(flags & XFS_BLI_CANCEL));
1737        return 0;
1738}
1739
1740STATIC int
1741xlog_recover_do_buffer_pass2(
1742        xlog_t                        *log,
1743        xfs_buf_log_format_t        *buf_f)
1744{
1745        xfs_daddr_t                blkno = 0;
1746        ushort                        flags = 0;
1747        uint                        len = 0;
1748
1749        switch (buf_f->blf_type) {
1750        case XFS_LI_BUF:
1751                blkno = buf_f->blf_blkno;
1752                flags = buf_f->blf_flags;
1753                len = buf_f->blf_len;
1754                break;
1755        }
1756
1757        return xlog_check_buffer_cancelled(log, blkno, len, flags);
1758}
1759
1760/*
1761 * Perform recovery for a buffer full of inodes.  In these buffers,
1762 * the only data which should be recovered is that which corresponds
1763 * to the di_next_unlinked pointers in the on disk inode structures.
1764 * The rest of the data for the inodes is always logged through the
1765 * inodes themselves rather than the inode buffer and is recovered
1766 * in xlog_recover_do_inode_trans().
1767 *
1768 * The only time when buffers full of inodes are fully recovered is
1769 * when the buffer is full of newly allocated inodes.  In this case
1770 * the buffer will not be marked as an inode buffer and so will be
1771 * sent to xlog_recover_do_reg_buffer() below during recovery.
1772 */
1773STATIC int
1774xlog_recover_do_inode_buffer(
1775        xfs_mount_t                *mp,
1776        xlog_recover_item_t        *item,
1777        xfs_buf_t                *bp,
1778        xfs_buf_log_format_t        *buf_f)
1779{
1780        int                        i;
1781        int                        item_index;
1782        int                        bit;
1783        int                        nbits;
1784        int                        reg_buf_offset;
1785        int                        reg_buf_bytes;
1786        int                        next_unlinked_offset;
1787        int                        inodes_per_buf;
1788        xfs_agino_t                *logged_nextp;
1789        xfs_agino_t                *buffer_nextp;
1790        unsigned int                *data_map = NULL;
1791        unsigned int                map_size = 0;
1792
1793        switch (buf_f->blf_type) {
1794        case XFS_LI_BUF:
1795                data_map = buf_f->blf_data_map;
1796                map_size = buf_f->blf_map_size;
1797                break;
1798        }
1799        /*
1800         * Set the variables corresponding to the current region to
1801         * 0 so that we'll initialize them on the first pass through
1802         * the loop.
1803         */
1804        reg_buf_offset = 0;
1805        reg_buf_bytes = 0;
1806        bit = 0;
1807        nbits = 0;
1808        item_index = 0;
1809        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1810        for (i = 0; i < inodes_per_buf; i++) {
1811                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1812                        offsetof(xfs_dinode_t, di_next_unlinked);
1813
1814                while (next_unlinked_offset >=
1815                       (reg_buf_offset + reg_buf_bytes)) {
1816                        /*
1817                         * The next di_next_unlinked field is beyond
1818                         * the current logged region.  Find the next
1819                         * logged region that contains or is beyond
1820                         * the current di_next_unlinked field.
1821                         */
1822                        bit += nbits;
1823                        bit = xfs_next_bit(data_map, map_size, bit);
1824
1825                        /*
1826                         * If there are no more logged regions in the
1827                         * buffer, then we're done.
1828                         */
1829                        if (bit == -1) {
1830                                return 0;
1831                        }
1832
1833                        nbits = xfs_contig_bits(data_map, map_size,
1834                                                         bit);
1835                        ASSERT(nbits > 0);
1836                        reg_buf_offset = bit << XFS_BLI_SHIFT;
1837                        reg_buf_bytes = nbits << XFS_BLI_SHIFT;
1838                        item_index++;
1839                }
1840
1841                /*
1842                 * If the current logged region starts after the current
1843                 * di_next_unlinked field, then move on to the next
1844                 * di_next_unlinked field.
1845                 */
1846                if (next_unlinked_offset < reg_buf_offset) {
1847                        continue;
1848                }
1849
1850                ASSERT(item->ri_buf[item_index].i_addr != NULL);
1851                ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
1852                ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1853
1854                /*
1855                 * The current logged region contains a copy of the
1856                 * current di_next_unlinked field.  Extract its value
1857                 * and copy it to the buffer copy.
1858                 */
1859                logged_nextp = (xfs_agino_t *)
1860                               ((char *)(item->ri_buf[item_index].i_addr) +
1861                                (next_unlinked_offset - reg_buf_offset));
1862                if (unlikely(*logged_nextp == 0)) {
1863                        xfs_fs_cmn_err(CE_ALERT, mp,
1864                                "bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
1865                                item, bp);
1866                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1867                                         XFS_ERRLEVEL_LOW, mp);
1868                        return XFS_ERROR(EFSCORRUPTED);
1869                }
1870
1871                buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1872                                              next_unlinked_offset);
1873                *buffer_nextp = *logged_nextp;
1874        }
1875
1876        return 0;
1877}
1878
1879/*
1880 * Perform a 'normal' buffer recovery.  Each logged region of the
1881 * buffer should be copied over the corresponding region in the
1882 * given buffer.  The bitmap in the buf log format structure indicates
1883 * where to place the logged data.
1884 */
1885/*ARGSUSED*/
1886STATIC void
1887xlog_recover_do_reg_buffer(
1888        xlog_recover_item_t        *item,
1889        xfs_buf_t                *bp,
1890        xfs_buf_log_format_t        *buf_f)
1891{
1892        int                        i;
1893        int                        bit;
1894        int                        nbits;
1895        unsigned int                *data_map = NULL;
1896        unsigned int                map_size = 0;
1897        int                     error;
1898
1899        switch (buf_f->blf_type) {
1900        case XFS_LI_BUF:
1901                data_map = buf_f->blf_data_map;
1902                map_size = buf_f->blf_map_size;
1903                break;
1904        }
1905        bit = 0;
1906        i = 1;  /* 0 is the buf format structure */
1907        while (1) {
1908                bit = xfs_next_bit(data_map, map_size, bit);
1909                if (bit == -1)
1910                        break;
1911                nbits = xfs_contig_bits(data_map, map_size, bit);
1912                ASSERT(nbits > 0);
1913                ASSERT(item->ri_buf[i].i_addr != NULL);
1914                ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
1915                ASSERT(XFS_BUF_COUNT(bp) >=
1916                       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
1917
1918                /*
1919                 * Do a sanity check if this is a dquot buffer. Just checking
1920                 * the first dquot in the buffer should do. XXXThis is
1921                 * probably a good thing to do for other buf types also.
1922                 */
1923                error = 0;
1924                if (buf_f->blf_flags &
1925                   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
1926                        error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
1927                                               item->ri_buf[i].i_addr,
1928                                               -1, 0, XFS_QMOPT_DOWARN,
1929                                               "dquot_buf_recover");
1930                }
1931                if (!error)
1932                        memcpy(xfs_buf_offset(bp,
1933                                (uint)bit << XFS_BLI_SHIFT),        /* dest */
1934                                item->ri_buf[i].i_addr,                /* source */
1935                                nbits<<XFS_BLI_SHIFT);                /* length */
1936                i++;
1937                bit += nbits;
1938        }
1939
1940        /* Shouldn't be any more regions */
1941        ASSERT(i == item->ri_total);
1942}
1943
1944/*
1945 * Do some primitive error checking on ondisk dquot data structures.
1946 */
1947int
1948xfs_qm_dqcheck(
1949        xfs_disk_dquot_t *ddq,
1950        xfs_dqid_t         id,
1951        uint                 type,          /* used only when IO_dorepair is true */
1952        uint                 flags,
1953        char                 *str)
1954{
1955        xfs_dqblk_t         *d = (xfs_dqblk_t *)ddq;
1956        int                errs = 0;
1957
1958        /*
1959         * We can encounter an uninitialized dquot buffer for 2 reasons:
1960         * 1. If we crash while deleting the quotainode(s), and those blks got
1961         *    used for user data. This is because we take the path of regular
1962         *    file deletion; however, the size field of quotainodes is never
1963         *    updated, so all the tricks that we play in itruncate_finish
1964         *    don't quite matter.
1965         *
1966         * 2. We don't play the quota buffers when there's a quotaoff logitem.
1967         *    But the allocation will be replayed so we'll end up with an
1968         *    uninitialized quota block.
1969         *
1970         * This is all fine; things are still consistent, and we haven't lost
1971         * any quota information. Just don't complain about bad dquot blks.
1972         */
1973        if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
1974                if (flags & XFS_QMOPT_DOWARN)
1975                        cmn_err(CE_ALERT,
1976                        "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
1977                        str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
1978                errs++;
1979        }
1980        if (ddq->d_version != XFS_DQUOT_VERSION) {
1981                if (flags & XFS_QMOPT_DOWARN)
1982                        cmn_err(CE_ALERT,
1983                        "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
1984                        str, id, ddq->d_version, XFS_DQUOT_VERSION);
1985                errs++;
1986        }
1987
1988        if (ddq->d_flags != XFS_DQ_USER &&
1989            ddq->d_flags != XFS_DQ_PROJ &&
1990            ddq->d_flags != XFS_DQ_GROUP) {
1991                if (flags & XFS_QMOPT_DOWARN)
1992                        cmn_err(CE_ALERT,
1993                        "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
1994                        str, id, ddq->d_flags);
1995                errs++;
1996        }
1997
1998        if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
1999                if (flags & XFS_QMOPT_DOWARN)
2000                        cmn_err(CE_ALERT,
2001                        "%s : ondisk-dquot 0x%p, ID mismatch: "
2002                        "0x%x expected, found id 0x%x",
2003                        str, ddq, id, be32_to_cpu(ddq->d_id));
2004                errs++;
2005        }
2006
2007        if (!errs && ddq->d_id) {
2008                if (ddq->d_blk_softlimit &&
2009                    be64_to_cpu(ddq->d_bcount) >=
2010                                be64_to_cpu(ddq->d_blk_softlimit)) {
2011                        if (!ddq->d_btimer) {
2012                                if (flags & XFS_QMOPT_DOWARN)
2013                                        cmn_err(CE_ALERT,
2014                                        "%s : Dquot ID 0x%x (0x%p) "
2015                                        "BLK TIMER NOT STARTED",
2016                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
2017                                errs++;
2018                        }
2019                }
2020                if (ddq->d_ino_softlimit &&
2021                    be64_to_cpu(ddq->d_icount) >=
2022                                be64_to_cpu(ddq->d_ino_softlimit)) {
2023                        if (!ddq->d_itimer) {
2024                                if (flags & XFS_QMOPT_DOWARN)
2025                                        cmn_err(CE_ALERT,
2026                                        "%s : Dquot ID 0x%x (0x%p) "
2027                                        "INODE TIMER NOT STARTED",
2028                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
2029                                errs++;
2030                        }
2031                }
2032                if (ddq->d_rtb_softlimit &&
2033                    be64_to_cpu(ddq->d_rtbcount) >=
2034                                be64_to_cpu(ddq->d_rtb_softlimit)) {
2035                        if (!ddq->d_rtbtimer) {
2036                                if (flags & XFS_QMOPT_DOWARN)
2037                                        cmn_err(CE_ALERT,
2038                                        "%s : Dquot ID 0x%x (0x%p) "
2039                                        "RTBLK TIMER NOT STARTED",
2040                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
2041                                errs++;
2042                        }
2043                }
2044        }
2045
2046        if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2047                return errs;
2048
2049        if (flags & XFS_QMOPT_DOWARN)
2050                cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
2051
2052        /*
2053         * Typically, a repair is only requested by quotacheck.
2054         */
2055        ASSERT(id != -1);
2056        ASSERT(flags & XFS_QMOPT_DQREPAIR);
2057        memset(d, 0, sizeof(xfs_dqblk_t));
2058
2059        d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2060        d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2061        d->dd_diskdq.d_flags = type;
2062        d->dd_diskdq.d_id = cpu_to_be32(id);
2063
2064        return errs;
2065}
2066
2067/*
2068 * Perform a dquot buffer recovery.
2069 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2070 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2071 * Else, treat it as a regular buffer and do recovery.
2072 */
2073STATIC void
2074xlog_recover_do_dquot_buffer(
2075        xfs_mount_t                *mp,
2076        xlog_t                        *log,
2077        xlog_recover_item_t        *item,
2078        xfs_buf_t                *bp,
2079        xfs_buf_log_format_t        *buf_f)
2080{
2081        uint                        type;
2082
2083        /*
2084         * Filesystems are required to send in quota flags at mount time.
2085         */
2086        if (mp->m_qflags == 0) {
2087                return;
2088        }
2089
2090        type = 0;
2091        if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
2092                type |= XFS_DQ_USER;
2093        if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
2094                type |= XFS_DQ_PROJ;
2095        if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
2096                type |= XFS_DQ_GROUP;
2097        /*
2098         * This type of quotas was turned off, so ignore this buffer
2099         */
2100        if (log->l_quotaoffs_flag & type)
2101                return;
2102
2103        xlog_recover_do_reg_buffer(item, bp, buf_f);
2104}
2105
2106/*
2107 * This routine replays a modification made to a buffer at runtime.
2108 * There are actually two types of buffer, regular and inode, which
2109 * are handled differently.  Inode buffers are handled differently
2110 * in that we only recover a specific set of data from them, namely
2111 * the inode di_next_unlinked fields.  This is because all other inode
2112 * data is actually logged via inode records and any data we replay
2113 * here which overlaps that may be stale.
2114 *
2115 * When meta-data buffers are freed at run time we log a buffer item
2116 * with the XFS_BLI_CANCEL bit set to indicate that previous copies
2117 * of the buffer in the log should not be replayed at recovery time.
2118 * This is so that if the blocks covered by the buffer are reused for
2119 * file data before we crash we don't end up replaying old, freed
2120 * meta-data into a user's file.
2121 *
2122 * To handle the cancellation of buffer log items, we make two passes
2123 * over the log during recovery.  During the first we build a table of
2124 * those buffers which have been cancelled, and during the second we
2125 * only replay those buffers which do not have corresponding cancel
2126 * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
2127 * for more details on the implementation of the table of cancel records.
2128 */
2129STATIC int
2130xlog_recover_do_buffer_trans(
2131        xlog_t                        *log,
2132        xlog_recover_item_t        *item,
2133        int                        pass)
2134{
2135        xfs_buf_log_format_t        *buf_f;
2136        xfs_mount_t                *mp;
2137        xfs_buf_t                *bp;
2138        int                        error;
2139        int                        cancel;
2140        xfs_daddr_t                blkno;
2141        int                        len;
2142        ushort                        flags;
2143
2144        buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2145
2146        if (pass == XLOG_RECOVER_PASS1) {
2147                /*
2148                 * In this pass we're only looking for buf items
2149                 * with the XFS_BLI_CANCEL bit set.
2150                 */
2151                xlog_recover_do_buffer_pass1(log, buf_f);
2152                return 0;
2153        } else {
2154                /*
2155                 * In this pass we want to recover all the buffers
2156                 * which have not been cancelled and are not
2157                 * cancellation buffers themselves.  The routine
2158                 * we call here will tell us whether or not to
2159                 * continue with the replay of this buffer.
2160                 */
2161                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2162                if (cancel) {
2163                        return 0;
2164                }
2165        }
2166        switch (buf_f->blf_type) {
2167        case XFS_LI_BUF:
2168                blkno = buf_f->blf_blkno;
2169                len = buf_f->blf_len;
2170                flags = buf_f->blf_flags;
2171                break;
2172        default:
2173                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2174                        "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2175                        buf_f->blf_type, log->l_mp->m_logname ?
2176                        log->l_mp->m_logname : "internal");
2177                XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2178                                 XFS_ERRLEVEL_LOW, log->l_mp);
2179                return XFS_ERROR(EFSCORRUPTED);
2180        }
2181
2182        mp = log->l_mp;
2183        if (flags & XFS_BLI_INODE_BUF) {
2184                bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
2185                                                                XFS_BUF_LOCK);
2186        } else {
2187                bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
2188        }
2189        if (XFS_BUF_ISERROR(bp)) {
2190                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
2191                                  bp, blkno);
2192                error = XFS_BUF_GETERROR(bp);
2193                xfs_buf_relse(bp);
2194                return error;
2195        }
2196
2197        error = 0;
2198        if (flags & XFS_BLI_INODE_BUF) {
2199                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2200        } else if (flags &
2201                  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
2202                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2203        } else {
2204                xlog_recover_do_reg_buffer(item, bp, buf_f);
2205        }
2206        if (error)
2207                return XFS_ERROR(error);
2208
2209        /*
2210         * Perform delayed write on the buffer.  Asynchronous writes will be
2211         * slower when taking into account all the buffers to be flushed.
2212         *
2213         * Also make sure that only inode buffers with good sizes stay in
2214         * the buffer cache.  The kernel moves inodes in buffers of 1 block
2215         * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
2216         * buffers in the log can be a different size if the log was generated
2217         * by an older kernel using unclustered inode buffers or a newer kernel
2218         * running with a different inode cluster size.  Regardless, if the
2219         * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2220         * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2221         * the buffer out of the buffer cache so that the buffer won't
2222         * overlap with future reads of those inodes.
2223         */
2224        if (XFS_DINODE_MAGIC ==
2225            be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2226            (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2227                        (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2228                XFS_BUF_STALE(bp);
2229                error = xfs_bwrite(mp, bp);
2230        } else {
2231                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2232                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2233                XFS_BUF_SET_FSPRIVATE(bp, mp);
2234                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2235                xfs_bdwrite(mp, bp);
2236        }
2237
2238        return (error);
2239}
2240
2241STATIC int
2242xlog_recover_do_inode_trans(
2243        xlog_t                        *log,
2244        xlog_recover_item_t        *item,
2245        int                        pass)
2246{
2247        xfs_inode_log_format_t        *in_f;
2248        xfs_mount_t                *mp;
2249        xfs_buf_t                *bp;
2250        xfs_imap_t                imap;
2251        xfs_dinode_t                *dip;
2252        xfs_ino_t                ino;
2253        int                        len;
2254        xfs_caddr_t                src;
2255        xfs_caddr_t                dest;
2256        int                        error;
2257        int                        attr_index;
2258        uint                        fields;
2259        xfs_icdinode_t                *dicp;
2260        int                        need_free = 0;
2261
2262        if (pass == XLOG_RECOVER_PASS1) {
2263                return 0;
2264        }
2265
2266        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2267                in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
2268        } else {
2269                in_f = (xfs_inode_log_format_t *)kmem_alloc(
2270                        sizeof(xfs_inode_log_format_t), KM_SLEEP);
2271                need_free = 1;
2272                error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2273                if (error)
2274                        goto error;
2275        }
2276        ino = in_f->ilf_ino;
2277        mp = log->l_mp;
2278        if (ITEM_TYPE(item) == XFS_LI_INODE) {
2279                imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
2280                imap.im_len = in_f->ilf_len;
2281                imap.im_boffset = in_f->ilf_boffset;
2282        } else {
2283                /*
2284                 * It's an old inode format record.  We don't know where
2285                 * its cluster is located on disk, and we can't allow
2286                 * xfs_imap() to figure it out because the inode btrees
2287                 * are not ready to be used.  Therefore do not pass the
2288                 * XFS_IMAP_LOOKUP flag to xfs_imap().  This will give
2289                 * us only the single block in which the inode lives
2290                 * rather than its cluster, so we must make sure to
2291                 * invalidate the buffer when we write it out below.
2292                 */
2293                imap.im_blkno = 0;
2294                error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
2295                if (error)
2296                        goto error;
2297        }
2298
2299        /*
2300         * Inode buffers can be freed, look out for it,
2301         * and do not replay the inode.
2302         */
2303        if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
2304                error = 0;
2305                goto error;
2306        }
2307
2308        bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
2309                                                                XFS_BUF_LOCK);
2310        if (XFS_BUF_ISERROR(bp)) {
2311                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2312                                  bp, imap.im_blkno);
2313                error = XFS_BUF_GETERROR(bp);
2314                xfs_buf_relse(bp);
2315                goto error;
2316        }
2317        error = 0;
2318        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2319        dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
2320
2321        /*
2322         * Make sure the place we're flushing out to really looks
2323         * like an inode!
2324         */
2325        if (unlikely(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC)) {
2326                xfs_buf_relse(bp);
2327                xfs_fs_cmn_err(CE_ALERT, mp,
2328                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2329                        dip, bp, ino);
2330                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
2331                                 XFS_ERRLEVEL_LOW, mp);
2332                error = EFSCORRUPTED;
2333                goto error;
2334        }
2335        dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
2336        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2337                xfs_buf_relse(bp);
2338                xfs_fs_cmn_err(CE_ALERT, mp,
2339                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2340                        item, ino);
2341                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
2342                                 XFS_ERRLEVEL_LOW, mp);
2343                error = EFSCORRUPTED;
2344                goto error;
2345        }
2346
2347        /* Skip replay when the on disk inode is newer than the log one */
2348        if (dicp->di_flushiter < be16_to_cpu(dip->di_core.di_flushiter)) {
2349                /*
2350                 * Deal with the wrap case, DI_MAX_FLUSH is less
2351                 * than smaller numbers
2352                 */
2353                if (be16_to_cpu(dip->di_core.di_flushiter) == DI_MAX_FLUSH &&
2354                    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2355                        /* do nothing */
2356                } else {
2357                        xfs_buf_relse(bp);
2358                        error = 0;
2359                        goto error;
2360                }
2361        }
2362        /* Take the opportunity to reset the flush iteration count */
2363        dicp->di_flushiter = 0;
2364
2365        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2366                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2367                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2368                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
2369                                         XFS_ERRLEVEL_LOW, mp, dicp);
2370                        xfs_buf_relse(bp);
2371                        xfs_fs_cmn_err(CE_ALERT, mp,
2372                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2373                                item, dip, bp, ino);
2374                        error = EFSCORRUPTED;
2375                        goto error;
2376                }
2377        } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
2378                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2379                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2380                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2381                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
2382                                             XFS_ERRLEVEL_LOW, mp, dicp);
2383                        xfs_buf_relse(bp);
2384                        xfs_fs_cmn_err(CE_ALERT, mp,
2385                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2386                                item, dip, bp, ino);
2387                        error = EFSCORRUPTED;
2388                        goto error;
2389                }
2390        }
2391        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2392                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
2393                                     XFS_ERRLEVEL_LOW, mp, dicp);
2394                xfs_buf_relse(bp);
2395                xfs_fs_cmn_err(CE_ALERT, mp,
2396                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2397                        item, dip, bp, ino,
2398                        dicp->di_nextents + dicp->di_anextents,
2399                        dicp->di_nblocks);
2400                error = EFSCORRUPTED;
2401                goto error;
2402        }
2403        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2404                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
2405                                     XFS_ERRLEVEL_LOW, mp, dicp);
2406                xfs_buf_relse(bp);
2407                xfs_fs_cmn_err(CE_ALERT, mp,
2408                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2409                        item, dip, bp, ino, dicp->di_forkoff);
2410                error = EFSCORRUPTED;
2411                goto error;
2412        }
2413        if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
2414                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
2415                                     XFS_ERRLEVEL_LOW, mp, dicp);
2416                xfs_buf_relse(bp);
2417                xfs_fs_cmn_err(CE_ALERT, mp,
2418                        "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
2419                        item->ri_buf[1].i_len, item);
2420                error = EFSCORRUPTED;
2421                goto error;
2422        }
2423
2424        /* The core is in in-core format */
2425        xfs_dinode_to_disk(&dip->di_core,
2426                (xfs_icdinode_t *)item->ri_buf[1].i_addr);
2427
2428        /* the rest is in on-disk format */
2429        if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
2430                memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
2431                        item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
2432                        item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
2433        }
2434
2435        fields = in_f->ilf_fields;
2436        switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2437        case XFS_ILOG_DEV:
2438                dip->di_u.di_dev = cpu_to_be32(in_f->ilf_u.ilfu_rdev);
2439                break;
2440        case XFS_ILOG_UUID:
2441                dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
2442                break;
2443        }
2444
2445        if (in_f->ilf_size == 2)
2446                goto write_inode_buffer;
2447        len = item->ri_buf[2].i_len;
2448        src = item->ri_buf[2].i_addr;
2449        ASSERT(in_f->ilf_size <= 4);
2450        ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2451        ASSERT(!(fields & XFS_ILOG_DFORK) ||
2452               (len == in_f->ilf_dsize));
2453
2454        switch (fields & XFS_ILOG_DFORK) {
2455        case XFS_ILOG_DDATA:
2456        case XFS_ILOG_DEXT:
2457                memcpy(&dip->di_u, src, len);
2458                break;
2459
2460        case XFS_ILOG_DBROOT:
2461                xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
2462                                 &(dip->di_u.di_bmbt),
2463                                 XFS_DFORK_DSIZE(dip, mp));
2464                break;
2465
2466        default:
2467                /*
2468                 * There are no data fork flags set.
2469                 */
2470                ASSERT((fields & XFS_ILOG_DFORK) == 0);
2471                break;
2472        }
2473
2474        /*
2475         * If we logged any attribute data, recover it.  There may or
2476         * may not have been any other non-core data logged in this
2477         * transaction.
2478         */
2479        if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2480                if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2481                        attr_index = 3;
2482                } else {
2483                        attr_index = 2;
2484                }
2485                len = item->ri_buf[attr_index].i_len;
2486                src = item->ri_buf[attr_index].i_addr;
2487                ASSERT(len == in_f->ilf_asize);
2488
2489                switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2490                case XFS_ILOG_ADATA:
2491                case XFS_ILOG_AEXT:
2492                        dest = XFS_DFORK_APTR(dip);
2493                        ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2494                        memcpy(dest, src, len);
2495                        break;
2496
2497                case XFS_ILOG_ABROOT:
2498                        dest = XFS_DFORK_APTR(dip);
2499                        xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
2500                                         (xfs_bmdr_block_t*)dest,
2501                                         XFS_DFORK_ASIZE(dip, mp));
2502                        break;
2503
2504                default:
2505                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
2506                        ASSERT(0);
2507                        xfs_buf_relse(bp);
2508                        error = EIO;
2509                        goto error;
2510                }
2511        }
2512
2513write_inode_buffer:
2514        if (ITEM_TYPE(item) == XFS_LI_INODE) {
2515                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2516                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2517                XFS_BUF_SET_FSPRIVATE(bp, mp);
2518                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2519                xfs_bdwrite(mp, bp);
2520        } else {
2521                XFS_BUF_STALE(bp);
2522                error = xfs_bwrite(mp, bp);
2523        }
2524
2525error:
2526        if (need_free)
2527                kmem_free(in_f);
2528        return XFS_ERROR(error);
2529}
2530
2531/*
2532 * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
2533 * structure, so that we know not to do any dquot item or dquot buffer recovery,
2534 * of that type.
2535 */
2536STATIC int
2537xlog_recover_do_quotaoff_trans(
2538        xlog_t                        *log,
2539        xlog_recover_item_t        *item,
2540        int                        pass)
2541{
2542        xfs_qoff_logformat_t        *qoff_f;
2543
2544        if (pass == XLOG_RECOVER_PASS2) {
2545                return (0);
2546        }
2547
2548        qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
2549        ASSERT(qoff_f);
2550
2551        /*
2552         * The logitem format's flag tells us if this was user quotaoff,
2553         * group/project quotaoff or both.
2554         */
2555        if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2556                log->l_quotaoffs_flag |= XFS_DQ_USER;
2557        if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2558                log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2559        if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2560                log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2561
2562        return (0);
2563}
2564
2565/*
2566 * Recover a dquot record
2567 */
2568STATIC int
2569xlog_recover_do_dquot_trans(
2570        xlog_t                        *log,
2571        xlog_recover_item_t        *item,
2572        int                        pass)
2573{
2574        xfs_mount_t                *mp;
2575        xfs_buf_t                *bp;
2576        struct xfs_disk_dquot        *ddq, *recddq;
2577        int                        error;
2578        xfs_dq_logformat_t        *dq_f;
2579        uint                        type;
2580
2581        if (pass == XLOG_RECOVER_PASS1) {
2582                return 0;
2583        }
2584        mp = log->l_mp;
2585
2586        /*
2587         * Filesystems are required to send in quota flags at mount time.
2588         */
2589        if (mp->m_qflags == 0)
2590                return (0);
2591
2592        recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
2593        ASSERT(recddq);
2594        /*
2595         * This type of quotas was turned off, so ignore this record.
2596         */
2597        type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2598        ASSERT(type);
2599        if (log->l_quotaoffs_flag & type)
2600                return (0);
2601
2602        /*
2603         * At this point we know that quota was _not_ turned off.
2604         * Since the mount flags are not indicating to us otherwise, this
2605         * must mean that quota is on, and the dquot needs to be replayed.
2606         * Remember that we may not have fully recovered the superblock yet,
2607         * so we can't do the usual trick of looking at the SB quota bits.
2608         *
2609         * The other possibility, of course, is that the quota subsystem was
2610         * removed since the last mount - ENOSYS.
2611         */
2612        dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
2613        ASSERT(dq_f);
2614        if ((error = xfs_qm_dqcheck(recddq,
2615                           dq_f->qlf_id,
2616                           0, XFS_QMOPT_DOWARN,
2617                           "xlog_recover_do_dquot_trans (log copy)"))) {
2618                return XFS_ERROR(EIO);
2619        }
2620        ASSERT(dq_f->qlf_len == 1);
2621
2622        error = xfs_read_buf(mp, mp->m_ddev_targp,
2623                             dq_f->qlf_blkno,
2624                             XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2625                             0, &bp);
2626        if (error) {
2627                xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
2628                                  bp, dq_f->qlf_blkno);
2629                return error;
2630        }
2631        ASSERT(bp);
2632        ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2633
2634        /*
2635         * At least the magic num portion should be on disk because this
2636         * was among a chunk of dquots created earlier, and we did some
2637         * minimal initialization then.
2638         */
2639        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2640                           "xlog_recover_do_dquot_trans")) {
2641                xfs_buf_relse(bp);
2642                return XFS_ERROR(EIO);
2643        }
2644
2645        memcpy(ddq, recddq, item->ri_buf[1].i_len);
2646
2647        ASSERT(dq_f->qlf_size == 2);
2648        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2649               XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2650        XFS_BUF_SET_FSPRIVATE(bp, mp);
2651        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2652        xfs_bdwrite(mp, bp);
2653
2654        return (0);
2655}
2656
2657/*
2658 * This routine is called to create an in-core extent free intent
2659 * item from the efi format structure which was logged on disk.
2660 * It allocates an in-core efi, copies the extents from the format
2661 * structure into it, and adds the efi to the AIL with the given
2662 * LSN.
2663 */
2664STATIC int
2665xlog_recover_do_efi_trans(
2666        xlog_t                        *log,
2667        xlog_recover_item_t        *item,
2668        xfs_lsn_t                lsn,
2669        int                        pass)
2670{
2671        int                        error;
2672        xfs_mount_t                *mp;
2673        xfs_efi_log_item_t        *efip;
2674        xfs_efi_log_format_t        *efi_formatp;
2675
2676        if (pass == XLOG_RECOVER_PASS1) {
2677                return 0;
2678        }
2679
2680        efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
2681
2682        mp = log->l_mp;
2683        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2684        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2685                                         &(efip->efi_format)))) {
2686                xfs_efi_item_free(efip);
2687                return error;
2688        }
2689        efip->efi_next_extent = efi_formatp->efi_nextents;
2690        efip->efi_flags |= XFS_EFI_COMMITTED;
2691
2692        spin_lock(&mp->m_ail_lock);
2693        /*
2694         * xfs_trans_update_ail() drops the AIL lock.
2695         */
2696        xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn);
2697        return 0;
2698}
2699
2700
2701/*
2702 * This routine is called when an efd format structure is found in
2703 * a committed transaction in the log.  It's purpose is to cancel
2704 * the corresponding efi if it was still in the log.  To do this
2705 * it searches the AIL for the efi with an id equal to that in the
2706 * efd format structure.  If we find it, we remove the efi from the
2707 * AIL and free it.
2708 */
2709STATIC void
2710xlog_recover_do_efd_trans(
2711        xlog_t                        *log,
2712        xlog_recover_item_t        *item,
2713        int                        pass)
2714{
2715        xfs_mount_t                *mp;
2716        xfs_efd_log_format_t        *efd_formatp;
2717        xfs_efi_log_item_t        *efip = NULL;
2718        xfs_log_item_t                *lip;
2719        int                        gen;
2720        __uint64_t                efi_id;
2721
2722        if (pass == XLOG_RECOVER_PASS1) {
2723                return;
2724        }
2725
2726        efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
2727        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2728                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2729               (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2730                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
2731        efi_id = efd_formatp->efd_efi_id;
2732
2733        /*
2734         * Search for the efi with the id in the efd format structure
2735         * in the AIL.
2736         */
2737        mp = log->l_mp;
2738        spin_lock(&mp->m_ail_lock);
2739        lip = xfs_trans_first_ail(mp, &gen);
2740        while (lip != NULL) {
2741                if (lip->li_type == XFS_LI_EFI) {
2742                        efip = (xfs_efi_log_item_t *)lip;
2743                        if (efip->efi_format.efi_id == efi_id) {
2744                                /*
2745                                 * xfs_trans_delete_ail() drops the
2746                                 * AIL lock.
2747                                 */
2748                                xfs_trans_delete_ail(mp, lip);
2749                                xfs_efi_item_free(efip);
2750                                return;
2751                        }
2752                }
2753                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
2754        }
2755        spin_unlock(&mp->m_ail_lock);
2756}
2757
2758/*
2759 * Perform the transaction
2760 *
2761 * If the transaction modifies a buffer or inode, do it now.  Otherwise,
2762 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2763 */
2764STATIC int
2765xlog_recover_do_trans(
2766        xlog_t                        *log,
2767        xlog_recover_t                *trans,
2768        int                        pass)
2769{
2770        int                        error = 0;
2771        xlog_recover_item_t        *item, *first_item;
2772
2773        if ((error = xlog_recover_reorder_trans(trans)))
2774                return error;
2775        first_item = item = trans->r_itemq;
2776        do {
2777                /*
2778                 * we don't need to worry about the block number being
2779                 * truncated in > 1 TB buffers because in user-land,
2780                 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
2781                 * the blknos will get through the user-mode buffer
2782                 * cache properly.  The only bad case is o32 kernels
2783                 * where xfs_daddr_t is 32-bits but mount will warn us
2784                 * off a > 1 TB filesystem before we get here.
2785                 */
2786                if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
2787                        if  ((error = xlog_recover_do_buffer_trans(log, item,
2788                                                                 pass)))
2789                                break;
2790                } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
2791                        if ((error = xlog_recover_do_inode_trans(log, item,
2792                                                                pass)))
2793                                break;
2794                } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
2795                        if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
2796                                                  pass)))
2797                                break;
2798                } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
2799                        xlog_recover_do_efd_trans(log, item, pass);
2800                } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
2801                        if ((error = xlog_recover_do_dquot_trans(log, item,
2802                                                                   pass)))
2803                                        break;
2804                } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
2805                        if ((error = xlog_recover_do_quotaoff_trans(log, item,
2806                                                                   pass)))
2807                                        break;
2808                } else {
2809                        xlog_warn("XFS: xlog_recover_do_trans");
2810                        ASSERT(0);
2811                        error = XFS_ERROR(EIO);
2812                        break;
2813                }
2814                item = item->ri_next;
2815        } while (first_item != item);
2816
2817        return error;
2818}
2819
2820/*
2821 * Free up any resources allocated by the transaction
2822 *
2823 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2824 */
2825STATIC void
2826xlog_recover_free_trans(
2827        xlog_recover_t                *trans)
2828{
2829        xlog_recover_item_t        *first_item, *item, *free_item;
2830        int                        i;
2831
2832        item = first_item = trans->r_itemq;
2833        do {
2834                free_item = item;
2835                item = item->ri_next;
2836                 /* Free the regions in the item. */
2837                for (i = 0; i < free_item->ri_cnt; i++) {
2838                        kmem_free(free_item->ri_buf[i].i_addr);
2839                }
2840                /* Free the item itself */
2841                kmem_free(free_item->ri_buf);
2842                kmem_free(free_item);
2843        } while (first_item != item);
2844        /* Free the transaction recover structure */
2845        kmem_free(trans);
2846}
2847
2848STATIC int
2849xlog_recover_commit_trans(
2850        xlog_t                        *log,
2851        xlog_recover_t                **q,
2852        xlog_recover_t                *trans,
2853        int                        pass)
2854{
2855        int                        error;
2856
2857        if ((error = xlog_recover_unlink_tid(q, trans)))
2858                return error;
2859        if ((error = xlog_recover_do_trans(log, trans, pass)))
2860                return error;
2861        xlog_recover_free_trans(trans);                        /* no error */
2862        return 0;
2863}
2864
2865STATIC int
2866xlog_recover_unmount_trans(
2867        xlog_recover_t                *trans)
2868{
2869        /* Do nothing now */
2870        xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
2871        return 0;
2872}
2873
2874/*
2875 * There are two valid states of the r_state field.  0 indicates that the
2876 * transaction structure is in a normal state.  We have either seen the
2877 * start of the transaction or the last operation we added was not a partial
2878 * operation.  If the last operation we added to the transaction was a
2879 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2880 *
2881 * NOTE: skip LRs with 0 data length.
2882 */
2883STATIC int
2884xlog_recover_process_data(
2885        xlog_t                        *log,
2886        xlog_recover_t                *rhash[],
2887        xlog_rec_header_t        *rhead,
2888        xfs_caddr_t                dp,
2889        int                        pass)
2890{
2891        xfs_caddr_t                lp;
2892        int                        num_logops;
2893        xlog_op_header_t        *ohead;
2894        xlog_recover_t                *trans;
2895        xlog_tid_t                tid;
2896        int                        error;
2897        unsigned long                hash;
2898        uint                        flags;
2899
2900        lp = dp + be32_to_cpu(rhead->h_len);
2901        num_logops = be32_to_cpu(rhead->h_num_logops);
2902
2903        /* check the log format matches our own - else we can't recover */
2904        if (xlog_header_check_recover(log->l_mp, rhead))
2905                return (XFS_ERROR(EIO));
2906
2907        while ((dp < lp) && num_logops) {
2908                ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2909                ohead = (xlog_op_header_t *)dp;
2910                dp += sizeof(xlog_op_header_t);
2911                if (ohead->oh_clientid != XFS_TRANSACTION &&
2912                    ohead->oh_clientid != XFS_LOG) {
2913                        xlog_warn(
2914                "XFS: xlog_recover_process_data: bad clientid");
2915                        ASSERT(0);
2916                        return (XFS_ERROR(EIO));
2917                }
2918                tid = be32_to_cpu(ohead->oh_tid);
2919                hash = XLOG_RHASH(tid);
2920                trans = xlog_recover_find_tid(rhash[hash], tid);
2921                if (trans == NULL) {                   /* not found; add new tid */
2922                        if (ohead->oh_flags & XLOG_START_TRANS)
2923                                xlog_recover_new_tid(&rhash[hash], tid,
2924                                        be64_to_cpu(rhead->h_lsn));
2925                } else {
2926                        if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2927                                xlog_warn(
2928                        "XFS: xlog_recover_process_data: bad length");
2929                                WARN_ON(1);
2930                                return (XFS_ERROR(EIO));
2931                        }
2932                        flags = ohead->oh_flags & ~XLOG_END_TRANS;
2933                        if (flags & XLOG_WAS_CONT_TRANS)
2934                                flags &= ~XLOG_CONTINUE_TRANS;
2935                        switch (flags) {
2936                        case XLOG_COMMIT_TRANS:
2937                                error = xlog_recover_commit_trans(log,
2938                                                &rhash[hash], trans, pass);
2939                                break;
2940                        case XLOG_UNMOUNT_TRANS:
2941                                error = xlog_recover_unmount_trans(trans);
2942                                break;
2943                        case XLOG_WAS_CONT_TRANS:
2944                                error = xlog_recover_add_to_cont_trans(trans,
2945                                                dp, be32_to_cpu(ohead->oh_len));
2946                                break;
2947                        case XLOG_START_TRANS:
2948                                xlog_warn(
2949                        "XFS: xlog_recover_process_data: bad transaction");
2950                                ASSERT(0);
2951                                error = XFS_ERROR(EIO);
2952                                break;
2953                        case 0:
2954                        case XLOG_CONTINUE_TRANS:
2955                                error = xlog_recover_add_to_trans(trans,
2956                                                dp, be32_to_cpu(ohead->oh_len));
2957                                break;
2958                        default:
2959                                xlog_warn(
2960                        "XFS: xlog_recover_process_data: bad flag");
2961                                ASSERT(0);
2962                                error = XFS_ERROR(EIO);
2963                                break;
2964                        }
2965                        if (error)
2966                                return error;
2967                }
2968                dp += be32_to_cpu(ohead->oh_len);
2969                num_logops--;
2970        }
2971        return 0;
2972}
2973
2974/*
2975 * Process an extent free intent item that was recovered from
2976 * the log.  We need to free the extents that it describes.
2977 */
2978STATIC int
2979xlog_recover_process_efi(
2980        xfs_mount_t                *mp,
2981        xfs_efi_log_item_t        *efip)
2982{
2983        xfs_efd_log_item_t        *efdp;
2984        xfs_trans_t                *tp;
2985        int                        i;
2986        int                        error = 0;
2987        xfs_extent_t                *extp;
2988        xfs_fsblock_t                startblock_fsb;
2989
2990        ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
2991
2992        /*
2993         * First check the validity of the extents described by the
2994         * EFI.  If any are bad, then assume that all are bad and
2995         * just toss the EFI.
2996         */
2997        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
2998                extp = &(efip->efi_format.efi_extents[i]);
2999                startblock_fsb = XFS_BB_TO_FSB(mp,
3000                                   XFS_FSB_TO_DADDR(mp, extp->ext_start));
3001                if ((startblock_fsb == 0) ||
3002                    (extp->ext_len == 0) ||
3003                    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
3004                    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
3005                        /*
3006                         * This will pull the EFI from the AIL and
3007                         * free the memory associated with it.
3008                         */
3009                        xfs_efi_release(efip, efip->efi_format.efi_nextents);
3010                        return XFS_ERROR(EIO);
3011                }
3012        }
3013
3014        tp = xfs_trans_alloc(mp, 0);
3015        error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
3016        if (error)
3017                goto abort_error;
3018        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3019
3020        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3021                extp = &(efip->efi_format.efi_extents[i]);
3022                error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3023                if (error)
3024                        goto abort_error;
3025                xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3026                                         extp->ext_len);
3027        }
3028
3029        efip->efi_flags |= XFS_EFI_RECOVERED;
3030        error = xfs_trans_commit(tp, 0);
3031        return error;
3032
3033abort_error:
3034        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3035        return error;
3036}
3037
3038/*
3039 * Verify that once we've encountered something other than an EFI
3040 * in the AIL that there are no more EFIs in the AIL.
3041 */
3042#if defined(DEBUG)
3043STATIC void
3044xlog_recover_check_ail(
3045        xfs_mount_t                *mp,
3046        xfs_log_item_t                *lip,
3047        int                        gen)
3048{
3049        int                        orig_gen = gen;
3050
3051        do {
3052                ASSERT(lip->li_type != XFS_LI_EFI);
3053                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3054                /*
3055                 * The check will be bogus if we restart from the
3056                 * beginning of the AIL, so ASSERT that we don't.
3057                 * We never should since we're holding the AIL lock
3058                 * the entire time.
3059                 */
3060                ASSERT(gen == orig_gen);
3061        } while (lip != NULL);
3062}
3063#endif        /* DEBUG */
3064
3065/*
3066 * When this is called, all of the EFIs which did not have
3067 * corresponding EFDs should be in the AIL.  What we do now
3068 * is free the extents associated with each one.
3069 *
3070 * Since we process the EFIs in normal transactions, they
3071 * will be removed at some point after the commit.  This prevents
3072 * us from just walking down the list processing each one.
3073 * We'll use a flag in the EFI to skip those that we've already
3074 * processed and use the AIL iteration mechanism's generation
3075 * count to try to speed this up at least a bit.
3076 *
3077 * When we start, we know that the EFIs are the only things in
3078 * the AIL.  As we process them, however, other items are added
3079 * to the AIL.  Since everything added to the AIL must come after
3080 * everything already in the AIL, we stop processing as soon as
3081 * we see something other than an EFI in the AIL.
3082 */
3083STATIC int
3084xlog_recover_process_efis(
3085        xlog_t                        *log)
3086{
3087        xfs_log_item_t                *lip;
3088        xfs_efi_log_item_t        *efip;
3089        int                        gen;
3090        xfs_mount_t                *mp;
3091        int                        error = 0;
3092
3093        mp = log->l_mp;
3094        spin_lock(&mp->m_ail_lock);
3095
3096        lip = xfs_trans_first_ail(mp, &gen);
3097        while (lip != NULL) {
3098                /*
3099                 * We're done when we see something other than an EFI.
3100                 */
3101                if (lip->li_type != XFS_LI_EFI) {
3102                        xlog_recover_check_ail(mp, lip, gen);
3103                        break;
3104                }
3105
3106                /*
3107                 * Skip EFIs that we've already processed.
3108                 */
3109                efip = (xfs_efi_log_item_t *)lip;
3110                if (efip->efi_flags & XFS_EFI_RECOVERED) {
3111                        lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3112                        continue;
3113                }
3114
3115                spin_unlock(&mp->m_ail_lock);
3116                error = xlog_recover_process_efi(mp, efip);
3117                if (error)
3118                        return error;
3119                spin_lock(&mp->m_ail_lock);
3120                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3121        }
3122        spin_unlock(&mp->m_ail_lock);
3123        return error;
3124}
3125
3126/*
3127 * This routine performs a transaction to null out a bad inode pointer
3128 * in an agi unlinked inode hash bucket.
3129 */
3130STATIC void
3131xlog_recover_clear_agi_bucket(
3132        xfs_mount_t        *mp,
3133        xfs_agnumber_t        agno,
3134        int                bucket)
3135{
3136        xfs_trans_t        *tp;
3137        xfs_agi_t        *agi;
3138        xfs_buf_t        *agibp;
3139        int                offset;
3140        int                error;
3141
3142        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3143        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
3144        if (!error)
3145                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3146                                   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3147                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
3148        if (error)
3149                goto out_abort;
3150
3151        error = EINVAL;
3152        agi = XFS_BUF_TO_AGI(agibp);
3153        if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
3154                goto out_abort;
3155
3156        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3157        offset = offsetof(xfs_agi_t, agi_unlinked) +
3158                 (sizeof(xfs_agino_t) * bucket);
3159        xfs_trans_log_buf(tp, agibp, offset,
3160                          (offset + sizeof(xfs_agino_t) - 1));
3161
3162        error = xfs_trans_commit(tp, 0);
3163        if (error)
3164                goto out_error;
3165        return;
3166
3167out_abort:
3168        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3169out_error:
3170        xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
3171                        "failed to clear agi %d. Continuing.", agno);
3172        return;
3173}
3174
3175/*
3176 * xlog_iunlink_recover
3177 *
3178 * This is called during recovery to process any inodes which
3179 * we unlinked but not freed when the system crashed.  These
3180 * inodes will be on the lists in the AGI blocks.  What we do
3181 * here is scan all the AGIs and fully truncate and free any
3182 * inodes found on the lists.  Each inode is removed from the
3183 * lists when it has been fully truncated and is freed.  The
3184 * freeing of the inode and its removal from the list must be
3185 * atomic.
3186 */
3187void
3188xlog_recover_process_iunlinks(
3189        xlog_t                *log)
3190{
3191        xfs_mount_t        *mp;
3192        xfs_agnumber_t        agno;
3193        xfs_agi_t        *agi;
3194        xfs_buf_t        *agibp;
3195        xfs_buf_t        *ibp;
3196        xfs_dinode_t        *dip;
3197        xfs_inode_t        *ip;
3198        xfs_agino_t        agino;
3199        xfs_ino_t        ino;
3200        int                bucket;
3201        int                error;
3202        uint                mp_dmevmask;
3203
3204        mp = log->l_mp;
3205
3206        /*
3207         * Prevent any DMAPI event from being sent while in this function.
3208         */
3209        mp_dmevmask = mp->m_dmevmask;
3210        mp->m_dmevmask = 0;
3211
3212        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3213                /*
3214                 * Find the agi for this ag.
3215                 */
3216                agibp = xfs_buf_read(mp->m_ddev_targp,
3217                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3218                                XFS_FSS_TO_BB(mp, 1), 0);
3219                if (XFS_BUF_ISERROR(agibp)) {
3220                        xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
3221                                log->l_mp, agibp,
3222                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
3223                }
3224                agi = XFS_BUF_TO_AGI(agibp);
3225                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum));
3226
3227                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3228
3229                        agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3230                        while (agino != NULLAGINO) {
3231
3232                                /*
3233                                 * Release the agi buffer so that it can
3234                                 * be acquired in the normal course of the
3235                                 * transaction to truncate and free the inode.
3236                                 */
3237                                xfs_buf_relse(agibp);
3238
3239                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
3240                                error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
3241                                ASSERT(error || (ip != NULL));
3242
3243                                if (!error) {
3244                                        /*
3245                                         * Get the on disk inode to find the
3246                                         * next inode in the bucket.
3247                                         */
3248                                        error = xfs_itobp(mp, NULL, ip, &dip,
3249                                                        &ibp, 0, 0,
3250                                                        XFS_BUF_LOCK);
3251                                        ASSERT(error || (dip != NULL));
3252                                }
3253
3254                                if (!error) {
3255                                        ASSERT(ip->i_d.di_nlink == 0);
3256
3257                                        /* setup for the next pass */
3258                                        agino = be32_to_cpu(
3259                                                        dip->di_next_unlinked);
3260                                        xfs_buf_relse(ibp);
3261                                        /*
3262                                         * Prevent any DMAPI event from
3263                                         * being sent when the
3264                                         * reference on the inode is
3265                                         * dropped.
3266                                         */
3267                                        ip->i_d.di_dmevmask = 0;
3268
3269                                        /*
3270                                         * If this is a new inode, handle
3271                                         * it specially.  Otherwise,
3272                                         * just drop our reference to the
3273                                         * inode.  If there are no
3274                                         * other references, this will
3275                                         * send the inode to
3276                                         * xfs_inactive() which will
3277                                         * truncate the file and free
3278                                         * the inode.
3279                                         */
3280                                        if (ip->i_d.di_mode == 0)
3281                                                xfs_iput_new(ip, 0);
3282                                        else
3283                                                IRELE(ip);
3284                                } else {
3285                                        /*
3286                                         * We can't read in the inode
3287                                         * this bucket points to, or
3288                                         * this inode is messed up.  Just
3289                                         * ditch this bucket of inodes.  We
3290                                         * will lose some inodes and space,
3291                                         * but at least we won't hang.  Call
3292                                         * xlog_recover_clear_agi_bucket()
3293                                         * to perform a transaction to clear
3294                                         * the inode pointer in the bucket.
3295                                         */
3296                                        xlog_recover_clear_agi_bucket(mp, agno,
3297                                                        bucket);
3298
3299                                        agino = NULLAGINO;
3300                                }
3301
3302                                /*
3303                                 * Reacquire the agibuffer and continue around
3304                                 * the loop.
3305                                 */
3306                                agibp = xfs_buf_read(mp->m_ddev_targp,
3307                                                XFS_AG_DADDR(mp, agno,
3308                                                        XFS_AGI_DADDR(mp)),
3309                                                XFS_FSS_TO_BB(mp, 1), 0);
3310                                if (XFS_BUF_ISERROR(agibp)) {
3311                                        xfs_ioerror_alert(
3312                                "xlog_recover_process_iunlinks(#2)",
3313                                                log->l_mp, agibp,
3314                                                XFS_AG_DADDR(mp, agno,
3315                                                        XFS_AGI_DADDR(mp)));
3316                                }
3317                                agi = XFS_BUF_TO_AGI(agibp);
3318                                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(
3319                                        agi->agi_magicnum));
3320                        }
3321                }
3322
3323                /*
3324                 * Release the buffer for the current agi so we can
3325                 * go on to the next one.
3326                 */
3327                xfs_buf_relse(agibp);
3328        }
3329
3330        mp->m_dmevmask = mp_dmevmask;
3331}
3332
3333
3334#ifdef DEBUG
3335STATIC void
3336xlog_pack_data_checksum(
3337        xlog_t                *log,
3338        xlog_in_core_t        *iclog,
3339        int                size)
3340{
3341        int                i;
3342        __be32                *up;
3343        uint                chksum = 0;
3344
3345        up = (__be32 *)iclog->ic_datap;
3346        /* divide length by 4 to get # words */
3347        for (i = 0; i < (size >> 2); i++) {
3348                chksum ^= be32_to_cpu(*up);
3349                up++;
3350        }
3351        iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3352}
3353#else
3354#define xlog_pack_data_checksum(log, iclog, size)
3355#endif
3356
3357/*
3358 * Stamp cycle number in every block
3359 */
3360void
3361xlog_pack_data(
3362        xlog_t                        *log,
3363        xlog_in_core_t                *iclog,
3364        int                        roundoff)
3365{
3366        int                        i, j, k;
3367        int                        size = iclog->ic_offset + roundoff;
3368        __be32                        cycle_lsn;
3369        xfs_caddr_t                dp;
3370        xlog_in_core_2_t        *xhdr;
3371
3372        xlog_pack_data_checksum(log, iclog, size);
3373
3374        cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3375
3376        dp = iclog->ic_datap;
3377        for (i = 0; i < BTOBB(size) &&
3378                i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3379                iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3380                *(__be32 *)dp = cycle_lsn;
3381                dp += BBSIZE;
3382        }
3383
3384        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3385                xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
3386                for ( ; i < BTOBB(size); i++) {
3387                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3388                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3389                        xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3390                        *(__be32 *)dp = cycle_lsn;
3391                        dp += BBSIZE;
3392                }
3393
3394                for (i = 1; i < log->l_iclog_heads; i++) {
3395                        xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3396                }
3397        }
3398}
3399
3400#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3401STATIC void
3402xlog_unpack_data_checksum(
3403        xlog_rec_header_t        *rhead,
3404        xfs_caddr_t                dp,
3405        xlog_t                        *log)
3406{
3407        __be32                        *up = (__be32 *)dp;
3408        uint                        chksum = 0;
3409        int                        i;
3410
3411        /* divide length by 4 to get # words */
3412        for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
3413                chksum ^= be32_to_cpu(*up);
3414                up++;
3415        }
3416        if (chksum != be32_to_cpu(rhead->h_chksum)) {
3417            if (rhead->h_chksum ||
3418                ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3419                    cmn_err(CE_DEBUG,
3420                        "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
3421                            be32_to_cpu(rhead->h_chksum), chksum);
3422                    cmn_err(CE_DEBUG,
3423"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3424                    if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3425                            cmn_err(CE_DEBUG,
3426                                "XFS: LogR this is a LogV2 filesystem\n");
3427                    }
3428                    log->l_flags |= XLOG_CHKSUM_MISMATCH;
3429            }
3430        }
3431}
3432#else
3433#define xlog_unpack_data_checksum(rhead, dp, log)
3434#endif
3435
3436STATIC void
3437xlog_unpack_data(
3438        xlog_rec_header_t        *rhead,
3439        xfs_caddr_t                dp,
3440        xlog_t                        *log)
3441{
3442        int                        i, j, k;
3443        xlog_in_core_2_t        *xhdr;
3444
3445        for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3446                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3447                *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
3448                dp += BBSIZE;
3449        }
3450
3451        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3452                xhdr = (xlog_in_core_2_t *)rhead;
3453                for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
3454                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3455                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3456                        *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3457                        dp += BBSIZE;
3458                }
3459        }
3460
3461        xlog_unpack_data_checksum(rhead, dp, log);
3462}
3463
3464STATIC int
3465xlog_valid_rec_header(
3466        xlog_t                        *log,
3467        xlog_rec_header_t        *rhead,
3468        xfs_daddr_t                blkno)
3469{
3470        int                        hlen;
3471
3472        if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
3473                XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3474                                XFS_ERRLEVEL_LOW, log->l_mp);
3475                return XFS_ERROR(EFSCORRUPTED);
3476        }
3477        if (unlikely(
3478            (!rhead->h_version ||
3479            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3480                xlog_warn("XFS: %s: unrecognised log version (%d).",
3481                        __func__, be32_to_cpu(rhead->h_version));
3482                return XFS_ERROR(EIO);
3483        }
3484
3485        /* LR body must have data or it wouldn't have been written */
3486        hlen = be32_to_cpu(rhead->h_len);
3487        if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3488                XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3489                                XFS_ERRLEVEL_LOW, log->l_mp);
3490                return XFS_ERROR(EFSCORRUPTED);
3491        }
3492        if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3493                XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3494                                XFS_ERRLEVEL_LOW, log->l_mp);
3495                return XFS_ERROR(EFSCORRUPTED);
3496        }
3497        return 0;
3498}
3499
3500/*
3501 * Read the log from tail to head and process the log records found.
3502 * Handle the two cases where the tail and head are in the same cycle
3503 * and where the active portion of the log wraps around the end of
3504 * the physical log separately.  The pass parameter is passed through
3505 * to the routines called to process the data and is not looked at
3506 * here.
3507 */
3508STATIC int
3509xlog_do_recovery_pass(
3510        xlog_t                        *log,
3511        xfs_daddr_t                head_blk,
3512        xfs_daddr_t                tail_blk,
3513        int                        pass)
3514{
3515        xlog_rec_header_t        *rhead;
3516        xfs_daddr_t                blk_no;
3517        xfs_caddr_t                bufaddr, offset;
3518        xfs_buf_t                *hbp, *dbp;
3519        int                        error = 0, h_size;
3520        int                        bblks, split_bblks;
3521        int                        hblks, split_hblks, wrapped_hblks;
3522        xlog_recover_t                *rhash[XLOG_RHASH_SIZE];
3523
3524        ASSERT(head_blk != tail_blk);
3525
3526        /*
3527         * Read the header of the tail block and get the iclog buffer size from
3528         * h_size.  Use this to tell how many sectors make up the log header.
3529         */
3530        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3531                /*
3532                 * When using variable length iclogs, read first sector of
3533                 * iclog header and extract the header size from it.  Get a
3534                 * new hbp that is the correct size.
3535                 */
3536                hbp = xlog_get_bp(log, 1);
3537                if (!hbp)
3538                        return ENOMEM;
3539                if ((error = xlog_bread(log, tail_blk, 1, hbp)))
3540                        goto bread_err1;
3541                offset = xlog_align(log, tail_blk, 1, hbp);
3542                rhead = (xlog_rec_header_t *)offset;
3543                error = xlog_valid_rec_header(log, rhead, tail_blk);
3544                if (error)
3545                        goto bread_err1;
3546                h_size = be32_to_cpu(rhead->h_size);
3547                if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
3548                    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3549                        hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3550                        if (h_size % XLOG_HEADER_CYCLE_SIZE)
3551                                hblks++;
3552                        xlog_put_bp(hbp);
3553                        hbp = xlog_get_bp(log, hblks);
3554                } else {
3555                        hblks = 1;
3556                }
3557        } else {
3558                ASSERT(log->l_sectbb_log == 0);
3559                hblks = 1;
3560                hbp = xlog_get_bp(log, 1);
3561                h_size = XLOG_BIG_RECORD_BSIZE;
3562        }
3563
3564        if (!hbp)
3565                return ENOMEM;
3566        dbp = xlog_get_bp(log, BTOBB(h_size));
3567        if (!dbp) {
3568                xlog_put_bp(hbp);
3569                return ENOMEM;
3570        }
3571
3572        memset(rhash, 0, sizeof(rhash));
3573        if (tail_blk <= head_blk) {
3574                for (blk_no = tail_blk; blk_no < head_blk; ) {
3575                        if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3576                                goto bread_err2;
3577                        offset = xlog_align(log, blk_no, hblks, hbp);
3578                        rhead = (xlog_rec_header_t *)offset;
3579                        error = xlog_valid_rec_header(log, rhead, blk_no);
3580                        if (error)
3581                                goto bread_err2;
3582
3583                        /* blocks in data section */
3584                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3585                        error = xlog_bread(log, blk_no + hblks, bblks, dbp);
3586                        if (error)
3587                                goto bread_err2;
3588                        offset = xlog_align(log, blk_no + hblks, bblks, dbp);
3589                        xlog_unpack_data(rhead, offset, log);
3590                        if ((error = xlog_recover_process_data(log,
3591                                                rhash, rhead, offset, pass)))
3592                                goto bread_err2;
3593                        blk_no += bblks + hblks;
3594                }
3595        } else {
3596                /*
3597                 * Perform recovery around the end of the physical log.
3598                 * When the head is not on the same cycle number as the tail,
3599                 * we can't do a sequential recovery as above.
3600                 */
3601                blk_no = tail_blk;
3602                while (blk_no < log->l_logBBsize) {
3603                        /*
3604                         * Check for header wrapping around physical end-of-log
3605                         */
3606                        offset = NULL;
3607                        split_hblks = 0;
3608                        wrapped_hblks = 0;
3609                        if (blk_no + hblks <= log->l_logBBsize) {
3610                                /* Read header in one read */
3611                                error = xlog_bread(log, blk_no, hblks, hbp);
3612                                if (error)
3613                                        goto bread_err2;
3614                                offset = xlog_align(log, blk_no, hblks, hbp);
3615                        } else {
3616                                /* This LR is split across physical log end */
3617                                if (blk_no != log->l_logBBsize) {
3618                                        /* some data before physical log end */
3619                                        ASSERT(blk_no <= INT_MAX);
3620                                        split_hblks = log->l_logBBsize - (int)blk_no;
3621                                        ASSERT(split_hblks > 0);
3622                                        if ((error = xlog_bread(log, blk_no,
3623                                                        split_hblks, hbp)))
3624                                                goto bread_err2;
3625                                        offset = xlog_align(log, blk_no,
3626                                                        split_hblks, hbp);
3627                                }
3628                                /*
3629                                 * Note: this black magic still works with
3630                                 * large sector sizes (non-512) only because:
3631                                 * - we increased the buffer size originally
3632                                 *   by 1 sector giving us enough extra space
3633                                 *   for the second read;
3634                                 * - the log start is guaranteed to be sector
3635                                 *   aligned;
3636                                 * - we read the log end (LR header start)
3637                                 *   _first_, then the log start (LR header end)
3638                                 *   - order is important.
3639                                 */
3640                                wrapped_hblks = hblks - split_hblks;
3641                                bufaddr = XFS_BUF_PTR(hbp);
3642                                error = XFS_BUF_SET_PTR(hbp,
3643                                                bufaddr + BBTOB(split_hblks),
3644                                                BBTOB(hblks - split_hblks));
3645                                if (!error)
3646                                        error = xlog_bread(log, 0,
3647                                                        wrapped_hblks, hbp);
3648                                if (!error)
3649                                        error = XFS_BUF_SET_PTR(hbp, bufaddr,
3650                                                        BBTOB(hblks));
3651                                if (error)
3652                                        goto bread_err2;
3653                                if (!offset)
3654                                        offset = xlog_align(log, 0,
3655                                                        wrapped_hblks, hbp);
3656                        }
3657                        rhead = (xlog_rec_header_t *)offset;
3658                        error = xlog_valid_rec_header(log, rhead,
3659                                                split_hblks ? blk_no : 0);
3660                        if (error)
3661                                goto bread_err2;
3662
3663                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3664                        blk_no += hblks;
3665
3666                        /* Read in data for log record */
3667                        if (blk_no + bblks <= log->l_logBBsize) {
3668                                error = xlog_bread(log, blk_no, bblks, dbp);
3669                                if (error)
3670                                        goto bread_err2;
3671                                offset = xlog_align(log, blk_no, bblks, dbp);
3672                        } else {
3673                                /* This log record is split across the
3674                                 * physical end of log */
3675                                offset = NULL;
3676                                split_bblks = 0;
3677                                if (blk_no != log->l_logBBsize) {
3678                                        /* some data is before the physical
3679                                         * end of log */
3680                                        ASSERT(!wrapped_hblks);
3681                                        ASSERT(blk_no <= INT_MAX);
3682                                        split_bblks =
3683                                                log->l_logBBsize - (int)blk_no;
3684                                        ASSERT(split_bblks > 0);
3685                                        if ((error = xlog_bread(log, blk_no,
3686                                                        split_bblks, dbp)))
3687                                                goto bread_err2;
3688                                        offset = xlog_align(log, blk_no,
3689                                                        split_bblks, dbp);
3690                                }
3691                                /*
3692                                 * Note: this black magic still works with
3693                                 * large sector sizes (non-512) only because:
3694                                 * - we increased the buffer size originally
3695                                 *   by 1 sector giving us enough extra space
3696                                 *   for the second read;
3697                                 * - the log start is guaranteed to be sector
3698                                 *   aligned;
3699                                 * - we read the log end (LR header start)
3700                                 *   _first_, then the log start (LR header end)
3701                                 *   - order is important.
3702                                 */
3703                                bufaddr = XFS_BUF_PTR(dbp);
3704                                error = XFS_BUF_SET_PTR(dbp,
3705                                                bufaddr + BBTOB(split_bblks),
3706                                                BBTOB(bblks - split_bblks));
3707                                if (!error)
3708                                        error = xlog_bread(log, wrapped_hblks,
3709                                                        bblks - split_bblks,
3710                                                        dbp);
3711                                if (!error)
3712                                        error = XFS_BUF_SET_PTR(dbp, bufaddr,
3713                                                        h_size);
3714                                if (error)
3715                                        goto bread_err2;
3716                                if (!offset)
3717                                        offset = xlog_align(log, wrapped_hblks,
3718                                                bblks - split_bblks, dbp);
3719                        }
3720                        xlog_unpack_data(rhead, offset, log);
3721                        if ((error = xlog_recover_process_data(log, rhash,
3722                                                        rhead, offset, pass)))
3723                                goto bread_err2;
3724                        blk_no += bblks;
3725                }
3726
3727                ASSERT(blk_no >= log->l_logBBsize);
3728                blk_no -= log->l_logBBsize;
3729
3730                /* read first part of physical log */
3731                while (blk_no < head_blk) {
3732                        if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3733                                goto bread_err2;
3734                        offset = xlog_align(log, blk_no, hblks, hbp);
3735                        rhead = (xlog_rec_header_t *)offset;
3736                        error = xlog_valid_rec_header(log, rhead, blk_no);
3737                        if (error)
3738                                goto bread_err2;
3739                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3740                        if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
3741                                goto bread_err2;
3742                        offset = xlog_align(log, blk_no+hblks, bblks, dbp);
3743                        xlog_unpack_data(rhead, offset, log);
3744                        if ((error = xlog_recover_process_data(log, rhash,
3745                                                        rhead, offset, pass)))
3746                                goto bread_err2;
3747                        blk_no += bblks + hblks;
3748                }
3749        }
3750
3751 bread_err2:
3752        xlog_put_bp(dbp);
3753 bread_err1:
3754        xlog_put_bp(hbp);
3755        return error;
3756}
3757
3758/*
3759 * Do the recovery of the log.  We actually do this in two phases.
3760 * The two passes are necessary in order to implement the function
3761 * of cancelling a record written into the log.  The first pass
3762 * determines those things which have been cancelled, and the
3763 * second pass replays log items normally except for those which
3764 * have been cancelled.  The handling of the replay and cancellations
3765 * takes place in the log item type specific routines.
3766 *
3767 * The table of items which have cancel records in the log is allocated
3768 * and freed at this level, since only here do we know when all of
3769 * the log recovery has been completed.
3770 */
3771STATIC int
3772xlog_do_log_recovery(
3773        xlog_t                *log,
3774        xfs_daddr_t        head_blk,
3775        xfs_daddr_t        tail_blk)
3776{
3777        int                error;
3778
3779        ASSERT(head_blk != tail_blk);
3780
3781        /*
3782         * First do a pass to find all of the cancelled buf log items.
3783         * Store them in the buf_cancel_table for use in the second pass.
3784         */
3785        log->l_buf_cancel_table =
3786                (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
3787                                                 sizeof(xfs_buf_cancel_t*),
3788                                                 KM_SLEEP);
3789        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3790                                      XLOG_RECOVER_PASS1);
3791        if (error != 0) {
3792                kmem_free(log->l_buf_cancel_table);
3793                log->l_buf_cancel_table = NULL;
3794                return error;
3795        }
3796        /*
3797         * Then do a second pass to actually recover the items in the log.
3798         * When it is complete free the table of buf cancel items.
3799         */
3800        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3801                                      XLOG_RECOVER_PASS2);
3802#ifdef DEBUG
3803        if (!error) {
3804                int        i;
3805
3806                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3807                        ASSERT(log->l_buf_cancel_table[i] == NULL);
3808        }
3809#endif        /* DEBUG */
3810
3811        kmem_free(log->l_buf_cancel_table);
3812        log->l_buf_cancel_table = NULL;
3813
3814        return error;
3815}
3816
3817/*
3818 * Do the actual recovery
3819 */
3820STATIC int
3821xlog_do_recover(
3822        xlog_t                *log,
3823        xfs_daddr_t        head_blk,
3824        xfs_daddr_t        tail_blk)
3825{
3826        int                error;
3827        xfs_buf_t        *bp;
3828        xfs_sb_t        *sbp;
3829
3830        /*
3831         * First replay the images in the log.
3832         */
3833        error = xlog_do_log_recovery(log, head_blk, tail_blk);
3834        if (error) {
3835                return error;
3836        }
3837
3838        XFS_bflush(log->l_mp->m_ddev_targp);
3839
3840        /*
3841         * If IO errors happened during recovery, bail out.
3842         */
3843        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3844                return (EIO);
3845        }
3846
3847        /*
3848         * We now update the tail_lsn since much of the recovery has completed
3849         * and there may be space available to use.  If there were no extent
3850         * or iunlinks, we can free up the entire log and set the tail_lsn to
3851         * be the last_sync_lsn.  This was set in xlog_find_tail to be the
3852         * lsn of the last known good LR on disk.  If there are extent frees
3853         * or iunlinks they will have some entries in the AIL; so we look at
3854         * the AIL to determine how to set the tail_lsn.
3855         */
3856        xlog_assign_tail_lsn(log->l_mp);
3857
3858        /*
3859         * Now that we've finished replaying all buffer and inode
3860         * updates, re-read in the superblock.
3861         */
3862        bp = xfs_getsb(log->l_mp, 0);
3863        XFS_BUF_UNDONE(bp);
3864        ASSERT(!(XFS_BUF_ISWRITE(bp)));
3865        ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
3866        XFS_BUF_READ(bp);
3867        XFS_BUF_UNASYNC(bp);
3868        xfsbdstrat(log->l_mp, bp);
3869        error = xfs_iowait(bp);
3870        if (error) {
3871                xfs_ioerror_alert("xlog_do_recover",
3872                                  log->l_mp, bp, XFS_BUF_ADDR(bp));
3873                ASSERT(0);
3874                xfs_buf_relse(bp);
3875                return error;
3876        }
3877
3878        /* Convert superblock from on-disk format */
3879        sbp = &log->l_mp->m_sb;
3880        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
3881        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3882        ASSERT(xfs_sb_good_version(sbp));
3883        xfs_buf_relse(bp);
3884
3885        /* We've re-read the superblock so re-initialize per-cpu counters */
3886        xfs_icsb_reinit_counters(log->l_mp);
3887
3888        xlog_recover_check_summary(log);
3889
3890        /* Normal transactions can now occur */
3891        log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3892        return 0;
3893}
3894
3895/*
3896 * Perform recovery and re-initialize some log variables in xlog_find_tail.
3897 *
3898 * Return error or zero.
3899 */
3900int
3901xlog_recover(
3902        xlog_t                *log)
3903{
3904        xfs_daddr_t        head_blk, tail_blk;
3905        int                error;
3906
3907        /* find the tail of the log */
3908        if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
3909                return error;
3910
3911        if (tail_blk != head_blk) {
3912                /* There used to be a comment here:
3913                 *
3914                 * disallow recovery on read-only mounts.  note -- mount
3915                 * checks for ENOSPC and turns it into an intelligent
3916                 * error message.
3917                 * ...but this is no longer true.  Now, unless you specify
3918                 * NORECOVERY (in which case this function would never be
3919                 * called), we just go ahead and recover.  We do this all
3920                 * under the vfs layer, so we can get away with it unless
3921                 * the device itself is read-only, in which case we fail.
3922                 */
3923                if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3924                        return error;
3925                }
3926
3927                cmn_err(CE_NOTE,
3928                        "Starting XFS recovery on filesystem: %s (logdev: %s)",
3929                        log->l_mp->m_fsname, log->l_mp->m_logname ?
3930                        log->l_mp->m_logname : "internal");
3931
3932                error = xlog_do_recover(log, head_blk, tail_blk);
3933                log->l_flags |= XLOG_RECOVERY_NEEDED;
3934        }
3935        return error;
3936}
3937
3938/*
3939 * In the first part of recovery we replay inodes and buffers and build
3940 * up the list of extent free items which need to be processed.  Here
3941 * we process the extent free items and clean up the on disk unlinked
3942 * inode lists.  This is separated from the first part of recovery so
3943 * that the root and real-time bitmap inodes can be read in from disk in
3944 * between the two stages.  This is necessary so that we can free space
3945 * in the real-time portion of the file system.
3946 */
3947int
3948xlog_recover_finish(
3949        xlog_t                *log)
3950{
3951        /*
3952         * Now we're ready to do the transactions needed for the
3953         * rest of recovery.  Start with completing all the extent
3954         * free intent records and then process the unlinked inode
3955         * lists.  At this point, we essentially run in normal mode
3956         * except that we're still performing recovery actions
3957         * rather than accepting new requests.
3958         */
3959        if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3960                int        error;
3961                error = xlog_recover_process_efis(log);
3962                if (error) {
3963                        cmn_err(CE_ALERT,
3964                                "Failed to recover EFIs on filesystem: %s",
3965                                log->l_mp->m_fsname);
3966                        return error;
3967                }
3968                /*
3969                 * Sync the log to get all the EFIs out of the AIL.
3970                 * This isn't absolutely necessary, but it helps in
3971                 * case the unlink transactions would have problems
3972                 * pushing the EFIs out of the way.
3973                 */
3974                xfs_log_force(log->l_mp, (xfs_lsn_t)0,
3975                              (XFS_LOG_FORCE | XFS_LOG_SYNC));
3976
3977                xlog_recover_process_iunlinks(log);
3978
3979                xlog_recover_check_summary(log);
3980
3981                cmn_err(CE_NOTE,
3982                        "Ending XFS recovery on filesystem: %s (logdev: %s)",
3983                        log->l_mp->m_fsname, log->l_mp->m_logname ?
3984                        log->l_mp->m_logname : "internal");
3985                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3986        } else {
3987                cmn_err(CE_DEBUG,
3988                        "!Ending clean XFS mount for filesystem: %s\n",
3989                        log->l_mp->m_fsname);
3990        }
3991        return 0;
3992}
3993
3994
3995#if defined(DEBUG)
3996/*
3997 * Read all of the agf and agi counters and check that they
3998 * are consistent with the superblock counters.
3999 */
4000void
4001xlog_recover_check_summary(
4002        xlog_t                *log)
4003{
4004        xfs_mount_t        *mp;
4005        xfs_agf_t        *agfp;
4006        xfs_agi_t        *agip;
4007        xfs_buf_t        *agfbp;
4008        xfs_buf_t        *agibp;
4009        xfs_daddr_t        agfdaddr;
4010        xfs_daddr_t        agidaddr;
4011        xfs_buf_t        *sbbp;
4012#ifdef XFS_LOUD_RECOVERY
4013        xfs_sb_t        *sbp;
4014#endif
4015        xfs_agnumber_t        agno;
4016        __uint64_t        freeblks;
4017        __uint64_t        itotal;
4018        __uint64_t        ifree;
4019
4020        mp = log->l_mp;
4021
4022        freeblks = 0LL;
4023        itotal = 0LL;
4024        ifree = 0LL;
4025        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4026                agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
4027                agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
4028                                XFS_FSS_TO_BB(mp, 1), 0);
4029                if (XFS_BUF_ISERROR(agfbp)) {
4030                        xfs_ioerror_alert("xlog_recover_check_summary(agf)",
4031                                                mp, agfbp, agfdaddr);
4032                }
4033                agfp = XFS_BUF_TO_AGF(agfbp);
4034                ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum));
4035                ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum)));
4036                ASSERT(be32_to_cpu(agfp->agf_seqno) == agno);
4037
4038                freeblks += be32_to_cpu(agfp->agf_freeblks) +
4039                            be32_to_cpu(agfp->agf_flcount);
4040                xfs_buf_relse(agfbp);
4041
4042                agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
4043                agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
4044                                XFS_FSS_TO_BB(mp, 1), 0);
4045                if (XFS_BUF_ISERROR(agibp)) {
4046                        xfs_ioerror_alert("xlog_recover_check_summary(agi)",
4047                                          mp, agibp, agidaddr);
4048                }
4049                agip = XFS_BUF_TO_AGI(agibp);
4050                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum));
4051                ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum)));
4052                ASSERT(be32_to_cpu(agip->agi_seqno) == agno);
4053
4054                itotal += be32_to_cpu(agip->agi_count);
4055                ifree += be32_to_cpu(agip->agi_freecount);
4056                xfs_buf_relse(agibp);
4057        }
4058
4059        sbbp = xfs_getsb(mp, 0);
4060#ifdef XFS_LOUD_RECOVERY
4061        sbp = &mp->m_sb;
4062        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
4063        cmn_err(CE_NOTE,
4064                "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
4065                sbp->sb_icount, itotal);
4066        cmn_err(CE_NOTE,
4067                "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
4068                sbp->sb_ifree, ifree);
4069        cmn_err(CE_NOTE,
4070                "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
4071                sbp->sb_fdblocks, freeblks);
4072#if 0
4073        /*
4074         * This is turned off until I account for the allocation
4075         * btree blocks which live in free space.
4076         */
4077        ASSERT(sbp->sb_icount == itotal);
4078        ASSERT(sbp->sb_ifree == ifree);
4079        ASSERT(sbp->sb_fdblocks == freeblks);
4080#endif
4081#endif
4082        xfs_buf_relse(sbbp);
4083}
4084#endif /* DEBUG */