ClabureDB: Classified Bug-Reports Database

Project: Linux Kernel

Showing error 589

User:	Jiri Slaby
Error type:	Double Lock
Error type description:	Some lock is locked twice unintentionally in a sequence
File location:	fs/jbd/commit.c
Line in file:	463
Project:	Linux Kernel
Project version:	2.6.28
Tools:	Stanse (1.2) Smatch (1.59)
Entered:	2011-11-07 22:19:59 UTC
Source:

  1/*
  2 * linux/fs/jbd/commit.c
  3 *
  4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  5 *
  6 * Copyright 1998 Red Hat corp --- All Rights Reserved
  7 *
  8 * This file is part of the Linux kernel and is made available under
  9 * the terms of the GNU General Public License, version 2, or at your
 10 * option, any later version, incorporated herein by reference.
 11 *
 12 * Journal commit routines for the generic filesystem journaling code;
 13 * part of the ext2fs journaling system.
 14 */
 15
 16#include <linux/time.h>
 17#include <linux/fs.h>
 18#include <linux/jbd.h>
 19#include <linux/errno.h>
 20#include <linux/slab.h>
 21#include <linux/mm.h>
 22#include <linux/pagemap.h>
 23
 24/*
 25 * Default IO end handler for temporary BJ_IO buffer_heads.
 26 */
 27static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 28{
 29        BUFFER_TRACE(bh, "");
 30        if (uptodate)
 31                set_buffer_uptodate(bh);
 32        else
 33                clear_buffer_uptodate(bh);
 34        unlock_buffer(bh);
 35}
 36
 37/*
 38 * When an ext3-ordered file is truncated, it is possible that many pages are
 39 * not successfully freed, because they are attached to a committing transaction.
 40 * After the transaction commits, these pages are left on the LRU, with no
 41 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 42 * by the VM, but their apparent absence upsets the VM accounting, and it makes
 43 * the numbers in /proc/meminfo look odd.
 44 *
 45 * So here, we have a buffer which has just come off the forget list.  Look to
 46 * see if we can strip all buffers from the backing page.
 47 *
 48 * Called under journal->j_list_lock.  The caller provided us with a ref
 49 * against the buffer, and we drop that here.
 50 */
 51static void release_buffer_page(struct buffer_head *bh)
 52{
 53        struct page *page;
 54
 55        if (buffer_dirty(bh))
 56                goto nope;
 57        if (atomic_read(&bh->b_count) != 1)
 58                goto nope;
 59        page = bh->b_page;
 60        if (!page)
 61                goto nope;
 62        if (page->mapping)
 63                goto nope;
 64
 65        /* OK, it's a truncated page */
 66        if (!trylock_page(page))
 67                goto nope;
 68
 69        page_cache_get(page);
 70        __brelse(bh);
 71        try_to_free_buffers(page);
 72        unlock_page(page);
 73        page_cache_release(page);
 74        return;
 75
 76nope:
 77        __brelse(bh);
 78}
 79
 80/*
 81 * Decrement reference counter for data buffer. If it has been marked
 82 * 'BH_Freed', release it and the page to which it belongs if possible.
 83 */
 84static void release_data_buffer(struct buffer_head *bh)
 85{
 86        if (buffer_freed(bh)) {
 87                clear_buffer_freed(bh);
 88                release_buffer_page(bh);
 89        } else
 90                put_bh(bh);
 91}
 92
 93/*
 94 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
 95 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
 96 * return 0.  j_list_lock is dropped in this case.
 97 */
 98static int inverted_lock(journal_t *journal, struct buffer_head *bh)
 99{
100        if (!jbd_trylock_bh_state(bh)) {
101                spin_unlock(&journal->j_list_lock);
102                schedule();
103                return 0;
104        }
105        return 1;
106}
107
108/* Done it all: now write the commit record.  We should have
109 * cleaned up our previous buffers by now, so if we are in abort
110 * mode we can now just skip the rest of the journal write
111 * entirely.
112 *
113 * Returns 1 if the journal needs to be aborted or 0 on success
114 */
115static int journal_write_commit_record(journal_t *journal,
116                                        transaction_t *commit_transaction)
117{
118        struct journal_head *descriptor;
119        struct buffer_head *bh;
120        journal_header_t *header;
121        int ret;
122        int barrier_done = 0;
123
124        if (is_journal_aborted(journal))
125                return 0;
126
127        descriptor = journal_get_descriptor_buffer(journal);
128        if (!descriptor)
129                return 1;
130
131        bh = jh2bh(descriptor);
132
133        header = (journal_header_t *)(bh->b_data);
134        header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
135        header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
136        header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
137
138        JBUFFER_TRACE(descriptor, "write commit block");
139        set_buffer_dirty(bh);
140        if (journal->j_flags & JFS_BARRIER) {
141                set_buffer_ordered(bh);
142                barrier_done = 1;
143        }
144        ret = sync_dirty_buffer(bh);
145        if (barrier_done)
146                clear_buffer_ordered(bh);
147        /* is it possible for another commit to fail at roughly
148         * the same time as this one?  If so, we don't want to
149         * trust the barrier flag in the super, but instead want
150         * to remember if we sent a barrier request
151         */
152        if (ret == -EOPNOTSUPP && barrier_done) {
153                char b[BDEVNAME_SIZE];
154
155                printk(KERN_WARNING
156                        "JBD: barrier-based sync failed on %s - "
157                        "disabling barriers\n",
158                        bdevname(journal->j_dev, b));
159                spin_lock(&journal->j_state_lock);
160                journal->j_flags &= ~JFS_BARRIER;
161                spin_unlock(&journal->j_state_lock);
162
163                /* And try again, without the barrier */
164                set_buffer_uptodate(bh);
165                set_buffer_dirty(bh);
166                ret = sync_dirty_buffer(bh);
167        }
168        put_bh(bh);                /* One for getblk() */
169        journal_put_journal_head(descriptor);
170
171        return (ret == -EIO);
172}
173
174static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
175{
176        int i;
177
178        for (i = 0; i < bufs; i++) {
179                wbuf[i]->b_end_io = end_buffer_write_sync;
180                /* We use-up our safety reference in submit_bh() */
181                submit_bh(WRITE, wbuf[i]);
182        }
183}
184
185/*
186 *  Submit all the data buffers to disk
187 */
188static int journal_submit_data_buffers(journal_t *journal,
189                                transaction_t *commit_transaction)
190{
191        struct journal_head *jh;
192        struct buffer_head *bh;
193        int locked;
194        int bufs = 0;
195        struct buffer_head **wbuf = journal->j_wbuf;
196        int err = 0;
197
198        /*
199         * Whenever we unlock the journal and sleep, things can get added
200         * onto ->t_sync_datalist, so we have to keep looping back to
201         * write_out_data until we *know* that the list is empty.
202         *
203         * Cleanup any flushed data buffers from the data list.  Even in
204         * abort mode, we want to flush this out as soon as possible.
205         */
206write_out_data:
207        cond_resched();
208        spin_lock(&journal->j_list_lock);
209
210        while (commit_transaction->t_sync_datalist) {
211                jh = commit_transaction->t_sync_datalist;
212                bh = jh2bh(jh);
213                locked = 0;
214
215                /* Get reference just to make sure buffer does not disappear
216                 * when we are forced to drop various locks */
217                get_bh(bh);
218                /* If the buffer is dirty, we need to submit IO and hence
219                 * we need the buffer lock. We try to lock the buffer without
220                 * blocking. If we fail, we need to drop j_list_lock and do
221                 * blocking lock_buffer().
222                 */
223                if (buffer_dirty(bh)) {
224                        if (!trylock_buffer(bh)) {
225                                BUFFER_TRACE(bh, "needs blocking lock");
226                                spin_unlock(&journal->j_list_lock);
227                                /* Write out all data to prevent deadlocks */
228                                journal_do_submit_data(wbuf, bufs);
229                                bufs = 0;
230                                lock_buffer(bh);
231                                spin_lock(&journal->j_list_lock);
232                        }
233                        locked = 1;
234                }
235                /* We have to get bh_state lock. Again out of order, sigh. */
236                if (!inverted_lock(journal, bh)) {
237                        jbd_lock_bh_state(bh);
238                        spin_lock(&journal->j_list_lock);
239                }
240                /* Someone already cleaned up the buffer? */
241                if (!buffer_jbd(bh)
242                        || jh->b_transaction != commit_transaction
243                        || jh->b_jlist != BJ_SyncData) {
244                        jbd_unlock_bh_state(bh);
245                        if (locked)
246                                unlock_buffer(bh);
247                        BUFFER_TRACE(bh, "already cleaned up");
248                        release_data_buffer(bh);
249                        continue;
250                }
251                if (locked && test_clear_buffer_dirty(bh)) {
252                        BUFFER_TRACE(bh, "needs writeout, adding to array");
253                        wbuf[bufs++] = bh;
254                        __journal_file_buffer(jh, commit_transaction,
255                                                BJ_Locked);
256                        jbd_unlock_bh_state(bh);
257                        if (bufs == journal->j_wbufsize) {
258                                spin_unlock(&journal->j_list_lock);
259                                journal_do_submit_data(wbuf, bufs);
260                                bufs = 0;
261                                goto write_out_data;
262                        }
263                } else if (!locked && buffer_locked(bh)) {
264                        __journal_file_buffer(jh, commit_transaction,
265                                                BJ_Locked);
266                        jbd_unlock_bh_state(bh);
267                        put_bh(bh);
268                } else {
269                        BUFFER_TRACE(bh, "writeout complete: unfile");
270                        if (unlikely(!buffer_uptodate(bh)))
271                                err = -EIO;
272                        __journal_unfile_buffer(jh);
273                        jbd_unlock_bh_state(bh);
274                        if (locked)
275                                unlock_buffer(bh);
276                        journal_remove_journal_head(bh);
277                        /* One for our safety reference, other for
278                         * journal_remove_journal_head() */
279                        put_bh(bh);
280                        release_data_buffer(bh);
281                }
282
283                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
284                        spin_unlock(&journal->j_list_lock);
285                        goto write_out_data;
286                }
287        }
288        spin_unlock(&journal->j_list_lock);
289        journal_do_submit_data(wbuf, bufs);
290
291        return err;
292}
293
294/*
295 * journal_commit_transaction
296 *
297 * The primary function for committing a transaction to the log.  This
298 * function is called by the journal thread to begin a complete commit.
299 */
300void journal_commit_transaction(journal_t *journal)
301{
302        transaction_t *commit_transaction;
303        struct journal_head *jh, *new_jh, *descriptor;
304        struct buffer_head **wbuf = journal->j_wbuf;
305        int bufs;
306        int flags;
307        int err;
308        unsigned long blocknr;
309        char *tagp = NULL;
310        journal_header_t *header;
311        journal_block_tag_t *tag = NULL;
312        int space_left = 0;
313        int first_tag = 0;
314        int tag_flag;
315        int i;
316
317        /*
318         * First job: lock down the current transaction and wait for
319         * all outstanding updates to complete.
320         */
321
322#ifdef COMMIT_STATS
323        spin_lock(&journal->j_list_lock);
324        summarise_journal_usage(journal);
325        spin_unlock(&journal->j_list_lock);
326#endif
327
328        /* Do we need to erase the effects of a prior journal_flush? */
329        if (journal->j_flags & JFS_FLUSHED) {
330                jbd_debug(3, "super block updated\n");
331                journal_update_superblock(journal, 1);
332        } else {
333                jbd_debug(3, "superblock not updated\n");
334        }
335
336        J_ASSERT(journal->j_running_transaction != NULL);
337        J_ASSERT(journal->j_committing_transaction == NULL);
338
339        commit_transaction = journal->j_running_transaction;
340        J_ASSERT(commit_transaction->t_state == T_RUNNING);
341
342        jbd_debug(1, "JBD: starting commit of transaction %d\n",
343                        commit_transaction->t_tid);
344
345        spin_lock(&journal->j_state_lock);
346        commit_transaction->t_state = T_LOCKED;
347
348        spin_lock(&commit_transaction->t_handle_lock);
349        while (commit_transaction->t_updates) {
350                DEFINE_WAIT(wait);
351
352                prepare_to_wait(&journal->j_wait_updates, &wait,
353                                        TASK_UNINTERRUPTIBLE);
354                if (commit_transaction->t_updates) {
355                        spin_unlock(&commit_transaction->t_handle_lock);
356                        spin_unlock(&journal->j_state_lock);
357                        schedule();
358                        spin_lock(&journal->j_state_lock);
359                        spin_lock(&commit_transaction->t_handle_lock);
360                }
361                finish_wait(&journal->j_wait_updates, &wait);
362        }
363        spin_unlock(&commit_transaction->t_handle_lock);
364
365        J_ASSERT (commit_transaction->t_outstanding_credits <=
366                        journal->j_max_transaction_buffers);
367
368        /*
369         * First thing we are allowed to do is to discard any remaining
370         * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
371         * that there are no such buffers: if a large filesystem
372         * operation like a truncate needs to split itself over multiple
373         * transactions, then it may try to do a journal_restart() while
374         * there are still BJ_Reserved buffers outstanding.  These must
375         * be released cleanly from the current transaction.
376         *
377         * In this case, the filesystem must still reserve write access
378         * again before modifying the buffer in the new transaction, but
379         * we do not require it to remember exactly which old buffers it
380         * has reserved.  This is consistent with the existing behaviour
381         * that multiple journal_get_write_access() calls to the same
382         * buffer are perfectly permissable.
383         */
384        while (commit_transaction->t_reserved_list) {
385                jh = commit_transaction->t_reserved_list;
386                JBUFFER_TRACE(jh, "reserved, unused: refile");
387                /*
388                 * A journal_get_undo_access()+journal_release_buffer() may
389                 * leave undo-committed data.
390                 */
391                if (jh->b_committed_data) {
392                        struct buffer_head *bh = jh2bh(jh);
393
394                        jbd_lock_bh_state(bh);
395                        jbd_free(jh->b_committed_data, bh->b_size);
396                        jh->b_committed_data = NULL;
397                        jbd_unlock_bh_state(bh);
398                }
399                journal_refile_buffer(journal, jh);
400        }
401
402        /*
403         * Now try to drop any written-back buffers from the journal's
404         * checkpoint lists.  We do this *before* commit because it potentially
405         * frees some memory
406         */
407        spin_lock(&journal->j_list_lock);
408        __journal_clean_checkpoint_list(journal);
409        spin_unlock(&journal->j_list_lock);
410
411        jbd_debug (3, "JBD: commit phase 1\n");
412
413        /*
414         * Switch to a new revoke table.
415         */
416        journal_switch_revoke_table(journal);
417
418        commit_transaction->t_state = T_FLUSH;
419        journal->j_committing_transaction = commit_transaction;
420        journal->j_running_transaction = NULL;
421        commit_transaction->t_log_start = journal->j_head;
422        wake_up(&journal->j_wait_transaction_locked);
423        spin_unlock(&journal->j_state_lock);
424
425        jbd_debug (3, "JBD: commit phase 2\n");
426
427        /*
428         * Now start flushing things to disk, in the order they appear
429         * on the transaction lists.  Data blocks go first.
430         */
431        err = journal_submit_data_buffers(journal, commit_transaction);
432
433        /*
434         * Wait for all previously submitted IO to complete.
435         */
436        spin_lock(&journal->j_list_lock);
437        while (commit_transaction->t_locked_list) {
438                struct buffer_head *bh;
439
440                jh = commit_transaction->t_locked_list->b_tprev;
441                bh = jh2bh(jh);
442                get_bh(bh);
443                if (buffer_locked(bh)) {
444                        spin_unlock(&journal->j_list_lock);
445                        wait_on_buffer(bh);
446                        spin_lock(&journal->j_list_lock);
447                }
448                if (unlikely(!buffer_uptodate(bh))) {
449                        if (!trylock_page(bh->b_page)) {
450                                spin_unlock(&journal->j_list_lock);
451                                lock_page(bh->b_page);
452                                spin_lock(&journal->j_list_lock);
453                        }
454                        if (bh->b_page->mapping)
455                                set_bit(AS_EIO, &bh->b_page->mapping->flags);
456
457                        unlock_page(bh->b_page);
458                        SetPageError(bh->b_page);
459                        err = -EIO;
460                }
461                if (!inverted_lock(journal, bh)) {
462                        put_bh(bh);
463                        spin_lock(&journal->j_list_lock);
464                        continue;
465                }
466                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
467                        __journal_unfile_buffer(jh);
468                        jbd_unlock_bh_state(bh);
469                        journal_remove_journal_head(bh);
470                        put_bh(bh);
471                } else {
472                        jbd_unlock_bh_state(bh);
473                }
474                release_data_buffer(bh);
475                cond_resched_lock(&journal->j_list_lock);
476        }
477        spin_unlock(&journal->j_list_lock);
478
479        if (err) {
480                char b[BDEVNAME_SIZE];
481
482                printk(KERN_WARNING
483                        "JBD: Detected IO errors while flushing file data "
484                        "on %s\n", bdevname(journal->j_fs_dev, b));
485                if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
486                        journal_abort(journal, err);
487                err = 0;
488        }
489
490        journal_write_revoke_records(journal, commit_transaction);
491
492        /*
493         * If we found any dirty or locked buffers, then we should have
494         * looped back up to the write_out_data label.  If there weren't
495         * any then journal_clean_data_list should have wiped the list
496         * clean by now, so check that it is in fact empty.
497         */
498        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
499
500        jbd_debug (3, "JBD: commit phase 3\n");
501
502        /*
503         * Way to go: we have now written out all of the data for a
504         * transaction!  Now comes the tricky part: we need to write out
505         * metadata.  Loop over the transaction's entire buffer list:
506         */
507        spin_lock(&journal->j_state_lock);
508        commit_transaction->t_state = T_COMMIT;
509        spin_unlock(&journal->j_state_lock);
510
511        J_ASSERT(commit_transaction->t_nr_buffers <=
512                 commit_transaction->t_outstanding_credits);
513
514        descriptor = NULL;
515        bufs = 0;
516        while (commit_transaction->t_buffers) {
517
518                /* Find the next buffer to be journaled... */
519
520                jh = commit_transaction->t_buffers;
521
522                /* If we're in abort mode, we just un-journal the buffer and
523                   release it. */
524
525                if (is_journal_aborted(journal)) {
526                        clear_buffer_jbddirty(jh2bh(jh));
527                        JBUFFER_TRACE(jh, "journal is aborting: refile");
528                        journal_refile_buffer(journal, jh);
529                        /* If that was the last one, we need to clean up
530                         * any descriptor buffers which may have been
531                         * already allocated, even if we are now
532                         * aborting. */
533                        if (!commit_transaction->t_buffers)
534                                goto start_journal_io;
535                        continue;
536                }
537
538                /* Make sure we have a descriptor block in which to
539                   record the metadata buffer. */
540
541                if (!descriptor) {
542                        struct buffer_head *bh;
543
544                        J_ASSERT (bufs == 0);
545
546                        jbd_debug(4, "JBD: get descriptor\n");
547
548                        descriptor = journal_get_descriptor_buffer(journal);
549                        if (!descriptor) {
550                                journal_abort(journal, -EIO);
551                                continue;
552                        }
553
554                        bh = jh2bh(descriptor);
555                        jbd_debug(4, "JBD: got buffer %llu (%p)\n",
556                                (unsigned long long)bh->b_blocknr, bh->b_data);
557                        header = (journal_header_t *)&bh->b_data[0];
558                        header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
559                        header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
560                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
561
562                        tagp = &bh->b_data[sizeof(journal_header_t)];
563                        space_left = bh->b_size - sizeof(journal_header_t);
564                        first_tag = 1;
565                        set_buffer_jwrite(bh);
566                        set_buffer_dirty(bh);
567                        wbuf[bufs++] = bh;
568
569                        /* Record it so that we can wait for IO
570                           completion later */
571                        BUFFER_TRACE(bh, "ph3: file as descriptor");
572                        journal_file_buffer(descriptor, commit_transaction,
573                                        BJ_LogCtl);
574                }
575
576                /* Where is the buffer to be written? */
577
578                err = journal_next_log_block(journal, &blocknr);
579                /* If the block mapping failed, just abandon the buffer
580                   and repeat this loop: we'll fall into the
581                   refile-on-abort condition above. */
582                if (err) {
583                        journal_abort(journal, err);
584                        continue;
585                }
586
587                /*
588                 * start_this_handle() uses t_outstanding_credits to determine
589                 * the free space in the log, but this counter is changed
590                 * by journal_next_log_block() also.
591                 */
592                commit_transaction->t_outstanding_credits--;
593
594                /* Bump b_count to prevent truncate from stumbling over
595                   the shadowed buffer!  @@@ This can go if we ever get
596                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
597                atomic_inc(&jh2bh(jh)->b_count);
598
599                /* Make a temporary IO buffer with which to write it out
600                   (this will requeue both the metadata buffer and the
601                   temporary IO buffer). new_bh goes on BJ_IO*/
602
603                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
604                /*
605                 * akpm: journal_write_metadata_buffer() sets
606                 * new_bh->b_transaction to commit_transaction.
607                 * We need to clean this up before we release new_bh
608                 * (which is of type BJ_IO)
609                 */
610                JBUFFER_TRACE(jh, "ph3: write metadata");
611                flags = journal_write_metadata_buffer(commit_transaction,
612                                                      jh, &new_jh, blocknr);
613                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
614                wbuf[bufs++] = jh2bh(new_jh);
615
616                /* Record the new block's tag in the current descriptor
617                   buffer */
618
619                tag_flag = 0;
620                if (flags & 1)
621                        tag_flag |= JFS_FLAG_ESCAPE;
622                if (!first_tag)
623                        tag_flag |= JFS_FLAG_SAME_UUID;
624
625                tag = (journal_block_tag_t *) tagp;
626                tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
627                tag->t_flags = cpu_to_be32(tag_flag);
628                tagp += sizeof(journal_block_tag_t);
629                space_left -= sizeof(journal_block_tag_t);
630
631                if (first_tag) {
632                        memcpy (tagp, journal->j_uuid, 16);
633                        tagp += 16;
634                        space_left -= 16;
635                        first_tag = 0;
636                }
637
638                /* If there's no more to do, or if the descriptor is full,
639                   let the IO rip! */
640
641                if (bufs == journal->j_wbufsize ||
642                    commit_transaction->t_buffers == NULL ||
643                    space_left < sizeof(journal_block_tag_t) + 16) {
644
645                        jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
646
647                        /* Write an end-of-descriptor marker before
648                           submitting the IOs.  "tag" still points to
649                           the last tag we set up. */
650
651                        tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
652
653start_journal_io:
654                        for (i = 0; i < bufs; i++) {
655                                struct buffer_head *bh = wbuf[i];
656                                lock_buffer(bh);
657                                clear_buffer_dirty(bh);
658                                set_buffer_uptodate(bh);
659                                bh->b_end_io = journal_end_buffer_io_sync;
660                                submit_bh(WRITE, bh);
661                        }
662                        cond_resched();
663
664                        /* Force a new descriptor to be generated next
665                           time round the loop. */
666                        descriptor = NULL;
667                        bufs = 0;
668                }
669        }
670
671        /* Lo and behold: we have just managed to send a transaction to
672           the log.  Before we can commit it, wait for the IO so far to
673           complete.  Control buffers being written are on the
674           transaction's t_log_list queue, and metadata buffers are on
675           the t_iobuf_list queue.
676
677           Wait for the buffers in reverse order.  That way we are
678           less likely to be woken up until all IOs have completed, and
679           so we incur less scheduling load.
680        */
681
682        jbd_debug(3, "JBD: commit phase 4\n");
683
684        /*
685         * akpm: these are BJ_IO, and j_list_lock is not needed.
686         * See __journal_try_to_free_buffer.
687         */
688wait_for_iobuf:
689        while (commit_transaction->t_iobuf_list != NULL) {
690                struct buffer_head *bh;
691
692                jh = commit_transaction->t_iobuf_list->b_tprev;
693                bh = jh2bh(jh);
694                if (buffer_locked(bh)) {
695                        wait_on_buffer(bh);
696                        goto wait_for_iobuf;
697                }
698                if (cond_resched())
699                        goto wait_for_iobuf;
700
701                if (unlikely(!buffer_uptodate(bh)))
702                        err = -EIO;
703
704                clear_buffer_jwrite(bh);
705
706                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
707                journal_unfile_buffer(journal, jh);
708
709                /*
710                 * ->t_iobuf_list should contain only dummy buffer_heads
711                 * which were created by journal_write_metadata_buffer().
712                 */
713                BUFFER_TRACE(bh, "dumping temporary bh");
714                journal_put_journal_head(jh);
715                __brelse(bh);
716                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
717                free_buffer_head(bh);
718
719                /* We also have to unlock and free the corresponding
720                   shadowed buffer */
721                jh = commit_transaction->t_shadow_list->b_tprev;
722                bh = jh2bh(jh);
723                clear_bit(BH_JWrite, &bh->b_state);
724                J_ASSERT_BH(bh, buffer_jbddirty(bh));
725
726                /* The metadata is now released for reuse, but we need
727                   to remember it against this transaction so that when
728                   we finally commit, we can do any checkpointing
729                   required. */
730                JBUFFER_TRACE(jh, "file as BJ_Forget");
731                journal_file_buffer(jh, commit_transaction, BJ_Forget);
732                /* Wake up any transactions which were waiting for this
733                   IO to complete */
734                wake_up_bit(&bh->b_state, BH_Unshadow);
735                JBUFFER_TRACE(jh, "brelse shadowed buffer");
736                __brelse(bh);
737        }
738
739        J_ASSERT (commit_transaction->t_shadow_list == NULL);
740
741        jbd_debug(3, "JBD: commit phase 5\n");
742
743        /* Here we wait for the revoke record and descriptor record buffers */
744 wait_for_ctlbuf:
745        while (commit_transaction->t_log_list != NULL) {
746                struct buffer_head *bh;
747
748                jh = commit_transaction->t_log_list->b_tprev;
749                bh = jh2bh(jh);
750                if (buffer_locked(bh)) {
751                        wait_on_buffer(bh);
752                        goto wait_for_ctlbuf;
753                }
754                if (cond_resched())
755                        goto wait_for_ctlbuf;
756
757                if (unlikely(!buffer_uptodate(bh)))
758                        err = -EIO;
759
760                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
761                clear_buffer_jwrite(bh);
762                journal_unfile_buffer(journal, jh);
763                journal_put_journal_head(jh);
764                __brelse(bh);                /* One for getblk */
765                /* AKPM: bforget here */
766        }
767
768        if (err)
769                journal_abort(journal, err);
770
771        jbd_debug(3, "JBD: commit phase 6\n");
772
773        if (journal_write_commit_record(journal, commit_transaction))
774                err = -EIO;
775
776        if (err)
777                journal_abort(journal, err);
778
779        /* End of a transaction!  Finally, we can do checkpoint
780           processing: any buffers committed as a result of this
781           transaction can be removed from any checkpoint list it was on
782           before. */
783
784        jbd_debug(3, "JBD: commit phase 7\n");
785
786        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
787        J_ASSERT(commit_transaction->t_buffers == NULL);
788        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
789        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
790        J_ASSERT(commit_transaction->t_shadow_list == NULL);
791        J_ASSERT(commit_transaction->t_log_list == NULL);
792
793restart_loop:
794        /*
795         * As there are other places (journal_unmap_buffer()) adding buffers
796         * to this list we have to be careful and hold the j_list_lock.
797         */
798        spin_lock(&journal->j_list_lock);
799        while (commit_transaction->t_forget) {
800                transaction_t *cp_transaction;
801                struct buffer_head *bh;
802
803                jh = commit_transaction->t_forget;
804                spin_unlock(&journal->j_list_lock);
805                bh = jh2bh(jh);
806                jbd_lock_bh_state(bh);
807                J_ASSERT_JH(jh,        jh->b_transaction == commit_transaction ||
808                        jh->b_transaction == journal->j_running_transaction);
809
810                /*
811                 * If there is undo-protected committed data against
812                 * this buffer, then we can remove it now.  If it is a
813                 * buffer needing such protection, the old frozen_data
814                 * field now points to a committed version of the
815                 * buffer, so rotate that field to the new committed
816                 * data.
817                 *
818                 * Otherwise, we can just throw away the frozen data now.
819                 */
820                if (jh->b_committed_data) {
821                        jbd_free(jh->b_committed_data, bh->b_size);
822                        jh->b_committed_data = NULL;
823                        if (jh->b_frozen_data) {
824                                jh->b_committed_data = jh->b_frozen_data;
825                                jh->b_frozen_data = NULL;
826                        }
827                } else if (jh->b_frozen_data) {
828                        jbd_free(jh->b_frozen_data, bh->b_size);
829                        jh->b_frozen_data = NULL;
830                }
831
832                spin_lock(&journal->j_list_lock);
833                cp_transaction = jh->b_cp_transaction;
834                if (cp_transaction) {
835                        JBUFFER_TRACE(jh, "remove from old cp transaction");
836                        __journal_remove_checkpoint(jh);
837                }
838
839                /* Only re-checkpoint the buffer_head if it is marked
840                 * dirty.  If the buffer was added to the BJ_Forget list
841                 * by journal_forget, it may no longer be dirty and
842                 * there's no point in keeping a checkpoint record for
843                 * it. */
844
845                /* A buffer which has been freed while still being
846                 * journaled by a previous transaction may end up still
847                 * being dirty here, but we want to avoid writing back
848                 * that buffer in the future now that the last use has
849                 * been committed.  That's not only a performance gain,
850                 * it also stops aliasing problems if the buffer is left
851                 * behind for writeback and gets reallocated for another
852                 * use in a different page. */
853                if (buffer_freed(bh)) {
854                        clear_buffer_freed(bh);
855                        clear_buffer_jbddirty(bh);
856                }
857
858                if (buffer_jbddirty(bh)) {
859                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
860                        __journal_insert_checkpoint(jh, commit_transaction);
861                        if (is_journal_aborted(journal))
862                                clear_buffer_jbddirty(bh);
863                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
864                        __journal_refile_buffer(jh);
865                        jbd_unlock_bh_state(bh);
866                } else {
867                        J_ASSERT_BH(bh, !buffer_dirty(bh));
868                        /* The buffer on BJ_Forget list and not jbddirty means
869                         * it has been freed by this transaction and hence it
870                         * could not have been reallocated until this
871                         * transaction has committed. *BUT* it could be
872                         * reallocated once we have written all the data to
873                         * disk and before we process the buffer on BJ_Forget
874                         * list. */
875                        JBUFFER_TRACE(jh, "refile or unfile freed buffer");
876                        __journal_refile_buffer(jh);
877                        if (!jh->b_transaction) {
878                                jbd_unlock_bh_state(bh);
879                                 /* needs a brelse */
880                                journal_remove_journal_head(bh);
881                                release_buffer_page(bh);
882                        } else
883                                jbd_unlock_bh_state(bh);
884                }
885                cond_resched_lock(&journal->j_list_lock);
886        }
887        spin_unlock(&journal->j_list_lock);
888        /*
889         * This is a bit sleazy.  We use j_list_lock to protect transition
890         * of a transaction into T_FINISHED state and calling
891         * __journal_drop_transaction(). Otherwise we could race with
892         * other checkpointing code processing the transaction...
893         */
894        spin_lock(&journal->j_state_lock);
895        spin_lock(&journal->j_list_lock);
896        /*
897         * Now recheck if some buffers did not get attached to the transaction
898         * while the lock was dropped...
899         */
900        if (commit_transaction->t_forget) {
901                spin_unlock(&journal->j_list_lock);
902                spin_unlock(&journal->j_state_lock);
903                goto restart_loop;
904        }
905
906        /* Done with this transaction! */
907
908        jbd_debug(3, "JBD: commit phase 8\n");
909
910        J_ASSERT(commit_transaction->t_state == T_COMMIT);
911
912        commit_transaction->t_state = T_FINISHED;
913        J_ASSERT(commit_transaction == journal->j_committing_transaction);
914        journal->j_commit_sequence = commit_transaction->t_tid;
915        journal->j_committing_transaction = NULL;
916        spin_unlock(&journal->j_state_lock);
917
918        if (commit_transaction->t_checkpoint_list == NULL &&
919            commit_transaction->t_checkpoint_io_list == NULL) {
920                __journal_drop_transaction(journal, commit_transaction);
921        } else {
922                if (journal->j_checkpoint_transactions == NULL) {
923                        journal->j_checkpoint_transactions = commit_transaction;
924                        commit_transaction->t_cpnext = commit_transaction;
925                        commit_transaction->t_cpprev = commit_transaction;
926                } else {
927                        commit_transaction->t_cpnext =
928                                journal->j_checkpoint_transactions;
929                        commit_transaction->t_cpprev =
930                                commit_transaction->t_cpnext->t_cpprev;
931                        commit_transaction->t_cpnext->t_cpprev =
932                                commit_transaction;
933                        commit_transaction->t_cpprev->t_cpnext =
934                                commit_transaction;
935                }
936        }
937        spin_unlock(&journal->j_list_lock);
938
939        jbd_debug(1, "JBD: commit %d complete, head %d\n",
940                  journal->j_commit_sequence, journal->j_tail_sequence);
941
942        wake_up(&journal->j_wait_done_commit);
943}