Showing error 1207

User: Jiri Slaby
Error type: Double Unlock
Error type description: Some lock is unlocked twice unintentionally in a sequence
File location: fs/xfs/linux-2.6/xfs_lrw.c
Line in file: 815
Project: Linux Kernel
Project version: 2.6.28
Tools: Stanse (1.2)
Entered: 2012-04-30 10:52:00 UTC


Source:

  1/*
  2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
  3 * All Rights Reserved.
  4 *
  5 * This program is free software; you can redistribute it and/or
  6 * modify it under the terms of the GNU General Public License as
  7 * published by the Free Software Foundation.
  8 *
  9 * This program is distributed in the hope that it would be useful,
 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 * GNU General Public License for more details.
 13 *
 14 * You should have received a copy of the GNU General Public License
 15 * along with this program; if not, write the Free Software Foundation,
 16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 17 */
 18#include "xfs.h"
 19#include "xfs_fs.h"
 20#include "xfs_bit.h"
 21#include "xfs_log.h"
 22#include "xfs_inum.h"
 23#include "xfs_trans.h"
 24#include "xfs_sb.h"
 25#include "xfs_ag.h"
 26#include "xfs_dir2.h"
 27#include "xfs_alloc.h"
 28#include "xfs_dmapi.h"
 29#include "xfs_quota.h"
 30#include "xfs_mount.h"
 31#include "xfs_bmap_btree.h"
 32#include "xfs_alloc_btree.h"
 33#include "xfs_ialloc_btree.h"
 34#include "xfs_dir2_sf.h"
 35#include "xfs_attr_sf.h"
 36#include "xfs_dinode.h"
 37#include "xfs_inode.h"
 38#include "xfs_bmap.h"
 39#include "xfs_btree.h"
 40#include "xfs_ialloc.h"
 41#include "xfs_rtalloc.h"
 42#include "xfs_error.h"
 43#include "xfs_itable.h"
 44#include "xfs_rw.h"
 45#include "xfs_acl.h"
 46#include "xfs_attr.h"
 47#include "xfs_inode_item.h"
 48#include "xfs_buf_item.h"
 49#include "xfs_utils.h"
 50#include "xfs_iomap.h"
 51#include "xfs_vnodeops.h"
 52
 53#include <linux/capability.h>
 54#include <linux/mount.h>
 55#include <linux/writeback.h>
 56
 57
 58#if defined(XFS_RW_TRACE)
 59void
 60xfs_rw_enter_trace(
 61        int                        tag,
 62        xfs_inode_t                *ip,
 63        void                        *data,
 64        size_t                        segs,
 65        loff_t                        offset,
 66        int                        ioflags)
 67{
 68        if (ip->i_rwtrace == NULL)
 69                return;
 70        ktrace_enter(ip->i_rwtrace,
 71                (void *)(unsigned long)tag,
 72                (void *)ip,
 73                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
 74                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
 75                (void *)data,
 76                (void *)((unsigned long)segs),
 77                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 78                (void *)((unsigned long)(offset & 0xffffffff)),
 79                (void *)((unsigned long)ioflags),
 80                (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
 81                (void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
 82                (void *)((unsigned long)current_pid()),
 83                (void *)NULL,
 84                (void *)NULL,
 85                (void *)NULL,
 86                (void *)NULL);
 87}
 88
 89void
 90xfs_inval_cached_trace(
 91        xfs_inode_t        *ip,
 92        xfs_off_t        offset,
 93        xfs_off_t        len,
 94        xfs_off_t        first,
 95        xfs_off_t        last)
 96{
 97
 98        if (ip->i_rwtrace == NULL)
 99                return;
100        ktrace_enter(ip->i_rwtrace,
101                (void *)(__psint_t)XFS_INVAL_CACHED,
102                (void *)ip,
103                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
104                (void *)((unsigned long)(offset & 0xffffffff)),
105                (void *)((unsigned long)((len >> 32) & 0xffffffff)),
106                (void *)((unsigned long)(len & 0xffffffff)),
107                (void *)((unsigned long)((first >> 32) & 0xffffffff)),
108                (void *)((unsigned long)(first & 0xffffffff)),
109                (void *)((unsigned long)((last >> 32) & 0xffffffff)),
110                (void *)((unsigned long)(last & 0xffffffff)),
111                (void *)((unsigned long)current_pid()),
112                (void *)NULL,
113                (void *)NULL,
114                (void *)NULL,
115                (void *)NULL,
116                (void *)NULL);
117}
118#endif
119
120/*
121 *        xfs_iozero
122 *
123 *        xfs_iozero clears the specified range of buffer supplied,
124 *        and marks all the affected blocks as valid and modified.  If
125 *        an affected block is not allocated, it will be allocated.  If
126 *        an affected block is not completely overwritten, and is not
127 *        valid before the operation, it will be read from disk before
128 *        being partially zeroed.
129 */
130STATIC int
131xfs_iozero(
132        struct xfs_inode        *ip,        /* inode                        */
133        loff_t                        pos,        /* offset in file                */
134        size_t                        count)        /* size of data to zero                */
135{
136        struct page                *page;
137        struct address_space        *mapping;
138        int                        status;
139
140        mapping = VFS_I(ip)->i_mapping;
141        do {
142                unsigned offset, bytes;
143                void *fsdata;
144
145                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
146                bytes = PAGE_CACHE_SIZE - offset;
147                if (bytes > count)
148                        bytes = count;
149
150                status = pagecache_write_begin(NULL, mapping, pos, bytes,
151                                        AOP_FLAG_UNINTERRUPTIBLE,
152                                        &page, &fsdata);
153                if (status)
154                        break;
155
156                zero_user(page, offset, bytes);
157
158                status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
159                                        page, fsdata);
160                WARN_ON(status <= 0); /* can't return less than zero! */
161                pos += bytes;
162                count -= bytes;
163                status = 0;
164        } while (count);
165
166        return (-status);
167}
168
169ssize_t                        /* bytes read, or (-)  error */
170xfs_read(
171        xfs_inode_t                *ip,
172        struct kiocb                *iocb,
173        const struct iovec        *iovp,
174        unsigned int                segs,
175        loff_t                        *offset,
176        int                        ioflags)
177{
178        struct file                *file = iocb->ki_filp;
179        struct inode                *inode = file->f_mapping->host;
180        xfs_mount_t                *mp = ip->i_mount;
181        size_t                        size = 0;
182        ssize_t                        ret = 0;
183        xfs_fsize_t                n;
184        unsigned long                seg;
185
186
187        XFS_STATS_INC(xs_read_calls);
188
189        /* START copy & waste from filemap.c */
190        for (seg = 0; seg < segs; seg++) {
191                const struct iovec *iv = &iovp[seg];
192
193                /*
194                 * If any segment has a negative length, or the cumulative
195                 * length ever wraps negative then return -EINVAL.
196                 */
197                size += iv->iov_len;
198                if (unlikely((ssize_t)(size|iv->iov_len) < 0))
199                        return XFS_ERROR(-EINVAL);
200        }
201        /* END copy & waste from filemap.c */
202
203        if (unlikely(ioflags & IO_ISDIRECT)) {
204                xfs_buftarg_t        *target =
205                        XFS_IS_REALTIME_INODE(ip) ?
206                                mp->m_rtdev_targp : mp->m_ddev_targp;
207                if ((*offset & target->bt_smask) ||
208                    (size & target->bt_smask)) {
209                        if (*offset == ip->i_size) {
210                                return (0);
211                        }
212                        return -XFS_ERROR(EINVAL);
213                }
214        }
215
216        n = XFS_MAXIOFFSET(mp) - *offset;
217        if ((n <= 0) || (size == 0))
218                return 0;
219
220        if (n < size)
221                size = n;
222
223        if (XFS_FORCED_SHUTDOWN(mp))
224                return -EIO;
225
226        if (unlikely(ioflags & IO_ISDIRECT))
227                mutex_lock(&inode->i_mutex);
228        xfs_ilock(ip, XFS_IOLOCK_SHARED);
229
230        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
231                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
232                int iolock = XFS_IOLOCK_SHARED;
233
234                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
235                                        dmflags, &iolock);
236                if (ret) {
237                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
238                        if (unlikely(ioflags & IO_ISDIRECT))
239                                mutex_unlock(&inode->i_mutex);
240                        return ret;
241                }
242        }
243
244        if (unlikely(ioflags & IO_ISDIRECT)) {
245                if (inode->i_mapping->nrpages)
246                        ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
247                                                    -1, FI_REMAPF_LOCKED);
248                mutex_unlock(&inode->i_mutex);
249                if (ret) {
250                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
251                        return ret;
252                }
253        }
254
255        xfs_rw_enter_trace(XFS_READ_ENTER, ip,
256                                (void *)iovp, segs, *offset, ioflags);
257
258        iocb->ki_pos = *offset;
259        ret = generic_file_aio_read(iocb, iovp, segs, *offset);
260        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
261                ret = wait_on_sync_kiocb(iocb);
262        if (ret > 0)
263                XFS_STATS_ADD(xs_read_bytes, ret);
264
265        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
266        return ret;
267}
268
269ssize_t
270xfs_splice_read(
271        xfs_inode_t                *ip,
272        struct file                *infilp,
273        loff_t                        *ppos,
274        struct pipe_inode_info        *pipe,
275        size_t                        count,
276        int                        flags,
277        int                        ioflags)
278{
279        xfs_mount_t                *mp = ip->i_mount;
280        ssize_t                        ret;
281
282        XFS_STATS_INC(xs_read_calls);
283        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
284                return -EIO;
285
286        xfs_ilock(ip, XFS_IOLOCK_SHARED);
287
288        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
289                int iolock = XFS_IOLOCK_SHARED;
290                int error;
291
292                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
293                                        FILP_DELAY_FLAG(infilp), &iolock);
294                if (error) {
295                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
296                        return -error;
297                }
298        }
299        xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip,
300                           pipe, count, *ppos, ioflags);
301        ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
302        if (ret > 0)
303                XFS_STATS_ADD(xs_read_bytes, ret);
304
305        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
306        return ret;
307}
308
309ssize_t
310xfs_splice_write(
311        xfs_inode_t                *ip,
312        struct pipe_inode_info        *pipe,
313        struct file                *outfilp,
314        loff_t                        *ppos,
315        size_t                        count,
316        int                        flags,
317        int                        ioflags)
318{
319        xfs_mount_t                *mp = ip->i_mount;
320        ssize_t                        ret;
321        struct inode                *inode = outfilp->f_mapping->host;
322        xfs_fsize_t                isize, new_size;
323
324        XFS_STATS_INC(xs_write_calls);
325        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
326                return -EIO;
327
328        xfs_ilock(ip, XFS_IOLOCK_EXCL);
329
330        if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
331                int iolock = XFS_IOLOCK_EXCL;
332                int error;
333
334                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
335                                        FILP_DELAY_FLAG(outfilp), &iolock);
336                if (error) {
337                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
338                        return -error;
339                }
340        }
341
342        new_size = *ppos + count;
343
344        xfs_ilock(ip, XFS_ILOCK_EXCL);
345        if (new_size > ip->i_size)
346                ip->i_new_size = new_size;
347        xfs_iunlock(ip, XFS_ILOCK_EXCL);
348
349        xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip,
350                           pipe, count, *ppos, ioflags);
351        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
352        if (ret > 0)
353                XFS_STATS_ADD(xs_write_bytes, ret);
354
355        isize = i_size_read(inode);
356        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
357                *ppos = isize;
358
359        if (*ppos > ip->i_size) {
360                xfs_ilock(ip, XFS_ILOCK_EXCL);
361                if (*ppos > ip->i_size)
362                        ip->i_size = *ppos;
363                xfs_iunlock(ip, XFS_ILOCK_EXCL);
364        }
365
366        if (ip->i_new_size) {
367                xfs_ilock(ip, XFS_ILOCK_EXCL);
368                ip->i_new_size = 0;
369                if (ip->i_d.di_size > ip->i_size)
370                        ip->i_d.di_size = ip->i_size;
371                xfs_iunlock(ip, XFS_ILOCK_EXCL);
372        }
373        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
374        return ret;
375}
376
377/*
378 * This routine is called to handle zeroing any space in the last
379 * block of the file that is beyond the EOF.  We do this since the
380 * size is being increased without writing anything to that block
381 * and we don't want anyone to read the garbage on the disk.
382 */
383STATIC int                                /* error (positive) */
384xfs_zero_last_block(
385        xfs_inode_t        *ip,
386        xfs_fsize_t        offset,
387        xfs_fsize_t        isize)
388{
389        xfs_fileoff_t        last_fsb;
390        xfs_mount_t        *mp = ip->i_mount;
391        int                nimaps;
392        int                zero_offset;
393        int                zero_len;
394        int                error = 0;
395        xfs_bmbt_irec_t        imap;
396
397        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
398
399        zero_offset = XFS_B_FSB_OFFSET(mp, isize);
400        if (zero_offset == 0) {
401                /*
402                 * There are no extra bytes in the last block on disk to
403                 * zero, so return.
404                 */
405                return 0;
406        }
407
408        last_fsb = XFS_B_TO_FSBT(mp, isize);
409        nimaps = 1;
410        error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
411                          &nimaps, NULL, NULL);
412        if (error) {
413                return error;
414        }
415        ASSERT(nimaps > 0);
416        /*
417         * If the block underlying isize is just a hole, then there
418         * is nothing to zero.
419         */
420        if (imap.br_startblock == HOLESTARTBLOCK) {
421                return 0;
422        }
423        /*
424         * Zero the part of the last block beyond the EOF, and write it
425         * out sync.  We need to drop the ilock while we do this so we
426         * don't deadlock when the buffer cache calls back to us.
427         */
428        xfs_iunlock(ip, XFS_ILOCK_EXCL);
429
430        zero_len = mp->m_sb.sb_blocksize - zero_offset;
431        if (isize + zero_len > offset)
432                zero_len = offset - isize;
433        error = xfs_iozero(ip, isize, zero_len);
434
435        xfs_ilock(ip, XFS_ILOCK_EXCL);
436        ASSERT(error >= 0);
437        return error;
438}
439
440/*
441 * Zero any on disk space between the current EOF and the new,
442 * larger EOF.  This handles the normal case of zeroing the remainder
443 * of the last block in the file and the unusual case of zeroing blocks
444 * out beyond the size of the file.  This second case only happens
445 * with fixed size extents and when the system crashes before the inode
446 * size was updated but after blocks were allocated.  If fill is set,
447 * then any holes in the range are filled and zeroed.  If not, the holes
448 * are left alone as holes.
449 */
450
451int                                        /* error (positive) */
452xfs_zero_eof(
453        xfs_inode_t        *ip,
454        xfs_off_t        offset,                /* starting I/O offset */
455        xfs_fsize_t        isize)                /* current inode size */
456{
457        xfs_mount_t        *mp = ip->i_mount;
458        xfs_fileoff_t        start_zero_fsb;
459        xfs_fileoff_t        end_zero_fsb;
460        xfs_fileoff_t        zero_count_fsb;
461        xfs_fileoff_t        last_fsb;
462        xfs_fileoff_t        zero_off;
463        xfs_fsize_t        zero_len;
464        int                nimaps;
465        int                error = 0;
466        xfs_bmbt_irec_t        imap;
467
468        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
469        ASSERT(offset > isize);
470
471        /*
472         * First handle zeroing the block on which isize resides.
473         * We only zero a part of that block so it is handled specially.
474         */
475        error = xfs_zero_last_block(ip, offset, isize);
476        if (error) {
477                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
478                return error;
479        }
480
481        /*
482         * Calculate the range between the new size and the old
483         * where blocks needing to be zeroed may exist.  To get the
484         * block where the last byte in the file currently resides,
485         * we need to subtract one from the size and truncate back
486         * to a block boundary.  We subtract 1 in case the size is
487         * exactly on a block boundary.
488         */
489        last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
490        start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
491        end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
492        ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
493        if (last_fsb == end_zero_fsb) {
494                /*
495                 * The size was only incremented on its last block.
496                 * We took care of that above, so just return.
497                 */
498                return 0;
499        }
500
501        ASSERT(start_zero_fsb <= end_zero_fsb);
502        while (start_zero_fsb <= end_zero_fsb) {
503                nimaps = 1;
504                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
505                error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
506                                  0, NULL, 0, &imap, &nimaps, NULL, NULL);
507                if (error) {
508                        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
509                        return error;
510                }
511                ASSERT(nimaps > 0);
512
513                if (imap.br_state == XFS_EXT_UNWRITTEN ||
514                    imap.br_startblock == HOLESTARTBLOCK) {
515                        /*
516                         * This loop handles initializing pages that were
517                         * partially initialized by the code below this
518                         * loop. It basically zeroes the part of the page
519                         * that sits on a hole and sets the page as P_HOLE
520                         * and calls remapf if it is a mapped file.
521                         */
522                        start_zero_fsb = imap.br_startoff + imap.br_blockcount;
523                        ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
524                        continue;
525                }
526
527                /*
528                 * There are blocks we need to zero.
529                 * Drop the inode lock while we're doing the I/O.
530                 * We'll still have the iolock to protect us.
531                 */
532                xfs_iunlock(ip, XFS_ILOCK_EXCL);
533
534                zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
535                zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
536
537                if ((zero_off + zero_len) > offset)
538                        zero_len = offset - zero_off;
539
540                error = xfs_iozero(ip, zero_off, zero_len);
541                if (error) {
542                        goto out_lock;
543                }
544
545                start_zero_fsb = imap.br_startoff + imap.br_blockcount;
546                ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
547
548                xfs_ilock(ip, XFS_ILOCK_EXCL);
549        }
550
551        return 0;
552
553out_lock:
554        xfs_ilock(ip, XFS_ILOCK_EXCL);
555        ASSERT(error >= 0);
556        return error;
557}
558
559ssize_t                                /* bytes written, or (-) error */
560xfs_write(
561        struct xfs_inode        *xip,
562        struct kiocb                *iocb,
563        const struct iovec        *iovp,
564        unsigned int                nsegs,
565        loff_t                        *offset,
566        int                        ioflags)
567{
568        struct file                *file = iocb->ki_filp;
569        struct address_space        *mapping = file->f_mapping;
570        struct inode                *inode = mapping->host;
571        unsigned long                segs = nsegs;
572        xfs_mount_t                *mp;
573        ssize_t                        ret = 0, error = 0;
574        xfs_fsize_t                isize, new_size;
575        int                        iolock;
576        int                        eventsent = 0;
577        size_t                        ocount = 0, count;
578        loff_t                        pos;
579        int                        need_i_mutex;
580
581        XFS_STATS_INC(xs_write_calls);
582
583        error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
584        if (error)
585                return error;
586
587        count = ocount;
588        pos = *offset;
589
590        if (count == 0)
591                return 0;
592
593        mp = xip->i_mount;
594
595        xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
596
597        if (XFS_FORCED_SHUTDOWN(mp))
598                return -EIO;
599
600relock:
601        if (ioflags & IO_ISDIRECT) {
602                iolock = XFS_IOLOCK_SHARED;
603                need_i_mutex = 0;
604        } else {
605                iolock = XFS_IOLOCK_EXCL;
606                need_i_mutex = 1;
607                mutex_lock(&inode->i_mutex);
608        }
609
610        xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
611
612start:
613        error = -generic_write_checks(file, &pos, &count,
614                                        S_ISBLK(inode->i_mode));
615        if (error) {
616                xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
617                goto out_unlock_mutex;
618        }
619
620        if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
621            !(ioflags & IO_INVIS) && !eventsent)) {
622                int                dmflags = FILP_DELAY_FLAG(file);
623
624                if (need_i_mutex)
625                        dmflags |= DM_FLAGS_IMUX;
626
627                xfs_iunlock(xip, XFS_ILOCK_EXCL);
628                error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
629                                      pos, count, dmflags, &iolock);
630                if (error) {
631                        goto out_unlock_internal;
632                }
633                xfs_ilock(xip, XFS_ILOCK_EXCL);
634                eventsent = 1;
635
636                /*
637                 * The iolock was dropped and reacquired in XFS_SEND_DATA
638                 * so we have to recheck the size when appending.
639                 * We will only "goto start;" once, since having sent the
640                 * event prevents another call to XFS_SEND_DATA, which is
641                 * what allows the size to change in the first place.
642                 */
643                if ((file->f_flags & O_APPEND) && pos != xip->i_size)
644                        goto start;
645        }
646
647        if (ioflags & IO_ISDIRECT) {
648                xfs_buftarg_t        *target =
649                        XFS_IS_REALTIME_INODE(xip) ?
650                                mp->m_rtdev_targp : mp->m_ddev_targp;
651
652                if ((pos & target->bt_smask) || (count & target->bt_smask)) {
653                        xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
654                        return XFS_ERROR(-EINVAL);
655                }
656
657                if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
658                        xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
659                        iolock = XFS_IOLOCK_EXCL;
660                        need_i_mutex = 1;
661                        mutex_lock(&inode->i_mutex);
662                        xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
663                        goto start;
664                }
665        }
666
667        new_size = pos + count;
668        if (new_size > xip->i_size)
669                xip->i_new_size = new_size;
670
671        /*
672         * We're not supposed to change timestamps in readonly-mounted
673         * filesystems.  Throw it away if anyone asks us.
674         */
675        if (likely(!(ioflags & IO_INVIS) &&
676                   !mnt_want_write(file->f_path.mnt))) {
677                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
678                mnt_drop_write(file->f_path.mnt);
679        }
680
681        /*
682         * If the offset is beyond the size of the file, we have a couple
683         * of things to do. First, if there is already space allocated
684         * we need to either create holes or zero the disk or ...
685         *
686         * If there is a page where the previous size lands, we need
687         * to zero it out up to the new size.
688         */
689
690        if (pos > xip->i_size) {
691                error = xfs_zero_eof(xip, pos, xip->i_size);
692                if (error) {
693                        xfs_iunlock(xip, XFS_ILOCK_EXCL);
694                        goto out_unlock_internal;
695                }
696        }
697        xfs_iunlock(xip, XFS_ILOCK_EXCL);
698
699        /*
700         * If we're writing the file then make sure to clear the
701         * setuid and setgid bits if the process is not being run
702         * by root.  This keeps people from modifying setuid and
703         * setgid binaries.
704         */
705
706        if (((xip->i_d.di_mode & S_ISUID) ||
707            ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
708                (S_ISGID | S_IXGRP))) &&
709             !capable(CAP_FSETID)) {
710                error = xfs_write_clear_setuid(xip);
711                if (likely(!error))
712                        error = -file_remove_suid(file);
713                if (unlikely(error)) {
714                        goto out_unlock_internal;
715                }
716        }
717
718retry:
719        /* We can write back this queue in page reclaim */
720        current->backing_dev_info = mapping->backing_dev_info;
721
722        if ((ioflags & IO_ISDIRECT)) {
723                if (mapping->nrpages) {
724                        WARN_ON(need_i_mutex == 0);
725                        xfs_inval_cached_trace(xip, pos, -1,
726                                        (pos & PAGE_CACHE_MASK), -1);
727                        error = xfs_flushinval_pages(xip,
728                                        (pos & PAGE_CACHE_MASK),
729                                        -1, FI_REMAPF_LOCKED);
730                        if (error)
731                                goto out_unlock_internal;
732                }
733
734                if (need_i_mutex) {
735                        /* demote the lock now the cached pages are gone */
736                        xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
737                        mutex_unlock(&inode->i_mutex);
738
739                        iolock = XFS_IOLOCK_SHARED;
740                        need_i_mutex = 0;
741                }
742
743                 xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs,
744                                *offset, ioflags);
745                ret = generic_file_direct_write(iocb, iovp,
746                                &segs, pos, offset, count, ocount);
747
748                /*
749                 * direct-io write to a hole: fall through to buffered I/O
750                 * for completing the rest of the request.
751                 */
752                if (ret >= 0 && ret != count) {
753                        XFS_STATS_ADD(xs_write_bytes, ret);
754
755                        pos += ret;
756                        count -= ret;
757
758                        ioflags &= ~IO_ISDIRECT;
759                        xfs_iunlock(xip, iolock);
760                        goto relock;
761                }
762        } else {
763                xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
764                                *offset, ioflags);
765                ret = generic_file_buffered_write(iocb, iovp, segs,
766                                pos, offset, count, ret);
767        }
768
769        current->backing_dev_info = NULL;
770
771        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
772                ret = wait_on_sync_kiocb(iocb);
773
774        if (ret == -ENOSPC &&
775            DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
776                xfs_iunlock(xip, iolock);
777                if (need_i_mutex)
778                        mutex_unlock(&inode->i_mutex);
779                error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
780                                DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
781                                0, 0, 0); /* Delay flag intentionally  unused */
782                if (need_i_mutex)
783                        mutex_lock(&inode->i_mutex);
784                xfs_ilock(xip, iolock);
785                if (error)
786                        goto out_unlock_internal;
787                pos = xip->i_size;
788                ret = 0;
789                goto retry;
790        }
791
792        isize = i_size_read(inode);
793        if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
794                *offset = isize;
795
796        if (*offset > xip->i_size) {
797                xfs_ilock(xip, XFS_ILOCK_EXCL);
798                if (*offset > xip->i_size)
799                        xip->i_size = *offset;
800                xfs_iunlock(xip, XFS_ILOCK_EXCL);
801        }
802
803        error = -ret;
804        if (ret <= 0)
805                goto out_unlock_internal;
806
807        XFS_STATS_ADD(xs_write_bytes, ret);
808
809        /* Handle various SYNC-type writes */
810        if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
811                int error2;
812
813                xfs_iunlock(xip, iolock);
814                if (need_i_mutex)
815                        mutex_unlock(&inode->i_mutex);
816                error2 = sync_page_range(inode, mapping, pos, ret);
817                if (!error)
818                        error = error2;
819                if (need_i_mutex)
820                        mutex_lock(&inode->i_mutex);
821                xfs_ilock(xip, iolock);
822                error2 = xfs_write_sync_logforce(mp, xip);
823                if (!error)
824                        error = error2;
825        }
826
827 out_unlock_internal:
828        if (xip->i_new_size) {
829                xfs_ilock(xip, XFS_ILOCK_EXCL);
830                xip->i_new_size = 0;
831                /*
832                 * If this was a direct or synchronous I/O that failed (such
833                 * as ENOSPC) then part of the I/O may have been written to
834                 * disk before the error occured.  In this case the on-disk
835                 * file size may have been adjusted beyond the in-memory file
836                 * size and now needs to be truncated back.
837                 */
838                if (xip->i_d.di_size > xip->i_size)
839                        xip->i_d.di_size = xip->i_size;
840                xfs_iunlock(xip, XFS_ILOCK_EXCL);
841        }
842        xfs_iunlock(xip, iolock);
843 out_unlock_mutex:
844        if (need_i_mutex)
845                mutex_unlock(&inode->i_mutex);
846        return -error;
847}
848
849/*
850 * All xfs metadata buffers except log state machine buffers
851 * get this attached as their b_bdstrat callback function.
852 * This is so that we can catch a buffer
853 * after prematurely unpinning it to forcibly shutdown the filesystem.
854 */
855int
856xfs_bdstrat_cb(struct xfs_buf *bp)
857{
858        xfs_mount_t        *mp;
859
860        mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
861        if (!XFS_FORCED_SHUTDOWN(mp)) {
862                xfs_buf_iorequest(bp);
863                return 0;
864        } else {
865                xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
866                /*
867                 * Metadata write that didn't get logged but
868                 * written delayed anyway. These aren't associated
869                 * with a transaction, and can be ignored.
870                 */
871                if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
872                    (XFS_BUF_ISREAD(bp)) == 0)
873                        return (xfs_bioerror_relse(bp));
874                else
875                        return (xfs_bioerror(bp));
876        }
877}
878
879/*
880 * Wrapper around bdstrat so that we can stop data from going to disk in case
881 * we are shutting down the filesystem.  Typically user data goes thru this
882 * path; one of the exceptions is the superblock.
883 */
884void
885xfsbdstrat(
886        struct xfs_mount        *mp,
887        struct xfs_buf                *bp)
888{
889        ASSERT(mp);
890        if (!XFS_FORCED_SHUTDOWN(mp)) {
891                xfs_buf_iorequest(bp);
892                return;
893        }
894
895        xfs_buftrace("XFSBDSTRAT IOERROR", bp);
896        xfs_bioerror_relse(bp);
897}
898
899/*
900 * If the underlying (data/log/rt) device is readonly, there are some
901 * operations that cannot proceed.
902 */
903int
904xfs_dev_is_read_only(
905        xfs_mount_t                *mp,
906        char                        *message)
907{
908        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
909            xfs_readonly_buftarg(mp->m_logdev_targp) ||
910            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
911                cmn_err(CE_NOTE,
912                        "XFS: %s required on read-only device.", message);
913                cmn_err(CE_NOTE,
914                        "XFS: write access unavailable, cannot proceed.");
915                return EROFS;
916        }
917        return 0;
918}