1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/kernel_stat.h>
42#include <linux/mm.h>
43#include <linux/hugetlb.h>
44#include <linux/mman.h>
45#include <linux/swap.h>
46#include <linux/highmem.h>
47#include <linux/pagemap.h>
48#include <linux/rmap.h>
49#include <linux/module.h>
50#include <linux/delayacct.h>
51#include <linux/init.h>
52#include <linux/writeback.h>
53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h>
55
56#include <asm/pgalloc.h>
57#include <asm/uaccess.h>
58#include <asm/tlb.h>
59#include <asm/tlbflush.h>
60#include <asm/pgtable.h>
61
62#include <linux/swapops.h>
63#include <linux/elf.h>
64
65#include "internal.h"
66
67#ifndef CONFIG_NEED_MULTIPLE_NODES
68
69unsigned long max_mapnr;
70struct page *mem_map;
71
72EXPORT_SYMBOL(max_mapnr);
73EXPORT_SYMBOL(mem_map);
74#endif
75
76unsigned long num_physpages;
77
78
79
80
81
82
83
84void * high_memory;
85
86EXPORT_SYMBOL(num_physpages);
87EXPORT_SYMBOL(high_memory);
88
89
90
91
92
93
94
95int randomize_va_space __read_mostly =
96#ifdef CONFIG_COMPAT_BRK
97 1;
98#else
99 2;
100#endif
101
102static int __init disable_randmaps(char *s)
103{
104 randomize_va_space = 0;
105 return 1;
106}
107__setup("norandmaps", disable_randmaps);
108
109
110
111
112
113
114
115
116void pgd_clear_bad(pgd_t *pgd)
117{
118 pgd_ERROR(*pgd);
119 pgd_clear(pgd);
120}
121
122void pud_clear_bad(pud_t *pud)
123{
124 pud_ERROR(*pud);
125 pud_clear(pud);
126}
127
128void pmd_clear_bad(pmd_t *pmd)
129{
130 pmd_ERROR(*pmd);
131 pmd_clear(pmd);
132}
133
134
135
136
137
138static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
139{
140 pgtable_t token = pmd_pgtable(*pmd);
141 pmd_clear(pmd);
142 pte_free_tlb(tlb, token);
143 tlb->mm->nr_ptes--;
144}
145
146static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
147 unsigned long addr, unsigned long end,
148 unsigned long floor, unsigned long ceiling)
149{
150 pmd_t *pmd;
151 unsigned long next;
152 unsigned long start;
153
154 start = addr;
155 pmd = pmd_offset(pud, addr);
156 do {
157 next = pmd_addr_end(addr, end);
158 if (pmd_none_or_clear_bad(pmd))
159 continue;
160 free_pte_range(tlb, pmd);
161 } while (pmd++, addr = next, addr != end);
162
163 start &= PUD_MASK;
164 if (start < floor)
165 return;
166 if (ceiling) {
167 ceiling &= PUD_MASK;
168 if (!ceiling)
169 return;
170 }
171 if (end - 1 > ceiling - 1)
172 return;
173
174 pmd = pmd_offset(pud, start);
175 pud_clear(pud);
176 pmd_free_tlb(tlb, pmd);
177}
178
179static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
180 unsigned long addr, unsigned long end,
181 unsigned long floor, unsigned long ceiling)
182{
183 pud_t *pud;
184 unsigned long next;
185 unsigned long start;
186
187 start = addr;
188 pud = pud_offset(pgd, addr);
189 do {
190 next = pud_addr_end(addr, end);
191 if (pud_none_or_clear_bad(pud))
192 continue;
193 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
194 } while (pud++, addr = next, addr != end);
195
196 start &= PGDIR_MASK;
197 if (start < floor)
198 return;
199 if (ceiling) {
200 ceiling &= PGDIR_MASK;
201 if (!ceiling)
202 return;
203 }
204 if (end - 1 > ceiling - 1)
205 return;
206
207 pud = pud_offset(pgd, start);
208 pgd_clear(pgd);
209 pud_free_tlb(tlb, pud);
210}
211
212
213
214
215
216
217void free_pgd_range(struct mmu_gather *tlb,
218 unsigned long addr, unsigned long end,
219 unsigned long floor, unsigned long ceiling)
220{
221 pgd_t *pgd;
222 unsigned long next;
223 unsigned long start;
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251 addr &= PMD_MASK;
252 if (addr < floor) {
253 addr += PMD_SIZE;
254 if (!addr)
255 return;
256 }
257 if (ceiling) {
258 ceiling &= PMD_MASK;
259 if (!ceiling)
260 return;
261 }
262 if (end - 1 > ceiling - 1)
263 end -= PMD_SIZE;
264 if (addr > end - 1)
265 return;
266
267 start = addr;
268 pgd = pgd_offset(tlb->mm, addr);
269 do {
270 next = pgd_addr_end(addr, end);
271 if (pgd_none_or_clear_bad(pgd))
272 continue;
273 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
274 } while (pgd++, addr = next, addr != end);
275}
276
277void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
278 unsigned long floor, unsigned long ceiling)
279{
280 while (vma) {
281 struct vm_area_struct *next = vma->vm_next;
282 unsigned long addr = vma->vm_start;
283
284
285
286
287 anon_vma_unlink(vma);
288 unlink_file_vma(vma);
289
290 if (is_vm_hugetlb_page(vma)) {
291 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
292 floor, next? next->vm_start: ceiling);
293 } else {
294
295
296
297 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
298 && !is_vm_hugetlb_page(next)) {
299 vma = next;
300 next = vma->vm_next;
301 anon_vma_unlink(vma);
302 unlink_file_vma(vma);
303 }
304 free_pgd_range(tlb, addr, vma->vm_end,
305 floor, next? next->vm_start: ceiling);
306 }
307 vma = next;
308 }
309}
310
311int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
312{
313 pgtable_t new = pte_alloc_one(mm, address);
314 if (!new)
315 return -ENOMEM;
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330 smp_wmb();
331
332 spin_lock(&mm->page_table_lock);
333 if (!pmd_present(*pmd)) {
334 mm->nr_ptes++;
335 pmd_populate(mm, pmd, new);
336 new = NULL;
337 }
338 spin_unlock(&mm->page_table_lock);
339 if (new)
340 pte_free(mm, new);
341 return 0;
342}
343
344int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
345{
346 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
347 if (!new)
348 return -ENOMEM;
349
350 smp_wmb();
351
352 spin_lock(&init_mm.page_table_lock);
353 if (!pmd_present(*pmd)) {
354 pmd_populate_kernel(&init_mm, pmd, new);
355 new = NULL;
356 }
357 spin_unlock(&init_mm.page_table_lock);
358 if (new)
359 pte_free_kernel(&init_mm, new);
360 return 0;
361}
362
363static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
364{
365 if (file_rss)
366 add_mm_counter(mm, file_rss, file_rss);
367 if (anon_rss)
368 add_mm_counter(mm, anon_rss, anon_rss);
369}
370
371
372
373
374
375
376
377
378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
379 unsigned long vaddr)
380{
381 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
382 "vm_flags = %lx, vaddr = %lx\n",
383 (long long)pte_val(pte),
384 (vma->vm_mm == current->mm ? current->comm : "???"),
385 vma->vm_flags, vaddr);
386 dump_stack();
387}
388
389static inline int is_cow_mapping(unsigned int flags)
390{
391 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
392}
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436#ifdef __HAVE_ARCH_PTE_SPECIAL
437# define HAVE_PTE_SPECIAL 1
438#else
439# define HAVE_PTE_SPECIAL 0
440#endif
441struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
442 pte_t pte)
443{
444 unsigned long pfn;
445
446 if (HAVE_PTE_SPECIAL) {
447 if (likely(!pte_special(pte))) {
448 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
449 return pte_page(pte);
450 }
451 VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
452 return NULL;
453 }
454
455
456
457 pfn = pte_pfn(pte);
458
459 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
460 if (vma->vm_flags & VM_MIXEDMAP) {
461 if (!pfn_valid(pfn))
462 return NULL;
463 goto out;
464 } else {
465 unsigned long off;
466 off = (addr - vma->vm_start) >> PAGE_SHIFT;
467 if (pfn == vma->vm_pgoff + off)
468 return NULL;
469 if (!is_cow_mapping(vma->vm_flags))
470 return NULL;
471 }
472 }
473
474 VM_BUG_ON(!pfn_valid(pfn));
475
476
477
478
479
480
481out:
482 return pfn_to_page(pfn);
483}
484
485
486
487
488
489
490
491static inline void
492copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
493 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
494 unsigned long addr, int *rss)
495{
496 unsigned long vm_flags = vma->vm_flags;
497 pte_t pte = *src_pte;
498 struct page *page;
499
500
501 if (unlikely(!pte_present(pte))) {
502 if (!pte_file(pte)) {
503 swp_entry_t entry = pte_to_swp_entry(pte);
504
505 swap_duplicate(entry);
506
507 if (unlikely(list_empty(&dst_mm->mmlist))) {
508 spin_lock(&mmlist_lock);
509 if (list_empty(&dst_mm->mmlist))
510 list_add(&dst_mm->mmlist,
511 &src_mm->mmlist);
512 spin_unlock(&mmlist_lock);
513 }
514 if (is_write_migration_entry(entry) &&
515 is_cow_mapping(vm_flags)) {
516
517
518
519
520 make_migration_entry_read(&entry);
521 pte = swp_entry_to_pte(entry);
522 set_pte_at(src_mm, addr, src_pte, pte);
523 }
524 }
525 goto out_set_pte;
526 }
527
528
529
530
531
532 if (is_cow_mapping(vm_flags)) {
533 ptep_set_wrprotect(src_mm, addr, src_pte);
534 pte = pte_wrprotect(pte);
535 }
536
537
538
539
540
541 if (vm_flags & VM_SHARED)
542 pte = pte_mkclean(pte);
543 pte = pte_mkold(pte);
544
545 page = vm_normal_page(vma, addr, pte);
546 if (page) {
547 get_page(page);
548 page_dup_rmap(page, vma, addr);
549 rss[!!PageAnon(page)]++;
550 }
551
552out_set_pte:
553 set_pte_at(dst_mm, addr, dst_pte, pte);
554}
555
556static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
557 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
558 unsigned long addr, unsigned long end)
559{
560 pte_t *src_pte, *dst_pte;
561 spinlock_t *src_ptl, *dst_ptl;
562 int progress = 0;
563 int rss[2];
564
565again:
566 rss[1] = rss[0] = 0;
567 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
568 if (!dst_pte)
569 return -ENOMEM;
570 src_pte = pte_offset_map_nested(src_pmd, addr);
571 src_ptl = pte_lockptr(src_mm, src_pmd);
572 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
573 arch_enter_lazy_mmu_mode();
574
575 do {
576
577
578
579
580 if (progress >= 32) {
581 progress = 0;
582 if (need_resched() ||
583 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
584 break;
585 }
586 if (pte_none(*src_pte)) {
587 progress++;
588 continue;
589 }
590 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
591 progress += 8;
592 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
593
594 arch_leave_lazy_mmu_mode();
595 spin_unlock(src_ptl);
596 pte_unmap_nested(src_pte - 1);
597 add_mm_rss(dst_mm, rss[0], rss[1]);
598 pte_unmap_unlock(dst_pte - 1, dst_ptl);
599 cond_resched();
600 if (addr != end)
601 goto again;
602 return 0;
603}
604
605static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
606 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
607 unsigned long addr, unsigned long end)
608{
609 pmd_t *src_pmd, *dst_pmd;
610 unsigned long next;
611
612 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
613 if (!dst_pmd)
614 return -ENOMEM;
615 src_pmd = pmd_offset(src_pud, addr);
616 do {
617 next = pmd_addr_end(addr, end);
618 if (pmd_none_or_clear_bad(src_pmd))
619 continue;
620 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
621 vma, addr, next))
622 return -ENOMEM;
623 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
624 return 0;
625}
626
627static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
628 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
629 unsigned long addr, unsigned long end)
630{
631 pud_t *src_pud, *dst_pud;
632 unsigned long next;
633
634 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
635 if (!dst_pud)
636 return -ENOMEM;
637 src_pud = pud_offset(src_pgd, addr);
638 do {
639 next = pud_addr_end(addr, end);
640 if (pud_none_or_clear_bad(src_pud))
641 continue;
642 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
643 vma, addr, next))
644 return -ENOMEM;
645 } while (dst_pud++, src_pud++, addr = next, addr != end);
646 return 0;
647}
648
649int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
650 struct vm_area_struct *vma)
651{
652 pgd_t *src_pgd, *dst_pgd;
653 unsigned long next;
654 unsigned long addr = vma->vm_start;
655 unsigned long end = vma->vm_end;
656 int ret;
657
658
659
660
661
662
663
664 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
665 if (!vma->anon_vma)
666 return 0;
667 }
668
669 if (is_vm_hugetlb_page(vma))
670 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
671
672
673
674
675
676
677
678 if (is_cow_mapping(vma->vm_flags))
679 mmu_notifier_invalidate_range_start(src_mm, addr, end);
680
681 ret = 0;
682 dst_pgd = pgd_offset(dst_mm, addr);
683 src_pgd = pgd_offset(src_mm, addr);
684 do {
685 next = pgd_addr_end(addr, end);
686 if (pgd_none_or_clear_bad(src_pgd))
687 continue;
688 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
689 vma, addr, next))) {
690 ret = -ENOMEM;
691 break;
692 }
693 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
694
695 if (is_cow_mapping(vma->vm_flags))
696 mmu_notifier_invalidate_range_end(src_mm,
697 vma->vm_start, end);
698 return ret;
699}
700
701static unsigned long zap_pte_range(struct mmu_gather *tlb,
702 struct vm_area_struct *vma, pmd_t *pmd,
703 unsigned long addr, unsigned long end,
704 long *zap_work, struct zap_details *details)
705{
706 struct mm_struct *mm = tlb->mm;
707 pte_t *pte;
708 spinlock_t *ptl;
709 int file_rss = 0;
710 int anon_rss = 0;
711
712 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
713 arch_enter_lazy_mmu_mode();
714 do {
715 pte_t ptent = *pte;
716 if (pte_none(ptent)) {
717 (*zap_work)--;
718 continue;
719 }
720
721 (*zap_work) -= PAGE_SIZE;
722
723 if (pte_present(ptent)) {
724 struct page *page;
725
726 page = vm_normal_page(vma, addr, ptent);
727 if (unlikely(details) && page) {
728
729
730
731
732
733 if (details->check_mapping &&
734 details->check_mapping != page->mapping)
735 continue;
736
737
738
739
740 if (details->nonlinear_vma &&
741 (page->index < details->first_index ||
742 page->index > details->last_index))
743 continue;
744 }
745 ptent = ptep_get_and_clear_full(mm, addr, pte,
746 tlb->fullmm);
747 tlb_remove_tlb_entry(tlb, pte, addr);
748 if (unlikely(!page))
749 continue;
750 if (unlikely(details) && details->nonlinear_vma
751 && linear_page_index(details->nonlinear_vma,
752 addr) != page->index)
753 set_pte_at(mm, addr, pte,
754 pgoff_to_pte(page->index));
755 if (PageAnon(page))
756 anon_rss--;
757 else {
758 if (pte_dirty(ptent))
759 set_page_dirty(page);
760 if (pte_young(ptent))
761 SetPageReferenced(page);
762 file_rss--;
763 }
764 page_remove_rmap(page, vma);
765 tlb_remove_page(tlb, page);
766 continue;
767 }
768
769
770
771
772 if (unlikely(details))
773 continue;
774 if (!pte_file(ptent))
775 free_swap_and_cache(pte_to_swp_entry(ptent));
776 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
777 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
778
779 add_mm_rss(mm, file_rss, anon_rss);
780 arch_leave_lazy_mmu_mode();
781 pte_unmap_unlock(pte - 1, ptl);
782
783 return addr;
784}
785
786static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
787 struct vm_area_struct *vma, pud_t *pud,
788 unsigned long addr, unsigned long end,
789 long *zap_work, struct zap_details *details)
790{
791 pmd_t *pmd;
792 unsigned long next;
793
794 pmd = pmd_offset(pud, addr);
795 do {
796 next = pmd_addr_end(addr, end);
797 if (pmd_none_or_clear_bad(pmd)) {
798 (*zap_work)--;
799 continue;
800 }
801 next = zap_pte_range(tlb, vma, pmd, addr, next,
802 zap_work, details);
803 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
804
805 return addr;
806}
807
808static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
809 struct vm_area_struct *vma, pgd_t *pgd,
810 unsigned long addr, unsigned long end,
811 long *zap_work, struct zap_details *details)
812{
813 pud_t *pud;
814 unsigned long next;
815
816 pud = pud_offset(pgd, addr);
817 do {
818 next = pud_addr_end(addr, end);
819 if (pud_none_or_clear_bad(pud)) {
820 (*zap_work)--;
821 continue;
822 }
823 next = zap_pmd_range(tlb, vma, pud, addr, next,
824 zap_work, details);
825 } while (pud++, addr = next, (addr != end && *zap_work > 0));
826
827 return addr;
828}
829
830static unsigned long unmap_page_range(struct mmu_gather *tlb,
831 struct vm_area_struct *vma,
832 unsigned long addr, unsigned long end,
833 long *zap_work, struct zap_details *details)
834{
835 pgd_t *pgd;
836 unsigned long next;
837
838 if (details && !details->check_mapping && !details->nonlinear_vma)
839 details = NULL;
840
841 BUG_ON(addr >= end);
842 tlb_start_vma(tlb, vma);
843 pgd = pgd_offset(vma->vm_mm, addr);
844 do {
845 next = pgd_addr_end(addr, end);
846 if (pgd_none_or_clear_bad(pgd)) {
847 (*zap_work)--;
848 continue;
849 }
850 next = zap_pud_range(tlb, vma, pgd, addr, next,
851 zap_work, details);
852 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
853 tlb_end_vma(tlb, vma);
854
855 return addr;
856}
857
858#ifdef CONFIG_PREEMPT
859# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
860#else
861
862# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
863#endif
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891unsigned long unmap_vmas(struct mmu_gather **tlbp,
892 struct vm_area_struct *vma, unsigned long start_addr,
893 unsigned long end_addr, unsigned long *nr_accounted,
894 struct zap_details *details)
895{
896 long zap_work = ZAP_BLOCK_SIZE;
897 unsigned long tlb_start = 0;
898 int tlb_start_valid = 0;
899 unsigned long start = start_addr;
900 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
901 int fullmm = (*tlbp)->fullmm;
902 struct mm_struct *mm = vma->vm_mm;
903
904 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
905 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
906 unsigned long end;
907
908 start = max(vma->vm_start, start_addr);
909 if (start >= vma->vm_end)
910 continue;
911 end = min(vma->vm_end, end_addr);
912 if (end <= vma->vm_start)
913 continue;
914
915 if (vma->vm_flags & VM_ACCOUNT)
916 *nr_accounted += (end - start) >> PAGE_SHIFT;
917
918 while (start != end) {
919 if (!tlb_start_valid) {
920 tlb_start = start;
921 tlb_start_valid = 1;
922 }
923
924 if (unlikely(is_vm_hugetlb_page(vma))) {
925
926
927
928
929
930
931
932
933
934
935
936 if (vma->vm_file) {
937 unmap_hugepage_range(vma, start, end, NULL);
938 zap_work -= (end - start) /
939 pages_per_huge_page(hstate_vma(vma));
940 }
941
942 start = end;
943 } else
944 start = unmap_page_range(*tlbp, vma,
945 start, end, &zap_work, details);
946
947 if (zap_work > 0) {
948 BUG_ON(start != end);
949 break;
950 }
951
952 tlb_finish_mmu(*tlbp, tlb_start, start);
953
954 if (need_resched() ||
955 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
956 if (i_mmap_lock) {
957 *tlbp = NULL;
958 goto out;
959 }
960 cond_resched();
961 }
962
963 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
964 tlb_start_valid = 0;
965 zap_work = ZAP_BLOCK_SIZE;
966 }
967 }
968out:
969 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
970 return start;
971}
972
973
974
975
976
977
978
979
980unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
981 unsigned long size, struct zap_details *details)
982{
983 struct mm_struct *mm = vma->vm_mm;
984 struct mmu_gather *tlb;
985 unsigned long end = address + size;
986 unsigned long nr_accounted = 0;
987
988 lru_add_drain();
989 tlb = tlb_gather_mmu(mm, 0);
990 update_hiwater_rss(mm);
991 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
992 if (tlb)
993 tlb_finish_mmu(tlb, address, end);
994 return end;
995}
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1010 unsigned long size)
1011{
1012 if (address < vma->vm_start || address + size > vma->vm_end ||
1013 !(vma->vm_flags & VM_PFNMAP))
1014 return -1;
1015 zap_page_range(vma, address, size, NULL);
1016 return 0;
1017}
1018EXPORT_SYMBOL_GPL(zap_vma_ptes);
1019
1020
1021
1022
1023struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1024 unsigned int flags)
1025{
1026 pgd_t *pgd;
1027 pud_t *pud;
1028 pmd_t *pmd;
1029 pte_t *ptep, pte;
1030 spinlock_t *ptl;
1031 struct page *page;
1032 struct mm_struct *mm = vma->vm_mm;
1033
1034 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1035 if (!IS_ERR(page)) {
1036 BUG_ON(flags & FOLL_GET);
1037 goto out;
1038 }
1039
1040 page = NULL;
1041 pgd = pgd_offset(mm, address);
1042 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1043 goto no_page_table;
1044
1045 pud = pud_offset(pgd, address);
1046 if (pud_none(*pud))
1047 goto no_page_table;
1048 if (pud_huge(*pud)) {
1049 BUG_ON(flags & FOLL_GET);
1050 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1051 goto out;
1052 }
1053 if (unlikely(pud_bad(*pud)))
1054 goto no_page_table;
1055
1056 pmd = pmd_offset(pud, address);
1057 if (pmd_none(*pmd))
1058 goto no_page_table;
1059 if (pmd_huge(*pmd)) {
1060 BUG_ON(flags & FOLL_GET);
1061 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1062 goto out;
1063 }
1064 if (unlikely(pmd_bad(*pmd)))
1065 goto no_page_table;
1066
1067 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1068
1069 pte = *ptep;
1070 if (!pte_present(pte))
1071 goto no_page;
1072 if ((flags & FOLL_WRITE) && !pte_write(pte))
1073 goto unlock;
1074 page = vm_normal_page(vma, address, pte);
1075 if (unlikely(!page))
1076 goto bad_page;
1077
1078 if (flags & FOLL_GET)
1079 get_page(page);
1080 if (flags & FOLL_TOUCH) {
1081 if ((flags & FOLL_WRITE) &&
1082 !pte_dirty(pte) && !PageDirty(page))
1083 set_page_dirty(page);
1084 mark_page_accessed(page);
1085 }
1086unlock:
1087 pte_unmap_unlock(ptep, ptl);
1088out:
1089 return page;
1090
1091bad_page:
1092 pte_unmap_unlock(ptep, ptl);
1093 return ERR_PTR(-EFAULT);
1094
1095no_page:
1096 pte_unmap_unlock(ptep, ptl);
1097 if (!pte_none(pte))
1098 return page;
1099
1100no_page_table:
1101
1102
1103
1104
1105 if (flags & FOLL_ANON) {
1106 page = ZERO_PAGE(0);
1107 if (flags & FOLL_GET)
1108 get_page(page);
1109 BUG_ON(flags & FOLL_WRITE);
1110 }
1111 return page;
1112}
1113
1114
1115static inline int use_zero_page(struct vm_area_struct *vma)
1116{
1117
1118
1119
1120
1121
1122
1123
1124 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1125 return 0;
1126
1127
1128
1129 return !vma->vm_ops || !vma->vm_ops->fault;
1130}
1131
1132
1133
1134int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1135 unsigned long start, int len, int flags,
1136 struct page **pages, struct vm_area_struct **vmas)
1137{
1138 int i;
1139 unsigned int vm_flags = 0;
1140 int write = !!(flags & GUP_FLAGS_WRITE);
1141 int force = !!(flags & GUP_FLAGS_FORCE);
1142 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1143
1144 if (len <= 0)
1145 return 0;
1146
1147
1148
1149
1150 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1151 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1152 i = 0;
1153
1154 do {
1155 struct vm_area_struct *vma;
1156 unsigned int foll_flags;
1157
1158 vma = find_extend_vma(mm, start);
1159 if (!vma && in_gate_area(tsk, start)) {
1160 unsigned long pg = start & PAGE_MASK;
1161 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1162 pgd_t *pgd;
1163 pud_t *pud;
1164 pmd_t *pmd;
1165 pte_t *pte;
1166
1167
1168 if (!ignore && write)
1169 return i ? : -EFAULT;
1170 if (pg > TASK_SIZE)
1171 pgd = pgd_offset_k(pg);
1172 else
1173 pgd = pgd_offset_gate(mm, pg);
1174 BUG_ON(pgd_none(*pgd));
1175 pud = pud_offset(pgd, pg);
1176 BUG_ON(pud_none(*pud));
1177 pmd = pmd_offset(pud, pg);
1178 if (pmd_none(*pmd))
1179 return i ? : -EFAULT;
1180 pte = pte_offset_map(pmd, pg);
1181 if (pte_none(*pte)) {
1182 pte_unmap(pte);
1183 return i ? : -EFAULT;
1184 }
1185 if (pages) {
1186 struct page *page = vm_normal_page(gate_vma, start, *pte);
1187 pages[i] = page;
1188 if (page)
1189 get_page(page);
1190 }
1191 pte_unmap(pte);
1192 if (vmas)
1193 vmas[i] = gate_vma;
1194 i++;
1195 start += PAGE_SIZE;
1196 len--;
1197 continue;
1198 }
1199
1200 if (!vma ||
1201 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1202 (!ignore && !(vm_flags & vma->vm_flags)))
1203 return i ? : -EFAULT;
1204
1205 if (is_vm_hugetlb_page(vma)) {
1206 i = follow_hugetlb_page(mm, vma, pages, vmas,
1207 &start, &len, i, write);
1208 continue;
1209 }
1210
1211 foll_flags = FOLL_TOUCH;
1212 if (pages)
1213 foll_flags |= FOLL_GET;
1214 if (!write && use_zero_page(vma))
1215 foll_flags |= FOLL_ANON;
1216
1217 do {
1218 struct page *page;
1219
1220
1221
1222
1223
1224
1225 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
1226 return i ? i : -ENOMEM;
1227
1228 if (write)
1229 foll_flags |= FOLL_WRITE;
1230
1231 cond_resched();
1232 while (!(page = follow_page(vma, start, foll_flags))) {
1233 int ret;
1234 ret = handle_mm_fault(mm, vma, start,
1235 foll_flags & FOLL_WRITE);
1236 if (ret & VM_FAULT_ERROR) {
1237 if (ret & VM_FAULT_OOM)
1238 return i ? i : -ENOMEM;
1239 else if (ret & VM_FAULT_SIGBUS)
1240 return i ? i : -EFAULT;
1241 BUG();
1242 }
1243 if (ret & VM_FAULT_MAJOR)
1244 tsk->maj_flt++;
1245 else
1246 tsk->min_flt++;
1247
1248
1249
1250
1251
1252
1253
1254
1255 if (ret & VM_FAULT_WRITE)
1256 foll_flags &= ~FOLL_WRITE;
1257
1258 cond_resched();
1259 }
1260 if (IS_ERR(page))
1261 return i ? i : PTR_ERR(page);
1262 if (pages) {
1263 pages[i] = page;
1264
1265 flush_anon_page(vma, page, start);
1266 flush_dcache_page(page);
1267 }
1268 if (vmas)
1269 vmas[i] = vma;
1270 i++;
1271 start += PAGE_SIZE;
1272 len--;
1273 } while (len && start < vma->vm_end);
1274 } while (len);
1275 return i;
1276}
1277
1278int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1279 unsigned long start, int len, int write, int force,
1280 struct page **pages, struct vm_area_struct **vmas)
1281{
1282 int flags = 0;
1283
1284 if (write)
1285 flags |= GUP_FLAGS_WRITE;
1286 if (force)
1287 flags |= GUP_FLAGS_FORCE;
1288
1289 return __get_user_pages(tsk, mm,
1290 start, len, flags,
1291 pages, vmas);
1292}
1293
1294EXPORT_SYMBOL(get_user_pages);
1295
1296pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1297 spinlock_t **ptl)
1298{
1299 pgd_t * pgd = pgd_offset(mm, addr);
1300 pud_t * pud = pud_alloc(mm, pgd, addr);
1301 if (pud) {
1302 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1303 if (pmd)
1304 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1305 }
1306 return NULL;
1307}
1308
1309
1310
1311
1312
1313
1314
1315
1316static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1317 struct page *page, pgprot_t prot)
1318{
1319 struct mm_struct *mm = vma->vm_mm;
1320 int retval;
1321 pte_t *pte;
1322 spinlock_t *ptl;
1323
1324 retval = -EINVAL;
1325 if (PageAnon(page))
1326 goto out;
1327 retval = -ENOMEM;
1328 flush_dcache_page(page);
1329 pte = get_locked_pte(mm, addr, &ptl);
1330 if (!pte)
1331 goto out;
1332 retval = -EBUSY;
1333 if (!pte_none(*pte))
1334 goto out_unlock;
1335
1336
1337 get_page(page);
1338 inc_mm_counter(mm, file_rss);
1339 page_add_file_rmap(page);
1340 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1341
1342 retval = 0;
1343 pte_unmap_unlock(pte, ptl);
1344 return retval;
1345out_unlock:
1346 pte_unmap_unlock(pte, ptl);
1347out:
1348 return retval;
1349}
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1374 struct page *page)
1375{
1376 if (addr < vma->vm_start || addr >= vma->vm_end)
1377 return -EFAULT;
1378 if (!page_count(page))
1379 return -EINVAL;
1380 vma->vm_flags |= VM_INSERTPAGE;
1381 return insert_page(vma, addr, page, vma->vm_page_prot);
1382}
1383EXPORT_SYMBOL(vm_insert_page);
1384
1385static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1386 unsigned long pfn, pgprot_t prot)
1387{
1388 struct mm_struct *mm = vma->vm_mm;
1389 int retval;
1390 pte_t *pte, entry;
1391 spinlock_t *ptl;
1392
1393 retval = -ENOMEM;
1394 pte = get_locked_pte(mm, addr, &ptl);
1395 if (!pte)
1396 goto out;
1397 retval = -EBUSY;
1398 if (!pte_none(*pte))
1399 goto out_unlock;
1400
1401
1402 entry = pte_mkspecial(pfn_pte(pfn, prot));
1403 set_pte_at(mm, addr, pte, entry);
1404 update_mmu_cache(vma, addr, entry);
1405
1406 retval = 0;
1407out_unlock:
1408 pte_unmap_unlock(pte, ptl);
1409out:
1410 return retval;
1411}
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1431 unsigned long pfn)
1432{
1433
1434
1435
1436
1437
1438
1439 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1440 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1441 (VM_PFNMAP|VM_MIXEDMAP));
1442 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1443 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1444
1445 if (addr < vma->vm_start || addr >= vma->vm_end)
1446 return -EFAULT;
1447 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1448}
1449EXPORT_SYMBOL(vm_insert_pfn);
1450
1451int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1452 unsigned long pfn)
1453{
1454 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1455
1456 if (addr < vma->vm_start || addr >= vma->vm_end)
1457 return -EFAULT;
1458
1459
1460
1461
1462
1463
1464
1465 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1466 struct page *page;
1467
1468 page = pfn_to_page(pfn);
1469 return insert_page(vma, addr, page, vma->vm_page_prot);
1470 }
1471 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1472}
1473EXPORT_SYMBOL(vm_insert_mixed);
1474
1475
1476
1477
1478
1479
1480static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1481 unsigned long addr, unsigned long end,
1482 unsigned long pfn, pgprot_t prot)
1483{
1484 pte_t *pte;
1485 spinlock_t *ptl;
1486
1487 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1488 if (!pte)
1489 return -ENOMEM;
1490 arch_enter_lazy_mmu_mode();
1491 do {
1492 BUG_ON(!pte_none(*pte));
1493 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1494 pfn++;
1495 } while (pte++, addr += PAGE_SIZE, addr != end);
1496 arch_leave_lazy_mmu_mode();
1497 pte_unmap_unlock(pte - 1, ptl);
1498 return 0;
1499}
1500
1501static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1502 unsigned long addr, unsigned long end,
1503 unsigned long pfn, pgprot_t prot)
1504{
1505 pmd_t *pmd;
1506 unsigned long next;
1507
1508 pfn -= addr >> PAGE_SHIFT;
1509 pmd = pmd_alloc(mm, pud, addr);
1510 if (!pmd)
1511 return -ENOMEM;
1512 do {
1513 next = pmd_addr_end(addr, end);
1514 if (remap_pte_range(mm, pmd, addr, next,
1515 pfn + (addr >> PAGE_SHIFT), prot))
1516 return -ENOMEM;
1517 } while (pmd++, addr = next, addr != end);
1518 return 0;
1519}
1520
1521static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1522 unsigned long addr, unsigned long end,
1523 unsigned long pfn, pgprot_t prot)
1524{
1525 pud_t *pud;
1526 unsigned long next;
1527
1528 pfn -= addr >> PAGE_SHIFT;
1529 pud = pud_alloc(mm, pgd, addr);
1530 if (!pud)
1531 return -ENOMEM;
1532 do {
1533 next = pud_addr_end(addr, end);
1534 if (remap_pmd_range(mm, pud, addr, next,
1535 pfn + (addr >> PAGE_SHIFT), prot))
1536 return -ENOMEM;
1537 } while (pud++, addr = next, addr != end);
1538 return 0;
1539}
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1552 unsigned long pfn, unsigned long size, pgprot_t prot)
1553{
1554 pgd_t *pgd;
1555 unsigned long next;
1556 unsigned long end = addr + PAGE_ALIGN(size);
1557 struct mm_struct *mm = vma->vm_mm;
1558 int err;
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578 if (is_cow_mapping(vma->vm_flags)) {
1579 if (addr != vma->vm_start || end != vma->vm_end)
1580 return -EINVAL;
1581 vma->vm_pgoff = pfn;
1582 }
1583
1584 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1585
1586 BUG_ON(addr >= end);
1587 pfn -= addr >> PAGE_SHIFT;
1588 pgd = pgd_offset(mm, addr);
1589 flush_cache_range(vma, addr, end);
1590 do {
1591 next = pgd_addr_end(addr, end);
1592 err = remap_pud_range(mm, pgd, addr, next,
1593 pfn + (addr >> PAGE_SHIFT), prot);
1594 if (err)
1595 break;
1596 } while (pgd++, addr = next, addr != end);
1597 return err;
1598}
1599EXPORT_SYMBOL(remap_pfn_range);
1600
1601static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1602 unsigned long addr, unsigned long end,
1603 pte_fn_t fn, void *data)
1604{
1605 pte_t *pte;
1606 int err;
1607 pgtable_t token;
1608 spinlock_t *uninitialized_var(ptl);
1609
1610 pte = (mm == &init_mm) ?
1611 pte_alloc_kernel(pmd, addr) :
1612 pte_alloc_map_lock(mm, pmd, addr, &ptl);
1613 if (!pte)
1614 return -ENOMEM;
1615
1616 BUG_ON(pmd_huge(*pmd));
1617
1618 token = pmd_pgtable(*pmd);
1619
1620 do {
1621 err = fn(pte, token, addr, data);
1622 if (err)
1623 break;
1624 } while (pte++, addr += PAGE_SIZE, addr != end);
1625
1626 if (mm != &init_mm)
1627 pte_unmap_unlock(pte-1, ptl);
1628 return err;
1629}
1630
1631static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1632 unsigned long addr, unsigned long end,
1633 pte_fn_t fn, void *data)
1634{
1635 pmd_t *pmd;
1636 unsigned long next;
1637 int err;
1638
1639 BUG_ON(pud_huge(*pud));
1640
1641 pmd = pmd_alloc(mm, pud, addr);
1642 if (!pmd)
1643 return -ENOMEM;
1644 do {
1645 next = pmd_addr_end(addr, end);
1646 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1647 if (err)
1648 break;
1649 } while (pmd++, addr = next, addr != end);
1650 return err;
1651}
1652
1653static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1654 unsigned long addr, unsigned long end,
1655 pte_fn_t fn, void *data)
1656{
1657 pud_t *pud;
1658 unsigned long next;
1659 int err;
1660
1661 pud = pud_alloc(mm, pgd, addr);
1662 if (!pud)
1663 return -ENOMEM;
1664 do {
1665 next = pud_addr_end(addr, end);
1666 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1667 if (err)
1668 break;
1669 } while (pud++, addr = next, addr != end);
1670 return err;
1671}
1672
1673
1674
1675
1676
1677int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1678 unsigned long size, pte_fn_t fn, void *data)
1679{
1680 pgd_t *pgd;
1681 unsigned long next;
1682 unsigned long start = addr, end = addr + size;
1683 int err;
1684
1685 BUG_ON(addr >= end);
1686 mmu_notifier_invalidate_range_start(mm, start, end);
1687 pgd = pgd_offset(mm, addr);
1688 do {
1689 next = pgd_addr_end(addr, end);
1690 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1691 if (err)
1692 break;
1693 } while (pgd++, addr = next, addr != end);
1694 mmu_notifier_invalidate_range_end(mm, start, end);
1695 return err;
1696}
1697EXPORT_SYMBOL_GPL(apply_to_page_range);
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1709 pte_t *page_table, pte_t orig_pte)
1710{
1711 int same = 1;
1712#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1713 if (sizeof(pte_t) > sizeof(unsigned long)) {
1714 spinlock_t *ptl = pte_lockptr(mm, pmd);
1715 spin_lock(ptl);
1716 same = pte_same(*page_table, orig_pte);
1717 spin_unlock(ptl);
1718 }
1719#endif
1720 pte_unmap(page_table);
1721 return same;
1722}
1723
1724
1725
1726
1727
1728
1729
1730static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1731{
1732 if (likely(vma->vm_flags & VM_WRITE))
1733 pte = pte_mkwrite(pte);
1734 return pte;
1735}
1736
1737static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1738{
1739
1740
1741
1742
1743
1744
1745 if (unlikely(!src)) {
1746 void *kaddr = kmap_atomic(dst, KM_USER0);
1747 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1748
1749
1750
1751
1752
1753
1754
1755 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1756 memset(kaddr, 0, PAGE_SIZE);
1757 kunmap_atomic(kaddr, KM_USER0);
1758 flush_dcache_page(dst);
1759 } else
1760 copy_user_highpage(dst, src, va, vma);
1761}
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1782 unsigned long address, pte_t *page_table, pmd_t *pmd,
1783 spinlock_t *ptl, pte_t orig_pte)
1784{
1785 struct page *old_page, *new_page;
1786 pte_t entry;
1787 int reuse = 0, ret = 0;
1788 int page_mkwrite = 0;
1789 struct page *dirty_page = NULL;
1790
1791 old_page = vm_normal_page(vma, address, orig_pte);
1792 if (!old_page) {
1793
1794
1795
1796
1797
1798
1799
1800 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1801 (VM_WRITE|VM_SHARED))
1802 goto reuse;
1803 goto gotten;
1804 }
1805
1806
1807
1808
1809
1810 if (PageAnon(old_page)) {
1811 if (trylock_page(old_page)) {
1812 reuse = can_share_swap_page(old_page);
1813 unlock_page(old_page);
1814 }
1815 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1816 (VM_WRITE|VM_SHARED))) {
1817
1818
1819
1820
1821
1822 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1823
1824
1825
1826
1827
1828
1829
1830
1831 page_cache_get(old_page);
1832 pte_unmap_unlock(page_table, ptl);
1833
1834 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
1835 goto unwritable_page;
1836
1837
1838
1839
1840
1841
1842
1843 page_table = pte_offset_map_lock(mm, pmd, address,
1844 &ptl);
1845 page_cache_release(old_page);
1846 if (!pte_same(*page_table, orig_pte))
1847 goto unlock;
1848
1849 page_mkwrite = 1;
1850 }
1851 dirty_page = old_page;
1852 get_page(dirty_page);
1853 reuse = 1;
1854 }
1855
1856 if (reuse) {
1857reuse:
1858 flush_cache_page(vma, address, pte_pfn(orig_pte));
1859 entry = pte_mkyoung(orig_pte);
1860 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1861 if (ptep_set_access_flags(vma, address, page_table, entry,1))
1862 update_mmu_cache(vma, address, entry);
1863 ret |= VM_FAULT_WRITE;
1864 goto unlock;
1865 }
1866
1867
1868
1869
1870 page_cache_get(old_page);
1871gotten:
1872 pte_unmap_unlock(page_table, ptl);
1873
1874 if (unlikely(anon_vma_prepare(vma)))
1875 goto oom;
1876 VM_BUG_ON(old_page == ZERO_PAGE(0));
1877 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1878 if (!new_page)
1879 goto oom;
1880
1881
1882
1883
1884 if (vma->vm_flags & VM_LOCKED) {
1885 lock_page(old_page);
1886 clear_page_mlock(old_page);
1887 unlock_page(old_page);
1888 }
1889 cow_user_page(new_page, old_page, address, vma);
1890 __SetPageUptodate(new_page);
1891
1892 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
1893 goto oom_free_new;
1894
1895
1896
1897
1898 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1899 if (likely(pte_same(*page_table, orig_pte))) {
1900 if (old_page) {
1901 if (!PageAnon(old_page)) {
1902 dec_mm_counter(mm, file_rss);
1903 inc_mm_counter(mm, anon_rss);
1904 }
1905 } else
1906 inc_mm_counter(mm, anon_rss);
1907 flush_cache_page(vma, address, pte_pfn(orig_pte));
1908 entry = mk_pte(new_page, vma->vm_page_prot);
1909 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1910
1911
1912
1913
1914
1915
1916 ptep_clear_flush_notify(vma, address, page_table);
1917 SetPageSwapBacked(new_page);
1918 lru_cache_add_active_or_unevictable(new_page, vma);
1919 page_add_new_anon_rmap(new_page, vma, address);
1920
1921
1922 set_pte_at(mm, address, page_table, entry);
1923 update_mmu_cache(vma, address, entry);
1924 if (old_page) {
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947 page_remove_rmap(old_page, vma);
1948 }
1949
1950
1951 new_page = old_page;
1952 ret |= VM_FAULT_WRITE;
1953 } else
1954 mem_cgroup_uncharge_page(new_page);
1955
1956 if (new_page)
1957 page_cache_release(new_page);
1958 if (old_page)
1959 page_cache_release(old_page);
1960unlock:
1961 pte_unmap_unlock(page_table, ptl);
1962 if (dirty_page) {
1963 if (vma->vm_file)
1964 file_update_time(vma->vm_file);
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974 wait_on_page_locked(dirty_page);
1975 set_page_dirty_balance(dirty_page, page_mkwrite);
1976 put_page(dirty_page);
1977 }
1978 return ret;
1979oom_free_new:
1980 page_cache_release(new_page);
1981oom:
1982 if (old_page)
1983 page_cache_release(old_page);
1984 return VM_FAULT_OOM;
1985
1986unwritable_page:
1987 page_cache_release(old_page);
1988 return VM_FAULT_SIGBUS;
1989}
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2024
2025static void reset_vma_truncate_counts(struct address_space *mapping)
2026{
2027 struct vm_area_struct *vma;
2028 struct prio_tree_iter iter;
2029
2030 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2031 vma->vm_truncate_count = 0;
2032 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2033 vma->vm_truncate_count = 0;
2034}
2035
2036static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2037 unsigned long start_addr, unsigned long end_addr,
2038 struct zap_details *details)
2039{
2040 unsigned long restart_addr;
2041 int need_break;
2042
2043
2044
2045
2046
2047
2048
2049
2050again:
2051 restart_addr = vma->vm_truncate_count;
2052 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2053 start_addr = restart_addr;
2054 if (start_addr >= end_addr) {
2055
2056 vma->vm_truncate_count = details->truncate_count;
2057 return 0;
2058 }
2059 }
2060
2061 restart_addr = zap_page_range(vma, start_addr,
2062 end_addr - start_addr, details);
2063 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2064
2065 if (restart_addr >= end_addr) {
2066
2067 vma->vm_truncate_count = details->truncate_count;
2068 if (!need_break)
2069 return 0;
2070 } else {
2071
2072 vma->vm_truncate_count = restart_addr;
2073 if (!need_break)
2074 goto again;
2075 }
2076
2077 spin_unlock(details->i_mmap_lock);
2078 cond_resched();
2079 spin_lock(details->i_mmap_lock);
2080 return -EINTR;
2081}
2082
2083static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2084 struct zap_details *details)
2085{
2086 struct vm_area_struct *vma;
2087 struct prio_tree_iter iter;
2088 pgoff_t vba, vea, zba, zea;
2089
2090restart:
2091 vma_prio_tree_foreach(vma, &iter, root,
2092 details->first_index, details->last_index) {
2093
2094 if (vma->vm_truncate_count == details->truncate_count)
2095 continue;
2096
2097 vba = vma->vm_pgoff;
2098 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2099
2100 zba = details->first_index;
2101 if (zba < vba)
2102 zba = vba;
2103 zea = details->last_index;
2104 if (zea > vea)
2105 zea = vea;
2106
2107 if (unmap_mapping_range_vma(vma,
2108 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2109 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2110 details) < 0)
2111 goto restart;
2112 }
2113}
2114
2115static inline void unmap_mapping_range_list(struct list_head *head,
2116 struct zap_details *details)
2117{
2118 struct vm_area_struct *vma;
2119
2120
2121
2122
2123
2124
2125
2126restart:
2127 list_for_each_entry(vma, head, shared.vm_set.list) {
2128
2129 if (vma->vm_truncate_count == details->truncate_count)
2130 continue;
2131 details->nonlinear_vma = vma;
2132 if (unmap_mapping_range_vma(vma, vma->vm_start,
2133 vma->vm_end, details) < 0)
2134 goto restart;
2135 }
2136}
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152void unmap_mapping_range(struct address_space *mapping,
2153 loff_t const holebegin, loff_t const holelen, int even_cows)
2154{
2155 struct zap_details details;
2156 pgoff_t hba = holebegin >> PAGE_SHIFT;
2157 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2158
2159
2160 if (sizeof(holelen) > sizeof(hlen)) {
2161 long long holeend =
2162 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2163 if (holeend & ~(long long)ULONG_MAX)
2164 hlen = ULONG_MAX - hba + 1;
2165 }
2166
2167 details.check_mapping = even_cows? NULL: mapping;
2168 details.nonlinear_vma = NULL;
2169 details.first_index = hba;
2170 details.last_index = hba + hlen - 1;
2171 if (details.last_index < details.first_index)
2172 details.last_index = ULONG_MAX;
2173 details.i_mmap_lock = &mapping->i_mmap_lock;
2174
2175 spin_lock(&mapping->i_mmap_lock);
2176
2177
2178 mapping->truncate_count++;
2179 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2180 if (mapping->truncate_count == 0)
2181 reset_vma_truncate_counts(mapping);
2182 mapping->truncate_count++;
2183 }
2184 details.truncate_count = mapping->truncate_count;
2185
2186 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2187 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2188 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2189 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2190 spin_unlock(&mapping->i_mmap_lock);
2191}
2192EXPORT_SYMBOL(unmap_mapping_range);
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203int vmtruncate(struct inode * inode, loff_t offset)
2204{
2205 if (inode->i_size < offset) {
2206 unsigned long limit;
2207
2208 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2209 if (limit != RLIM_INFINITY && offset > limit)
2210 goto out_sig;
2211 if (offset > inode->i_sb->s_maxbytes)
2212 goto out_big;
2213 i_size_write(inode, offset);
2214 } else {
2215 struct address_space *mapping = inode->i_mapping;
2216
2217
2218
2219
2220
2221
2222 if (IS_SWAPFILE(inode))
2223 return -ETXTBSY;
2224 i_size_write(inode, offset);
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2236 truncate_inode_pages(mapping, offset);
2237 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2238 }
2239
2240 if (inode->i_op && inode->i_op->truncate)
2241 inode->i_op->truncate(inode);
2242 return 0;
2243
2244out_sig:
2245 send_sig(SIGXFSZ, current, 0);
2246out_big:
2247 return -EFBIG;
2248}
2249EXPORT_SYMBOL(vmtruncate);
2250
2251int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2252{
2253 struct address_space *mapping = inode->i_mapping;
2254
2255
2256
2257
2258
2259
2260 if (!inode->i_op || !inode->i_op->truncate_range)
2261 return -ENOSYS;
2262
2263 mutex_lock(&inode->i_mutex);
2264 down_write(&inode->i_alloc_sem);
2265 unmap_mapping_range(mapping, offset, (end - offset), 1);
2266 truncate_inode_pages_range(mapping, offset, end);
2267 unmap_mapping_range(mapping, offset, (end - offset), 1);
2268 inode->i_op->truncate_range(inode, offset, end);
2269 up_write(&inode->i_alloc_sem);
2270 mutex_unlock(&inode->i_mutex);
2271
2272 return 0;
2273}
2274
2275
2276
2277
2278
2279
2280static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2281 unsigned long address, pte_t *page_table, pmd_t *pmd,
2282 int write_access, pte_t orig_pte)
2283{
2284 spinlock_t *ptl;
2285 struct page *page;
2286 swp_entry_t entry;
2287 pte_t pte;
2288 int ret = 0;
2289
2290 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2291 goto out;
2292
2293 entry = pte_to_swp_entry(orig_pte);
2294 if (is_migration_entry(entry)) {
2295 migration_entry_wait(mm, pmd, address);
2296 goto out;
2297 }
2298 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2299 page = lookup_swap_cache(entry);
2300 if (!page) {
2301 grab_swap_token();
2302 page = swapin_readahead(entry,
2303 GFP_HIGHUSER_MOVABLE, vma, address);
2304 if (!page) {
2305
2306
2307
2308
2309 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2310 if (likely(pte_same(*page_table, orig_pte)))
2311 ret = VM_FAULT_OOM;
2312 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2313 goto unlock;
2314 }
2315
2316
2317 ret = VM_FAULT_MAJOR;
2318 count_vm_event(PGMAJFAULT);
2319 }
2320
2321 mark_page_accessed(page);
2322
2323 lock_page(page);
2324 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2325
2326 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2327 ret = VM_FAULT_OOM;
2328 unlock_page(page);
2329 goto out;
2330 }
2331
2332
2333
2334
2335 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2336 if (unlikely(!pte_same(*page_table, orig_pte)))
2337 goto out_nomap;
2338
2339 if (unlikely(!PageUptodate(page))) {
2340 ret = VM_FAULT_SIGBUS;
2341 goto out_nomap;
2342 }
2343
2344
2345
2346 inc_mm_counter(mm, anon_rss);
2347 pte = mk_pte(page, vma->vm_page_prot);
2348 if (write_access && can_share_swap_page(page)) {
2349 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2350 write_access = 0;
2351 }
2352
2353 flush_icache_page(vma, page);
2354 set_pte_at(mm, address, page_table, pte);
2355 page_add_anon_rmap(page, vma, address);
2356
2357 swap_free(entry);
2358 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2359 remove_exclusive_swap_page(page);
2360 unlock_page(page);
2361
2362 if (write_access) {
2363 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2364 if (ret & VM_FAULT_ERROR)
2365 ret &= VM_FAULT_ERROR;
2366 goto out;
2367 }
2368
2369
2370 update_mmu_cache(vma, address, pte);
2371unlock:
2372 pte_unmap_unlock(page_table, ptl);
2373out:
2374 return ret;
2375out_nomap:
2376 mem_cgroup_uncharge_page(page);
2377 pte_unmap_unlock(page_table, ptl);
2378 unlock_page(page);
2379 page_cache_release(page);
2380 return ret;
2381}
2382
2383
2384
2385
2386
2387
2388static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2389 unsigned long address, pte_t *page_table, pmd_t *pmd,
2390 int write_access)
2391{
2392 struct page *page;
2393 spinlock_t *ptl;
2394 pte_t entry;
2395
2396
2397 pte_unmap(page_table);
2398
2399 if (unlikely(anon_vma_prepare(vma)))
2400 goto oom;
2401 page = alloc_zeroed_user_highpage_movable(vma, address);
2402 if (!page)
2403 goto oom;
2404 __SetPageUptodate(page);
2405
2406 if (mem_cgroup_charge(page, mm, GFP_KERNEL))
2407 goto oom_free_page;
2408
2409 entry = mk_pte(page, vma->vm_page_prot);
2410 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2411
2412 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2413 if (!pte_none(*page_table))
2414 goto release;
2415 inc_mm_counter(mm, anon_rss);
2416 SetPageSwapBacked(page);
2417 lru_cache_add_active_or_unevictable(page, vma);
2418 page_add_new_anon_rmap(page, vma, address);
2419 set_pte_at(mm, address, page_table, entry);
2420
2421
2422 update_mmu_cache(vma, address, entry);
2423unlock:
2424 pte_unmap_unlock(page_table, ptl);
2425 return 0;
2426release:
2427 mem_cgroup_uncharge_page(page);
2428 page_cache_release(page);
2429 goto unlock;
2430oom_free_page:
2431 page_cache_release(page);
2432oom:
2433 return VM_FAULT_OOM;
2434}
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2450 unsigned long address, pmd_t *pmd,
2451 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2452{
2453 pte_t *page_table;
2454 spinlock_t *ptl;
2455 struct page *page;
2456 pte_t entry;
2457 int anon = 0;
2458 int charged = 0;
2459 struct page *dirty_page = NULL;
2460 struct vm_fault vmf;
2461 int ret;
2462 int page_mkwrite = 0;
2463
2464 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2465 vmf.pgoff = pgoff;
2466 vmf.flags = flags;
2467 vmf.page = NULL;
2468
2469 ret = vma->vm_ops->fault(vma, &vmf);
2470 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2471 return ret;
2472
2473
2474
2475
2476
2477 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2478 lock_page(vmf.page);
2479 else
2480 VM_BUG_ON(!PageLocked(vmf.page));
2481
2482
2483
2484
2485 page = vmf.page;
2486 if (flags & FAULT_FLAG_WRITE) {
2487 if (!(vma->vm_flags & VM_SHARED)) {
2488 anon = 1;
2489 if (unlikely(anon_vma_prepare(vma))) {
2490 ret = VM_FAULT_OOM;
2491 goto out;
2492 }
2493 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
2494 vma, address);
2495 if (!page) {
2496 ret = VM_FAULT_OOM;
2497 goto out;
2498 }
2499 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2500 ret = VM_FAULT_OOM;
2501 page_cache_release(page);
2502 goto out;
2503 }
2504 charged = 1;
2505
2506
2507
2508
2509 if (vma->vm_flags & VM_LOCKED)
2510 clear_page_mlock(vmf.page);
2511 copy_user_highpage(page, vmf.page, address, vma);
2512 __SetPageUptodate(page);
2513 } else {
2514
2515
2516
2517
2518
2519 if (vma->vm_ops->page_mkwrite) {
2520 unlock_page(page);
2521 if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
2522 ret = VM_FAULT_SIGBUS;
2523 anon = 1;
2524 goto out_unlocked;
2525 }
2526 lock_page(page);
2527
2528
2529
2530
2531
2532
2533
2534 if (!page->mapping) {
2535 ret = 0;
2536 anon = 1;
2537 goto out;
2538 }
2539 page_mkwrite = 1;
2540 }
2541 }
2542
2543 }
2544
2545 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558 if (likely(pte_same(*page_table, orig_pte))) {
2559 flush_icache_page(vma, page);
2560 entry = mk_pte(page, vma->vm_page_prot);
2561 if (flags & FAULT_FLAG_WRITE)
2562 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2563 if (anon) {
2564 inc_mm_counter(mm, anon_rss);
2565 SetPageSwapBacked(page);
2566 lru_cache_add_active_or_unevictable(page, vma);
2567 page_add_new_anon_rmap(page, vma, address);
2568 } else {
2569 inc_mm_counter(mm, file_rss);
2570 page_add_file_rmap(page);
2571 if (flags & FAULT_FLAG_WRITE) {
2572 dirty_page = page;
2573 get_page(dirty_page);
2574 }
2575 }
2576
2577 set_pte_at(mm, address, page_table, entry);
2578
2579
2580 update_mmu_cache(vma, address, entry);
2581 } else {
2582 if (charged)
2583 mem_cgroup_uncharge_page(page);
2584 if (anon)
2585 page_cache_release(page);
2586 else
2587 anon = 1;
2588 }
2589
2590 pte_unmap_unlock(page_table, ptl);
2591
2592out:
2593 unlock_page(vmf.page);
2594out_unlocked:
2595 if (anon)
2596 page_cache_release(vmf.page);
2597 else if (dirty_page) {
2598 if (vma->vm_file)
2599 file_update_time(vma->vm_file);
2600
2601 set_page_dirty_balance(dirty_page, page_mkwrite);
2602 put_page(dirty_page);
2603 }
2604
2605 return ret;
2606}
2607
2608static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2609 unsigned long address, pte_t *page_table, pmd_t *pmd,
2610 int write_access, pte_t orig_pte)
2611{
2612 pgoff_t pgoff = (((address & PAGE_MASK)
2613 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2614 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2615
2616 pte_unmap(page_table);
2617 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2618}
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2630 unsigned long address, pte_t *page_table, pmd_t *pmd,
2631 int write_access, pte_t orig_pte)
2632{
2633 unsigned int flags = FAULT_FLAG_NONLINEAR |
2634 (write_access ? FAULT_FLAG_WRITE : 0);
2635 pgoff_t pgoff;
2636
2637 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2638 return 0;
2639
2640 if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
2641 !(vma->vm_flags & VM_CAN_NONLINEAR))) {
2642
2643
2644
2645 print_bad_pte(vma, orig_pte, address);
2646 return VM_FAULT_OOM;
2647 }
2648
2649 pgoff = pte_to_pgoff(orig_pte);
2650 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2651}
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666static inline int handle_pte_fault(struct mm_struct *mm,
2667 struct vm_area_struct *vma, unsigned long address,
2668 pte_t *pte, pmd_t *pmd, int write_access)
2669{
2670 pte_t entry;
2671 spinlock_t *ptl;
2672
2673 entry = *pte;
2674 if (!pte_present(entry)) {
2675 if (pte_none(entry)) {
2676 if (vma->vm_ops) {
2677 if (likely(vma->vm_ops->fault))
2678 return do_linear_fault(mm, vma, address,
2679 pte, pmd, write_access, entry);
2680 }
2681 return do_anonymous_page(mm, vma, address,
2682 pte, pmd, write_access);
2683 }
2684 if (pte_file(entry))
2685 return do_nonlinear_fault(mm, vma, address,
2686 pte, pmd, write_access, entry);
2687 return do_swap_page(mm, vma, address,
2688 pte, pmd, write_access, entry);
2689 }
2690
2691 ptl = pte_lockptr(mm, pmd);
2692 spin_lock(ptl);
2693 if (unlikely(!pte_same(*pte, entry)))
2694 goto unlock;
2695 if (write_access) {
2696 if (!pte_write(entry))
2697 return do_wp_page(mm, vma, address,
2698 pte, pmd, ptl, entry);
2699 entry = pte_mkdirty(entry);
2700 }
2701 entry = pte_mkyoung(entry);
2702 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
2703 update_mmu_cache(vma, address, entry);
2704 } else {
2705
2706
2707
2708
2709
2710
2711 if (write_access)
2712 flush_tlb_page(vma, address);
2713 }
2714unlock:
2715 pte_unmap_unlock(pte, ptl);
2716 return 0;
2717}
2718
2719
2720
2721
2722int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2723 unsigned long address, int write_access)
2724{
2725 pgd_t *pgd;
2726 pud_t *pud;
2727 pmd_t *pmd;
2728 pte_t *pte;
2729
2730 __set_current_state(TASK_RUNNING);
2731
2732 count_vm_event(PGFAULT);
2733
2734 if (unlikely(is_vm_hugetlb_page(vma)))
2735 return hugetlb_fault(mm, vma, address, write_access);
2736
2737 pgd = pgd_offset(mm, address);
2738 pud = pud_alloc(mm, pgd, address);
2739 if (!pud)
2740 return VM_FAULT_OOM;
2741 pmd = pmd_alloc(mm, pud, address);
2742 if (!pmd)
2743 return VM_FAULT_OOM;
2744 pte = pte_alloc_map(mm, pmd, address);
2745 if (!pte)
2746 return VM_FAULT_OOM;
2747
2748 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2749}
2750
2751#ifndef __PAGETABLE_PUD_FOLDED
2752
2753
2754
2755
2756int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2757{
2758 pud_t *new = pud_alloc_one(mm, address);
2759 if (!new)
2760 return -ENOMEM;
2761
2762 smp_wmb();
2763
2764 spin_lock(&mm->page_table_lock);
2765 if (pgd_present(*pgd))
2766 pud_free(mm, new);
2767 else
2768 pgd_populate(mm, pgd, new);
2769 spin_unlock(&mm->page_table_lock);
2770 return 0;
2771}
2772#endif
2773
2774#ifndef __PAGETABLE_PMD_FOLDED
2775
2776
2777
2778
2779int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2780{
2781 pmd_t *new = pmd_alloc_one(mm, address);
2782 if (!new)
2783 return -ENOMEM;
2784
2785 smp_wmb();
2786
2787 spin_lock(&mm->page_table_lock);
2788#ifndef __ARCH_HAS_4LEVEL_HACK
2789 if (pud_present(*pud))
2790 pmd_free(mm, new);
2791 else
2792 pud_populate(mm, pud, new);
2793#else
2794 if (pgd_present(*pud))
2795 pmd_free(mm, new);
2796 else
2797 pgd_populate(mm, pud, new);
2798#endif
2799 spin_unlock(&mm->page_table_lock);
2800 return 0;
2801}
2802#endif
2803
2804int make_pages_present(unsigned long addr, unsigned long end)
2805{
2806 int ret, len, write;
2807 struct vm_area_struct * vma;
2808
2809 vma = find_vma(current->mm, addr);
2810 if (!vma)
2811 return -ENOMEM;
2812 write = (vma->vm_flags & VM_WRITE) != 0;
2813 BUG_ON(addr >= end);
2814 BUG_ON(end > vma->vm_end);
2815 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2816 ret = get_user_pages(current, current->mm, addr,
2817 len, write, 0, NULL, NULL);
2818 if (ret < 0)
2819 return ret;
2820 return ret == len ? 0 : -EFAULT;
2821}
2822
2823#if !defined(__HAVE_ARCH_GATE_AREA)
2824
2825#if defined(AT_SYSINFO_EHDR)
2826static struct vm_area_struct gate_vma;
2827
2828static int __init gate_vma_init(void)
2829{
2830 gate_vma.vm_mm = NULL;
2831 gate_vma.vm_start = FIXADDR_USER_START;
2832 gate_vma.vm_end = FIXADDR_USER_END;
2833 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
2834 gate_vma.vm_page_prot = __P101;
2835
2836
2837
2838
2839
2840
2841 gate_vma.vm_flags |= VM_ALWAYSDUMP;
2842 return 0;
2843}
2844__initcall(gate_vma_init);
2845#endif
2846
2847struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
2848{
2849#ifdef AT_SYSINFO_EHDR
2850 return &gate_vma;
2851#else
2852 return NULL;
2853#endif
2854}
2855
2856int in_gate_area_no_task(unsigned long addr)
2857{
2858#ifdef AT_SYSINFO_EHDR
2859 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
2860 return 1;
2861#endif
2862 return 0;
2863}
2864
2865#endif
2866
2867#ifdef CONFIG_HAVE_IOREMAP_PROT
2868static resource_size_t follow_phys(struct vm_area_struct *vma,
2869 unsigned long address, unsigned int flags,
2870 unsigned long *prot)
2871{
2872 pgd_t *pgd;
2873 pud_t *pud;
2874 pmd_t *pmd;
2875 pte_t *ptep, pte;
2876 spinlock_t *ptl;
2877 resource_size_t phys_addr = 0;
2878 struct mm_struct *mm = vma->vm_mm;
2879
2880 VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
2881
2882 pgd = pgd_offset(mm, address);
2883 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
2884 goto no_page_table;
2885
2886 pud = pud_offset(pgd, address);
2887 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
2888 goto no_page_table;
2889
2890 pmd = pmd_offset(pud, address);
2891 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
2892 goto no_page_table;
2893
2894
2895 if (pmd_huge(*pmd))
2896 goto no_page_table;
2897
2898 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2899 if (!ptep)
2900 goto out;
2901
2902 pte = *ptep;
2903 if (!pte_present(pte))
2904 goto unlock;
2905 if ((flags & FOLL_WRITE) && !pte_write(pte))
2906 goto unlock;
2907 phys_addr = pte_pfn(pte);
2908 phys_addr <<= PAGE_SHIFT;
2909
2910 *prot = pgprot_val(pte_pgprot(pte));
2911
2912unlock:
2913 pte_unmap_unlock(ptep, ptl);
2914out:
2915 return phys_addr;
2916no_page_table:
2917 return 0;
2918}
2919
2920int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2921 void *buf, int len, int write)
2922{
2923 resource_size_t phys_addr;
2924 unsigned long prot = 0;
2925 void *maddr;
2926 int offset = addr & (PAGE_SIZE-1);
2927
2928 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
2929 return -EINVAL;
2930
2931 phys_addr = follow_phys(vma, addr, write, &prot);
2932
2933 if (!phys_addr)
2934 return -EINVAL;
2935
2936 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
2937 if (write)
2938 memcpy_toio(maddr + offset, buf, len);
2939 else
2940 memcpy_fromio(buf, maddr + offset, len);
2941 iounmap(maddr);
2942
2943 return len;
2944}
2945#endif
2946
2947
2948
2949
2950
2951
2952int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2953{
2954 struct mm_struct *mm;
2955 struct vm_area_struct *vma;
2956 void *old_buf = buf;
2957
2958 mm = get_task_mm(tsk);
2959 if (!mm)
2960 return 0;
2961
2962 down_read(&mm->mmap_sem);
2963
2964 while (len) {
2965 int bytes, ret, offset;
2966 void *maddr;
2967 struct page *page = NULL;
2968
2969 ret = get_user_pages(tsk, mm, addr, 1,
2970 write, 1, &page, &vma);
2971 if (ret <= 0) {
2972
2973
2974
2975
2976#ifdef CONFIG_HAVE_IOREMAP_PROT
2977 vma = find_vma(mm, addr);
2978 if (!vma)
2979 break;
2980 if (vma->vm_ops && vma->vm_ops->access)
2981 ret = vma->vm_ops->access(vma, addr, buf,
2982 len, write);
2983 if (ret <= 0)
2984#endif
2985 break;
2986 bytes = ret;
2987 } else {
2988 bytes = len;
2989 offset = addr & (PAGE_SIZE-1);
2990 if (bytes > PAGE_SIZE-offset)
2991 bytes = PAGE_SIZE-offset;
2992
2993 maddr = kmap(page);
2994 if (write) {
2995 copy_to_user_page(vma, page, addr,
2996 maddr + offset, buf, bytes);
2997 set_page_dirty_lock(page);
2998 } else {
2999 copy_from_user_page(vma, page, addr,
3000 buf, maddr + offset, bytes);
3001 }
3002 kunmap(page);
3003 page_cache_release(page);
3004 }
3005 len -= bytes;
3006 buf += bytes;
3007 addr += bytes;
3008 }
3009 up_read(&mm->mmap_sem);
3010 mmput(mm);
3011
3012 return buf - old_buf;
3013}
3014
3015
3016
3017
3018void print_vma_addr(char *prefix, unsigned long ip)
3019{
3020 struct mm_struct *mm = current->mm;
3021 struct vm_area_struct *vma;
3022
3023
3024
3025
3026
3027 if (preempt_count())
3028 return;
3029
3030 down_read(&mm->mmap_sem);
3031 vma = find_vma(mm, ip);
3032 if (vma && vma->vm_file) {
3033 struct file *f = vma->vm_file;
3034 char *buf = (char *)__get_free_page(GFP_KERNEL);
3035 if (buf) {
3036 char *p, *s;
3037
3038 p = d_path(&f->f_path, buf, PAGE_SIZE);
3039 if (IS_ERR(p))
3040 p = "?";
3041 s = strrchr(p, '/');
3042 if (s)
3043 p = s+1;
3044 printk("%s%s[%lx+%lx]", prefix, p,
3045 vma->vm_start,
3046 vma->vm_end - vma->vm_start);
3047 free_page((unsigned long)buf);
3048 }
3049 }
3050 up_read(¤t->mm->mmap_sem);
3051}