Showing error 561

User: Jiri Slaby
Error type: Calling function from invalid context
Error type description: Some function is called at inappropriate place like sleep inside critical sections or interrupt handlers
File location: mm/migrate.c
Line in file: 528
Project: Linux Kernel
Project version: 2.6.28
Tools: Stanse (1.2)
Entered: 2011-11-07 22:19:02 UTC


Source:

   1/*
   2 * Memory Migration functionality - linux/mm/migration.c
   3 *
   4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5 *
   6 * Page migration was first developed in the context of the memory hotplug
   7 * project. The main authors of the migration code are:
   8 *
   9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10 * Hirokazu Takahashi <taka@valinux.co.jp>
  11 * Dave Hansen <haveblue@us.ibm.com>
  12 * Christoph Lameter
  13 */
  14
  15#include <linux/migrate.h>
  16#include <linux/module.h>
  17#include <linux/swap.h>
  18#include <linux/swapops.h>
  19#include <linux/pagemap.h>
  20#include <linux/buffer_head.h>
  21#include <linux/mm_inline.h>
  22#include <linux/nsproxy.h>
  23#include <linux/pagevec.h>
  24#include <linux/rmap.h>
  25#include <linux/topology.h>
  26#include <linux/cpu.h>
  27#include <linux/cpuset.h>
  28#include <linux/writeback.h>
  29#include <linux/mempolicy.h>
  30#include <linux/vmalloc.h>
  31#include <linux/security.h>
  32#include <linux/memcontrol.h>
  33#include <linux/syscalls.h>
  34
  35#include "internal.h"
  36
  37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  38
  39/*
  40 * migrate_prep() needs to be called before we start compiling a list of pages
  41 * to be migrated using isolate_lru_page().
  42 */
  43int migrate_prep(void)
  44{
  45        /*
  46         * Clear the LRU lists so pages can be isolated.
  47         * Note that pages may be moved off the LRU after we have
  48         * drained them. Those pages will fail to migrate like other
  49         * pages that may be busy.
  50         */
  51        lru_add_drain_all();
  52
  53        return 0;
  54}
  55
  56/*
  57 * Add isolated pages on the list back to the LRU under page lock
  58 * to avoid leaking evictable pages back onto unevictable list.
  59 *
  60 * returns the number of pages put back.
  61 */
  62int putback_lru_pages(struct list_head *l)
  63{
  64        struct page *page;
  65        struct page *page2;
  66        int count = 0;
  67
  68        list_for_each_entry_safe(page, page2, l, lru) {
  69                list_del(&page->lru);
  70                putback_lru_page(page);
  71                count++;
  72        }
  73        return count;
  74}
  75
  76/*
  77 * Restore a potential migration pte to a working pte entry
  78 */
  79static void remove_migration_pte(struct vm_area_struct *vma,
  80                struct page *old, struct page *new)
  81{
  82        struct mm_struct *mm = vma->vm_mm;
  83        swp_entry_t entry;
  84         pgd_t *pgd;
  85         pud_t *pud;
  86         pmd_t *pmd;
  87        pte_t *ptep, pte;
  88         spinlock_t *ptl;
  89        unsigned long addr = page_address_in_vma(new, vma);
  90
  91        if (addr == -EFAULT)
  92                return;
  93
  94         pgd = pgd_offset(mm, addr);
  95        if (!pgd_present(*pgd))
  96                return;
  97
  98        pud = pud_offset(pgd, addr);
  99        if (!pud_present(*pud))
 100                return;
 101
 102        pmd = pmd_offset(pud, addr);
 103        if (!pmd_present(*pmd))
 104                return;
 105
 106        ptep = pte_offset_map(pmd, addr);
 107
 108        if (!is_swap_pte(*ptep)) {
 109                pte_unmap(ptep);
 110                 return;
 111         }
 112
 113         ptl = pte_lockptr(mm, pmd);
 114         spin_lock(ptl);
 115        pte = *ptep;
 116        if (!is_swap_pte(pte))
 117                goto out;
 118
 119        entry = pte_to_swp_entry(pte);
 120
 121        if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
 122                goto out;
 123
 124        /*
 125         * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
 126         * Failure is not an option here: we're now expected to remove every
 127         * migration pte, and will cause crashes otherwise.  Normally this
 128         * is not an issue: mem_cgroup_prepare_migration bumped up the old
 129         * page_cgroup count for safety, that's now attached to the new page,
 130         * so this charge should just be another incrementation of the count,
 131         * to keep in balance with rmap.c's mem_cgroup_uncharging.  But if
 132         * there's been a force_empty, those reference counts may no longer
 133         * be reliable, and this charge can actually fail: oh well, we don't
 134         * make the situation any worse by proceeding as if it had succeeded.
 135         */
 136        mem_cgroup_charge(new, mm, GFP_ATOMIC);
 137
 138        get_page(new);
 139        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 140        if (is_write_migration_entry(entry))
 141                pte = pte_mkwrite(pte);
 142        flush_cache_page(vma, addr, pte_pfn(pte));
 143        set_pte_at(mm, addr, ptep, pte);
 144
 145        if (PageAnon(new))
 146                page_add_anon_rmap(new, vma, addr);
 147        else
 148                page_add_file_rmap(new);
 149
 150        /* No need to invalidate - it was non-present before */
 151        update_mmu_cache(vma, addr, pte);
 152
 153out:
 154        pte_unmap_unlock(ptep, ptl);
 155}
 156
 157/*
 158 * Note that remove_file_migration_ptes will only work on regular mappings,
 159 * Nonlinear mappings do not use migration entries.
 160 */
 161static void remove_file_migration_ptes(struct page *old, struct page *new)
 162{
 163        struct vm_area_struct *vma;
 164        struct address_space *mapping = page_mapping(new);
 165        struct prio_tree_iter iter;
 166        pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 167
 168        if (!mapping)
 169                return;
 170
 171        spin_lock(&mapping->i_mmap_lock);
 172
 173        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
 174                remove_migration_pte(vma, old, new);
 175
 176        spin_unlock(&mapping->i_mmap_lock);
 177}
 178
 179/*
 180 * Must hold mmap_sem lock on at least one of the vmas containing
 181 * the page so that the anon_vma cannot vanish.
 182 */
 183static void remove_anon_migration_ptes(struct page *old, struct page *new)
 184{
 185        struct anon_vma *anon_vma;
 186        struct vm_area_struct *vma;
 187        unsigned long mapping;
 188
 189        mapping = (unsigned long)new->mapping;
 190
 191        if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
 192                return;
 193
 194        /*
 195         * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
 196         */
 197        anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
 198        spin_lock(&anon_vma->lock);
 199
 200        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
 201                remove_migration_pte(vma, old, new);
 202
 203        spin_unlock(&anon_vma->lock);
 204}
 205
 206/*
 207 * Get rid of all migration entries and replace them by
 208 * references to the indicated page.
 209 */
 210static void remove_migration_ptes(struct page *old, struct page *new)
 211{
 212        if (PageAnon(new))
 213                remove_anon_migration_ptes(old, new);
 214        else
 215                remove_file_migration_ptes(old, new);
 216}
 217
 218/*
 219 * Something used the pte of a page under migration. We need to
 220 * get to the page and wait until migration is finished.
 221 * When we return from this function the fault will be retried.
 222 *
 223 * This function is called from do_swap_page().
 224 */
 225void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 226                                unsigned long address)
 227{
 228        pte_t *ptep, pte;
 229        spinlock_t *ptl;
 230        swp_entry_t entry;
 231        struct page *page;
 232
 233        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 234        pte = *ptep;
 235        if (!is_swap_pte(pte))
 236                goto out;
 237
 238        entry = pte_to_swp_entry(pte);
 239        if (!is_migration_entry(entry))
 240                goto out;
 241
 242        page = migration_entry_to_page(entry);
 243
 244        /*
 245         * Once radix-tree replacement of page migration started, page_count
 246         * *must* be zero. And, we don't want to call wait_on_page_locked()
 247         * against a page without get_page().
 248         * So, we use get_page_unless_zero(), here. Even failed, page fault
 249         * will occur again.
 250         */
 251        if (!get_page_unless_zero(page))
 252                goto out;
 253        pte_unmap_unlock(ptep, ptl);
 254        wait_on_page_locked(page);
 255        put_page(page);
 256        return;
 257out:
 258        pte_unmap_unlock(ptep, ptl);
 259}
 260
 261/*
 262 * Replace the page in the mapping.
 263 *
 264 * The number of remaining references must be:
 265 * 1 for anonymous pages without a mapping
 266 * 2 for pages with a mapping
 267 * 3 for pages with a mapping and PagePrivate set.
 268 */
 269static int migrate_page_move_mapping(struct address_space *mapping,
 270                struct page *newpage, struct page *page)
 271{
 272        int expected_count;
 273        void **pslot;
 274
 275        if (!mapping) {
 276                /* Anonymous page without mapping */
 277                if (page_count(page) != 1)
 278                        return -EAGAIN;
 279                return 0;
 280        }
 281
 282        spin_lock_irq(&mapping->tree_lock);
 283
 284        pslot = radix_tree_lookup_slot(&mapping->page_tree,
 285                                         page_index(page));
 286
 287        expected_count = 2 + !!PagePrivate(page);
 288        if (page_count(page) != expected_count ||
 289                        (struct page *)radix_tree_deref_slot(pslot) != page) {
 290                spin_unlock_irq(&mapping->tree_lock);
 291                return -EAGAIN;
 292        }
 293
 294        if (!page_freeze_refs(page, expected_count)) {
 295                spin_unlock_irq(&mapping->tree_lock);
 296                return -EAGAIN;
 297        }
 298
 299        /*
 300         * Now we know that no one else is looking at the page.
 301         */
 302        get_page(newpage);        /* add cache reference */
 303#ifdef CONFIG_SWAP
 304        if (PageSwapCache(page)) {
 305                SetPageSwapCache(newpage);
 306                set_page_private(newpage, page_private(page));
 307        }
 308#endif
 309
 310        radix_tree_replace_slot(pslot, newpage);
 311
 312        page_unfreeze_refs(page, expected_count);
 313        /*
 314         * Drop cache reference from old page.
 315         * We know this isn't the last reference.
 316         */
 317        __put_page(page);
 318
 319        /*
 320         * If moved to a different zone then also account
 321         * the page for that zone. Other VM counters will be
 322         * taken care of when we establish references to the
 323         * new page and drop references to the old page.
 324         *
 325         * Note that anonymous pages are accounted for
 326         * via NR_FILE_PAGES and NR_ANON_PAGES if they
 327         * are mapped to swap space.
 328         */
 329        __dec_zone_page_state(page, NR_FILE_PAGES);
 330        __inc_zone_page_state(newpage, NR_FILE_PAGES);
 331
 332        spin_unlock_irq(&mapping->tree_lock);
 333
 334        return 0;
 335}
 336
 337/*
 338 * Copy the page to its new location
 339 */
 340static void migrate_page_copy(struct page *newpage, struct page *page)
 341{
 342        int anon;
 343
 344        copy_highpage(newpage, page);
 345
 346        if (PageError(page))
 347                SetPageError(newpage);
 348        if (PageReferenced(page))
 349                SetPageReferenced(newpage);
 350        if (PageUptodate(page))
 351                SetPageUptodate(newpage);
 352        if (TestClearPageActive(page)) {
 353                VM_BUG_ON(PageUnevictable(page));
 354                SetPageActive(newpage);
 355        } else
 356                unevictable_migrate_page(newpage, page);
 357        if (PageChecked(page))
 358                SetPageChecked(newpage);
 359        if (PageMappedToDisk(page))
 360                SetPageMappedToDisk(newpage);
 361
 362        if (PageDirty(page)) {
 363                clear_page_dirty_for_io(page);
 364                /*
 365                 * Want to mark the page and the radix tree as dirty, and
 366                 * redo the accounting that clear_page_dirty_for_io undid,
 367                 * but we can't use set_page_dirty because that function
 368                 * is actually a signal that all of the page has become dirty.
 369                 * Wheras only part of our page may be dirty.
 370                 */
 371                __set_page_dirty_nobuffers(newpage);
 372         }
 373
 374        mlock_migrate_page(newpage, page);
 375
 376#ifdef CONFIG_SWAP
 377        ClearPageSwapCache(page);
 378#endif
 379        ClearPagePrivate(page);
 380        set_page_private(page, 0);
 381        /* page->mapping contains a flag for PageAnon() */
 382        anon = PageAnon(page);
 383        page->mapping = NULL;
 384
 385        if (!anon) /* This page was removed from radix-tree. */
 386                mem_cgroup_uncharge_cache_page(page);
 387
 388        /*
 389         * If any waiters have accumulated on the new page then
 390         * wake them up.
 391         */
 392        if (PageWriteback(newpage))
 393                end_page_writeback(newpage);
 394}
 395
 396/************************************************************
 397 *                    Migration functions
 398 ***********************************************************/
 399
 400/* Always fail migration. Used for mappings that are not movable */
 401int fail_migrate_page(struct address_space *mapping,
 402                        struct page *newpage, struct page *page)
 403{
 404        return -EIO;
 405}
 406EXPORT_SYMBOL(fail_migrate_page);
 407
 408/*
 409 * Common logic to directly migrate a single page suitable for
 410 * pages that do not use PagePrivate.
 411 *
 412 * Pages are locked upon entry and exit.
 413 */
 414int migrate_page(struct address_space *mapping,
 415                struct page *newpage, struct page *page)
 416{
 417        int rc;
 418
 419        BUG_ON(PageWriteback(page));        /* Writeback must be complete */
 420
 421        rc = migrate_page_move_mapping(mapping, newpage, page);
 422
 423        if (rc)
 424                return rc;
 425
 426        migrate_page_copy(newpage, page);
 427        return 0;
 428}
 429EXPORT_SYMBOL(migrate_page);
 430
 431#ifdef CONFIG_BLOCK
 432/*
 433 * Migration function for pages with buffers. This function can only be used
 434 * if the underlying filesystem guarantees that no other references to "page"
 435 * exist.
 436 */
 437int buffer_migrate_page(struct address_space *mapping,
 438                struct page *newpage, struct page *page)
 439{
 440        struct buffer_head *bh, *head;
 441        int rc;
 442
 443        if (!page_has_buffers(page))
 444                return migrate_page(mapping, newpage, page);
 445
 446        head = page_buffers(page);
 447
 448        rc = migrate_page_move_mapping(mapping, newpage, page);
 449
 450        if (rc)
 451                return rc;
 452
 453        bh = head;
 454        do {
 455                get_bh(bh);
 456                lock_buffer(bh);
 457                bh = bh->b_this_page;
 458
 459        } while (bh != head);
 460
 461        ClearPagePrivate(page);
 462        set_page_private(newpage, page_private(page));
 463        set_page_private(page, 0);
 464        put_page(page);
 465        get_page(newpage);
 466
 467        bh = head;
 468        do {
 469                set_bh_page(bh, newpage, bh_offset(bh));
 470                bh = bh->b_this_page;
 471
 472        } while (bh != head);
 473
 474        SetPagePrivate(newpage);
 475
 476        migrate_page_copy(newpage, page);
 477
 478        bh = head;
 479        do {
 480                unlock_buffer(bh);
 481                 put_bh(bh);
 482                bh = bh->b_this_page;
 483
 484        } while (bh != head);
 485
 486        return 0;
 487}
 488EXPORT_SYMBOL(buffer_migrate_page);
 489#endif
 490
 491/*
 492 * Writeback a page to clean the dirty state
 493 */
 494static int writeout(struct address_space *mapping, struct page *page)
 495{
 496        struct writeback_control wbc = {
 497                .sync_mode = WB_SYNC_NONE,
 498                .nr_to_write = 1,
 499                .range_start = 0,
 500                .range_end = LLONG_MAX,
 501                .nonblocking = 1,
 502                .for_reclaim = 1
 503        };
 504        int rc;
 505
 506        if (!mapping->a_ops->writepage)
 507                /* No write method for the address space */
 508                return -EINVAL;
 509
 510        if (!clear_page_dirty_for_io(page))
 511                /* Someone else already triggered a write */
 512                return -EAGAIN;
 513
 514        /*
 515         * A dirty page may imply that the underlying filesystem has
 516         * the page on some queue. So the page must be clean for
 517         * migration. Writeout may mean we loose the lock and the
 518         * page state is no longer what we checked for earlier.
 519         * At this point we know that the migration attempt cannot
 520         * be successful.
 521         */
 522        remove_migration_ptes(page, page);
 523
 524        rc = mapping->a_ops->writepage(page, &wbc);
 525
 526        if (rc != AOP_WRITEPAGE_ACTIVATE)
 527                /* unlocked. Relock */
 528                lock_page(page);
 529
 530        return (rc < 0) ? -EIO : -EAGAIN;
 531}
 532
 533/*
 534 * Default handling if a filesystem does not provide a migration function.
 535 */
 536static int fallback_migrate_page(struct address_space *mapping,
 537        struct page *newpage, struct page *page)
 538{
 539        if (PageDirty(page))
 540                return writeout(mapping, page);
 541
 542        /*
 543         * Buffers may be managed in a filesystem specific way.
 544         * We must have no buffers or drop them.
 545         */
 546        if (PagePrivate(page) &&
 547            !try_to_release_page(page, GFP_KERNEL))
 548                return -EAGAIN;
 549
 550        return migrate_page(mapping, newpage, page);
 551}
 552
 553/*
 554 * Move a page to a newly allocated page
 555 * The page is locked and all ptes have been successfully removed.
 556 *
 557 * The new page will have replaced the old page if this function
 558 * is successful.
 559 *
 560 * Return value:
 561 *   < 0 - error code
 562 *  == 0 - success
 563 */
 564static int move_to_new_page(struct page *newpage, struct page *page)
 565{
 566        struct address_space *mapping;
 567        int rc;
 568
 569        /*
 570         * Block others from accessing the page when we get around to
 571         * establishing additional references. We are the only one
 572         * holding a reference to the new page at this point.
 573         */
 574        if (!trylock_page(newpage))
 575                BUG();
 576
 577        /* Prepare mapping for the new page.*/
 578        newpage->index = page->index;
 579        newpage->mapping = page->mapping;
 580        if (PageSwapBacked(page))
 581                SetPageSwapBacked(newpage);
 582
 583        mapping = page_mapping(page);
 584        if (!mapping)
 585                rc = migrate_page(mapping, newpage, page);
 586        else if (mapping->a_ops->migratepage)
 587                /*
 588                 * Most pages have a mapping and most filesystems
 589                 * should provide a migration function. Anonymous
 590                 * pages are part of swap space which also has its
 591                 * own migration function. This is the most common
 592                 * path for page migration.
 593                 */
 594                rc = mapping->a_ops->migratepage(mapping,
 595                                                newpage, page);
 596        else
 597                rc = fallback_migrate_page(mapping, newpage, page);
 598
 599        if (!rc) {
 600                remove_migration_ptes(page, newpage);
 601        } else
 602                newpage->mapping = NULL;
 603
 604        unlock_page(newpage);
 605
 606        return rc;
 607}
 608
 609/*
 610 * Obtain the lock on page, remove all ptes and migrate the page
 611 * to the newly allocated page in newpage.
 612 */
 613static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 614                        struct page *page, int force)
 615{
 616        int rc = 0;
 617        int *result = NULL;
 618        struct page *newpage = get_new_page(page, private, &result);
 619        int rcu_locked = 0;
 620        int charge = 0;
 621
 622        if (!newpage)
 623                return -ENOMEM;
 624
 625        if (page_count(page) == 1) {
 626                /* page was freed from under us. So we are done. */
 627                goto move_newpage;
 628        }
 629
 630        charge = mem_cgroup_prepare_migration(page, newpage);
 631        if (charge == -ENOMEM) {
 632                rc = -ENOMEM;
 633                goto move_newpage;
 634        }
 635        /* prepare cgroup just returns 0 or -ENOMEM */
 636        BUG_ON(charge);
 637
 638        rc = -EAGAIN;
 639        if (!trylock_page(page)) {
 640                if (!force)
 641                        goto move_newpage;
 642                lock_page(page);
 643        }
 644
 645        if (PageWriteback(page)) {
 646                if (!force)
 647                        goto unlock;
 648                wait_on_page_writeback(page);
 649        }
 650        /*
 651         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
 652         * we cannot notice that anon_vma is freed while we migrates a page.
 653         * This rcu_read_lock() delays freeing anon_vma pointer until the end
 654         * of migration. File cache pages are no problem because of page_lock()
 655         * File Caches may use write_page() or lock_page() in migration, then,
 656         * just care Anon page here.
 657         */
 658        if (PageAnon(page)) {
 659                rcu_read_lock();
 660                rcu_locked = 1;
 661        }
 662
 663        /*
 664         * Corner case handling:
 665         * 1. When a new swap-cache page is read into, it is added to the LRU
 666         * and treated as swapcache but it has no rmap yet.
 667         * Calling try_to_unmap() against a page->mapping==NULL page will
 668         * trigger a BUG.  So handle it here.
 669         * 2. An orphaned page (see truncate_complete_page) might have
 670         * fs-private metadata. The page can be picked up due to memory
 671         * offlining.  Everywhere else except page reclaim, the page is
 672         * invisible to the vm, so the page can not be migrated.  So try to
 673         * free the metadata, so the page can be freed.
 674         */
 675        if (!page->mapping) {
 676                if (!PageAnon(page) && PagePrivate(page)) {
 677                        /*
 678                         * Go direct to try_to_free_buffers() here because
 679                         * a) that's what try_to_release_page() would do anyway
 680                         * b) we may be under rcu_read_lock() here, so we can't
 681                         *    use GFP_KERNEL which is what try_to_release_page()
 682                         *    needs to be effective.
 683                         */
 684                        try_to_free_buffers(page);
 685                }
 686                goto rcu_unlock;
 687        }
 688
 689        /* Establish migration ptes or remove ptes */
 690        try_to_unmap(page, 1);
 691
 692        if (!page_mapped(page))
 693                rc = move_to_new_page(newpage, page);
 694
 695        if (rc)
 696                remove_migration_ptes(page, page);
 697rcu_unlock:
 698        if (rcu_locked)
 699                rcu_read_unlock();
 700
 701unlock:
 702        unlock_page(page);
 703
 704        if (rc != -EAGAIN) {
 705                 /*
 706                  * A page that has been migrated has all references
 707                  * removed and will be freed. A page that has not been
 708                  * migrated will have kepts its references and be
 709                  * restored.
 710                  */
 711                 list_del(&page->lru);
 712                putback_lru_page(page);
 713        }
 714
 715move_newpage:
 716        if (!charge)
 717                mem_cgroup_end_migration(newpage);
 718
 719        /*
 720         * Move the new page to the LRU. If migration was not successful
 721         * then this will free the page.
 722         */
 723        putback_lru_page(newpage);
 724
 725        if (result) {
 726                if (rc)
 727                        *result = rc;
 728                else
 729                        *result = page_to_nid(newpage);
 730        }
 731        return rc;
 732}
 733
 734/*
 735 * migrate_pages
 736 *
 737 * The function takes one list of pages to migrate and a function
 738 * that determines from the page to be migrated and the private data
 739 * the target of the move and allocates the page.
 740 *
 741 * The function returns after 10 attempts or if no pages
 742 * are movable anymore because to has become empty
 743 * or no retryable pages exist anymore. All pages will be
 744 * returned to the LRU or freed.
 745 *
 746 * Return: Number of pages not migrated or error code.
 747 */
 748int migrate_pages(struct list_head *from,
 749                new_page_t get_new_page, unsigned long private)
 750{
 751        int retry = 1;
 752        int nr_failed = 0;
 753        int pass = 0;
 754        struct page *page;
 755        struct page *page2;
 756        int swapwrite = current->flags & PF_SWAPWRITE;
 757        int rc;
 758
 759        if (!swapwrite)
 760                current->flags |= PF_SWAPWRITE;
 761
 762        for(pass = 0; pass < 10 && retry; pass++) {
 763                retry = 0;
 764
 765                list_for_each_entry_safe(page, page2, from, lru) {
 766                        cond_resched();
 767
 768                        rc = unmap_and_move(get_new_page, private,
 769                                                page, pass > 2);
 770
 771                        switch(rc) {
 772                        case -ENOMEM:
 773                                goto out;
 774                        case -EAGAIN:
 775                                retry++;
 776                                break;
 777                        case 0:
 778                                break;
 779                        default:
 780                                /* Permanent failure */
 781                                nr_failed++;
 782                                break;
 783                        }
 784                }
 785        }
 786        rc = 0;
 787out:
 788        if (!swapwrite)
 789                current->flags &= ~PF_SWAPWRITE;
 790
 791        putback_lru_pages(from);
 792
 793        if (rc)
 794                return rc;
 795
 796        return nr_failed + retry;
 797}
 798
 799#ifdef CONFIG_NUMA
 800/*
 801 * Move a list of individual pages
 802 */
 803struct page_to_node {
 804        unsigned long addr;
 805        struct page *page;
 806        int node;
 807        int status;
 808};
 809
 810static struct page *new_page_node(struct page *p, unsigned long private,
 811                int **result)
 812{
 813        struct page_to_node *pm = (struct page_to_node *)private;
 814
 815        while (pm->node != MAX_NUMNODES && pm->page != p)
 816                pm++;
 817
 818        if (pm->node == MAX_NUMNODES)
 819                return NULL;
 820
 821        *result = &pm->status;
 822
 823        return alloc_pages_node(pm->node,
 824                                GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 825}
 826
 827/*
 828 * Move a set of pages as indicated in the pm array. The addr
 829 * field must be set to the virtual address of the page to be moved
 830 * and the node number must contain a valid target node.
 831 * The pm array ends with node = MAX_NUMNODES.
 832 */
 833static int do_move_page_to_node_array(struct mm_struct *mm,
 834                                      struct page_to_node *pm,
 835                                      int migrate_all)
 836{
 837        int err;
 838        struct page_to_node *pp;
 839        LIST_HEAD(pagelist);
 840
 841        migrate_prep();
 842        down_read(&mm->mmap_sem);
 843
 844        /*
 845         * Build a list of pages to migrate
 846         */
 847        for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
 848                struct vm_area_struct *vma;
 849                struct page *page;
 850
 851                /*
 852                 * A valid page pointer that will not match any of the
 853                 * pages that will be moved.
 854                 */
 855                pp->page = ZERO_PAGE(0);
 856
 857                err = -EFAULT;
 858                vma = find_vma(mm, pp->addr);
 859                if (!vma || !vma_migratable(vma))
 860                        goto set_status;
 861
 862                page = follow_page(vma, pp->addr, FOLL_GET);
 863
 864                err = PTR_ERR(page);
 865                if (IS_ERR(page))
 866                        goto set_status;
 867
 868                err = -ENOENT;
 869                if (!page)
 870                        goto set_status;
 871
 872                if (PageReserved(page))                /* Check for zero page */
 873                        goto put_and_set;
 874
 875                pp->page = page;
 876                err = page_to_nid(page);
 877
 878                if (err == pp->node)
 879                        /*
 880                         * Node already in the right place
 881                         */
 882                        goto put_and_set;
 883
 884                err = -EACCES;
 885                if (page_mapcount(page) > 1 &&
 886                                !migrate_all)
 887                        goto put_and_set;
 888
 889                err = isolate_lru_page(page);
 890                if (!err)
 891                        list_add_tail(&page->lru, &pagelist);
 892put_and_set:
 893                /*
 894                 * Either remove the duplicate refcount from
 895                 * isolate_lru_page() or drop the page ref if it was
 896                 * not isolated.
 897                 */
 898                put_page(page);
 899set_status:
 900                pp->status = err;
 901        }
 902
 903        err = 0;
 904        if (!list_empty(&pagelist))
 905                err = migrate_pages(&pagelist, new_page_node,
 906                                (unsigned long)pm);
 907
 908        up_read(&mm->mmap_sem);
 909        return err;
 910}
 911
 912/*
 913 * Migrate an array of page address onto an array of nodes and fill
 914 * the corresponding array of status.
 915 */
 916static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
 917                         unsigned long nr_pages,
 918                         const void __user * __user *pages,
 919                         const int __user *nodes,
 920                         int __user *status, int flags)
 921{
 922        struct page_to_node *pm = NULL;
 923        nodemask_t task_nodes;
 924        int err = 0;
 925        int i;
 926
 927        task_nodes = cpuset_mems_allowed(task);
 928
 929        /* Limit nr_pages so that the multiplication may not overflow */
 930        if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
 931                err = -E2BIG;
 932                goto out;
 933        }
 934
 935        pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
 936        if (!pm) {
 937                err = -ENOMEM;
 938                goto out;
 939        }
 940
 941        /*
 942         * Get parameters from user space and initialize the pm
 943         * array. Return various errors if the user did something wrong.
 944         */
 945        for (i = 0; i < nr_pages; i++) {
 946                const void __user *p;
 947
 948                err = -EFAULT;
 949                if (get_user(p, pages + i))
 950                        goto out_pm;
 951
 952                pm[i].addr = (unsigned long)p;
 953                if (nodes) {
 954                        int node;
 955
 956                        if (get_user(node, nodes + i))
 957                                goto out_pm;
 958
 959                        err = -ENODEV;
 960                        if (!node_state(node, N_HIGH_MEMORY))
 961                                goto out_pm;
 962
 963                        err = -EACCES;
 964                        if (!node_isset(node, task_nodes))
 965                                goto out_pm;
 966
 967                        pm[i].node = node;
 968                } else
 969                        pm[i].node = 0;        /* anything to not match MAX_NUMNODES */
 970        }
 971        /* End marker */
 972        pm[nr_pages].node = MAX_NUMNODES;
 973
 974        err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
 975        if (err >= 0)
 976                /* Return status information */
 977                for (i = 0; i < nr_pages; i++)
 978                        if (put_user(pm[i].status, status + i))
 979                                err = -EFAULT;
 980
 981out_pm:
 982        vfree(pm);
 983out:
 984        return err;
 985}
 986
 987/*
 988 * Determine the nodes of an array of pages and store it in an array of status.
 989 */
 990static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 991                                const void __user **pages, int *status)
 992{
 993        unsigned long i;
 994
 995        down_read(&mm->mmap_sem);
 996
 997        for (i = 0; i < nr_pages; i++) {
 998                unsigned long addr = (unsigned long)(*pages);
 999                struct vm_area_struct *vma;
1000                struct page *page;
1001                int err = -EFAULT;
1002
1003                vma = find_vma(mm, addr);
1004                if (!vma)
1005                        goto set_status;
1006
1007                page = follow_page(vma, addr, 0);
1008
1009                err = PTR_ERR(page);
1010                if (IS_ERR(page))
1011                        goto set_status;
1012
1013                err = -ENOENT;
1014                /* Use PageReserved to check for zero page */
1015                if (!page || PageReserved(page))
1016                        goto set_status;
1017
1018                err = page_to_nid(page);
1019set_status:
1020                *status = err;
1021
1022                pages++;
1023                status++;
1024        }
1025
1026        up_read(&mm->mmap_sem);
1027}
1028
1029/*
1030 * Determine the nodes of a user array of pages and store it in
1031 * a user array of status.
1032 */
1033static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1034                         const void __user * __user *pages,
1035                         int __user *status)
1036{
1037#define DO_PAGES_STAT_CHUNK_NR 16
1038        const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1039        int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1040        unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1041        int err;
1042
1043        for (i = 0; i < nr_pages; i += chunk_nr) {
1044                if (chunk_nr + i > nr_pages)
1045                        chunk_nr = nr_pages - i;
1046
1047                err = copy_from_user(chunk_pages, &pages[i],
1048                                     chunk_nr * sizeof(*chunk_pages));
1049                if (err) {
1050                        err = -EFAULT;
1051                        goto out;
1052                }
1053
1054                do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1055
1056                err = copy_to_user(&status[i], chunk_status,
1057                                   chunk_nr * sizeof(*chunk_status));
1058                if (err) {
1059                        err = -EFAULT;
1060                        goto out;
1061                }
1062        }
1063        err = 0;
1064
1065out:
1066        return err;
1067}
1068
1069/*
1070 * Move a list of pages in the address space of the currently executing
1071 * process.
1072 */
1073asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
1074                        const void __user * __user *pages,
1075                        const int __user *nodes,
1076                        int __user *status, int flags)
1077{
1078        struct task_struct *task;
1079        struct mm_struct *mm;
1080        int err;
1081
1082        /* Check flags */
1083        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1084                return -EINVAL;
1085
1086        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1087                return -EPERM;
1088
1089        /* Find the mm_struct */
1090        read_lock(&tasklist_lock);
1091        task = pid ? find_task_by_vpid(pid) : current;
1092        if (!task) {
1093                read_unlock(&tasklist_lock);
1094                return -ESRCH;
1095        }
1096        mm = get_task_mm(task);
1097        read_unlock(&tasklist_lock);
1098
1099        if (!mm)
1100                return -EINVAL;
1101
1102        /*
1103         * Check if this process has the right to modify the specified
1104         * process. The right exists if the process has administrative
1105         * capabilities, superuser privileges or the same
1106         * userid as the target process.
1107         */
1108        if ((current->euid != task->suid) && (current->euid != task->uid) &&
1109            (current->uid != task->suid) && (current->uid != task->uid) &&
1110            !capable(CAP_SYS_NICE)) {
1111                err = -EPERM;
1112                goto out;
1113        }
1114
1115         err = security_task_movememory(task);
1116         if (err)
1117                goto out;
1118
1119        if (nodes) {
1120                err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1121                                    flags);
1122        } else {
1123                err = do_pages_stat(mm, nr_pages, pages, status);
1124        }
1125
1126out:
1127        mmput(mm);
1128        return err;
1129}
1130
1131/*
1132 * Call migration functions in the vma_ops that may prepare
1133 * memory in a vm for migration. migration functions may perform
1134 * the migration for vmas that do not have an underlying page struct.
1135 */
1136int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1137        const nodemask_t *from, unsigned long flags)
1138{
1139         struct vm_area_struct *vma;
1140         int err = 0;
1141
1142         for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
1143                 if (vma->vm_ops && vma->vm_ops->migrate) {
1144                         err = vma->vm_ops->migrate(vma, to, from, flags);
1145                         if (err)
1146                                 break;
1147                 }
1148         }
1149         return err;
1150}
1151#endif